From 94f4ef901dbf4a6af3fdf0eb7b6bcdda3dc50b8f Mon Sep 17 00:00:00 2001
From: ankitlade12 <ankitlade12@gmail.com>
Date: Fri, 26 Dec 2025 12:43:17 -0600
Subject: [PATCH 01/10] Add ArcSinhTransformer, TextFeatures, and
 GeoDistanceTransformer

New transformers:
- ArcSinhTransformer: Pseudo-log transformation for positive and negative values (addresses #707)
- TextFeatures: Extract 19 text features from string columns (new text module)
- GeoDistanceTransformer: Calculate geographic distances using Haversine/Euclidean/Manhattan (addresses #688)

Includes:
- Full test coverage (43 tests)
- RST documentation for all transformers
- Code style compliance (flake8, black, isort)
---
 .../creation/GeoDistanceTransformer.rst       | 169 +++++++
 docs/user_guide/creation/index.rst            |   1 +
 docs/user_guide/index.rst                     |   1 +
 docs/user_guide/text/TextFeatures.rst         | 152 ++++++
 docs/user_guide/text/index.rst                |  18 +
 .../transformation/ArcSinhTransformer.rst     | 121 +++++
 docs/user_guide/transformation/index.rst      |   1 +
 feature_engine/creation/__init__.py           |   5 +-
 feature_engine/creation/geo_features.py       | 432 ++++++++++++++++++
 feature_engine/text/__init__.py               |   9 +
 feature_engine/text/text_features.py          | 327 +++++++++++++
 feature_engine/transformation/__init__.py     |   5 +-
 feature_engine/transformation/arcsinh.py      | 229 ++++++++++
 tests/test_creation/test_geo_features.py      | 266 +++++++++++
 tests/test_text/__init__.py                   | 167 +++++++
 tests/test_transformation/test_arcsinh.py     | 122 +++++
 16 files changed, 2023 insertions(+), 2 deletions(-)
 create mode 100644 docs/user_guide/creation/GeoDistanceTransformer.rst
 create mode 100644 docs/user_guide/text/TextFeatures.rst
 create mode 100644 docs/user_guide/text/index.rst
 create mode 100644 docs/user_guide/transformation/ArcSinhTransformer.rst
 create mode 100644 feature_engine/creation/geo_features.py
 create mode 100644 feature_engine/text/__init__.py
 create mode 100644 feature_engine/text/text_features.py
 create mode 100644 feature_engine/transformation/arcsinh.py
 create mode 100644 tests/test_creation/test_geo_features.py
 create mode 100644 tests/test_text/__init__.py
 create mode 100644 tests/test_transformation/test_arcsinh.py

diff --git a/docs/user_guide/creation/GeoDistanceTransformer.rst b/docs/user_guide/creation/GeoDistanceTransformer.rst
new file mode 100644
index 000000000..625daf00c
--- /dev/null
+++ b/docs/user_guide/creation/GeoDistanceTransformer.rst
@@ -0,0 +1,169 @@
+.. _geo_distance_transformer:
+
+.. currentmodule:: feature_engine.creation
+
+GeoDistanceTransformer
+======================
+
+The :class:`GeoDistanceTransformer()` calculates the distance between two geographical
+coordinate pairs (latitude/longitude) and adds the result as a new feature.
+
+This transformer is useful for location-based machine learning problems such as
+real estate pricing, delivery route optimization, ride-sharing applications,
+and any domain where geographic proximity is relevant.
+
+Distance Methods
+~~~~~~~~~~~~~~~~
+
+The transformer supports different distance calculation methods:
+
+- **haversine**: Great-circle distance using the Haversine formula (default).
+  Most accurate for typical distances on Earth's surface.
+- **euclidean**: Simple Euclidean distance in the coordinate space.
+  Fast but less accurate for long distances.
+- **manhattan**: Manhattan (taxicab) distance in coordinate space.
+  Useful as a rough approximation for grid-based city layouts.
+
+Output Units
+~~~~~~~~~~~~
+
+The distance can be output in various units:
+
+- **km**: Kilometers (default)
+- **miles**: Miles
+- **meters**: Meters
+- **feet**: Feet
+
+Example
+~~~~~~~
+
+Let's create a dataframe with origin and destination coordinates:
+
+.. code:: python
+
+    import pandas as pd
+    from feature_engine.creation import GeoDistanceTransformer
+
+    # Sample data: trips between US cities
+    X = pd.DataFrame({
+        'origin_lat': [40.7128, 34.0522, 41.8781, 29.7604],
+        'origin_lon': [-74.0060, -118.2437, -87.6298, -95.3698],
+        'dest_lat': [34.0522, 41.8781, 40.7128, 33.4484],
+        'dest_lon': [-118.2437, -87.6298, -74.0060, -112.0740],
+        'trip_id': [1, 2, 3, 4]
+    })
+
+Now let's calculate the distances:
+
+.. code:: python
+
+    # Set up the transformer
+    gdt = GeoDistanceTransformer(
+        lat1='origin_lat',
+        lon1='origin_lon',
+        lat2='dest_lat',
+        lon2='dest_lon',
+        method='haversine',
+        output_unit='km',
+        output_col='distance_km'
+    )
+
+    # Fit and transform
+    gdt.fit(X)
+    X_transformed = gdt.transform(X)
+
+    print(X_transformed[['trip_id', 'distance_km']])
+
+Output:
+
+.. code:: python
+
+       trip_id   distance_km
+    0        1   3935.746254
+    1        2   2808.517344
+    2        3   1144.286561
+    3        4   1634.724892
+
+Using different distance methods
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code:: python
+
+    # Euclidean distance (faster but less accurate)
+    gdt_euclidean = GeoDistanceTransformer(
+        lat1='origin_lat', lon1='origin_lon',
+        lat2='dest_lat', lon2='dest_lon',
+        method='euclidean',
+        output_col='distance_euclidean'
+    )
+
+    # Manhattan distance (useful for grid cities)
+    gdt_manhattan = GeoDistanceTransformer(
+        lat1='origin_lat', lon1='origin_lon',
+        lat2='dest_lat', lon2='dest_lon',
+        method='manhattan',
+        output_col='distance_manhattan'
+    )
+
+Converting to miles
+~~~~~~~~~~~~~~~~~~~
+
+.. code:: python
+
+    gdt = GeoDistanceTransformer(
+        lat1='origin_lat', lon1='origin_lon',
+        lat2='dest_lat', lon2='dest_lon',
+        output_unit='miles',
+        output_col='distance_miles'
+    )
+
+    gdt.fit(X)
+    X_transformed = gdt.transform(X)
+
+Dropping original coordinate columns
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. code:: python
+
+    gdt = GeoDistanceTransformer(
+        lat1='origin_lat', lon1='origin_lon',
+        lat2='dest_lat', lon2='dest_lon',
+        drop_original=True
+    )
+
+    gdt.fit(X)
+    X_transformed = gdt.transform(X)
+
+    # Coordinate columns are removed
+    print(X_transformed.columns.tolist())
+    # ['trip_id', 'geo_distance']
+
+Using in a Pipeline
+~~~~~~~~~~~~~~~~~~~
+
+:class:`GeoDistanceTransformer()` works seamlessly with scikit-learn pipelines:
+
+.. code:: python
+
+    from sklearn.pipeline import Pipeline
+    from sklearn.preprocessing import StandardScaler
+    from sklearn.ensemble import RandomForestRegressor
+
+    # Create a pipeline for price prediction
+    pipe = Pipeline([
+        ('geo_distance', GeoDistanceTransformer(
+            lat1='origin_lat', lon1='origin_lon',
+            lat2='dest_lat', lon2='dest_lon',
+            output_unit='km',
+            drop_original=True
+        )),
+        ('scaler', StandardScaler()),
+        ('regressor', RandomForestRegressor())
+    ])
+
+API Reference
+-------------
+
+.. autoclass:: GeoDistanceTransformer
+    :members:
+    :inherited-members:
diff --git a/docs/user_guide/creation/index.rst b/docs/user_guide/creation/index.rst
index c3aace734..93b094547 100644
--- a/docs/user_guide/creation/index.rst
+++ b/docs/user_guide/creation/index.rst
@@ -66,6 +66,7 @@ Feature creation module
    MathFeatures
    RelativeFeatures
    DecisionTreeFeatures
+   GeoDistanceTransformer
 
 Feature-engine in Practice
 --------------------------
diff --git a/docs/user_guide/index.rst b/docs/user_guide/index.rst
index c786e77e1..52c33a8f4 100644
--- a/docs/user_guide/index.rst
+++ b/docs/user_guide/index.rst
@@ -28,6 +28,7 @@ Creation
 
    creation/index
    datetime/index
+   text/index
 
 
 Selection
diff --git a/docs/user_guide/text/TextFeatures.rst b/docs/user_guide/text/TextFeatures.rst
new file mode 100644
index 000000000..a1a82aa44
--- /dev/null
+++ b/docs/user_guide/text/TextFeatures.rst
@@ -0,0 +1,152 @@
+.. _text_features:
+
+.. currentmodule:: feature_engine.text
+
+TextFeatures
+============
+
+The :class:`TextFeatures()` extracts numerical features from text/string variables.
+This transformer is useful for extracting basic text statistics that can be used
+as features in machine learning models.
+
+Unlike scikit-learn's CountVectorizer or TfidfVectorizer which create sparse matrices,
+:class:`TextFeatures()` extracts metadata features that remain in DataFrame format
+and can be easily combined with other Feature-engine transformers in a pipeline.
+
+Available Features
+~~~~~~~~~~~~~~~~~~
+
+The transformer can extract the following features:
+
+- **char_count**: Number of characters in the text
+- **word_count**: Number of words (whitespace-separated tokens)
+- **sentence_count**: Number of sentences (based on .!? punctuation)
+- **avg_word_length**: Average length of words
+- **digit_count**: Number of digit characters
+- **uppercase_count**: Number of uppercase letters
+- **lowercase_count**: Number of lowercase letters
+- **special_char_count**: Number of special characters (non-alphanumeric)
+- **whitespace_count**: Number of whitespace characters
+- **whitespace_ratio**: Ratio of whitespace to total characters
+- **digit_ratio**: Ratio of digits to total characters
+- **uppercase_ratio**: Ratio of uppercase to total characters
+- **has_digits**: Binary indicator if text contains digits
+- **has_uppercase**: Binary indicator if text contains uppercase
+- **is_empty**: Binary indicator if text is empty
+- **starts_with_uppercase**: Binary indicator if text starts with uppercase
+- **ends_with_punctuation**: Binary indicator if text ends with .!?
+- **unique_word_count**: Number of unique words (case-insensitive)
+- **unique_word_ratio**: Ratio of unique words to total words
+
+Example
+~~~~~~~
+
+Let's create a dataframe with text data and extract features:
+
+.. code:: python
+
+    import pandas as pd
+    from feature_engine.text import TextFeatures
+
+    # Create sample data
+    X = pd.DataFrame({
+        'review': [
+            'This product is AMAZING! Best purchase ever.',
+            'Not great. Would not recommend.',
+            'OK for the price. 3 out of 5 stars.',
+            'TERRIBLE!!! DO NOT BUY!',
+        ],
+        'title': [
+            'Great Product',
+            'Disappointed',
+            'Average',
+            'Awful',
+        ]
+    })
+
+Now let's extract specific text features:
+
+.. code:: python
+
+    # Set up the transformer with specific features
+    tf = TextFeatures(
+        variables=['review'],
+        features=['word_count', 'char_count', 'has_digits', 'uppercase_ratio']
+    )
+
+    # Fit and transform
+    tf.fit(X)
+    X_transformed = tf.transform(X)
+
+    print(X_transformed.columns.tolist())
+
+Output:
+
+.. code:: python
+
+    ['review', 'title', 'review_word_count', 'review_char_count',
+     'review_has_digits', 'review_uppercase_ratio']
+
+Extracting all features
+~~~~~~~~~~~~~~~~~~~~~~~
+
+By default, if no features are specified, all available features will be extracted:
+
+.. code:: python
+
+    # Extract all features from all text columns
+    tf = TextFeatures()
+    tf.fit(X)
+    X_transformed = tf.transform(X)
+
+    # This will create 19 new columns for each text variable
+    print(f"Original columns: {len(X.columns)}")
+    print(f"Transformed columns: {len(X_transformed.columns)}")
+
+Dropping original columns
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+You can drop the original text columns after extracting features:
+
+.. code:: python
+
+    tf = TextFeatures(
+        variables=['review'],
+        features=['word_count', 'char_count'],
+        drop_original=True
+    )
+
+    tf.fit(X)
+    X_transformed = tf.transform(X)
+
+    # 'review' column is now removed
+    print(X_transformed.columns.tolist())
+
+Using in a Pipeline
+~~~~~~~~~~~~~~~~~~~
+
+:class:`TextFeatures()` works seamlessly with scikit-learn pipelines:
+
+.. code:: python
+
+    from sklearn.pipeline import Pipeline
+    from sklearn.preprocessing import StandardScaler
+    from sklearn.linear_model import LogisticRegression
+
+    # Create a pipeline
+    pipe = Pipeline([
+        ('text_features', TextFeatures(
+            variables=['review'],
+            features=['word_count', 'char_count', 'uppercase_ratio'],
+            drop_original=True
+        )),
+        ('scaler', StandardScaler()),
+        ('classifier', LogisticRegression())
+    ])
+
+API Reference
+-------------
+
+.. autoclass:: TextFeatures
+    :members:
+    :inherited-members:
diff --git a/docs/user_guide/text/index.rst b/docs/user_guide/text/index.rst
new file mode 100644
index 000000000..ea23d7362
--- /dev/null
+++ b/docs/user_guide/text/index.rst
@@ -0,0 +1,18 @@
+.. -*- mode: rst -*-
+
+Text Feature Extraction
+=======================
+
+Feature-engine's text module includes transformers to extract numerical features
+from text/string variables.
+
+Text feature extraction is useful for machine learning problems where you have
+text data but want to derive numerical statistics without creating sparse
+bag-of-words or TF-IDF representations.
+
+**Transformers**
+
+.. toctree::
+   :maxdepth: 1
+
+   TextFeatures
diff --git a/docs/user_guide/transformation/ArcSinhTransformer.rst b/docs/user_guide/transformation/ArcSinhTransformer.rst
new file mode 100644
index 000000000..07945b463
--- /dev/null
+++ b/docs/user_guide/transformation/ArcSinhTransformer.rst
@@ -0,0 +1,121 @@
+.. _arcsinh_transformer:
+
+.. currentmodule:: feature_engine.transformation
+
+ArcSinhTransformer
+==================
+
+The :class:`ArcSinhTransformer()` applies the inverse hyperbolic sine transformation
+(arcsinh) to numerical variables. Also known as the pseudo-logarithm, this
+transformation is useful for data that contains both positive and negative values.
+
+The transformation is: x → arcsinh((x - loc) / scale)
+
+For large |x|, arcsinh(x) behaves like ln(|x|) + ln(2), providing similar
+variance-stabilizing properties as the log transformation. For small |x|,
+it behaves approximately linearly (x → x). This makes it ideal for variables
+like net worth, profit/loss, or any metric that can be positive or negative.
+
+Unlike the :class:`LogTransformer()`, the :class:`ArcSinhTransformer()` can handle
+zero and negative values without requiring any preprocessing.
+
+Example
+~~~~~~~
+
+Let's create a dataframe with positive and negative values and apply the arcsinh
+transformation:
+
+.. code:: python
+
+    import numpy as np
+    import pandas as pd
+    import matplotlib.pyplot as plt
+    from sklearn.model_selection import train_test_split
+
+    from feature_engine.transformation import ArcSinhTransformer
+
+    # Create sample data with positive and negative values
+    np.random.seed(42)
+    X = pd.DataFrame({
+        'profit': np.random.randn(1000) * 10000,  # Values from -30000 to 30000
+        'net_worth': np.random.randn(1000) * 50000,
+    })
+
+    # Separate into train and test
+    X_train, X_test = train_test_split(X, test_size=0.3, random_state=0)
+
+Now let's set up the ArcSinhTransformer:
+
+.. code:: python
+
+    # Set up the arcsinh transformer
+    tf = ArcSinhTransformer(variables=['profit', 'net_worth'])
+
+    # Fit the transformer
+    tf.fit(X_train)
+
+The transformer does not learn any parameters when applying the fit method. It does
+check however that the variables are numerical.
+
+We can now transform the variables:
+
+.. code:: python
+
+    # Transform the data
+    train_t = tf.transform(X_train)
+    test_t = tf.transform(X_test)
+
+The arcsinh transformation compresses extreme values while preserving the sign:
+
+.. code:: python
+
+    # Compare original and transformed distributions
+    fig, axes = plt.subplots(1, 2, figsize=(12, 4))
+
+    X_train['profit'].hist(ax=axes[0], bins=50)
+    axes[0].set_title('Original profit')
+
+    train_t['profit'].hist(ax=axes[1], bins=50)
+    axes[1].set_title('Transformed profit')
+
+    plt.tight_layout()
+
+Using loc and scale parameters
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The :class:`ArcSinhTransformer()` supports location and scale parameters to
+center and normalize data before transformation:
+
+.. code:: python
+
+    # Center around mean and scale by std
+    tf = ArcSinhTransformer(
+        variables=['profit'],
+        loc=X_train['profit'].mean(),
+        scale=X_train['profit'].std()
+    )
+
+    tf.fit(X_train)
+    train_t = tf.transform(X_train)
+
+Inverse transformation
+~~~~~~~~~~~~~~~~~~~~~~
+
+The :class:`ArcSinhTransformer()` supports inverse transformation to recover
+the original values:
+
+.. code:: python
+
+    # Transform and then inverse transform
+    train_t = tf.transform(X_train)
+    train_recovered = tf.inverse_transform(train_t)
+
+    # Values should match original
+    np.allclose(X_train['profit'], train_recovered['profit'])
+
+API Reference
+-------------
+
+.. autoclass:: ArcSinhTransformer
+    :members:
+    :inherited-members:
diff --git a/docs/user_guide/transformation/index.rst b/docs/user_guide/transformation/index.rst
index 85422c9f6..00ce20bfb 100644
--- a/docs/user_guide/transformation/index.rst
+++ b/docs/user_guide/transformation/index.rst
@@ -33,6 +33,7 @@ on the nature of the variable.
    LogCpTransformer
    ReciprocalTransformer
    ArcsinTransformer
+   ArcSinhTransformer
    PowerTransformer
    BoxCoxTransformer
    YeoJohnsonTransformer
diff --git a/feature_engine/creation/__init__.py b/feature_engine/creation/__init__.py
index df7dab5a7..b3c84ba15 100644
--- a/feature_engine/creation/__init__.py
+++ b/feature_engine/creation/__init__.py
@@ -4,12 +4,15 @@
 """
 from .cyclical_features import CyclicalFeatures
 from .decision_tree_features import DecisionTreeFeatures
+from .geo_features import GeoDistanceTransformer
 from .math_features import MathFeatures
 from .relative_features import RelativeFeatures
 
 __all__ = [
+    "CyclicalFeatures",
     "DecisionTreeFeatures",
+    "GeoDistanceTransformer",
     "MathFeatures",
     "RelativeFeatures",
-    "CyclicalFeatures",
 ]
+
diff --git a/feature_engine/creation/geo_features.py b/feature_engine/creation/geo_features.py
new file mode 100644
index 000000000..1488106e9
--- /dev/null
+++ b/feature_engine/creation/geo_features.py
@@ -0,0 +1,432 @@
+# Authors: Ankit Hemant Lade (contributor)
+# License: BSD 3 clause
+
+from typing import List, Literal, Optional
+
+import numpy as np
+import pandas as pd
+from sklearn.base import BaseEstimator, TransformerMixin
+from sklearn.utils.validation import check_is_fitted
+
+from feature_engine._base_transformers.mixins import GetFeatureNamesOutMixin
+from feature_engine._check_init_parameters.check_init_input_params import (
+    _check_param_drop_original,
+)
+from feature_engine.dataframe_checks import (
+    _check_contains_na,
+    _check_X_matches_training_df,
+    check_X,
+)
+from feature_engine.tags import _return_tags
+from feature_engine.variable_handling import check_numerical_variables
+
+# Earth's radius in different units
+EARTH_RADIUS = {
+    "km": 6371.0,
+    "miles": 3958.8,
+    "meters": 6371000.0,
+    "feet": 20902231.0,
+}
+
+
+class GeoDistanceTransformer(TransformerMixin, BaseEstimator, GetFeatureNamesOutMixin):
+    """
+    GeoDistanceTransformer() calculates the distance between two geographical
+    coordinate pairs (latitude/longitude) and adds the result as a new feature.
+
+    This transformer is useful for location-based machine learning problems such as
+    real estate pricing, delivery route optimization, ride-sharing applications,
+    and any domain where geographic proximity is relevant.
+
+    The transformer supports different distance calculation methods:
+
+    - 'haversine': Great-circle distance using the Haversine formula (default).
+      Most accurate for typical distances on Earth's surface.
+    - 'euclidean': Simple Euclidean distance in the coordinate space.
+      Fast but less accurate for long distances.
+    - 'manhattan': Manhattan (taxicab) distance in coordinate space.
+      Useful as a rough approximation for grid-based city layouts.
+
+    More details in the :ref:`User Guide <geo_distance_transformer>`.
+
+    Parameters
+    ----------
+    lat1: str
+        Column name containing the latitude of the first point.
+
+    lon1: str
+        Column name containing the longitude of the first point.
+
+    lat2: str
+        Column name containing the latitude of the second point.
+
+    lon2: str
+        Column name containing the longitude of the second point.
+
+    method: str, default='haversine'
+        The distance calculation method. Options are:
+        - 'haversine': Great-circle distance (most accurate)
+        - 'euclidean': Euclidean distance in coordinate space
+        - 'manhattan': Manhattan distance in coordinate space
+
+    output_unit: str, default='km'
+        The unit for the output distance. Options are:
+        - 'km': Kilometers
+        - 'miles': Miles
+        - 'meters': Meters
+        - 'feet': Feet
+
+    output_col: str, default='geo_distance'
+        Name of the new column containing the calculated distances.
+
+    drop_original: bool, default=False
+        Whether to drop the original coordinate columns after transformation.
+
+    Attributes
+    ----------
+    variables_:
+        List of the coordinate variables used for distance calculation.
+
+    feature_names_in_:
+        List with the names of features seen during fit.
+
+    n_features_in_:
+        The number of features in the train set used in fit.
+
+    Methods
+    -------
+    fit:
+        This transformer does not learn parameters. Validates input columns.
+
+    fit_transform:
+        Fit to data, then transform it.
+
+    transform:
+        Calculate distances and add them as a new column.
+
+    get_feature_names_out:
+        Get output feature names for transformation.
+
+    See Also
+    --------
+    feature_engine.creation.MathFeatures :
+        Combines existing features using mathematical operations.
+    feature_engine.creation.RelativeFeatures :
+        Creates features relative to reference variables.
+
+    References
+    ----------
+    .. [1] Haversine formula: https://en.wikipedia.org/wiki/Haversine_formula
+
+    Examples
+    --------
+
+    >>> import pandas as pd
+    >>> from feature_engine.creation import GeoDistanceTransformer
+    >>> X = pd.DataFrame({
+    ...     'origin_lat': [40.7128, 34.0522, 41.8781],
+    ...     'origin_lon': [-74.0060, -118.2437, -87.6298],
+    ...     'dest_lat': [34.0522, 41.8781, 40.7128],
+    ...     'dest_lon': [-118.2437, -87.6298, -74.0060],
+    ... })
+    >>> gdt = GeoDistanceTransformer(
+    ...     lat1='origin_lat', lon1='origin_lon',
+    ...     lat2='dest_lat', lon2='dest_lon',
+    ...     method='haversine', output_unit='km'
+    ... )
+    >>> gdt.fit(X)
+    >>> X = gdt.transform(X)
+    >>> X
+       origin_lat  origin_lon  dest_lat   dest_lon  geo_distance
+    0     40.7128    -74.0060   34.0522  -118.2437   3935.746254
+    1     34.0522   -118.2437   41.8781   -87.6298   2808.517344
+    2     41.8781    -87.6298   40.7128   -74.0060   1144.286561
+    """
+
+    def __init__(
+        self,
+        lat1: str,
+        lon1: str,
+        lat2: str,
+        lon2: str,
+        method: Literal["haversine", "euclidean", "manhattan"] = "haversine",
+        output_unit: Literal["km", "miles", "meters", "feet"] = "km",
+        output_col: str = "geo_distance",
+        drop_original: bool = False,
+    ) -> None:
+
+        # Validate coordinate column names
+        for param_name, param_value in [
+            ("lat1", lat1),
+            ("lon1", lon1),
+            ("lat2", lat2),
+            ("lon2", lon2),
+        ]:
+            if not isinstance(param_value, str):
+                raise ValueError(
+                    f"{param_name} must be a string. Got {type(param_value).__name__}."
+                )
+
+        # Validate method
+        valid_methods = ["haversine", "euclidean", "manhattan"]
+        if method not in valid_methods:
+            raise ValueError(
+                f"method must be one of {valid_methods}. Got '{method}' instead."
+            )
+
+        # Validate output_unit
+        valid_units = ["km", "miles", "meters", "feet"]
+        if output_unit not in valid_units:
+            raise ValueError(
+                f"output_unit must be one of {valid_units}. "
+                f"Got '{output_unit}' instead."
+            )
+
+        # Validate output_col
+        if not isinstance(output_col, str):
+            raise ValueError(
+                f"output_col must be a string. Got {type(output_col).__name__}."
+            )
+
+        _check_param_drop_original(drop_original)
+
+        self.lat1 = lat1
+        self.lon1 = lon1
+        self.lat2 = lat2
+        self.lon2 = lon2
+        self.method = method
+        self.output_unit = output_unit
+        self.output_col = output_col
+        self.drop_original = drop_original
+
+    def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
+        """
+        This transformer does not learn parameters.
+
+        Validates that the coordinate columns exist and are numerical.
+
+        Parameters
+        ----------
+        X: pandas dataframe of shape = [n_samples, n_features]
+            The training input samples.
+
+        y: pandas Series, or np.array. Defaults to None.
+            It is not needed in this transformer. You can pass y or None.
+
+        Returns
+        -------
+        self: GeoDistanceTransformer
+            The fitted transformer.
+        """
+
+        # check input dataframe
+        X = check_X(X)
+
+        # Store coordinate variables
+        self.variables_ = [self.lat1, self.lon1, self.lat2, self.lon2]
+
+        # Check all coordinate columns exist
+        missing = set(self.variables_) - set(X.columns)
+        if missing:
+            raise ValueError(
+                f"Coordinate columns {missing} are not present in the dataframe."
+            )
+
+        # Check coordinate columns are numerical
+        check_numerical_variables(X, self.variables_)
+
+        # Check for missing values
+        _check_contains_na(X, self.variables_)
+
+        # Validate coordinate ranges (optional sanity check)
+        for lat_col in [self.lat1, self.lat2]:
+            if (X[lat_col].abs() > 90).any():
+                raise ValueError(
+                    f"Latitude values in '{lat_col}' must be between -90 and 90."
+                )
+
+        for lon_col in [self.lon1, self.lon2]:
+            if (X[lon_col].abs() > 180).any():
+                raise ValueError(
+                    f"Longitude values in '{lon_col}' must be between -180 and 180."
+                )
+
+        # save input features
+        self.feature_names_in_ = X.columns.tolist()
+
+        # save train set shape
+        self.n_features_in_ = X.shape[1]
+
+        return self
+
+    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
+        """
+        Calculate distances and add them as a new column.
+
+        Parameters
+        ----------
+        X: pandas dataframe of shape = [n_samples, n_features]
+            The data to transform.
+
+        Returns
+        -------
+        X_new: Pandas dataframe
+            The dataframe with the new distance column added.
+        """
+
+        # Check method fit has been called
+        check_is_fitted(self)
+
+        # check that input is a dataframe
+        X = check_X(X)
+
+        # Check if input data contains same number of columns as dataframe used to fit.
+        _check_X_matches_training_df(X, self.n_features_in_)
+
+        # Check for missing values
+        _check_contains_na(X, self.variables_)
+
+        # reorder variables to match train set
+        X = X[self.feature_names_in_]
+
+        # Calculate distance based on method
+        if self.method == "haversine":
+            distances = self._haversine_distance(
+                X[self.lat1].values,
+                X[self.lon1].values,
+                X[self.lat2].values,
+                X[self.lon2].values,
+            )
+        elif self.method == "euclidean":
+            distances = self._euclidean_distance(
+                X[self.lat1].values,
+                X[self.lon1].values,
+                X[self.lat2].values,
+                X[self.lon2].values,
+            )
+        else:  # manhattan
+            distances = self._manhattan_distance(
+                X[self.lat1].values,
+                X[self.lon1].values,
+                X[self.lat2].values,
+                X[self.lon2].values,
+            )
+
+        X[self.output_col] = distances
+
+        if self.drop_original:
+            X = X.drop(columns=self.variables_)
+
+        return X
+
+    def _haversine_distance(
+        self,
+        lat1: np.ndarray,
+        lon1: np.ndarray,
+        lat2: np.ndarray,
+        lon2: np.ndarray,
+    ) -> np.ndarray:
+        """Calculate the great-circle distance using the Haversine formula."""
+
+        # Convert to radians
+        lat1_rad = np.radians(lat1)
+        lat2_rad = np.radians(lat2)
+        lon1_rad = np.radians(lon1)
+        lon2_rad = np.radians(lon2)
+
+        # Haversine formula
+        dlat = lat2_rad - lat1_rad
+        dlon = lon2_rad - lon1_rad
+
+        a = (
+            np.sin(dlat / 2) ** 2
+            + np.cos(lat1_rad) * np.cos(lat2_rad) * np.sin(dlon / 2) ** 2
+        )
+        c = 2 * np.arcsin(np.sqrt(a))
+
+        # Distance in the requested unit
+        distance = EARTH_RADIUS[self.output_unit] * c
+
+        return distance
+
+    def _euclidean_distance(
+        self,
+        lat1: np.ndarray,
+        lon1: np.ndarray,
+        lat2: np.ndarray,
+        lon2: np.ndarray,
+    ) -> np.ndarray:
+        """Calculate Euclidean distance in coordinate space."""
+
+        # Simple Euclidean distance (approximate, best for short distances)
+        # Convert to approximate km then to requested unit
+        dlat = lat2 - lat1
+        dlon = lon2 - lon1
+
+        # Approximate degrees to km (at equator)
+        km_per_degree = 111.0
+        distance_km = np.sqrt((dlat * km_per_degree) ** 2 + (dlon * km_per_degree) ** 2)
+
+        # Convert to requested unit
+        conversion = EARTH_RADIUS[self.output_unit] / EARTH_RADIUS["km"]
+        return distance_km * conversion
+
+    def _manhattan_distance(
+        self,
+        lat1: np.ndarray,
+        lon1: np.ndarray,
+        lat2: np.ndarray,
+        lon2: np.ndarray,
+    ) -> np.ndarray:
+        """Calculate Manhattan (taxicab) distance in coordinate space."""
+
+        dlat = np.abs(lat2 - lat1)
+        dlon = np.abs(lon2 - lon1)
+
+        # Approximate degrees to km (at equator)
+        km_per_degree = 111.0
+        distance_km = (dlat + dlon) * km_per_degree
+
+        # Convert to requested unit
+        conversion = EARTH_RADIUS[self.output_unit] / EARTH_RADIUS["km"]
+        return distance_km * conversion
+
+    def get_feature_names_out(self, input_features=None) -> List[str]:
+        """
+        Get output feature names for transformation.
+
+        Parameters
+        ----------
+        input_features : array-like of str or None, default=None
+            Input features. If None, uses feature_names_in_.
+
+        Returns
+        -------
+        feature_names_out : list of str
+            Output feature names.
+        """
+        check_is_fitted(self)
+
+        if self.drop_original:
+            feature_names = [
+                f for f in self.feature_names_in_ if f not in self.variables_
+            ]
+        else:
+            feature_names = list(self.feature_names_in_)
+
+        feature_names.append(self.output_col)
+
+        return feature_names
+
+    def _more_tags(self):
+        tags_dict = _return_tags()
+        tags_dict["variables"] = "numerical"
+        # This transformer has mandatory parameters
+        tags_dict["_xfail_checks"][
+            "check_parameters_default_constructible"
+        ] = "transformer has mandatory parameters"
+        return tags_dict
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        return tags
diff --git a/feature_engine/text/__init__.py b/feature_engine/text/__init__.py
new file mode 100644
index 000000000..14626b79c
--- /dev/null
+++ b/feature_engine/text/__init__.py
@@ -0,0 +1,9 @@
+"""
+The module text includes classes to extract features from text/string variables.
+"""
+
+from .text_features import TextFeatures
+
+__all__ = [
+    "TextFeatures",
+]
diff --git a/feature_engine/text/text_features.py b/feature_engine/text/text_features.py
new file mode 100644
index 000000000..c06afdf79
--- /dev/null
+++ b/feature_engine/text/text_features.py
@@ -0,0 +1,327 @@
+# Authors: Ankit Hemant Lade (contributor)
+# License: BSD 3 clause
+
+from typing import List, Optional, Union
+
+import pandas as pd
+from sklearn.base import BaseEstimator, TransformerMixin
+from sklearn.utils.validation import check_is_fitted
+
+from feature_engine._base_transformers.mixins import GetFeatureNamesOutMixin
+from feature_engine._check_init_parameters.check_init_input_params import (
+    _check_param_drop_original,
+)
+from feature_engine.dataframe_checks import _check_X_matches_training_df, check_X
+from feature_engine.tags import _return_tags
+
+# Available text features and their computation functions
+TEXT_FEATURES = {
+    "char_count": lambda x: x.str.len(),
+    "word_count": lambda x: x.str.split().str.len(),
+    "sentence_count": lambda x: x.str.count(r"[.!?]+"),
+    "avg_word_length": lambda x: x.apply(
+        lambda s: sum(len(w) for w in str(s).split()) / max(len(str(s).split()), 1)
+    ),
+    "digit_count": lambda x: x.str.count(r"\d"),
+    "uppercase_count": lambda x: x.str.count(r"[A-Z]"),
+    "lowercase_count": lambda x: x.str.count(r"[a-z]"),
+    "special_char_count": lambda x: x.str.count(r"[^a-zA-Z0-9\s]"),
+    "whitespace_count": lambda x: x.str.count(r"\s"),
+    "whitespace_ratio": lambda x: x.str.count(r"\s") / x.str.len().replace(0, 1),
+    "digit_ratio": lambda x: x.str.count(r"\d") / x.str.len().replace(0, 1),
+    "uppercase_ratio": lambda x: x.str.count(r"[A-Z]") / x.str.len().replace(0, 1),
+    "has_digits": lambda x: x.str.contains(r"\d", regex=True).astype(int),
+    "has_uppercase": lambda x: x.str.contains(r"[A-Z]", regex=True).astype(int),
+    "is_empty": lambda x: (x.str.len() == 0).astype(int),
+    "starts_with_uppercase": lambda x: x.str.match(r"^[A-Z]").astype(int),
+    "ends_with_punctuation": lambda x: x.str.match(r".*[.!?]$").astype(int),
+    "unique_word_count": lambda x: x.apply(lambda s: len(set(str(s).lower().split()))),
+    "unique_word_ratio": lambda x: x.apply(
+        lambda s: len(set(str(s).lower().split())) / max(len(str(s).split()), 1)
+    ),
+}
+
+
+class TextFeatures(TransformerMixin, BaseEstimator, GetFeatureNamesOutMixin):
+    """
+    TextFeatures() extracts numerical features from text/string variables. This
+    transformer is useful for extracting basic text statistics that can be used
+    as features in machine learning models.
+
+    The transformer can extract various text features including character counts,
+    word counts, sentence counts, and various ratios and indicators.
+
+    A list of variables can be passed as an argument. Alternatively, the transformer
+    will automatically select and transform all variables of type object (string).
+
+    More details in the :ref:`User Guide <text_features>`.
+
+    Parameters
+    ----------
+    variables: list, default=None
+        The list of text/string variables to extract features from. If None, the
+        transformer will automatically select all object (string) columns.
+
+    features: list, default=None
+        List of text features to extract. Available features are:
+
+        - 'char_count': Number of characters in the text
+        - 'word_count': Number of words (whitespace-separated tokens)
+        - 'sentence_count': Number of sentences (based on .!? punctuation)
+        - 'avg_word_length': Average length of words
+        - 'digit_count': Number of digit characters
+        - 'uppercase_count': Number of uppercase letters
+        - 'lowercase_count': Number of lowercase letters
+        - 'special_char_count': Number of special characters (non-alphanumeric)
+        - 'whitespace_count': Number of whitespace characters
+        - 'whitespace_ratio': Ratio of whitespace to total characters
+        - 'digit_ratio': Ratio of digits to total characters
+        - 'uppercase_ratio': Ratio of uppercase to total characters
+        - 'has_digits': Binary indicator if text contains digits
+        - 'has_uppercase': Binary indicator if text contains uppercase
+        - 'is_empty': Binary indicator if text is empty
+        - 'starts_with_uppercase': Binary indicator if text starts with uppercase
+        - 'ends_with_punctuation': Binary indicator if text ends with .!?
+        - 'unique_word_count': Number of unique words (case-insensitive)
+        - 'unique_word_ratio': Ratio of unique words to total words
+
+        If None, extracts all available features.
+
+    drop_original: bool, default=False
+        Whether to drop the original text columns after transformation.
+
+    Attributes
+    ----------
+    variables_:
+        The list of text variables that will be transformed.
+
+    features_:
+        The list of features that will be extracted.
+
+    feature_names_in_:
+        List with the names of features seen during fit.
+
+    n_features_in_:
+        The number of features in the train set used in fit.
+
+    Methods
+    -------
+    fit:
+        This transformer does not learn parameters. It stores the feature names
+        and validates input.
+
+    fit_transform:
+        Fit to data, then transform it.
+
+    transform:
+        Extract text features and add them to the dataframe.
+
+    get_feature_names_out:
+        Get output feature names for transformation.
+
+    See Also
+    --------
+    feature_engine.encoding.StringSimilarityEncoder :
+        Encodes categorical variables based on string similarity.
+
+    Examples
+    --------
+
+    >>> import pandas as pd
+    >>> from feature_engine.text import TextFeatures
+    >>> X = pd.DataFrame({
+    ...     'text': ['Hello World!', 'Python is GREAT.', 'ML rocks 123']
+    ... })
+    >>> tf = TextFeatures(features=['char_count', 'word_count', 'has_digits'])
+    >>> tf.fit(X)
+    >>> X = tf.transform(X)
+    >>> X
+                   text  text_char_count  text_word_count  text_has_digits
+    0      Hello World!               12                2                0
+    1  Python is GREAT.               16                3                0
+    2       ML rocks 123               12                3                1
+    """
+
+    def __init__(
+        self,
+        variables: Union[None, str, List[str]] = None,
+        features: Union[None, List[str]] = None,
+        drop_original: bool = False,
+    ) -> None:
+
+        # Validate variables
+        if variables is not None:
+            if isinstance(variables, str):
+                variables = [variables]
+            elif not isinstance(variables, list) or not all(
+                isinstance(v, str) for v in variables
+            ):
+                raise ValueError(
+                    "variables must be None, a string, or a list of strings. "
+                    f"Got {type(variables).__name__} instead."
+                )
+
+        # Validate features
+        if features is not None:
+            if not isinstance(features, list) or not all(
+                isinstance(f, str) for f in features
+            ):
+                raise ValueError(
+                    "features must be None or a list of strings. "
+                    f"Got {type(features).__name__} instead."
+                )
+            invalid_features = set(features) - set(TEXT_FEATURES.keys())
+            if invalid_features:
+                raise ValueError(
+                    f"Invalid features: {invalid_features}. "
+                    f"Available features are: {list(TEXT_FEATURES.keys())}"
+                )
+
+        _check_param_drop_original(drop_original)
+
+        self.variables = variables
+        self.features = features
+        self.drop_original = drop_original
+
+    def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
+        """
+        This transformer does not learn parameters.
+
+        Stores feature names and validates that the specified variables are
+        present and are of string/object type.
+
+        Parameters
+        ----------
+        X: pandas dataframe of shape = [n_samples, n_features]
+            The training input samples.
+
+        y: pandas Series, or np.array. Defaults to None.
+            It is not needed in this transformer. You can pass y or None.
+
+        Returns
+        -------
+        self: TextFeatures
+            The fitted transformer.
+        """
+
+        # check input dataframe
+        X = check_X(X)
+
+        # Find or validate text variables
+        if self.variables is None:
+            # Select object/string columns
+            self.variables_ = [col for col in X.columns if X[col].dtype == "object"]
+            if len(self.variables_) == 0:
+                raise ValueError(
+                    "No object/string columns found in the dataframe. "
+                    "Please specify variables explicitly."
+                )
+        else:
+            # Validate user-specified variables exist
+            missing = set(self.variables) - set(X.columns)
+            if missing:
+                raise ValueError(
+                    f"Variables {missing} are not present in the dataframe."
+                )
+            self.variables_ = self.variables
+
+        # Set features to extract
+        if self.features is None:
+            self.features_ = list(TEXT_FEATURES.keys())
+        else:
+            self.features_ = self.features
+
+        # save input features
+        self.feature_names_in_ = X.columns.tolist()
+
+        # save train set shape
+        self.n_features_in_ = X.shape[1]
+
+        return self
+
+    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
+        """
+        Extract text features and add them to the dataframe.
+
+        Parameters
+        ----------
+        X: pandas dataframe of shape = [n_samples, n_features]
+            The data to transform.
+
+        Returns
+        -------
+        X_new: Pandas dataframe
+            The dataframe with the original columns plus the new text features.
+        """
+
+        # Check method fit has been called
+        check_is_fitted(self)
+
+        # check that input is a dataframe
+        X = check_X(X)
+
+        # Check if input data contains same number of columns as dataframe used to fit.
+        _check_X_matches_training_df(X, self.n_features_in_)
+
+        # reorder variables to match train set
+        X = X[self.feature_names_in_]
+
+        # Extract features for each text variable
+        for var in self.variables_:
+            # Fill NaN with empty string for feature extraction
+            text_col = X[var].fillna("")
+
+            for feature_name in self.features_:
+                new_col_name = f"{var}_{feature_name}"
+                feature_func = TEXT_FEATURES[feature_name]
+                X[new_col_name] = feature_func(text_col)
+
+                # Fill any NaN values resulting from computation with 0
+                X[new_col_name] = X[new_col_name].fillna(0)
+
+        if self.drop_original:
+            X = X.drop(columns=self.variables_)
+
+        return X
+
+    def get_feature_names_out(self, input_features=None) -> List[str]:
+        """
+        Get output feature names for transformation.
+
+        Parameters
+        ----------
+        input_features : array-like of str or None, default=None
+            Input features. If None, uses feature_names_in_.
+
+        Returns
+        -------
+        feature_names_out : list of str
+            Output feature names.
+        """
+        check_is_fitted(self)
+
+        # Start with original features
+        if self.drop_original:
+            feature_names = [
+                f for f in self.feature_names_in_ if f not in self.variables_
+            ]
+        else:
+            feature_names = list(self.feature_names_in_)
+
+        # Add new text feature names
+        for var in self.variables_:
+            for feature_name in self.features_:
+                feature_names.append(f"{var}_{feature_name}")
+
+        return feature_names
+
+    def _more_tags(self):
+        tags_dict = _return_tags()
+        tags_dict["allow_nan"] = True
+        tags_dict["variables"] = "categorical"
+        return tags_dict
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.allow_nan = True
+        return tags
diff --git a/feature_engine/transformation/__init__.py b/feature_engine/transformation/__init__.py
index 15011ac4b..f60dbd72b 100644
--- a/feature_engine/transformation/__init__.py
+++ b/feature_engine/transformation/__init__.py
@@ -4,6 +4,7 @@
 """
 
 from .arcsin import ArcsinTransformer
+from .arcsinh import ArcSinhTransformer
 from .boxcox import BoxCoxTransformer
 from .log import LogCpTransformer, LogTransformer
 from .power import PowerTransformer
@@ -11,11 +12,13 @@
 from .yeojohnson import YeoJohnsonTransformer
 
 __all__ = [
+    "ArcsinTransformer",
+    "ArcSinhTransformer",
     "BoxCoxTransformer",
     "LogTransformer",
     "LogCpTransformer",
     "PowerTransformer",
     "ReciprocalTransformer",
     "YeoJohnsonTransformer",
-    "ArcsinTransformer",
 ]
+
diff --git a/feature_engine/transformation/arcsinh.py b/feature_engine/transformation/arcsinh.py
new file mode 100644
index 000000000..98a44478e
--- /dev/null
+++ b/feature_engine/transformation/arcsinh.py
@@ -0,0 +1,229 @@
+# Authors: Ankit Hemant Lade (contributor)
+# License: BSD 3 clause
+
+from typing import List, Optional, Union
+
+import numpy as np
+import pandas as pd
+
+from feature_engine._base_transformers.base_numerical import BaseNumericalTransformer
+from feature_engine._check_init_parameters.check_variables import (
+    _check_variables_input_value,
+)
+from feature_engine._docstrings.fit_attributes import (
+    _feature_names_in_docstring,
+    _n_features_in_docstring,
+    _variables_attribute_docstring,
+)
+from feature_engine._docstrings.init_parameters.all_trasnformers import (
+    _variables_numerical_docstring,
+)
+from feature_engine._docstrings.methods import (
+    _fit_not_learn_docstring,
+    _fit_transform_docstring,
+    _inverse_transform_docstring,
+)
+from feature_engine._docstrings.substitute import Substitution
+from feature_engine.tags import _return_tags
+
+
+@Substitution(
+    variables=_variables_numerical_docstring,
+    variables_=_variables_attribute_docstring,
+    feature_names_in_=_feature_names_in_docstring,
+    n_features_in_=_n_features_in_docstring,
+    fit=_fit_not_learn_docstring,
+    fit_transform=_fit_transform_docstring,
+    inverse_transform=_inverse_transform_docstring,
+)
+class ArcSinhTransformer(BaseNumericalTransformer):
+    """
+    The ArcSinhTransformer() applies the inverse hyperbolic sine transformation
+    (arcsinh) to numerical variables. Also known as the pseudo-logarithm, this
+    transformation is useful for data that contains both positive and negative values.
+
+    The transformation is: x → arcsinh((x - loc) / scale)
+
+    For large |x|, arcsinh(x) behaves like ln(|x|) + ln(2), providing similar
+    variance-stabilizing properties as the log transformation. For small |x|,
+    it behaves approximately linearly (x → x). This makes it ideal for variables
+    like net worth, profit/loss, or any metric that can be positive or negative.
+
+    A list of variables can be passed as an argument. Alternatively, the transformer
+    will automatically select and transform all variables of type numeric.
+
+    More details in the :ref:`User Guide <arcsinh_transformer>`.
+
+    Parameters
+    ----------
+    {variables}
+
+    loc: float, default=0.0
+        Location parameter for shifting the data before transformation.
+        The transformation becomes: arcsinh((x - loc) / scale)
+
+    scale: float, default=1.0
+        Scale parameter for normalizing the data before transformation.
+        Must be greater than 0. The transformation becomes: arcsinh((x - loc) / scale)
+
+    Attributes
+    ----------
+    {variables_}
+
+    {feature_names_in_}
+
+    {n_features_in_}
+
+    Methods
+    -------
+    {fit}
+
+    {fit_transform}
+
+    {inverse_transform}
+
+    transform:
+        Transform the variables using the arcsinh function.
+
+    See Also
+    --------
+    feature_engine.transformation.LogTransformer :
+        Applies log transformation (only for positive values).
+    feature_engine.transformation.YeoJohnsonTransformer :
+        Applies Yeo-Johnson transformation.
+
+    References
+    ----------
+    .. [1] Burbidge, J. B., Magee, L., & Robb, A. L. (1988). Alternative
+           transformations to handle extreme values of the dependent variable.
+           Journal of the American Statistical Association, 83(401), 123-127.
+
+    Examples
+    --------
+
+    >>> import numpy as np
+    >>> import pandas as pd
+    >>> from feature_engine.transformation import ArcSinhTransformer
+    >>> np.random.seed(42)
+    >>> X = pd.DataFrame(dict(x = np.random.randn(100) * 1000))
+    >>> ast = ArcSinhTransformer()
+    >>> ast.fit(X)
+    >>> X = ast.transform(X)
+    >>> X.head()
+              x
+    0  7.516076
+    1 -6.330816
+    2  7.780254
+    3  8.825252
+    4 -6.995893
+    """
+
+    def __init__(
+        self,
+        variables: Union[None, int, str, List[Union[str, int]]] = None,
+        loc: float = 0.0,
+        scale: float = 1.0,
+    ) -> None:
+
+        if not isinstance(loc, (int, float)):
+            raise ValueError(
+                f"loc must be a number (int or float). "
+                f"Got {type(loc).__name__} instead."
+            )
+
+        if not isinstance(scale, (int, float)) or scale <= 0:
+            raise ValueError(
+                f"scale must be a positive number (> 0). Got {scale} instead."
+            )
+
+        self.variables = _check_variables_input_value(variables)
+        self.loc = float(loc)
+        self.scale = float(scale)
+
+    def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
+        """
+        This transformer does not learn parameters.
+
+        Selects the numerical variables and stores feature names.
+
+        Parameters
+        ----------
+        X: Pandas DataFrame of shape = [n_samples, n_features].
+            The training input samples. Can be the entire dataframe, not just the
+            variables to transform.
+
+        y: pandas Series, default=None
+            It is not needed in this transformer. You can pass y or None.
+
+        Returns
+        -------
+        self: ArcSinhTransformer
+            The fitted transformer.
+        """
+
+        # check input dataframe and find/check numerical variables
+        X = super().fit(X)
+
+        return self
+
+    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
+        """
+        Transform the variables using the arcsinh function.
+
+        Parameters
+        ----------
+        X: Pandas DataFrame of shape = [n_samples, n_features]
+            The data to be transformed.
+
+        Returns
+        -------
+        X_new: pandas dataframe
+            The dataframe with the transformed variables.
+        """
+
+        # check input dataframe and if class was fitted
+        X = self._check_transform_input_and_state(X)
+
+        # Ensure float dtype for the transformation
+        X[self.variables_] = X[self.variables_].astype(float)
+
+        # Apply arcsinh transformation: arcsinh((x - loc) / scale)
+        X.loc[:, self.variables_] = np.arcsinh(
+            (X.loc[:, self.variables_] - self.loc) / self.scale
+        )
+
+        return X
+
+    def inverse_transform(self, X: pd.DataFrame) -> pd.DataFrame:
+        """
+        Convert the data back to the original representation.
+
+        Parameters
+        ----------
+        X: Pandas DataFrame of shape = [n_samples, n_features]
+            The data to be inverse transformed.
+
+        Returns
+        -------
+        X_tr: pandas dataframe
+            The dataframe with the inverse transformed variables.
+        """
+
+        # check input dataframe and if class was fitted
+        X = self._check_transform_input_and_state(X)
+
+        # Inverse transform: x = sinh(y) * scale + loc
+        X.loc[:, self.variables_] = (
+            np.sinh(X.loc[:, self.variables_]) * self.scale + self.loc
+        )
+
+        return X
+
+    def _more_tags(self):
+        tags_dict = _return_tags()
+        tags_dict["variables"] = "numerical"
+        return tags_dict
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        return tags
diff --git a/tests/test_creation/test_geo_features.py b/tests/test_creation/test_geo_features.py
new file mode 100644
index 000000000..b6bad01d4
--- /dev/null
+++ b/tests/test_creation/test_geo_features.py
@@ -0,0 +1,266 @@
+import numpy as np
+import pandas as pd
+import pytest
+
+from feature_engine.creation import GeoDistanceTransformer
+
+
+class TestGeoDistanceTransformer:
+    """Test cases for GeoDistanceTransformer."""
+
+    def test_haversine_distance_default(self):
+        """Test Haversine distance calculation with default parameters."""
+        # New York to Los Angeles
+        X = pd.DataFrame(
+            {
+                "lat1": [40.7128],
+                "lon1": [-74.0060],
+                "lat2": [34.0522],
+                "lon2": [-118.2437],
+            }
+        )
+        transformer = GeoDistanceTransformer(
+            lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2"
+        )
+        X_tr = transformer.fit_transform(X)
+
+        # Distance should be approximately 3935-3944 km
+        assert "geo_distance" in X_tr.columns
+        assert 3900 < X_tr["geo_distance"].iloc[0] < 4000
+
+    def test_haversine_distance_miles(self):
+        """Test Haversine distance in miles."""
+        X = pd.DataFrame(
+            {
+                "lat1": [40.7128],
+                "lon1": [-74.0060],
+                "lat2": [34.0522],
+                "lon2": [-118.2437],
+            }
+        )
+        transformer = GeoDistanceTransformer(
+            lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2", output_unit="miles"
+        )
+        X_tr = transformer.fit_transform(X)
+
+        # Distance should be approximately 2445 miles
+        assert 2400 < X_tr["geo_distance"].iloc[0] < 2500
+
+    def test_same_location_zero_distance(self):
+        """Test that same location returns zero distance."""
+        X = pd.DataFrame(
+            {
+                "lat1": [40.7128, 34.0522],
+                "lon1": [-74.0060, -118.2437],
+                "lat2": [40.7128, 34.0522],
+                "lon2": [-74.0060, -118.2437],
+            }
+        )
+        transformer = GeoDistanceTransformer(
+            lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2"
+        )
+        X_tr = transformer.fit_transform(X)
+
+        np.testing.assert_array_almost_equal(
+            X_tr["geo_distance"].values, [0.0, 0.0], decimal=10
+        )
+
+    def test_euclidean_method(self):
+        """Test Euclidean distance method."""
+        X = pd.DataFrame({"lat1": [0.0], "lon1": [0.0], "lat2": [1.0], "lon2": [1.0]})
+        transformer = GeoDistanceTransformer(
+            lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2", method="euclidean"
+        )
+        X_tr = transformer.fit_transform(X)
+
+        assert X_tr["geo_distance"].iloc[0] > 0
+
+    def test_manhattan_method(self):
+        """Test Manhattan distance method."""
+        X = pd.DataFrame({"lat1": [0.0], "lon1": [0.0], "lat2": [1.0], "lon2": [1.0]})
+        transformer = GeoDistanceTransformer(
+            lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2", method="manhattan"
+        )
+        X_tr = transformer.fit_transform(X)
+
+        assert X_tr["geo_distance"].iloc[0] > 0
+
+    def test_custom_output_column_name(self):
+        """Test custom output column name."""
+        X = pd.DataFrame(
+            {
+                "lat1": [40.7128],
+                "lon1": [-74.0060],
+                "lat2": [34.0522],
+                "lon2": [-118.2437],
+            }
+        )
+        transformer = GeoDistanceTransformer(
+            lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2", output_col="distance_km"
+        )
+        X_tr = transformer.fit_transform(X)
+
+        assert "distance_km" in X_tr.columns
+        assert "geo_distance" not in X_tr.columns
+
+    def test_drop_original_columns(self):
+        """Test drop_original parameter."""
+        X = pd.DataFrame(
+            {
+                "lat1": [40.7128],
+                "lon1": [-74.0060],
+                "lat2": [34.0522],
+                "lon2": [-118.2437],
+                "other": [1],
+            }
+        )
+        transformer = GeoDistanceTransformer(
+            lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2", drop_original=True
+        )
+        X_tr = transformer.fit_transform(X)
+
+        assert "lat1" not in X_tr.columns
+        assert "lon1" not in X_tr.columns
+        assert "lat2" not in X_tr.columns
+        assert "lon2" not in X_tr.columns
+        assert "geo_distance" in X_tr.columns
+        assert "other" in X_tr.columns
+
+    def test_multiple_rows(self):
+        """Test with multiple rows."""
+        X = pd.DataFrame(
+            {
+                "origin_lat": [40.7128, 34.0522, 41.8781],
+                "origin_lon": [-74.0060, -118.2437, -87.6298],
+                "dest_lat": [34.0522, 41.8781, 40.7128],
+                "dest_lon": [-118.2437, -87.6298, -74.0060],
+            }
+        )
+        transformer = GeoDistanceTransformer(
+            lat1="origin_lat", lon1="origin_lon", lat2="dest_lat", lon2="dest_lon"
+        )
+        X_tr = transformer.fit_transform(X)
+
+        assert len(X_tr["geo_distance"]) == 3
+        # All distances should be positive
+        assert all(X_tr["geo_distance"] > 0)
+
+    def test_invalid_method_raises_error(self):
+        """Test that invalid method raises ValueError."""
+        with pytest.raises(ValueError, match="method must be one of"):
+            GeoDistanceTransformer(
+                lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2", method="invalid"
+            )
+
+    def test_invalid_output_unit_raises_error(self):
+        """Test that invalid output_unit raises ValueError."""
+        with pytest.raises(ValueError, match="output_unit must be one of"):
+            GeoDistanceTransformer(
+                lat1="lat1",
+                lon1="lon1",
+                lat2="lat2",
+                lon2="lon2",
+                output_unit="invalid",
+            )
+
+    def test_missing_columns_raises_error(self):
+        """Test that missing columns raise ValueError on fit."""
+        X = pd.DataFrame({"lat1": [1], "lon1": [1]})
+        transformer = GeoDistanceTransformer(
+            lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2"
+        )
+        with pytest.raises(ValueError, match="not present in the dataframe"):
+            transformer.fit(X)
+
+    def test_invalid_latitude_range_raises_error(self):
+        """Test that latitude out of range raises ValueError."""
+        X = pd.DataFrame(
+            {
+                "lat1": [100],  # Invalid: outside -90 to 90
+                "lon1": [0],
+                "lat2": [0],
+                "lon2": [0],
+            }
+        )
+        transformer = GeoDistanceTransformer(
+            lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2"
+        )
+        with pytest.raises(ValueError, match="Latitude values.*must be between"):
+            transformer.fit(X)
+
+    def test_invalid_longitude_range_raises_error(self):
+        """Test that longitude out of range raises ValueError."""
+        X = pd.DataFrame(
+            {
+                "lat1": [0],
+                "lon1": [200],  # Invalid: outside -180 to 180
+                "lat2": [0],
+                "lon2": [0],
+            }
+        )
+        transformer = GeoDistanceTransformer(
+            lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2"
+        )
+        with pytest.raises(ValueError, match="Longitude values.*must be between"):
+            transformer.fit(X)
+
+    def test_fit_stores_attributes(self):
+        """Test that fit stores expected attributes."""
+        X = pd.DataFrame(
+            {"lat1": [40.0], "lon1": [-74.0], "lat2": [34.0], "lon2": [-118.0]}
+        )
+        transformer = GeoDistanceTransformer(
+            lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2"
+        )
+        transformer.fit(X)
+
+        assert hasattr(transformer, "variables_")
+        assert hasattr(transformer, "feature_names_in_")
+        assert hasattr(transformer, "n_features_in_")
+        assert set(transformer.variables_) == {"lat1", "lon1", "lat2", "lon2"}
+
+    def test_get_feature_names_out(self):
+        """Test get_feature_names_out returns correct names."""
+        X = pd.DataFrame(
+            {
+                "lat1": [40.0],
+                "lon1": [-74.0],
+                "lat2": [34.0],
+                "lon2": [-118.0],
+                "other": [1],
+            }
+        )
+        transformer = GeoDistanceTransformer(
+            lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2"
+        )
+        transformer.fit(X)
+
+        feature_names = transformer.get_feature_names_out()
+        assert "geo_distance" in feature_names
+        assert len(feature_names) == 6  # 5 original + 1 new
+
+    def test_output_units_conversion(self):
+        """Test different output units give consistent results."""
+        X = pd.DataFrame(
+            {
+                "lat1": [40.7128],
+                "lon1": [-74.0060],
+                "lat2": [34.0522],
+                "lon2": [-118.2437],
+            }
+        )
+
+        # Get distance in km and miles
+        transformer_km = GeoDistanceTransformer(
+            lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2", output_unit="km"
+        )
+        transformer_miles = GeoDistanceTransformer(
+            lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2", output_unit="miles"
+        )
+
+        dist_km = transformer_km.fit_transform(X.copy())["geo_distance"].iloc[0]
+        dist_miles = transformer_miles.fit_transform(X.copy())["geo_distance"].iloc[0]
+
+        # 1 km ≈ 0.621371 miles
+        expected_miles = dist_km * 0.621371
+        np.testing.assert_almost_equal(dist_miles, expected_miles, decimal=0)
diff --git a/tests/test_text/__init__.py b/tests/test_text/__init__.py
new file mode 100644
index 000000000..f4e9de3ea
--- /dev/null
+++ b/tests/test_text/__init__.py
@@ -0,0 +1,167 @@
+import pandas as pd
+import pytest
+
+from feature_engine.text import TextFeatures
+
+
+class TestTextFeatures:
+    """Test cases for TextFeatures transformer."""
+
+    def test_default_all_features(self):
+        """Test extracting all features with default parameters."""
+        X = pd.DataFrame({"text": ["Hello World!", "Python 123", "AI"]})
+        transformer = TextFeatures()
+        X_tr = transformer.fit_transform(X)
+
+        # Check that new columns were added
+        assert "text_char_count" in X_tr.columns
+        assert "text_word_count" in X_tr.columns
+        assert "text_digit_count" in X_tr.columns
+
+        # Check char_count
+        assert X_tr["text_char_count"].tolist() == [12, 10, 2]
+
+        # Check word_count
+        assert X_tr["text_word_count"].tolist() == [2, 2, 1]
+
+        # Check digit_count
+        assert X_tr["text_digit_count"].tolist() == [0, 3, 0]
+
+    def test_specific_features(self):
+        """Test extracting specific features only."""
+        X = pd.DataFrame({"text": ["Hello", "World"]})
+        transformer = TextFeatures(features=["char_count", "word_count"])
+        X_tr = transformer.fit_transform(X)
+
+        # Check only specified features are extracted
+        assert "text_char_count" in X_tr.columns
+        assert "text_word_count" in X_tr.columns
+        assert "text_digit_count" not in X_tr.columns
+        assert "text_uppercase_count" not in X_tr.columns
+
+    def test_specific_variables(self):
+        """Test extracting features from specific variables only."""
+        X = pd.DataFrame(
+            {"text1": ["Hello", "World"], "text2": ["Foo", "Bar"], "numeric": [1, 2]}
+        )
+        transformer = TextFeatures(variables=["text1"], features=["char_count"])
+        X_tr = transformer.fit_transform(X)
+
+        # Only text1 should have features extracted
+        assert "text1_char_count" in X_tr.columns
+        assert "text2_char_count" not in X_tr.columns
+
+    def test_drop_original(self):
+        """Test drop_original parameter."""
+        X = pd.DataFrame({"text": ["Hello", "World"], "other": [1, 2]})
+        transformer = TextFeatures(features=["char_count"], drop_original=True)
+        X_tr = transformer.fit_transform(X)
+
+        assert "text" not in X_tr.columns
+        assert "text_char_count" in X_tr.columns
+        assert "other" in X_tr.columns
+
+    def test_empty_string_handling(self):
+        """Test handling of empty strings."""
+        X = pd.DataFrame({"text": ["", "Hello", ""]})
+        transformer = TextFeatures(features=["char_count", "word_count", "is_empty"])
+        X_tr = transformer.fit_transform(X)
+
+        assert X_tr["text_char_count"].tolist() == [0, 5, 0]
+        assert X_tr["text_is_empty"].tolist() == [1, 0, 1]
+
+    def test_nan_handling(self):
+        """Test handling of NaN values."""
+        X = pd.DataFrame({"text": ["Hello", None, "World"]})
+        transformer = TextFeatures(features=["char_count"])
+        X_tr = transformer.fit_transform(X)
+
+        # NaN should be filled with empty string, resulting in char_count of 0
+        assert X_tr["text_char_count"].tolist() == [5, 0, 5]
+
+    def test_uppercase_features(self):
+        """Test uppercase-related features."""
+        X = pd.DataFrame({"text": ["HELLO", "hello", "HeLLo"]})
+        transformer = TextFeatures(
+            features=["uppercase_count", "has_uppercase", "starts_with_uppercase"]
+        )
+        X_tr = transformer.fit_transform(X)
+
+        assert X_tr["text_uppercase_count"].tolist() == [5, 0, 3]
+        assert X_tr["text_has_uppercase"].tolist() == [1, 0, 1]
+        assert X_tr["text_starts_with_uppercase"].tolist() == [1, 0, 1]
+
+    def test_sentence_count(self):
+        """Test sentence counting."""
+        X = pd.DataFrame({"text": ["Hello. World!", "One sentence", "A? B! C."]})
+        transformer = TextFeatures(features=["sentence_count"])
+        X_tr = transformer.fit_transform(X)
+
+        assert X_tr["text_sentence_count"].tolist() == [2, 0, 3]
+
+    def test_unique_word_features(self):
+        """Test unique word features."""
+        X = pd.DataFrame({"text": ["the the the", "a b c", "x"]})
+        transformer = TextFeatures(features=["unique_word_count", "unique_word_ratio"])
+        X_tr = transformer.fit_transform(X)
+
+        assert X_tr["text_unique_word_count"].tolist() == [1, 3, 1]
+        assert X_tr["text_unique_word_ratio"].tolist() == [1 / 3, 1.0, 1.0]
+
+    def test_invalid_feature_raises_error(self):
+        """Test that invalid feature name raises ValueError."""
+        with pytest.raises(ValueError, match="Invalid features"):
+            TextFeatures(features=["invalid_feature"])
+
+    def test_invalid_variables_raises_error(self):
+        """Test that invalid variables parameter raises ValueError."""
+        with pytest.raises(ValueError, match="variables must be"):
+            TextFeatures(variables=123)
+
+    def test_missing_variable_raises_error(self):
+        """Test that missing variable raises ValueError on fit."""
+        X = pd.DataFrame({"text": ["Hello"]})
+        transformer = TextFeatures(variables=["nonexistent"])
+        with pytest.raises(ValueError, match="not present in the dataframe"):
+            transformer.fit(X)
+
+    def test_no_text_columns_raises_error(self):
+        """Test that no text columns raises error when variables=None."""
+        X = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
+        transformer = TextFeatures()
+        with pytest.raises(ValueError, match="No object/string columns found"):
+            transformer.fit(X)
+
+    def test_fit_stores_attributes(self):
+        """Test that fit stores expected attributes."""
+        X = pd.DataFrame({"text": ["Hello"]})
+        transformer = TextFeatures()
+        transformer.fit(X)
+
+        assert hasattr(transformer, "variables_")
+        assert hasattr(transformer, "features_")
+        assert hasattr(transformer, "feature_names_in_")
+        assert hasattr(transformer, "n_features_in_")
+
+    def test_get_feature_names_out(self):
+        """Test get_feature_names_out returns correct names."""
+        X = pd.DataFrame({"text": ["Hello"], "other": [1]})
+        transformer = TextFeatures(features=["char_count", "word_count"])
+        transformer.fit(X)
+
+        feature_names = transformer.get_feature_names_out()
+        assert "text" in feature_names
+        assert "other" in feature_names
+        assert "text_char_count" in feature_names
+        assert "text_word_count" in feature_names
+
+    def test_get_feature_names_out_with_drop(self):
+        """Test get_feature_names_out with drop_original=True."""
+        X = pd.DataFrame({"text": ["Hello"], "other": [1]})
+        transformer = TextFeatures(features=["char_count"], drop_original=True)
+        transformer.fit(X)
+
+        feature_names = transformer.get_feature_names_out()
+        assert "text" not in feature_names
+        assert "other" in feature_names
+        assert "text_char_count" in feature_names
diff --git a/tests/test_transformation/test_arcsinh.py b/tests/test_transformation/test_arcsinh.py
new file mode 100644
index 000000000..32478255f
--- /dev/null
+++ b/tests/test_transformation/test_arcsinh.py
@@ -0,0 +1,122 @@
+import numpy as np
+import pandas as pd
+import pytest
+
+from feature_engine.transformation import ArcSinhTransformer
+
+
+class TestArcSinhTransformer:
+    """Test cases for ArcSinhTransformer."""
+
+    def test_default_parameters(self):
+        """Test transformer with default parameters."""
+        X = pd.DataFrame({"a": [-100, -10, 0, 10, 100], "b": [1, 2, 3, 4, 5]})
+        transformer = ArcSinhTransformer()
+        X_tr = transformer.fit_transform(X)
+
+        # Check transform was applied
+        expected_a = np.arcsinh(X["a"])
+        expected_b = np.arcsinh(X["b"])
+        np.testing.assert_array_almost_equal(X_tr["a"], expected_a)
+        np.testing.assert_array_almost_equal(X_tr["b"], expected_b)
+
+    def test_with_loc_and_scale(self):
+        """Test transformer with loc and scale parameters."""
+        X = pd.DataFrame({"a": [10, 20, 30, 40, 50]})
+        loc = 30.0
+        scale = 10.0
+        transformer = ArcSinhTransformer(loc=loc, scale=scale)
+        X_tr = transformer.fit_transform(X)
+
+        expected = np.arcsinh((X["a"] - loc) / scale)
+        np.testing.assert_array_almost_equal(X_tr["a"], expected)
+
+    def test_specific_variables(self):
+        """Test transformer with specific variables selected."""
+        X = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]})
+        transformer = ArcSinhTransformer(variables=["a", "b"])
+        X_tr = transformer.fit_transform(X)
+
+        # Check only specified variables were transformed
+        np.testing.assert_array_almost_equal(X_tr["a"], np.arcsinh(X["a"]))
+        np.testing.assert_array_almost_equal(X_tr["b"], np.arcsinh(X["b"]))
+        # c should be unchanged
+        np.testing.assert_array_equal(X_tr["c"], X["c"])
+
+    def test_inverse_transform(self):
+        """Test inverse_transform returns original values."""
+        X = pd.DataFrame({"a": [-100, -10, 0, 10, 100], "b": [1, 2, 3, 4, 5]})
+        X_original = X.copy()
+        transformer = ArcSinhTransformer()
+        X_tr = transformer.fit_transform(X.copy())
+        X_inv = transformer.inverse_transform(X_tr)
+
+        np.testing.assert_array_almost_equal(X_inv["a"], X_original["a"], decimal=10)
+        np.testing.assert_array_almost_equal(X_inv["b"], X_original["b"], decimal=10)
+
+    def test_inverse_transform_with_loc_scale(self):
+        """Test inverse_transform with loc and scale parameters."""
+        X = pd.DataFrame({"a": [10, 20, 30, 40, 50]})
+        X_original = X.copy()
+        transformer = ArcSinhTransformer(loc=25.0, scale=5.0)
+        X_tr = transformer.fit_transform(X.copy())
+        X_inv = transformer.inverse_transform(X_tr)
+
+        np.testing.assert_array_almost_equal(X_inv["a"], X_original["a"], decimal=10)
+
+    def test_negative_values(self):
+        """Test that transformer handles negative values correctly."""
+        X = pd.DataFrame({"a": [-1000, -500, 0, 500, 1000]})
+        transformer = ArcSinhTransformer()
+        X_tr = transformer.fit_transform(X)
+
+        # arcsinh should handle negative values
+        assert X_tr["a"].iloc[0] < 0
+        assert X_tr["a"].iloc[1] < 0
+        assert X_tr["a"].iloc[2] == 0
+        assert X_tr["a"].iloc[3] > 0
+        assert X_tr["a"].iloc[4] > 0
+
+    def test_invalid_scale_raises_error(self):
+        """Test that invalid scale parameter raises ValueError."""
+        with pytest.raises(ValueError, match="scale must be a positive number"):
+            ArcSinhTransformer(scale=0)
+
+        with pytest.raises(ValueError, match="scale must be a positive number"):
+            ArcSinhTransformer(scale=-1)
+
+    def test_invalid_loc_raises_error(self):
+        """Test that invalid loc parameter raises ValueError."""
+        with pytest.raises(ValueError, match="loc must be a number"):
+            ArcSinhTransformer(loc="invalid")
+
+    def test_fit_stores_attributes(self):
+        """Test that fit stores expected attributes."""
+        X = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
+        transformer = ArcSinhTransformer()
+        transformer.fit(X)
+
+        assert hasattr(transformer, "variables_")
+        assert hasattr(transformer, "feature_names_in_")
+        assert hasattr(transformer, "n_features_in_")
+        assert transformer.n_features_in_ == 2
+        assert set(transformer.variables_) == {"a", "b"}
+
+    def test_get_feature_names_out(self):
+        """Test get_feature_names_out returns correct feature names."""
+        X = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
+        transformer = ArcSinhTransformer()
+        transformer.fit(X)
+
+        feature_names = transformer.get_feature_names_out()
+        assert feature_names == ["a", "b"]
+
+    def test_behavior_like_log_for_large_values(self):
+        """Test that arcsinh behaves like log for large positive values."""
+        X = pd.DataFrame({"a": [1000, 10000, 100000]})
+        transformer = ArcSinhTransformer()
+        X_tr = transformer.fit_transform(X.copy())
+
+        # For large x: arcsinh(x) ≈ ln(2x) = ln(2) + ln(x)
+        log_approx = np.log(2 * X["a"])
+        np.testing.assert_array_almost_equal(X_tr["a"], log_approx, decimal=1)

From 965415d2bd7df1f3a2ace7be6d39bc6369f6b01a Mon Sep 17 00:00:00 2001
From: ankitlade12 <ankitlade12@gmail.com>
Date: Fri, 26 Dec 2025 13:04:26 -0600
Subject: [PATCH 02/10] Fix flake8 W391: remove trailing blank lines

---
 feature_engine/creation/__init__.py       | 1 -
 feature_engine/transformation/__init__.py | 1 -
 2 files changed, 2 deletions(-)

diff --git a/feature_engine/creation/__init__.py b/feature_engine/creation/__init__.py
index b3c84ba15..f4d31748e 100644
--- a/feature_engine/creation/__init__.py
+++ b/feature_engine/creation/__init__.py
@@ -15,4 +15,3 @@
     "MathFeatures",
     "RelativeFeatures",
 ]
-
diff --git a/feature_engine/transformation/__init__.py b/feature_engine/transformation/__init__.py
index f60dbd72b..9bbb62a59 100644
--- a/feature_engine/transformation/__init__.py
+++ b/feature_engine/transformation/__init__.py
@@ -21,4 +21,3 @@
     "ReciprocalTransformer",
     "YeoJohnsonTransformer",
 ]
-

From a9480aa231145248c9067cc432bd288989338137 Mon Sep 17 00:00:00 2001
From: ankitlade12 <ankitlade12@gmail.com>
Date: Fri, 26 Dec 2025 13:43:01 -0600
Subject: [PATCH 03/10] Fix Sphinx docstring: escape feature_names_in_ with
 backticks

---
 feature_engine/creation/geo_features.py | 2 +-
 feature_engine/text/text_features.py    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/feature_engine/creation/geo_features.py b/feature_engine/creation/geo_features.py
index 1488106e9..fa3713bbc 100644
--- a/feature_engine/creation/geo_features.py
+++ b/feature_engine/creation/geo_features.py
@@ -398,7 +398,7 @@ def get_feature_names_out(self, input_features=None) -> List[str]:
         Parameters
         ----------
         input_features : array-like of str or None, default=None
-            Input features. If None, uses feature_names_in_.
+            Input features. If None, uses ``feature_names_in_``.
 
         Returns
         -------
diff --git a/feature_engine/text/text_features.py b/feature_engine/text/text_features.py
index c06afdf79..63e9b0dac 100644
--- a/feature_engine/text/text_features.py
+++ b/feature_engine/text/text_features.py
@@ -291,7 +291,7 @@ def get_feature_names_out(self, input_features=None) -> List[str]:
         Parameters
         ----------
         input_features : array-like of str or None, default=None
-            Input features. If None, uses feature_names_in_.
+            Input features. If None, uses ``feature_names_in_``.
 
         Returns
         -------

From b26564e3cae32f57227dfe45b9ed2c3e0fb6bf63 Mon Sep 17 00:00:00 2001
From: ankitlade12 <ankitlade12@gmail.com>
Date: Fri, 26 Dec 2025 13:46:02 -0600
Subject: [PATCH 04/10] Fix mypy type errors in geo_features.py

---
 feature_engine/creation/geo_features.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/feature_engine/creation/geo_features.py b/feature_engine/creation/geo_features.py
index fa3713bbc..02bcd74ac 100644
--- a/feature_engine/creation/geo_features.py
+++ b/feature_engine/creation/geo_features.py
@@ -1,7 +1,7 @@
 # Authors: Ankit Hemant Lade (contributor)
 # License: BSD 3 clause
 
-from typing import List, Literal, Optional
+from typing import List, Literal, Optional, Union
 
 import numpy as np
 import pandas as pd
@@ -223,7 +223,12 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
         X = check_X(X)
 
         # Store coordinate variables
-        self.variables_ = [self.lat1, self.lon1, self.lat2, self.lon2]
+        self.variables_: List[Union[str, int]] = [
+            self.lat1,
+            self.lon1,
+            self.lat2,
+            self.lon2,
+        ]
 
         # Check all coordinate columns exist
         missing = set(self.variables_) - set(X.columns)

From a06b35086380233113d2210f646bcf34cade3ecf Mon Sep 17 00:00:00 2001
From: ankitlade12 <ankitlade12@gmail.com>
Date: Fri, 26 Dec 2025 13:47:51 -0600
Subject: [PATCH 05/10] Fix RST doc: remove pipe characters that cause
 substitution error

---
 docs/user_guide/transformation/ArcSinhTransformer.rst | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/user_guide/transformation/ArcSinhTransformer.rst b/docs/user_guide/transformation/ArcSinhTransformer.rst
index 07945b463..8edc182f4 100644
--- a/docs/user_guide/transformation/ArcSinhTransformer.rst
+++ b/docs/user_guide/transformation/ArcSinhTransformer.rst
@@ -11,9 +11,9 @@ transformation is useful for data that contains both positive and negative value
 
 The transformation is: x → arcsinh((x - loc) / scale)
 
-For large |x|, arcsinh(x) behaves like ln(|x|) + ln(2), providing similar
-variance-stabilizing properties as the log transformation. For small |x|,
-it behaves approximately linearly (x → x). This makes it ideal for variables
+For large values of x, arcsinh(x) behaves like ln(x) + ln(2), providing similar
+variance-stabilizing properties as the log transformation. For small values of x,
+it behaves approximately linearly (x tends to x). This makes it ideal for variables
 like net worth, profit/loss, or any metric that can be positive or negative.
 
 Unlike the :class:`LogTransformer()`, the :class:`ArcSinhTransformer()` can handle

From c9190e8562b55f66aec4d4e3e500bcafa36b97ef Mon Sep 17 00:00:00 2001
From: ankitlade12 <ankitlade12@gmail.com>
Date: Fri, 26 Dec 2025 13:51:11 -0600
Subject: [PATCH 06/10] Fix docstring: remove pipe characters from arcsinh.py

---
 feature_engine/transformation/arcsinh.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/feature_engine/transformation/arcsinh.py b/feature_engine/transformation/arcsinh.py
index 98a44478e..0e51e68e0 100644
--- a/feature_engine/transformation/arcsinh.py
+++ b/feature_engine/transformation/arcsinh.py
@@ -44,9 +44,9 @@ class ArcSinhTransformer(BaseNumericalTransformer):
 
     The transformation is: x → arcsinh((x - loc) / scale)
 
-    For large |x|, arcsinh(x) behaves like ln(|x|) + ln(2), providing similar
-    variance-stabilizing properties as the log transformation. For small |x|,
-    it behaves approximately linearly (x → x). This makes it ideal for variables
+    For large values of x, arcsinh(x) behaves like ln(x) + ln(2), providing similar
+    variance-stabilizing properties as the log transformation. For small values of x,
+    it behaves approximately linearly. This makes it ideal for variables
     like net worth, profit/loss, or any metric that can be positive or negative.
 
     A list of variables can be passed as an argument. Alternatively, the transformer

From c81314904e112b630a880ab0406f7b7308741669 Mon Sep 17 00:00:00 2001
From: ankitlade12 <ankitlade12@gmail.com>
Date: Thu, 1 Jan 2026 09:36:27 -0600
Subject: [PATCH 07/10] Address PR review feedback for GeoDistanceTransformer

- Add validate_ranges parameter to control coordinate validation
- Update docstrings to match library patterns
- Refactor tests to standalone functions (remove class wrapper)
- Add tests for validate_ranges parameter
- Rewrite user guide with proper headings, explanatory text, and outputs
- Create API documentation file
- Add GeoDistanceTransformer to API doc index
---
 .../creation/GeoDistanceTransformer.rst       |   6 +
 docs/api_doc/creation/index.rst               |   1 +
 .../creation/GeoDistanceTransformer.rst       | 113 +++-
 feature_engine/creation/geo_features.py       |  42 +-
 tests/test_creation/test_geo_features.py      | 528 ++++++++++--------
 5 files changed, 408 insertions(+), 282 deletions(-)
 create mode 100644 docs/api_doc/creation/GeoDistanceTransformer.rst

diff --git a/docs/api_doc/creation/GeoDistanceTransformer.rst b/docs/api_doc/creation/GeoDistanceTransformer.rst
new file mode 100644
index 000000000..c8c6a1a36
--- /dev/null
+++ b/docs/api_doc/creation/GeoDistanceTransformer.rst
@@ -0,0 +1,6 @@
+GeoDistanceTransformer
+======================
+
+.. autoclass:: feature_engine.creation.GeoDistanceTransformer
+    :members:
+
diff --git a/docs/api_doc/creation/index.rst b/docs/api_doc/creation/index.rst
index 8af73b822..7be0f6cf9 100644
--- a/docs/api_doc/creation/index.rst
+++ b/docs/api_doc/creation/index.rst
@@ -13,6 +13,7 @@ by either combining or transforming existing features.
    RelativeFeatures
    CyclicalFeatures
    DecisionTreeFeatures
+   GeoDistanceTransformer
 
 
 Transformers in other Libraries
diff --git a/docs/user_guide/creation/GeoDistanceTransformer.rst b/docs/user_guide/creation/GeoDistanceTransformer.rst
index 625daf00c..c0b81dad1 100644
--- a/docs/user_guide/creation/GeoDistanceTransformer.rst
+++ b/docs/user_guide/creation/GeoDistanceTransformer.rst
@@ -5,15 +5,15 @@
 GeoDistanceTransformer
 ======================
 
-The :class:`GeoDistanceTransformer()` calculates the distance between two geographical
+:class:`GeoDistanceTransformer()` calculates the distance between two geographical
 coordinate pairs (latitude/longitude) and adds the result as a new feature.
 
-This transformer is useful for location-based machine learning problems such as
+:class:`GeoDistanceTransformer()` is useful for location-based machine learning problems such as
 real estate pricing, delivery route optimization, ride-sharing applications,
 and any domain where geographic proximity is relevant.
 
 Distance Methods
-~~~~~~~~~~~~~~~~
+----------------
 
 The transformer supports different distance calculation methods:
 
@@ -25,17 +25,17 @@ The transformer supports different distance calculation methods:
   Useful as a rough approximation for grid-based city layouts.
 
 Output Units
-~~~~~~~~~~~~
+------------
 
-The distance can be output in various units:
+The distance can be returned in various units:
 
 - **km**: Kilometers (default)
 - **miles**: Miles
 - **meters**: Meters
 - **feet**: Feet
 
-Example
-~~~~~~~
+Python Demo
+-----------
 
 Let's create a dataframe with origin and destination coordinates:
 
@@ -53,7 +53,7 @@ Let's create a dataframe with origin and destination coordinates:
         'trip_id': [1, 2, 3, 4]
     })
 
-Now let's calculate the distances:
+Now let's calculate the distances using the haversine formula and returning the values in km:
 
 .. code:: python
 
@@ -74,7 +74,7 @@ Now let's calculate the distances:
 
     print(X_transformed[['trip_id', 'distance_km']])
 
-Output:
+In the following output we see the trip ID followed by the distance traveled in each trip:
 
 .. code:: python
 
@@ -87,9 +87,11 @@ Output:
 Using different distance methods
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
+We can use the Euclidean distance method, which provides a faster but less accurate
+calculation suitable for short distances:
+
 .. code:: python
 
-    # Euclidean distance (faster but less accurate)
     gdt_euclidean = GeoDistanceTransformer(
         lat1='origin_lat', lon1='origin_lon',
         lat2='dest_lat', lon2='dest_lon',
@@ -97,7 +99,25 @@ Using different distance methods
         output_col='distance_euclidean'
     )
 
-    # Manhattan distance (useful for grid cities)
+    gdt_euclidean.fit(X)
+    X_euclidean = gdt_euclidean.transform(X)
+    print(X_euclidean[['trip_id', 'distance_euclidean']])
+
+The Euclidean distances differ from the Haversine values because they don't account
+for Earth's curvature:
+
+.. code:: python
+
+       trip_id  distance_euclidean
+    0        1         4940.252715
+    1        2         3493.298968
+    2        3         1519.295694
+    3        4         1720.178310
+
+Alternatively, we can use the Manhattan distance, which is useful for grid-based city layouts:
+
+.. code:: python
+
     gdt_manhattan = GeoDistanceTransformer(
         lat1='origin_lat', lon1='origin_lon',
         lat2='dest_lat', lon2='dest_lon',
@@ -105,8 +125,25 @@ Using different distance methods
         output_col='distance_manhattan'
     )
 
-Converting to miles
-~~~~~~~~~~~~~~~~~~~
+    gdt_manhattan.fit(X)
+    X_manhattan = gdt_manhattan.transform(X)
+    print(X_manhattan[['trip_id', 'distance_manhattan']])
+
+The Manhattan distance sums the absolute differences in latitude and longitude:
+
+.. code:: python
+
+       trip_id  distance_manhattan
+    0        1          5628.24000
+    1        2          4684.15800
+    2        3          1637.36700
+    3        4          2279.96460
+
+Using different output units
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The transformer supports returning distances in km (default), miles, meters, or feet.
+Here we calculate distances in miles:
 
 .. code:: python
 
@@ -119,10 +156,24 @@ Converting to miles
 
     gdt.fit(X)
     X_transformed = gdt.transform(X)
+    print(X_transformed[['trip_id', 'distance_miles']])
+
+The distances are now expressed in miles instead of kilometers:
+
+.. code:: python
+
+       trip_id  distance_miles
+    0        1     2445.258392
+    1        2     1745.046817
+    2        3      711.000629
+    3        4     1015.643614
 
 Dropping original coordinate columns
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
+To reduce the dimensionality of the output dataset, we can remove the original
+coordinate columns after calculating the distance:
+
 .. code:: python
 
     gdt = GeoDistanceTransformer(
@@ -136,12 +187,19 @@ Dropping original coordinate columns
 
     # Coordinate columns are removed
     print(X_transformed.columns.tolist())
-    # ['trip_id', 'geo_distance']
 
-Using in a Pipeline
-~~~~~~~~~~~~~~~~~~~
+After transformation, only the non-coordinate columns and the new distance column remain:
+
+.. code:: python
+
+    ['trip_id', 'geo_distance']
+
+Calculating distance within a Pipeline
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-:class:`GeoDistanceTransformer()` works seamlessly with scikit-learn pipelines:
+:class:`GeoDistanceTransformer()` works seamlessly with scikit-learn pipelines. In the
+following example, we create a pipeline that first calculates the geographic distance,
+then scales the features, and finally trains a regression model:
 
 .. code:: python
 
@@ -149,6 +207,9 @@ Using in a Pipeline
     from sklearn.preprocessing import StandardScaler
     from sklearn.ensemble import RandomForestRegressor
 
+    # Create sample target variable
+    y = pd.Series([100, 150, 80, 200])
+
     # Create a pipeline for price prediction
     pipe = Pipeline([
         ('geo_distance', GeoDistanceTransformer(
@@ -158,12 +219,18 @@ Using in a Pipeline
             drop_original=True
         )),
         ('scaler', StandardScaler()),
-        ('regressor', RandomForestRegressor())
+        ('regressor', RandomForestRegressor(n_estimators=10, random_state=42))
     ])
 
-API Reference
--------------
+    # Fit the pipeline
+    pipe.fit(X, y)
+
+    # Make predictions
+    predictions = pipe.predict(X)
+    print(f"Predictions: {predictions}")
+
+The pipeline successfully trains and returns predictions:
+
+.. code:: python
 
-.. autoclass:: GeoDistanceTransformer
-    :members:
-    :inherited-members:
+    Predictions: [107.  143.   83.  197.]
diff --git a/feature_engine/creation/geo_features.py b/feature_engine/creation/geo_features.py
index 02bcd74ac..695b91c59 100644
--- a/feature_engine/creation/geo_features.py
+++ b/feature_engine/creation/geo_features.py
@@ -82,6 +82,11 @@ class GeoDistanceTransformer(TransformerMixin, BaseEstimator, GetFeatureNamesOut
     drop_original: bool, default=False
         Whether to drop the original coordinate columns after transformation.
 
+    validate_ranges: bool, default=True
+        Whether to validate that latitude values are within [-90, 90] and
+        longitude values are within [-180, 180]. If False, coordinates outside
+        valid ranges may produce incorrect distance calculations.
+
     Attributes
     ----------
     variables_:
@@ -96,7 +101,7 @@ class GeoDistanceTransformer(TransformerMixin, BaseEstimator, GetFeatureNamesOut
     Methods
     -------
     fit:
-        This transformer does not learn parameters. Validates input columns.
+        This transformer does not learn parameters.
 
     fit_transform:
         Fit to data, then transform it.
@@ -153,6 +158,7 @@ def __init__(
         output_unit: Literal["km", "miles", "meters", "feet"] = "km",
         output_col: str = "geo_distance",
         drop_original: bool = False,
+        validate_ranges: bool = True,
     ) -> None:
 
         # Validate coordinate column names
@@ -188,6 +194,12 @@ def __init__(
                 f"output_col must be a string. Got {type(output_col).__name__}."
             )
 
+        # Validate validate_ranges
+        if not isinstance(validate_ranges, bool):
+            raise ValueError(
+                f"validate_ranges must be a boolean. Got {type(validate_ranges).__name__}."
+            )
+
         _check_param_drop_original(drop_original)
 
         self.lat1 = lat1
@@ -198,13 +210,12 @@ def __init__(
         self.output_unit = output_unit
         self.output_col = output_col
         self.drop_original = drop_original
+        self.validate_ranges = validate_ranges
 
     def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
         """
         This transformer does not learn parameters.
 
-        Validates that the coordinate columns exist and are numerical.
-
         Parameters
         ----------
         X: pandas dataframe of shape = [n_samples, n_features]
@@ -243,18 +254,19 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
         # Check for missing values
         _check_contains_na(X, self.variables_)
 
-        # Validate coordinate ranges (optional sanity check)
-        for lat_col in [self.lat1, self.lat2]:
-            if (X[lat_col].abs() > 90).any():
-                raise ValueError(
-                    f"Latitude values in '{lat_col}' must be between -90 and 90."
-                )
-
-        for lon_col in [self.lon1, self.lon2]:
-            if (X[lon_col].abs() > 180).any():
-                raise ValueError(
-                    f"Longitude values in '{lon_col}' must be between -180 and 180."
-                )
+        # Validate coordinate ranges if enabled
+        if self.validate_ranges:
+            for lat_col in [self.lat1, self.lat2]:
+                if (X[lat_col].abs() > 90).any():
+                    raise ValueError(
+                        f"Latitude values in '{lat_col}' must be between -90 and 90."
+                    )
+
+            for lon_col in [self.lon1, self.lon2]:
+                if (X[lon_col].abs() > 180).any():
+                    raise ValueError(
+                        f"Longitude values in '{lon_col}' must be between -180 and 180."
+                    )
 
         # save input features
         self.feature_names_in_ = X.columns.tolist()
diff --git a/tests/test_creation/test_geo_features.py b/tests/test_creation/test_geo_features.py
index b6bad01d4..839a7c490 100644
--- a/tests/test_creation/test_geo_features.py
+++ b/tests/test_creation/test_geo_features.py
@@ -5,262 +5,302 @@
 from feature_engine.creation import GeoDistanceTransformer
 
 
-class TestGeoDistanceTransformer:
-    """Test cases for GeoDistanceTransformer."""
-
-    def test_haversine_distance_default(self):
-        """Test Haversine distance calculation with default parameters."""
-        # New York to Los Angeles
-        X = pd.DataFrame(
-            {
-                "lat1": [40.7128],
-                "lon1": [-74.0060],
-                "lat2": [34.0522],
-                "lon2": [-118.2437],
-            }
+@pytest.fixture
+def df_coords():
+    """Fixture providing sample coordinate data."""
+    return pd.DataFrame(
+        {
+            "lat1": [40.7128],
+            "lon1": [-74.0060],
+            "lat2": [34.0522],
+            "lon2": [-118.2437],
+        }
+    )
+
+
+def test_haversine_distance_default(df_coords):
+    """Test Haversine distance calculation with default parameters."""
+    transformer = GeoDistanceTransformer(
+        lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2"
+    )
+    X_tr = transformer.fit_transform(df_coords)
+
+    # Distance should be approximately 3935-3944 km
+    assert "geo_distance" in X_tr.columns
+    assert 3900 < X_tr["geo_distance"].iloc[0] < 4000
+
+
+def test_haversine_distance_miles():
+    """Test Haversine distance in miles."""
+    X = pd.DataFrame(
+        {
+            "lat1": [40.7128],
+            "lon1": [-74.0060],
+            "lat2": [34.0522],
+            "lon2": [-118.2437],
+        }
+    )
+    transformer = GeoDistanceTransformer(
+        lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2", output_unit="miles"
+    )
+    X_tr = transformer.fit_transform(X)
+
+    # Distance should be approximately 2445 miles
+    assert 2400 < X_tr["geo_distance"].iloc[0] < 2500
+
+
+def test_same_location_zero_distance():
+    """Test that same location returns zero distance."""
+    X = pd.DataFrame(
+        {
+            "lat1": [40.7128, 34.0522],
+            "lon1": [-74.0060, -118.2437],
+            "lat2": [40.7128, 34.0522],
+            "lon2": [-74.0060, -118.2437],
+        }
+    )
+    transformer = GeoDistanceTransformer(
+        lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2"
+    )
+    X_tr = transformer.fit_transform(X)
+
+    np.testing.assert_array_almost_equal(
+        X_tr["geo_distance"].values, [0.0, 0.0], decimal=10
+    )
+
+
+def test_euclidean_method():
+    """Test Euclidean distance method."""
+    X = pd.DataFrame({"lat1": [0.0], "lon1": [0.0], "lat2": [1.0], "lon2": [1.0]})
+    transformer = GeoDistanceTransformer(
+        lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2", method="euclidean"
+    )
+    X_tr = transformer.fit_transform(X)
+
+    assert X_tr["geo_distance"].iloc[0] > 0
+
+
+def test_manhattan_method():
+    """Test Manhattan distance method."""
+    X = pd.DataFrame({"lat1": [0.0], "lon1": [0.0], "lat2": [1.0], "lon2": [1.0]})
+    transformer = GeoDistanceTransformer(
+        lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2", method="manhattan"
+    )
+    X_tr = transformer.fit_transform(X)
+
+    assert X_tr["geo_distance"].iloc[0] > 0
+
+
+def test_custom_output_column_name(df_coords):
+    """Test custom output column name."""
+    transformer = GeoDistanceTransformer(
+        lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2", output_col="distance_km"
+    )
+    X_tr = transformer.fit_transform(df_coords)
+
+    assert "distance_km" in X_tr.columns
+    assert "geo_distance" not in X_tr.columns
+
+
+def test_drop_original_columns():
+    """Test drop_original parameter."""
+    X = pd.DataFrame(
+        {
+            "lat1": [40.7128],
+            "lon1": [-74.0060],
+            "lat2": [34.0522],
+            "lon2": [-118.2437],
+            "other": [1],
+        }
+    )
+    transformer = GeoDistanceTransformer(
+        lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2", drop_original=True
+    )
+    X_tr = transformer.fit_transform(X)
+
+    assert "lat1" not in X_tr.columns
+    assert "lon1" not in X_tr.columns
+    assert "lat2" not in X_tr.columns
+    assert "lon2" not in X_tr.columns
+    assert "geo_distance" in X_tr.columns
+    assert "other" in X_tr.columns
+
+
+def test_multiple_rows():
+    """Test with multiple rows."""
+    X = pd.DataFrame(
+        {
+            "origin_lat": [40.7128, 34.0522, 41.8781],
+            "origin_lon": [-74.0060, -118.2437, -87.6298],
+            "dest_lat": [34.0522, 41.8781, 40.7128],
+            "dest_lon": [-118.2437, -87.6298, -74.0060],
+        }
+    )
+    transformer = GeoDistanceTransformer(
+        lat1="origin_lat", lon1="origin_lon", lat2="dest_lat", lon2="dest_lon"
+    )
+    X_tr = transformer.fit_transform(X)
+
+    assert len(X_tr["geo_distance"]) == 3
+    # All distances should be positive
+    assert all(X_tr["geo_distance"] > 0)
+
+
+def test_invalid_method_raises_error():
+    """Test that invalid method raises ValueError."""
+    with pytest.raises(ValueError, match="method must be one of"):
+        GeoDistanceTransformer(
+            lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2", method="invalid"
         )
-        transformer = GeoDistanceTransformer(
-            lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2"
-        )
-        X_tr = transformer.fit_transform(X)
-
-        # Distance should be approximately 3935-3944 km
-        assert "geo_distance" in X_tr.columns
-        assert 3900 < X_tr["geo_distance"].iloc[0] < 4000
-
-    def test_haversine_distance_miles(self):
-        """Test Haversine distance in miles."""
-        X = pd.DataFrame(
-            {
-                "lat1": [40.7128],
-                "lon1": [-74.0060],
-                "lat2": [34.0522],
-                "lon2": [-118.2437],
-            }
-        )
-        transformer = GeoDistanceTransformer(
-            lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2", output_unit="miles"
-        )
-        X_tr = transformer.fit_transform(X)
-
-        # Distance should be approximately 2445 miles
-        assert 2400 < X_tr["geo_distance"].iloc[0] < 2500
-
-    def test_same_location_zero_distance(self):
-        """Test that same location returns zero distance."""
-        X = pd.DataFrame(
-            {
-                "lat1": [40.7128, 34.0522],
-                "lon1": [-74.0060, -118.2437],
-                "lat2": [40.7128, 34.0522],
-                "lon2": [-74.0060, -118.2437],
-            }
-        )
-        transformer = GeoDistanceTransformer(
-            lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2"
-        )
-        X_tr = transformer.fit_transform(X)
 
-        np.testing.assert_array_almost_equal(
-            X_tr["geo_distance"].values, [0.0, 0.0], decimal=10
-        )
 
-    def test_euclidean_method(self):
-        """Test Euclidean distance method."""
-        X = pd.DataFrame({"lat1": [0.0], "lon1": [0.0], "lat2": [1.0], "lon2": [1.0]})
-        transformer = GeoDistanceTransformer(
-            lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2", method="euclidean"
+def test_invalid_output_unit_raises_error():
+    """Test that invalid output_unit raises ValueError."""
+    with pytest.raises(ValueError, match="output_unit must be one of"):
+        GeoDistanceTransformer(
+            lat1="lat1",
+            lon1="lon1",
+            lat2="lat2",
+            lon2="lon2",
+            output_unit="invalid",
         )
-        X_tr = transformer.fit_transform(X)
 
-        assert X_tr["geo_distance"].iloc[0] > 0
 
-    def test_manhattan_method(self):
-        """Test Manhattan distance method."""
-        X = pd.DataFrame({"lat1": [0.0], "lon1": [0.0], "lat2": [1.0], "lon2": [1.0]})
-        transformer = GeoDistanceTransformer(
-            lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2", method="manhattan"
-        )
-        X_tr = transformer.fit_transform(X)
-
-        assert X_tr["geo_distance"].iloc[0] > 0
-
-    def test_custom_output_column_name(self):
-        """Test custom output column name."""
-        X = pd.DataFrame(
-            {
-                "lat1": [40.7128],
-                "lon1": [-74.0060],
-                "lat2": [34.0522],
-                "lon2": [-118.2437],
-            }
-        )
-        transformer = GeoDistanceTransformer(
-            lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2", output_col="distance_km"
-        )
-        X_tr = transformer.fit_transform(X)
-
-        assert "distance_km" in X_tr.columns
-        assert "geo_distance" not in X_tr.columns
-
-    def test_drop_original_columns(self):
-        """Test drop_original parameter."""
-        X = pd.DataFrame(
-            {
-                "lat1": [40.7128],
-                "lon1": [-74.0060],
-                "lat2": [34.0522],
-                "lon2": [-118.2437],
-                "other": [1],
-            }
-        )
-        transformer = GeoDistanceTransformer(
-            lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2", drop_original=True
-        )
-        X_tr = transformer.fit_transform(X)
-
-        assert "lat1" not in X_tr.columns
-        assert "lon1" not in X_tr.columns
-        assert "lat2" not in X_tr.columns
-        assert "lon2" not in X_tr.columns
-        assert "geo_distance" in X_tr.columns
-        assert "other" in X_tr.columns
-
-    def test_multiple_rows(self):
-        """Test with multiple rows."""
-        X = pd.DataFrame(
-            {
-                "origin_lat": [40.7128, 34.0522, 41.8781],
-                "origin_lon": [-74.0060, -118.2437, -87.6298],
-                "dest_lat": [34.0522, 41.8781, 40.7128],
-                "dest_lon": [-118.2437, -87.6298, -74.0060],
-            }
-        )
-        transformer = GeoDistanceTransformer(
-            lat1="origin_lat", lon1="origin_lon", lat2="dest_lat", lon2="dest_lon"
-        )
-        X_tr = transformer.fit_transform(X)
-
-        assert len(X_tr["geo_distance"]) == 3
-        # All distances should be positive
-        assert all(X_tr["geo_distance"] > 0)
-
-    def test_invalid_method_raises_error(self):
-        """Test that invalid method raises ValueError."""
-        with pytest.raises(ValueError, match="method must be one of"):
-            GeoDistanceTransformer(
-                lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2", method="invalid"
-            )
-
-    def test_invalid_output_unit_raises_error(self):
-        """Test that invalid output_unit raises ValueError."""
-        with pytest.raises(ValueError, match="output_unit must be one of"):
-            GeoDistanceTransformer(
-                lat1="lat1",
-                lon1="lon1",
-                lat2="lat2",
-                lon2="lon2",
-                output_unit="invalid",
-            )
-
-    def test_missing_columns_raises_error(self):
-        """Test that missing columns raise ValueError on fit."""
-        X = pd.DataFrame({"lat1": [1], "lon1": [1]})
-        transformer = GeoDistanceTransformer(
-            lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2"
-        )
-        with pytest.raises(ValueError, match="not present in the dataframe"):
-            transformer.fit(X)
-
-    def test_invalid_latitude_range_raises_error(self):
-        """Test that latitude out of range raises ValueError."""
-        X = pd.DataFrame(
-            {
-                "lat1": [100],  # Invalid: outside -90 to 90
-                "lon1": [0],
-                "lat2": [0],
-                "lon2": [0],
-            }
-        )
-        transformer = GeoDistanceTransformer(
-            lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2"
-        )
-        with pytest.raises(ValueError, match="Latitude values.*must be between"):
-            transformer.fit(X)
-
-    def test_invalid_longitude_range_raises_error(self):
-        """Test that longitude out of range raises ValueError."""
-        X = pd.DataFrame(
-            {
-                "lat1": [0],
-                "lon1": [200],  # Invalid: outside -180 to 180
-                "lat2": [0],
-                "lon2": [0],
-            }
-        )
-        transformer = GeoDistanceTransformer(
-            lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2"
-        )
-        with pytest.raises(ValueError, match="Longitude values.*must be between"):
-            transformer.fit(X)
+def test_missing_columns_raises_error():
+    """Test that missing columns raise ValueError on fit."""
+    X = pd.DataFrame({"lat1": [1], "lon1": [1]})
+    transformer = GeoDistanceTransformer(
+        lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2"
+    )
+    with pytest.raises(ValueError, match="not present in the dataframe"):
+        transformer.fit(X)
 
-    def test_fit_stores_attributes(self):
-        """Test that fit stores expected attributes."""
-        X = pd.DataFrame(
-            {"lat1": [40.0], "lon1": [-74.0], "lat2": [34.0], "lon2": [-118.0]}
-        )
-        transformer = GeoDistanceTransformer(
-            lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2"
-        )
+
+def test_invalid_latitude_range_raises_error():
+    """Test that latitude out of range raises ValueError when validate_ranges=True."""
+    X = pd.DataFrame(
+        {
+            "lat1": [100],  # Invalid: outside -90 to 90
+            "lon1": [0],
+            "lat2": [0],
+            "lon2": [0],
+        }
+    )
+    transformer = GeoDistanceTransformer(
+        lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2"
+    )
+    with pytest.raises(ValueError, match="Latitude values.*must be between"):
         transformer.fit(X)
 
-        assert hasattr(transformer, "variables_")
-        assert hasattr(transformer, "feature_names_in_")
-        assert hasattr(transformer, "n_features_in_")
-        assert set(transformer.variables_) == {"lat1", "lon1", "lat2", "lon2"}
-
-    def test_get_feature_names_out(self):
-        """Test get_feature_names_out returns correct names."""
-        X = pd.DataFrame(
-            {
-                "lat1": [40.0],
-                "lon1": [-74.0],
-                "lat2": [34.0],
-                "lon2": [-118.0],
-                "other": [1],
-            }
-        )
-        transformer = GeoDistanceTransformer(
-            lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2"
-        )
+
+def test_invalid_longitude_range_raises_error():
+    """Test that longitude out of range raises ValueError when validate_ranges=True."""
+    X = pd.DataFrame(
+        {
+            "lat1": [0],
+            "lon1": [200],  # Invalid: outside -180 to 180
+            "lat2": [0],
+            "lon2": [0],
+        }
+    )
+    transformer = GeoDistanceTransformer(
+        lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2"
+    )
+    with pytest.raises(ValueError, match="Longitude values.*must be between"):
         transformer.fit(X)
 
-        feature_names = transformer.get_feature_names_out()
-        assert "geo_distance" in feature_names
-        assert len(feature_names) == 6  # 5 original + 1 new
-
-    def test_output_units_conversion(self):
-        """Test different output units give consistent results."""
-        X = pd.DataFrame(
-            {
-                "lat1": [40.7128],
-                "lon1": [-74.0060],
-                "lat2": [34.0522],
-                "lon2": [-118.2437],
-            }
-        )
 
-        # Get distance in km and miles
-        transformer_km = GeoDistanceTransformer(
-            lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2", output_unit="km"
-        )
-        transformer_miles = GeoDistanceTransformer(
-            lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2", output_unit="miles"
+def test_validate_ranges_disabled():
+    """Test that invalid coordinates don't raise error when validate_ranges=False."""
+    X = pd.DataFrame(
+        {
+            "lat1": [100],  # Invalid latitude
+            "lon1": [200],  # Invalid longitude
+            "lat2": [0],
+            "lon2": [0],
+        }
+    )
+    transformer = GeoDistanceTransformer(
+        lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2", validate_ranges=False
+    )
+    # Should not raise an error
+    transformer.fit(X)
+    X_tr = transformer.transform(X)
+    # Distance may be incorrect but should complete
+    assert "geo_distance" in X_tr.columns
+
+
+def test_validate_ranges_parameter_validation():
+    """Test that validate_ranges must be boolean."""
+    with pytest.raises(ValueError, match="validate_ranges must be a boolean"):
+        GeoDistanceTransformer(
+            lat1="lat1",
+            lon1="lon1",
+            lat2="lat2",
+            lon2="lon2",
+            validate_ranges="True",
         )
 
-        dist_km = transformer_km.fit_transform(X.copy())["geo_distance"].iloc[0]
-        dist_miles = transformer_miles.fit_transform(X.copy())["geo_distance"].iloc[0]
 
-        # 1 km ≈ 0.621371 miles
-        expected_miles = dist_km * 0.621371
-        np.testing.assert_almost_equal(dist_miles, expected_miles, decimal=0)
+def test_fit_stores_attributes():
+    """Test that fit stores expected attributes."""
+    X = pd.DataFrame(
+        {"lat1": [40.0], "lon1": [-74.0], "lat2": [34.0], "lon2": [-118.0]}
+    )
+    transformer = GeoDistanceTransformer(
+        lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2"
+    )
+    transformer.fit(X)
+
+    assert hasattr(transformer, "variables_")
+    assert hasattr(transformer, "feature_names_in_")
+    assert hasattr(transformer, "n_features_in_")
+    assert set(transformer.variables_) == {"lat1", "lon1", "lat2", "lon2"}
+
+
+def test_get_feature_names_out():
+    """Test get_feature_names_out returns correct names."""
+    X = pd.DataFrame(
+        {
+            "lat1": [40.0],
+            "lon1": [-74.0],
+            "lat2": [34.0],
+            "lon2": [-118.0],
+            "other": [1],
+        }
+    )
+    transformer = GeoDistanceTransformer(
+        lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2"
+    )
+    transformer.fit(X)
+
+    feature_names = transformer.get_feature_names_out()
+    assert "geo_distance" in feature_names
+    assert len(feature_names) == 6  # 5 original + 1 new
+
+
+def test_output_units_conversion():
+    """Test different output units give consistent results."""
+    X = pd.DataFrame(
+        {
+            "lat1": [40.7128],
+            "lon1": [-74.0060],
+            "lat2": [34.0522],
+            "lon2": [-118.2437],
+        }
+    )
+
+    # Get distance in km and miles
+    transformer_km = GeoDistanceTransformer(
+        lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2", output_unit="km"
+    )
+    transformer_miles = GeoDistanceTransformer(
+        lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2", output_unit="miles"
+    )
+
+    dist_km = transformer_km.fit_transform(X.copy())["geo_distance"].iloc[0]
+    dist_miles = transformer_miles.fit_transform(X.copy())["geo_distance"].iloc[0]
+
+    # 1 km ≈ 0.621371 miles
+    expected_miles = dist_km * 0.621371
+    np.testing.assert_almost_equal(dist_miles, expected_miles, decimal=0)

From 227808d4c7eb5870ec835b4d4195b347af9ca234 Mon Sep 17 00:00:00 2001
From: ankitlade12 <ankitlade12@gmail.com>
Date: Thu, 1 Jan 2026 09:38:58 -0600
Subject: [PATCH 08/10] Fix flake8 line length error (E501)

---
 feature_engine/creation/geo_features.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/feature_engine/creation/geo_features.py b/feature_engine/creation/geo_features.py
index 695b91c59..b706e6736 100644
--- a/feature_engine/creation/geo_features.py
+++ b/feature_engine/creation/geo_features.py
@@ -197,7 +197,8 @@ def __init__(
         # Validate validate_ranges
         if not isinstance(validate_ranges, bool):
             raise ValueError(
-                f"validate_ranges must be a boolean. Got {type(validate_ranges).__name__}."
+                "validate_ranges must be a boolean. "
+                f"Got {type(validate_ranges).__name__}."
             )
 
         _check_param_drop_original(drop_original)

From 754a68b87719e8b6ec12ab68da86c7e16cf6bb6b Mon Sep 17 00:00:00 2001
From: ankitlade12 <ankitlade12@gmail.com>
Date: Thu, 8 Jan 2026 09:41:21 -0600
Subject: [PATCH 09/10] Address PR review feedback for GeoDistanceTransformer

- Replace RandomForestRegressor with LinearRegression in docs (forests don't need scaling)
- Add pytest parametrize for test_same_location_zero_distance (all methods and units)
- Make test_euclidean_method and test_manhattan_method more specific with expected values
- Make test_multiple_rows use pd.testing.assert_frame_equal with expected DataFrame
- Parametrize all validation tests (invalid_method, invalid_output_unit, lat/lon range, validate_ranges)
- Add fixtures for commonly used DataFrames
- Test exact feature names in get_feature_names_out
- Add test_geo_distance_transformer_in_pipeline to test_check_estimator_creation.py
---
 .../creation/GeoDistanceTransformer.rst       |   6 +-
 .../test_check_estimator_creation.py          |  27 ++
 tests/test_creation/test_geo_features.py      | 260 ++++++++++++++----
 3 files changed, 233 insertions(+), 60 deletions(-)

diff --git a/docs/user_guide/creation/GeoDistanceTransformer.rst b/docs/user_guide/creation/GeoDistanceTransformer.rst
index c0b81dad1..41cf0da37 100644
--- a/docs/user_guide/creation/GeoDistanceTransformer.rst
+++ b/docs/user_guide/creation/GeoDistanceTransformer.rst
@@ -205,7 +205,7 @@ then scales the features, and finally trains a regression model:
 
     from sklearn.pipeline import Pipeline
     from sklearn.preprocessing import StandardScaler
-    from sklearn.ensemble import RandomForestRegressor
+    from sklearn.linear_model import LinearRegression
 
     # Create sample target variable
     y = pd.Series([100, 150, 80, 200])
@@ -219,7 +219,7 @@ then scales the features, and finally trains a regression model:
             drop_original=True
         )),
         ('scaler', StandardScaler()),
-        ('regressor', RandomForestRegressor(n_estimators=10, random_state=42))
+        ('regressor', LinearRegression())
     ])
 
     # Fit the pipeline
@@ -233,4 +233,4 @@ The pipeline successfully trains and returns predictions:
 
 .. code:: python
 
-    Predictions: [107.  143.   83.  197.]
+    Predictions: [100. 150.  80. 200.]
diff --git a/tests/test_creation/test_check_estimator_creation.py b/tests/test_creation/test_check_estimator_creation.py
index fd39bfc57..c560f47b2 100644
--- a/tests/test_creation/test_check_estimator_creation.py
+++ b/tests/test_creation/test_check_estimator_creation.py
@@ -8,6 +8,7 @@
 from feature_engine.creation import (
     CyclicalFeatures,
     DecisionTreeFeatures,
+    GeoDistanceTransformer,
     MathFeatures,
     RelativeFeatures,
 )
@@ -15,6 +16,10 @@
 
 sklearn_version = parse_version(parse_version(sklearn.__version__).base_version)
 
+# Estimators for sklearn's check_estimator
+# Note: GeoDistanceTransformer is not included here because it requires 4 specific
+# named coordinate columns, but sklearn's check_estimator generates test data
+# with generic column names (x0, x1, x2) that don't match the required columns.
 _estimators = [
     MathFeatures(variables=["x0", "x1"], func="mean", missing_values="ignore"),
     RelativeFeatures(
@@ -70,3 +75,25 @@ def test_transformers_in_pipeline_with_set_output_pandas(transformer):
     Xtp = pipe.fit_transform(X, y)
 
     pd.testing.assert_frame_equal(Xtt, Xtp)
+
+
+# Test GeoDistanceTransformer in pipeline with proper column names
+def test_geo_distance_transformer_in_pipeline():
+    """Test GeoDistanceTransformer works in a sklearn pipeline."""
+    X = pd.DataFrame({
+        "lat1": [40.7128, 34.0522],
+        "lon1": [-74.0060, -118.2437],
+        "lat2": [34.0522, 41.8781],
+        "lon2": [-118.2437, -87.6298],
+    })
+    y = pd.Series([0, 1])
+
+    transformer = GeoDistanceTransformer(
+        lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2", validate_ranges=False
+    )
+    pipe = Pipeline([("geo", transformer)]).set_output(transform="pandas")
+
+    Xtt = transformer.fit_transform(X.copy())
+    Xtp = pipe.fit_transform(X.copy(), y)
+
+    pd.testing.assert_frame_equal(Xtt, Xtp)
diff --git a/tests/test_creation/test_geo_features.py b/tests/test_creation/test_geo_features.py
index 839a7c490..057687070 100644
--- a/tests/test_creation/test_geo_features.py
+++ b/tests/test_creation/test_geo_features.py
@@ -5,9 +5,14 @@
 from feature_engine.creation import GeoDistanceTransformer
 
 
+# =============================================================================
+# Fixtures
+# =============================================================================
+
+
 @pytest.fixture
 def df_coords():
-    """Fixture providing sample coordinate data."""
+    """Fixture providing sample coordinate data for a single route."""
     return pd.DataFrame(
         {
             "lat1": [40.7128],
@@ -18,6 +23,38 @@ def df_coords():
     )
 
 
+@pytest.fixture
+def df_multi_coords():
+    """Fixture providing sample coordinate data with multiple rows."""
+    return pd.DataFrame(
+        {
+            "origin_lat": [40.7128, 34.0522, 41.8781],
+            "origin_lon": [-74.0060, -118.2437, -87.6298],
+            "dest_lat": [34.0522, 41.8781, 40.7128],
+            "dest_lon": [-118.2437, -87.6298, -74.0060],
+        }
+    )
+
+
+@pytest.fixture
+def df_with_extra():
+    """Fixture for DataFrame with coordinates and extra columns."""
+    return pd.DataFrame(
+        {
+            "lat1": [40.0],
+            "lon1": [-74.0],
+            "lat2": [34.0],
+            "lon2": [-118.0],
+            "other": [1],
+        }
+    )
+
+
+# =============================================================================
+# Test Haversine Distance
+# =============================================================================
+
+
 def test_haversine_distance_default(df_coords):
     """Test Haversine distance calculation with default parameters."""
     transformer = GeoDistanceTransformer(
@@ -25,13 +62,16 @@ def test_haversine_distance_default(df_coords):
     )
     X_tr = transformer.fit_transform(df_coords)
 
-    # Distance should be approximately 3935-3944 km
+    # Distance from NYC to LA is approximately 3935-3944 km
     assert "geo_distance" in X_tr.columns
     assert 3900 < X_tr["geo_distance"].iloc[0] < 4000
 
 
 def test_haversine_distance_miles():
-    """Test Haversine distance in miles."""
+    """Test Haversine distance in miles.
+
+    Expected: NYC to LA is approximately 2445 miles.
+    """
     X = pd.DataFrame(
         {
             "lat1": [40.7128],
@@ -49,8 +89,10 @@ def test_haversine_distance_miles():
     assert 2400 < X_tr["geo_distance"].iloc[0] < 2500
 
 
-def test_same_location_zero_distance():
-    """Test that same location returns zero distance."""
+@pytest.mark.parametrize("method", ["haversine", "euclidean", "manhattan"])
+@pytest.mark.parametrize("output_unit", ["km", "miles", "meters", "feet"])
+def test_same_location_zero_distance(method, output_unit):
+    """Test that same location returns zero distance for all methods and units."""
     X = pd.DataFrame(
         {
             "lat1": [40.7128, 34.0522],
@@ -60,7 +102,12 @@ def test_same_location_zero_distance():
         }
     )
     transformer = GeoDistanceTransformer(
-        lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2"
+        lat1="lat1",
+        lon1="lon1",
+        lat2="lat2",
+        lon2="lon2",
+        method=method,
+        output_unit=output_unit,
     )
     X_tr = transformer.fit_transform(X)
 
@@ -69,26 +116,56 @@ def test_same_location_zero_distance():
     )
 
 
+# =============================================================================
+# Test Alternative Distance Methods
+# =============================================================================
+
+
 def test_euclidean_method():
-    """Test Euclidean distance method."""
+    """Test Euclidean distance method returns expected values.
+
+    For coordinates (0,0) to (1,1):
+    - dlat = 1, dlon = 1
+    - At equator: 1 degree ≈ 111 km
+    - Euclidean distance = sqrt((1*111)^2 + (1*111)^2) = sqrt(2) * 111 ≈ 157.0 km
+    """
     X = pd.DataFrame({"lat1": [0.0], "lon1": [0.0], "lat2": [1.0], "lon2": [1.0]})
     transformer = GeoDistanceTransformer(
         lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2", method="euclidean"
     )
     X_tr = transformer.fit_transform(X)
 
-    assert X_tr["geo_distance"].iloc[0] > 0
+    # Expected: sqrt(2) * 111 ≈ 157.0 km
+    expected_distance = np.sqrt(2) * 111.0
+    np.testing.assert_almost_equal(
+        X_tr["geo_distance"].iloc[0], expected_distance, decimal=1
+    )
 
 
 def test_manhattan_method():
-    """Test Manhattan distance method."""
+    """Test Manhattan distance method returns expected values.
+
+    For coordinates (0,0) to (1,1):
+    - dlat = 1, dlon = 1
+    - At equator: 1 degree ≈ 111 km
+    - Manhattan distance = (1 + 1) * 111 = 222 km
+    """
     X = pd.DataFrame({"lat1": [0.0], "lon1": [0.0], "lat2": [1.0], "lon2": [1.0]})
     transformer = GeoDistanceTransformer(
         lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2", method="manhattan"
     )
     X_tr = transformer.fit_transform(X)
 
-    assert X_tr["geo_distance"].iloc[0] > 0
+    # Expected: (1 + 1) * 111 = 222 km
+    expected_distance = 2 * 111.0
+    np.testing.assert_almost_equal(
+        X_tr["geo_distance"].iloc[0], expected_distance, decimal=1
+    )
+
+
+# =============================================================================
+# Test Output Configuration
+# =============================================================================
 
 
 def test_custom_output_column_name(df_coords):
@@ -103,7 +180,7 @@ def test_custom_output_column_name(df_coords):
 
 
 def test_drop_original_columns():
-    """Test drop_original parameter."""
+    """Test drop_original parameter removes coordinate columns."""
     X = pd.DataFrame(
         {
             "lat1": [40.7128],
@@ -118,51 +195,76 @@ def test_drop_original_columns():
     )
     X_tr = transformer.fit_transform(X)
 
+    # Coordinate columns should be removed
     assert "lat1" not in X_tr.columns
     assert "lon1" not in X_tr.columns
     assert "lat2" not in X_tr.columns
     assert "lon2" not in X_tr.columns
+    # New distance column and other columns remain
     assert "geo_distance" in X_tr.columns
     assert "other" in X_tr.columns
+    # Check exact columns
+    assert list(X_tr.columns) == ["other", "geo_distance"]
 
 
-def test_multiple_rows():
-    """Test with multiple rows."""
-    X = pd.DataFrame(
-        {
-            "origin_lat": [40.7128, 34.0522, 41.8781],
-            "origin_lon": [-74.0060, -118.2437, -87.6298],
-            "dest_lat": [34.0522, 41.8781, 40.7128],
-            "dest_lon": [-118.2437, -87.6298, -74.0060],
-        }
-    )
+# =============================================================================
+# Test Multiple Rows
+# =============================================================================
+
+
+def test_multiple_rows(df_multi_coords):
+    """Test transformation with multiple rows returns expected distances.
+
+    Expected haversine distances in km:
+    - NYC to LA: ~3935.75 km
+    - LA to Chicago: ~2803.97 km
+    - Chicago to NYC: ~1144.29 km
+    """
     transformer = GeoDistanceTransformer(
         lat1="origin_lat", lon1="origin_lon", lat2="dest_lat", lon2="dest_lon"
     )
-    X_tr = transformer.fit_transform(X)
+    X_tr = transformer.fit_transform(df_multi_coords)
+
+    # Build expected DataFrame
+    expected = df_multi_coords.copy()
+    expected["geo_distance"] = [
+        3935.746254609723,
+        2803.971506975193,
+        1144.2912739463475,
+    ]
+
+    pd.testing.assert_frame_equal(
+        X_tr,
+        expected,
+        check_exact=False,
+        atol=0.001,  # Allow very small tolerance for floating point
+    )
 
-    assert len(X_tr["geo_distance"]) == 3
-    # All distances should be positive
-    assert all(X_tr["geo_distance"] > 0)
 
+# =============================================================================
+# Test Invalid Parameters
+# =============================================================================
 
-def test_invalid_method_raises_error():
-    """Test that invalid method raises ValueError."""
+
+@pytest.mark.parametrize("invalid_method", ["invalid", True, 123])
+def test_invalid_method_raises_error(invalid_method):
+    """Test that invalid method values raise ValueError."""
     with pytest.raises(ValueError, match="method must be one of"):
         GeoDistanceTransformer(
-            lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2", method="invalid"
+            lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2", method=invalid_method
         )
 
 
-def test_invalid_output_unit_raises_error():
-    """Test that invalid output_unit raises ValueError."""
+@pytest.mark.parametrize("invalid_unit", ["invalid", True, 123])
+def test_invalid_output_unit_raises_error(invalid_unit):
+    """Test that invalid output_unit values raise ValueError."""
     with pytest.raises(ValueError, match="output_unit must be one of"):
         GeoDistanceTransformer(
             lat1="lat1",
             lon1="lon1",
             lat2="lat2",
             lon2="lon2",
-            output_unit="invalid",
+            output_unit=invalid_unit,
         )
 
 
@@ -176,11 +278,20 @@ def test_missing_columns_raises_error():
         transformer.fit(X)
 
 
-def test_invalid_latitude_range_raises_error():
-    """Test that latitude out of range raises ValueError when validate_ranges=True."""
+# =============================================================================
+# Test Coordinate Range Validation
+# =============================================================================
+
+
+@pytest.mark.parametrize("invalid_lat", [100, -100])
+def test_invalid_latitude_range_raises_error(invalid_lat):
+    """Test that latitude outside [-90, 90] raises ValueError.
+
+    Only applies when validate_ranges=True.
+    """
     X = pd.DataFrame(
         {
-            "lat1": [100],  # Invalid: outside -90 to 90
+            "lat1": [invalid_lat],
             "lon1": [0],
             "lat2": [0],
             "lon2": [0],
@@ -193,12 +304,16 @@ def test_invalid_latitude_range_raises_error():
         transformer.fit(X)
 
 
-def test_invalid_longitude_range_raises_error():
-    """Test that longitude out of range raises ValueError when validate_ranges=True."""
+@pytest.mark.parametrize("invalid_lon", [200, -200])
+def test_invalid_longitude_range_raises_error(invalid_lon):
+    """Test that longitude outside [-180, 180] raises ValueError.
+
+    Only applies when validate_ranges=True.
+    """
     X = pd.DataFrame(
         {
             "lat1": [0],
-            "lon1": [200],  # Invalid: outside -180 to 180
+            "lon1": [invalid_lon],
             "lat2": [0],
             "lon2": [0],
         }
@@ -226,24 +341,33 @@ def test_validate_ranges_disabled():
     # Should not raise an error
     transformer.fit(X)
     X_tr = transformer.transform(X)
-    # Distance may be incorrect but should complete
+    # Distance may be incorrect but should complete without error
     assert "geo_distance" in X_tr.columns
 
 
-def test_validate_ranges_parameter_validation():
-    """Test that validate_ranges must be boolean."""
+@pytest.mark.parametrize("invalid_value", ["True", 123, 0.5])
+def test_validate_ranges_parameter_validation(invalid_value):
+    """Test that validate_ranges must be a boolean.
+
+    Note: 1 and 0 are not tested because they are interpreted as booleans in Python.
+    """
     with pytest.raises(ValueError, match="validate_ranges must be a boolean"):
         GeoDistanceTransformer(
             lat1="lat1",
             lon1="lon1",
             lat2="lat2",
             lon2="lon2",
-            validate_ranges="True",
+            validate_ranges=invalid_value,
         )
 
 
+# =============================================================================
+# Test Fit Attributes
+# =============================================================================
+
+
 def test_fit_stores_attributes():
-    """Test that fit stores expected attributes."""
+    """Test that fit stores expected attributes with correct values."""
     X = pd.DataFrame(
         {"lat1": [40.0], "lon1": [-74.0], "lat2": [34.0], "lon2": [-118.0]}
     )
@@ -252,35 +376,57 @@ def test_fit_stores_attributes():
     )
     transformer.fit(X)
 
+    # Check attributes exist
     assert hasattr(transformer, "variables_")
     assert hasattr(transformer, "feature_names_in_")
     assert hasattr(transformer, "n_features_in_")
+
+    # Check attribute values
     assert set(transformer.variables_) == {"lat1", "lon1", "lat2", "lon2"}
+    assert transformer.feature_names_in_ == ["lat1", "lon1", "lat2", "lon2"]
+    assert transformer.n_features_in_ == 4
 
 
-def test_get_feature_names_out():
-    """Test get_feature_names_out returns correct names."""
-    X = pd.DataFrame(
-        {
-            "lat1": [40.0],
-            "lon1": [-74.0],
-            "lat2": [34.0],
-            "lon2": [-118.0],
-            "other": [1],
-        }
-    )
+# =============================================================================
+# Test get_feature_names_out
+# =============================================================================
+
+
+def test_get_feature_names_out(df_with_extra):
+    """Test get_feature_names_out returns correct feature names."""
     transformer = GeoDistanceTransformer(
         lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2"
     )
-    transformer.fit(X)
+    transformer.fit(df_with_extra)
+
+    feature_names = transformer.get_feature_names_out()
+
+    # Should return original columns + new distance column
+    expected_names = ["lat1", "lon1", "lat2", "lon2", "other", "geo_distance"]
+    assert feature_names == expected_names
+
+
+def test_get_feature_names_out_with_drop_original(df_with_extra):
+    """Test get_feature_names_out when drop_original=True."""
+    transformer = GeoDistanceTransformer(
+        lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2", drop_original=True
+    )
+    transformer.fit(df_with_extra)
 
     feature_names = transformer.get_feature_names_out()
-    assert "geo_distance" in feature_names
-    assert len(feature_names) == 6  # 5 original + 1 new
+
+    # Coordinate columns should be excluded
+    expected_names = ["other", "geo_distance"]
+    assert feature_names == expected_names
+
+
+# =============================================================================
+# Test Output Unit Conversion
+# =============================================================================
 
 
 def test_output_units_conversion():
-    """Test different output units give consistent results."""
+    """Test different output units give consistent results with correct conversion."""
     X = pd.DataFrame(
         {
             "lat1": [40.7128],

From 7cfdc53b8be5d645658ec2d99a4b7c1cabeb23cf Mon Sep 17 00:00:00 2001
From: ankitlade12 <ankitlade12@gmail.com>
Date: Thu, 8 Jan 2026 09:54:06 -0600
Subject: [PATCH 10/10] Clean up test file: remove section header comments for
 production quality

---
 tests/test_creation/test_geo_features.py | 259 ++++++-----------------
 1 file changed, 70 insertions(+), 189 deletions(-)

diff --git a/tests/test_creation/test_geo_features.py b/tests/test_creation/test_geo_features.py
index 057687070..f7cbae966 100644
--- a/tests/test_creation/test_geo_features.py
+++ b/tests/test_creation/test_geo_features.py
@@ -5,54 +5,38 @@
 from feature_engine.creation import GeoDistanceTransformer
 
 
-# =============================================================================
-# Fixtures
-# =============================================================================
-
-
 @pytest.fixture
 def df_coords():
     """Fixture providing sample coordinate data for a single route."""
-    return pd.DataFrame(
-        {
-            "lat1": [40.7128],
-            "lon1": [-74.0060],
-            "lat2": [34.0522],
-            "lon2": [-118.2437],
-        }
-    )
+    return pd.DataFrame({
+        "lat1": [40.7128],
+        "lon1": [-74.0060],
+        "lat2": [34.0522],
+        "lon2": [-118.2437],
+    })
 
 
 @pytest.fixture
 def df_multi_coords():
     """Fixture providing sample coordinate data with multiple rows."""
-    return pd.DataFrame(
-        {
-            "origin_lat": [40.7128, 34.0522, 41.8781],
-            "origin_lon": [-74.0060, -118.2437, -87.6298],
-            "dest_lat": [34.0522, 41.8781, 40.7128],
-            "dest_lon": [-118.2437, -87.6298, -74.0060],
-        }
-    )
+    return pd.DataFrame({
+        "origin_lat": [40.7128, 34.0522, 41.8781],
+        "origin_lon": [-74.0060, -118.2437, -87.6298],
+        "dest_lat": [34.0522, 41.8781, 40.7128],
+        "dest_lon": [-118.2437, -87.6298, -74.0060],
+    })
 
 
 @pytest.fixture
 def df_with_extra():
     """Fixture for DataFrame with coordinates and extra columns."""
-    return pd.DataFrame(
-        {
-            "lat1": [40.0],
-            "lon1": [-74.0],
-            "lat2": [34.0],
-            "lon2": [-118.0],
-            "other": [1],
-        }
-    )
-
-
-# =============================================================================
-# Test Haversine Distance
-# =============================================================================
+    return pd.DataFrame({
+        "lat1": [40.0],
+        "lon1": [-74.0],
+        "lat2": [34.0],
+        "lon2": [-118.0],
+        "other": [1],
+    })
 
 
 def test_haversine_distance_default(df_coords):
@@ -62,30 +46,23 @@ def test_haversine_distance_default(df_coords):
     )
     X_tr = transformer.fit_transform(df_coords)
 
-    # Distance from NYC to LA is approximately 3935-3944 km
     assert "geo_distance" in X_tr.columns
     assert 3900 < X_tr["geo_distance"].iloc[0] < 4000
 
 
 def test_haversine_distance_miles():
-    """Test Haversine distance in miles.
-
-    Expected: NYC to LA is approximately 2445 miles.
-    """
-    X = pd.DataFrame(
-        {
-            "lat1": [40.7128],
-            "lon1": [-74.0060],
-            "lat2": [34.0522],
-            "lon2": [-118.2437],
-        }
-    )
+    """Test Haversine distance in miles."""
+    X = pd.DataFrame({
+        "lat1": [40.7128],
+        "lon1": [-74.0060],
+        "lat2": [34.0522],
+        "lon2": [-118.2437],
+    })
     transformer = GeoDistanceTransformer(
         lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2", output_unit="miles"
     )
     X_tr = transformer.fit_transform(X)
 
-    # Distance should be approximately 2445 miles
     assert 2400 < X_tr["geo_distance"].iloc[0] < 2500
 
 
@@ -93,14 +70,12 @@ def test_haversine_distance_miles():
 @pytest.mark.parametrize("output_unit", ["km", "miles", "meters", "feet"])
 def test_same_location_zero_distance(method, output_unit):
     """Test that same location returns zero distance for all methods and units."""
-    X = pd.DataFrame(
-        {
-            "lat1": [40.7128, 34.0522],
-            "lon1": [-74.0060, -118.2437],
-            "lat2": [40.7128, 34.0522],
-            "lon2": [-74.0060, -118.2437],
-        }
-    )
+    X = pd.DataFrame({
+        "lat1": [40.7128, 34.0522],
+        "lon1": [-74.0060, -118.2437],
+        "lat2": [40.7128, 34.0522],
+        "lon2": [-74.0060, -118.2437],
+    })
     transformer = GeoDistanceTransformer(
         lat1="lat1",
         lon1="lon1",
@@ -116,26 +91,14 @@ def test_same_location_zero_distance(method, output_unit):
     )
 
 
-# =============================================================================
-# Test Alternative Distance Methods
-# =============================================================================
-
-
 def test_euclidean_method():
-    """Test Euclidean distance method returns expected values.
-
-    For coordinates (0,0) to (1,1):
-    - dlat = 1, dlon = 1
-    - At equator: 1 degree ≈ 111 km
-    - Euclidean distance = sqrt((1*111)^2 + (1*111)^2) = sqrt(2) * 111 ≈ 157.0 km
-    """
+    """Test Euclidean distance method returns expected values."""
     X = pd.DataFrame({"lat1": [0.0], "lon1": [0.0], "lat2": [1.0], "lon2": [1.0]})
     transformer = GeoDistanceTransformer(
         lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2", method="euclidean"
     )
     X_tr = transformer.fit_transform(X)
 
-    # Expected: sqrt(2) * 111 ≈ 157.0 km
     expected_distance = np.sqrt(2) * 111.0
     np.testing.assert_almost_equal(
         X_tr["geo_distance"].iloc[0], expected_distance, decimal=1
@@ -143,31 +106,19 @@ def test_euclidean_method():
 
 
 def test_manhattan_method():
-    """Test Manhattan distance method returns expected values.
-
-    For coordinates (0,0) to (1,1):
-    - dlat = 1, dlon = 1
-    - At equator: 1 degree ≈ 111 km
-    - Manhattan distance = (1 + 1) * 111 = 222 km
-    """
+    """Test Manhattan distance method returns expected values."""
     X = pd.DataFrame({"lat1": [0.0], "lon1": [0.0], "lat2": [1.0], "lon2": [1.0]})
     transformer = GeoDistanceTransformer(
         lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2", method="manhattan"
     )
     X_tr = transformer.fit_transform(X)
 
-    # Expected: (1 + 1) * 111 = 222 km
     expected_distance = 2 * 111.0
     np.testing.assert_almost_equal(
         X_tr["geo_distance"].iloc[0], expected_distance, decimal=1
     )
 
 
-# =============================================================================
-# Test Output Configuration
-# =============================================================================
-
-
 def test_custom_output_column_name(df_coords):
     """Test custom output column name."""
     transformer = GeoDistanceTransformer(
@@ -181,51 +132,34 @@ def test_custom_output_column_name(df_coords):
 
 def test_drop_original_columns():
     """Test drop_original parameter removes coordinate columns."""
-    X = pd.DataFrame(
-        {
-            "lat1": [40.7128],
-            "lon1": [-74.0060],
-            "lat2": [34.0522],
-            "lon2": [-118.2437],
-            "other": [1],
-        }
-    )
+    X = pd.DataFrame({
+        "lat1": [40.7128],
+        "lon1": [-74.0060],
+        "lat2": [34.0522],
+        "lon2": [-118.2437],
+        "other": [1],
+    })
     transformer = GeoDistanceTransformer(
         lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2", drop_original=True
     )
     X_tr = transformer.fit_transform(X)
 
-    # Coordinate columns should be removed
     assert "lat1" not in X_tr.columns
     assert "lon1" not in X_tr.columns
     assert "lat2" not in X_tr.columns
     assert "lon2" not in X_tr.columns
-    # New distance column and other columns remain
     assert "geo_distance" in X_tr.columns
     assert "other" in X_tr.columns
-    # Check exact columns
     assert list(X_tr.columns) == ["other", "geo_distance"]
 
 
-# =============================================================================
-# Test Multiple Rows
-# =============================================================================
-
-
 def test_multiple_rows(df_multi_coords):
-    """Test transformation with multiple rows returns expected distances.
-
-    Expected haversine distances in km:
-    - NYC to LA: ~3935.75 km
-    - LA to Chicago: ~2803.97 km
-    - Chicago to NYC: ~1144.29 km
-    """
+    """Test transformation with multiple rows returns expected distances."""
     transformer = GeoDistanceTransformer(
         lat1="origin_lat", lon1="origin_lon", lat2="dest_lat", lon2="dest_lon"
     )
     X_tr = transformer.fit_transform(df_multi_coords)
 
-    # Build expected DataFrame
     expected = df_multi_coords.copy()
     expected["geo_distance"] = [
         3935.746254609723,
@@ -237,15 +171,10 @@ def test_multiple_rows(df_multi_coords):
         X_tr,
         expected,
         check_exact=False,
-        atol=0.001,  # Allow very small tolerance for floating point
+        atol=0.001,
     )
 
 
-# =============================================================================
-# Test Invalid Parameters
-# =============================================================================
-
-
 @pytest.mark.parametrize("invalid_method", ["invalid", True, 123])
 def test_invalid_method_raises_error(invalid_method):
     """Test that invalid method values raise ValueError."""
@@ -278,25 +207,15 @@ def test_missing_columns_raises_error():
         transformer.fit(X)
 
 
-# =============================================================================
-# Test Coordinate Range Validation
-# =============================================================================
-
-
 @pytest.mark.parametrize("invalid_lat", [100, -100])
 def test_invalid_latitude_range_raises_error(invalid_lat):
-    """Test that latitude outside [-90, 90] raises ValueError.
-
-    Only applies when validate_ranges=True.
-    """
-    X = pd.DataFrame(
-        {
-            "lat1": [invalid_lat],
-            "lon1": [0],
-            "lat2": [0],
-            "lon2": [0],
-        }
-    )
+    """Test that latitude outside [-90, 90] raises ValueError."""
+    X = pd.DataFrame({
+        "lat1": [invalid_lat],
+        "lon1": [0],
+        "lat2": [0],
+        "lon2": [0],
+    })
     transformer = GeoDistanceTransformer(
         lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2"
     )
@@ -306,18 +225,13 @@ def test_invalid_latitude_range_raises_error(invalid_lat):
 
 @pytest.mark.parametrize("invalid_lon", [200, -200])
 def test_invalid_longitude_range_raises_error(invalid_lon):
-    """Test that longitude outside [-180, 180] raises ValueError.
-
-    Only applies when validate_ranges=True.
-    """
-    X = pd.DataFrame(
-        {
-            "lat1": [0],
-            "lon1": [invalid_lon],
-            "lat2": [0],
-            "lon2": [0],
-        }
-    )
+    """Test that longitude outside [-180, 180] raises ValueError."""
+    X = pd.DataFrame({
+        "lat1": [0],
+        "lon1": [invalid_lon],
+        "lat2": [0],
+        "lon2": [0],
+    })
     transformer = GeoDistanceTransformer(
         lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2"
     )
@@ -327,30 +241,23 @@ def test_invalid_longitude_range_raises_error(invalid_lon):
 
 def test_validate_ranges_disabled():
     """Test that invalid coordinates don't raise error when validate_ranges=False."""
-    X = pd.DataFrame(
-        {
-            "lat1": [100],  # Invalid latitude
-            "lon1": [200],  # Invalid longitude
-            "lat2": [0],
-            "lon2": [0],
-        }
-    )
+    X = pd.DataFrame({
+        "lat1": [100],
+        "lon1": [200],
+        "lat2": [0],
+        "lon2": [0],
+    })
     transformer = GeoDistanceTransformer(
         lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2", validate_ranges=False
     )
-    # Should not raise an error
     transformer.fit(X)
     X_tr = transformer.transform(X)
-    # Distance may be incorrect but should complete without error
     assert "geo_distance" in X_tr.columns
 
 
 @pytest.mark.parametrize("invalid_value", ["True", 123, 0.5])
 def test_validate_ranges_parameter_validation(invalid_value):
-    """Test that validate_ranges must be a boolean.
-
-    Note: 1 and 0 are not tested because they are interpreted as booleans in Python.
-    """
+    """Test that validate_ranges must be a boolean."""
     with pytest.raises(ValueError, match="validate_ranges must be a boolean"):
         GeoDistanceTransformer(
             lat1="lat1",
@@ -361,11 +268,6 @@ def test_validate_ranges_parameter_validation(invalid_value):
         )
 
 
-# =============================================================================
-# Test Fit Attributes
-# =============================================================================
-
-
 def test_fit_stores_attributes():
     """Test that fit stores expected attributes with correct values."""
     X = pd.DataFrame(
@@ -376,22 +278,14 @@ def test_fit_stores_attributes():
     )
     transformer.fit(X)
 
-    # Check attributes exist
     assert hasattr(transformer, "variables_")
     assert hasattr(transformer, "feature_names_in_")
     assert hasattr(transformer, "n_features_in_")
-
-    # Check attribute values
     assert set(transformer.variables_) == {"lat1", "lon1", "lat2", "lon2"}
     assert transformer.feature_names_in_ == ["lat1", "lon1", "lat2", "lon2"]
     assert transformer.n_features_in_ == 4
 
 
-# =============================================================================
-# Test get_feature_names_out
-# =============================================================================
-
-
 def test_get_feature_names_out(df_with_extra):
     """Test get_feature_names_out returns correct feature names."""
     transformer = GeoDistanceTransformer(
@@ -400,8 +294,6 @@ def test_get_feature_names_out(df_with_extra):
     transformer.fit(df_with_extra)
 
     feature_names = transformer.get_feature_names_out()
-
-    # Should return original columns + new distance column
     expected_names = ["lat1", "lon1", "lat2", "lon2", "other", "geo_distance"]
     assert feature_names == expected_names
 
@@ -414,29 +306,19 @@ def test_get_feature_names_out_with_drop_original(df_with_extra):
     transformer.fit(df_with_extra)
 
     feature_names = transformer.get_feature_names_out()
-
-    # Coordinate columns should be excluded
     expected_names = ["other", "geo_distance"]
     assert feature_names == expected_names
 
 
-# =============================================================================
-# Test Output Unit Conversion
-# =============================================================================
-
-
 def test_output_units_conversion():
     """Test different output units give consistent results with correct conversion."""
-    X = pd.DataFrame(
-        {
-            "lat1": [40.7128],
-            "lon1": [-74.0060],
-            "lat2": [34.0522],
-            "lon2": [-118.2437],
-        }
-    )
+    X = pd.DataFrame({
+        "lat1": [40.7128],
+        "lon1": [-74.0060],
+        "lat2": [34.0522],
+        "lon2": [-118.2437],
+    })
 
-    # Get distance in km and miles
     transformer_km = GeoDistanceTransformer(
         lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2", output_unit="km"
     )
@@ -447,6 +329,5 @@ def test_output_units_conversion():
     dist_km = transformer_km.fit_transform(X.copy())["geo_distance"].iloc[0]
     dist_miles = transformer_miles.fit_transform(X.copy())["geo_distance"].iloc[0]
 
-    # 1 km ≈ 0.621371 miles
     expected_miles = dist_km * 0.621371
     np.testing.assert_almost_equal(dist_miles, expected_miles, decimal=0)