From 94f4ef901dbf4a6af3fdf0eb7b6bcdda3dc50b8f Mon Sep 17 00:00:00 2001 From: ankitlade12 Date: Fri, 26 Dec 2025 12:43:17 -0600 Subject: [PATCH 01/10] Add ArcSinhTransformer, TextFeatures, and GeoDistanceTransformer New transformers: - ArcSinhTransformer: Pseudo-log transformation for positive and negative values (addresses #707) - TextFeatures: Extract 19 text features from string columns (new text module) - GeoDistanceTransformer: Calculate geographic distances using Haversine/Euclidean/Manhattan (addresses #688) Includes: - Full test coverage (43 tests) - RST documentation for all transformers - Code style compliance (flake8, black, isort) --- .../creation/GeoDistanceTransformer.rst | 169 +++++++ docs/user_guide/creation/index.rst | 1 + docs/user_guide/index.rst | 1 + docs/user_guide/text/TextFeatures.rst | 152 ++++++ docs/user_guide/text/index.rst | 18 + .../transformation/ArcSinhTransformer.rst | 121 +++++ docs/user_guide/transformation/index.rst | 1 + feature_engine/creation/__init__.py | 5 +- feature_engine/creation/geo_features.py | 432 ++++++++++++++++++ feature_engine/text/__init__.py | 9 + feature_engine/text/text_features.py | 327 +++++++++++++ feature_engine/transformation/__init__.py | 5 +- feature_engine/transformation/arcsinh.py | 229 ++++++++++ tests/test_creation/test_geo_features.py | 266 +++++++++++ tests/test_text/__init__.py | 167 +++++++ tests/test_transformation/test_arcsinh.py | 122 +++++ 16 files changed, 2023 insertions(+), 2 deletions(-) create mode 100644 docs/user_guide/creation/GeoDistanceTransformer.rst create mode 100644 docs/user_guide/text/TextFeatures.rst create mode 100644 docs/user_guide/text/index.rst create mode 100644 docs/user_guide/transformation/ArcSinhTransformer.rst create mode 100644 feature_engine/creation/geo_features.py create mode 100644 feature_engine/text/__init__.py create mode 100644 feature_engine/text/text_features.py create mode 100644 feature_engine/transformation/arcsinh.py create mode 100644 tests/test_creation/test_geo_features.py create mode 100644 tests/test_text/__init__.py create mode 100644 tests/test_transformation/test_arcsinh.py diff --git a/docs/user_guide/creation/GeoDistanceTransformer.rst b/docs/user_guide/creation/GeoDistanceTransformer.rst new file mode 100644 index 000000000..625daf00c --- /dev/null +++ b/docs/user_guide/creation/GeoDistanceTransformer.rst @@ -0,0 +1,169 @@ +.. _geo_distance_transformer: + +.. currentmodule:: feature_engine.creation + +GeoDistanceTransformer +====================== + +The :class:`GeoDistanceTransformer()` calculates the distance between two geographical +coordinate pairs (latitude/longitude) and adds the result as a new feature. + +This transformer is useful for location-based machine learning problems such as +real estate pricing, delivery route optimization, ride-sharing applications, +and any domain where geographic proximity is relevant. + +Distance Methods +~~~~~~~~~~~~~~~~ + +The transformer supports different distance calculation methods: + +- **haversine**: Great-circle distance using the Haversine formula (default). + Most accurate for typical distances on Earth's surface. +- **euclidean**: Simple Euclidean distance in the coordinate space. + Fast but less accurate for long distances. +- **manhattan**: Manhattan (taxicab) distance in coordinate space. + Useful as a rough approximation for grid-based city layouts. + +Output Units +~~~~~~~~~~~~ + +The distance can be output in various units: + +- **km**: Kilometers (default) +- **miles**: Miles +- **meters**: Meters +- **feet**: Feet + +Example +~~~~~~~ + +Let's create a dataframe with origin and destination coordinates: + +.. code:: python + + import pandas as pd + from feature_engine.creation import GeoDistanceTransformer + + # Sample data: trips between US cities + X = pd.DataFrame({ + 'origin_lat': [40.7128, 34.0522, 41.8781, 29.7604], + 'origin_lon': [-74.0060, -118.2437, -87.6298, -95.3698], + 'dest_lat': [34.0522, 41.8781, 40.7128, 33.4484], + 'dest_lon': [-118.2437, -87.6298, -74.0060, -112.0740], + 'trip_id': [1, 2, 3, 4] + }) + +Now let's calculate the distances: + +.. code:: python + + # Set up the transformer + gdt = GeoDistanceTransformer( + lat1='origin_lat', + lon1='origin_lon', + lat2='dest_lat', + lon2='dest_lon', + method='haversine', + output_unit='km', + output_col='distance_km' + ) + + # Fit and transform + gdt.fit(X) + X_transformed = gdt.transform(X) + + print(X_transformed[['trip_id', 'distance_km']]) + +Output: + +.. code:: python + + trip_id distance_km + 0 1 3935.746254 + 1 2 2808.517344 + 2 3 1144.286561 + 3 4 1634.724892 + +Using different distance methods +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code:: python + + # Euclidean distance (faster but less accurate) + gdt_euclidean = GeoDistanceTransformer( + lat1='origin_lat', lon1='origin_lon', + lat2='dest_lat', lon2='dest_lon', + method='euclidean', + output_col='distance_euclidean' + ) + + # Manhattan distance (useful for grid cities) + gdt_manhattan = GeoDistanceTransformer( + lat1='origin_lat', lon1='origin_lon', + lat2='dest_lat', lon2='dest_lon', + method='manhattan', + output_col='distance_manhattan' + ) + +Converting to miles +~~~~~~~~~~~~~~~~~~~ + +.. code:: python + + gdt = GeoDistanceTransformer( + lat1='origin_lat', lon1='origin_lon', + lat2='dest_lat', lon2='dest_lon', + output_unit='miles', + output_col='distance_miles' + ) + + gdt.fit(X) + X_transformed = gdt.transform(X) + +Dropping original coordinate columns +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code:: python + + gdt = GeoDistanceTransformer( + lat1='origin_lat', lon1='origin_lon', + lat2='dest_lat', lon2='dest_lon', + drop_original=True + ) + + gdt.fit(X) + X_transformed = gdt.transform(X) + + # Coordinate columns are removed + print(X_transformed.columns.tolist()) + # ['trip_id', 'geo_distance'] + +Using in a Pipeline +~~~~~~~~~~~~~~~~~~~ + +:class:`GeoDistanceTransformer()` works seamlessly with scikit-learn pipelines: + +.. code:: python + + from sklearn.pipeline import Pipeline + from sklearn.preprocessing import StandardScaler + from sklearn.ensemble import RandomForestRegressor + + # Create a pipeline for price prediction + pipe = Pipeline([ + ('geo_distance', GeoDistanceTransformer( + lat1='origin_lat', lon1='origin_lon', + lat2='dest_lat', lon2='dest_lon', + output_unit='km', + drop_original=True + )), + ('scaler', StandardScaler()), + ('regressor', RandomForestRegressor()) + ]) + +API Reference +------------- + +.. autoclass:: GeoDistanceTransformer + :members: + :inherited-members: diff --git a/docs/user_guide/creation/index.rst b/docs/user_guide/creation/index.rst index c3aace734..93b094547 100644 --- a/docs/user_guide/creation/index.rst +++ b/docs/user_guide/creation/index.rst @@ -66,6 +66,7 @@ Feature creation module MathFeatures RelativeFeatures DecisionTreeFeatures + GeoDistanceTransformer Feature-engine in Practice -------------------------- diff --git a/docs/user_guide/index.rst b/docs/user_guide/index.rst index c786e77e1..52c33a8f4 100644 --- a/docs/user_guide/index.rst +++ b/docs/user_guide/index.rst @@ -28,6 +28,7 @@ Creation creation/index datetime/index + text/index Selection diff --git a/docs/user_guide/text/TextFeatures.rst b/docs/user_guide/text/TextFeatures.rst new file mode 100644 index 000000000..a1a82aa44 --- /dev/null +++ b/docs/user_guide/text/TextFeatures.rst @@ -0,0 +1,152 @@ +.. _text_features: + +.. currentmodule:: feature_engine.text + +TextFeatures +============ + +The :class:`TextFeatures()` extracts numerical features from text/string variables. +This transformer is useful for extracting basic text statistics that can be used +as features in machine learning models. + +Unlike scikit-learn's CountVectorizer or TfidfVectorizer which create sparse matrices, +:class:`TextFeatures()` extracts metadata features that remain in DataFrame format +and can be easily combined with other Feature-engine transformers in a pipeline. + +Available Features +~~~~~~~~~~~~~~~~~~ + +The transformer can extract the following features: + +- **char_count**: Number of characters in the text +- **word_count**: Number of words (whitespace-separated tokens) +- **sentence_count**: Number of sentences (based on .!? punctuation) +- **avg_word_length**: Average length of words +- **digit_count**: Number of digit characters +- **uppercase_count**: Number of uppercase letters +- **lowercase_count**: Number of lowercase letters +- **special_char_count**: Number of special characters (non-alphanumeric) +- **whitespace_count**: Number of whitespace characters +- **whitespace_ratio**: Ratio of whitespace to total characters +- **digit_ratio**: Ratio of digits to total characters +- **uppercase_ratio**: Ratio of uppercase to total characters +- **has_digits**: Binary indicator if text contains digits +- **has_uppercase**: Binary indicator if text contains uppercase +- **is_empty**: Binary indicator if text is empty +- **starts_with_uppercase**: Binary indicator if text starts with uppercase +- **ends_with_punctuation**: Binary indicator if text ends with .!? +- **unique_word_count**: Number of unique words (case-insensitive) +- **unique_word_ratio**: Ratio of unique words to total words + +Example +~~~~~~~ + +Let's create a dataframe with text data and extract features: + +.. code:: python + + import pandas as pd + from feature_engine.text import TextFeatures + + # Create sample data + X = pd.DataFrame({ + 'review': [ + 'This product is AMAZING! Best purchase ever.', + 'Not great. Would not recommend.', + 'OK for the price. 3 out of 5 stars.', + 'TERRIBLE!!! DO NOT BUY!', + ], + 'title': [ + 'Great Product', + 'Disappointed', + 'Average', + 'Awful', + ] + }) + +Now let's extract specific text features: + +.. code:: python + + # Set up the transformer with specific features + tf = TextFeatures( + variables=['review'], + features=['word_count', 'char_count', 'has_digits', 'uppercase_ratio'] + ) + + # Fit and transform + tf.fit(X) + X_transformed = tf.transform(X) + + print(X_transformed.columns.tolist()) + +Output: + +.. code:: python + + ['review', 'title', 'review_word_count', 'review_char_count', + 'review_has_digits', 'review_uppercase_ratio'] + +Extracting all features +~~~~~~~~~~~~~~~~~~~~~~~ + +By default, if no features are specified, all available features will be extracted: + +.. code:: python + + # Extract all features from all text columns + tf = TextFeatures() + tf.fit(X) + X_transformed = tf.transform(X) + + # This will create 19 new columns for each text variable + print(f"Original columns: {len(X.columns)}") + print(f"Transformed columns: {len(X_transformed.columns)}") + +Dropping original columns +~~~~~~~~~~~~~~~~~~~~~~~~~ + +You can drop the original text columns after extracting features: + +.. code:: python + + tf = TextFeatures( + variables=['review'], + features=['word_count', 'char_count'], + drop_original=True + ) + + tf.fit(X) + X_transformed = tf.transform(X) + + # 'review' column is now removed + print(X_transformed.columns.tolist()) + +Using in a Pipeline +~~~~~~~~~~~~~~~~~~~ + +:class:`TextFeatures()` works seamlessly with scikit-learn pipelines: + +.. code:: python + + from sklearn.pipeline import Pipeline + from sklearn.preprocessing import StandardScaler + from sklearn.linear_model import LogisticRegression + + # Create a pipeline + pipe = Pipeline([ + ('text_features', TextFeatures( + variables=['review'], + features=['word_count', 'char_count', 'uppercase_ratio'], + drop_original=True + )), + ('scaler', StandardScaler()), + ('classifier', LogisticRegression()) + ]) + +API Reference +------------- + +.. autoclass:: TextFeatures + :members: + :inherited-members: diff --git a/docs/user_guide/text/index.rst b/docs/user_guide/text/index.rst new file mode 100644 index 000000000..ea23d7362 --- /dev/null +++ b/docs/user_guide/text/index.rst @@ -0,0 +1,18 @@ +.. -*- mode: rst -*- + +Text Feature Extraction +======================= + +Feature-engine's text module includes transformers to extract numerical features +from text/string variables. + +Text feature extraction is useful for machine learning problems where you have +text data but want to derive numerical statistics without creating sparse +bag-of-words or TF-IDF representations. + +**Transformers** + +.. toctree:: + :maxdepth: 1 + + TextFeatures diff --git a/docs/user_guide/transformation/ArcSinhTransformer.rst b/docs/user_guide/transformation/ArcSinhTransformer.rst new file mode 100644 index 000000000..07945b463 --- /dev/null +++ b/docs/user_guide/transformation/ArcSinhTransformer.rst @@ -0,0 +1,121 @@ +.. _arcsinh_transformer: + +.. currentmodule:: feature_engine.transformation + +ArcSinhTransformer +================== + +The :class:`ArcSinhTransformer()` applies the inverse hyperbolic sine transformation +(arcsinh) to numerical variables. Also known as the pseudo-logarithm, this +transformation is useful for data that contains both positive and negative values. + +The transformation is: x → arcsinh((x - loc) / scale) + +For large |x|, arcsinh(x) behaves like ln(|x|) + ln(2), providing similar +variance-stabilizing properties as the log transformation. For small |x|, +it behaves approximately linearly (x → x). This makes it ideal for variables +like net worth, profit/loss, or any metric that can be positive or negative. + +Unlike the :class:`LogTransformer()`, the :class:`ArcSinhTransformer()` can handle +zero and negative values without requiring any preprocessing. + +Example +~~~~~~~ + +Let's create a dataframe with positive and negative values and apply the arcsinh +transformation: + +.. code:: python + + import numpy as np + import pandas as pd + import matplotlib.pyplot as plt + from sklearn.model_selection import train_test_split + + from feature_engine.transformation import ArcSinhTransformer + + # Create sample data with positive and negative values + np.random.seed(42) + X = pd.DataFrame({ + 'profit': np.random.randn(1000) * 10000, # Values from -30000 to 30000 + 'net_worth': np.random.randn(1000) * 50000, + }) + + # Separate into train and test + X_train, X_test = train_test_split(X, test_size=0.3, random_state=0) + +Now let's set up the ArcSinhTransformer: + +.. code:: python + + # Set up the arcsinh transformer + tf = ArcSinhTransformer(variables=['profit', 'net_worth']) + + # Fit the transformer + tf.fit(X_train) + +The transformer does not learn any parameters when applying the fit method. It does +check however that the variables are numerical. + +We can now transform the variables: + +.. code:: python + + # Transform the data + train_t = tf.transform(X_train) + test_t = tf.transform(X_test) + +The arcsinh transformation compresses extreme values while preserving the sign: + +.. code:: python + + # Compare original and transformed distributions + fig, axes = plt.subplots(1, 2, figsize=(12, 4)) + + X_train['profit'].hist(ax=axes[0], bins=50) + axes[0].set_title('Original profit') + + train_t['profit'].hist(ax=axes[1], bins=50) + axes[1].set_title('Transformed profit') + + plt.tight_layout() + +Using loc and scale parameters +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The :class:`ArcSinhTransformer()` supports location and scale parameters to +center and normalize data before transformation: + +.. code:: python + + # Center around mean and scale by std + tf = ArcSinhTransformer( + variables=['profit'], + loc=X_train['profit'].mean(), + scale=X_train['profit'].std() + ) + + tf.fit(X_train) + train_t = tf.transform(X_train) + +Inverse transformation +~~~~~~~~~~~~~~~~~~~~~~ + +The :class:`ArcSinhTransformer()` supports inverse transformation to recover +the original values: + +.. code:: python + + # Transform and then inverse transform + train_t = tf.transform(X_train) + train_recovered = tf.inverse_transform(train_t) + + # Values should match original + np.allclose(X_train['profit'], train_recovered['profit']) + +API Reference +------------- + +.. autoclass:: ArcSinhTransformer + :members: + :inherited-members: diff --git a/docs/user_guide/transformation/index.rst b/docs/user_guide/transformation/index.rst index 85422c9f6..00ce20bfb 100644 --- a/docs/user_guide/transformation/index.rst +++ b/docs/user_guide/transformation/index.rst @@ -33,6 +33,7 @@ on the nature of the variable. LogCpTransformer ReciprocalTransformer ArcsinTransformer + ArcSinhTransformer PowerTransformer BoxCoxTransformer YeoJohnsonTransformer diff --git a/feature_engine/creation/__init__.py b/feature_engine/creation/__init__.py index df7dab5a7..b3c84ba15 100644 --- a/feature_engine/creation/__init__.py +++ b/feature_engine/creation/__init__.py @@ -4,12 +4,15 @@ """ from .cyclical_features import CyclicalFeatures from .decision_tree_features import DecisionTreeFeatures +from .geo_features import GeoDistanceTransformer from .math_features import MathFeatures from .relative_features import RelativeFeatures __all__ = [ + "CyclicalFeatures", "DecisionTreeFeatures", + "GeoDistanceTransformer", "MathFeatures", "RelativeFeatures", - "CyclicalFeatures", ] + diff --git a/feature_engine/creation/geo_features.py b/feature_engine/creation/geo_features.py new file mode 100644 index 000000000..1488106e9 --- /dev/null +++ b/feature_engine/creation/geo_features.py @@ -0,0 +1,432 @@ +# Authors: Ankit Hemant Lade (contributor) +# License: BSD 3 clause + +from typing import List, Literal, Optional + +import numpy as np +import pandas as pd +from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.utils.validation import check_is_fitted + +from feature_engine._base_transformers.mixins import GetFeatureNamesOutMixin +from feature_engine._check_init_parameters.check_init_input_params import ( + _check_param_drop_original, +) +from feature_engine.dataframe_checks import ( + _check_contains_na, + _check_X_matches_training_df, + check_X, +) +from feature_engine.tags import _return_tags +from feature_engine.variable_handling import check_numerical_variables + +# Earth's radius in different units +EARTH_RADIUS = { + "km": 6371.0, + "miles": 3958.8, + "meters": 6371000.0, + "feet": 20902231.0, +} + + +class GeoDistanceTransformer(TransformerMixin, BaseEstimator, GetFeatureNamesOutMixin): + """ + GeoDistanceTransformer() calculates the distance between two geographical + coordinate pairs (latitude/longitude) and adds the result as a new feature. + + This transformer is useful for location-based machine learning problems such as + real estate pricing, delivery route optimization, ride-sharing applications, + and any domain where geographic proximity is relevant. + + The transformer supports different distance calculation methods: + + - 'haversine': Great-circle distance using the Haversine formula (default). + Most accurate for typical distances on Earth's surface. + - 'euclidean': Simple Euclidean distance in the coordinate space. + Fast but less accurate for long distances. + - 'manhattan': Manhattan (taxicab) distance in coordinate space. + Useful as a rough approximation for grid-based city layouts. + + More details in the :ref:`User Guide `. + + Parameters + ---------- + lat1: str + Column name containing the latitude of the first point. + + lon1: str + Column name containing the longitude of the first point. + + lat2: str + Column name containing the latitude of the second point. + + lon2: str + Column name containing the longitude of the second point. + + method: str, default='haversine' + The distance calculation method. Options are: + - 'haversine': Great-circle distance (most accurate) + - 'euclidean': Euclidean distance in coordinate space + - 'manhattan': Manhattan distance in coordinate space + + output_unit: str, default='km' + The unit for the output distance. Options are: + - 'km': Kilometers + - 'miles': Miles + - 'meters': Meters + - 'feet': Feet + + output_col: str, default='geo_distance' + Name of the new column containing the calculated distances. + + drop_original: bool, default=False + Whether to drop the original coordinate columns after transformation. + + Attributes + ---------- + variables_: + List of the coordinate variables used for distance calculation. + + feature_names_in_: + List with the names of features seen during fit. + + n_features_in_: + The number of features in the train set used in fit. + + Methods + ------- + fit: + This transformer does not learn parameters. Validates input columns. + + fit_transform: + Fit to data, then transform it. + + transform: + Calculate distances and add them as a new column. + + get_feature_names_out: + Get output feature names for transformation. + + See Also + -------- + feature_engine.creation.MathFeatures : + Combines existing features using mathematical operations. + feature_engine.creation.RelativeFeatures : + Creates features relative to reference variables. + + References + ---------- + .. [1] Haversine formula: https://en.wikipedia.org/wiki/Haversine_formula + + Examples + -------- + + >>> import pandas as pd + >>> from feature_engine.creation import GeoDistanceTransformer + >>> X = pd.DataFrame({ + ... 'origin_lat': [40.7128, 34.0522, 41.8781], + ... 'origin_lon': [-74.0060, -118.2437, -87.6298], + ... 'dest_lat': [34.0522, 41.8781, 40.7128], + ... 'dest_lon': [-118.2437, -87.6298, -74.0060], + ... }) + >>> gdt = GeoDistanceTransformer( + ... lat1='origin_lat', lon1='origin_lon', + ... lat2='dest_lat', lon2='dest_lon', + ... method='haversine', output_unit='km' + ... ) + >>> gdt.fit(X) + >>> X = gdt.transform(X) + >>> X + origin_lat origin_lon dest_lat dest_lon geo_distance + 0 40.7128 -74.0060 34.0522 -118.2437 3935.746254 + 1 34.0522 -118.2437 41.8781 -87.6298 2808.517344 + 2 41.8781 -87.6298 40.7128 -74.0060 1144.286561 + """ + + def __init__( + self, + lat1: str, + lon1: str, + lat2: str, + lon2: str, + method: Literal["haversine", "euclidean", "manhattan"] = "haversine", + output_unit: Literal["km", "miles", "meters", "feet"] = "km", + output_col: str = "geo_distance", + drop_original: bool = False, + ) -> None: + + # Validate coordinate column names + for param_name, param_value in [ + ("lat1", lat1), + ("lon1", lon1), + ("lat2", lat2), + ("lon2", lon2), + ]: + if not isinstance(param_value, str): + raise ValueError( + f"{param_name} must be a string. Got {type(param_value).__name__}." + ) + + # Validate method + valid_methods = ["haversine", "euclidean", "manhattan"] + if method not in valid_methods: + raise ValueError( + f"method must be one of {valid_methods}. Got '{method}' instead." + ) + + # Validate output_unit + valid_units = ["km", "miles", "meters", "feet"] + if output_unit not in valid_units: + raise ValueError( + f"output_unit must be one of {valid_units}. " + f"Got '{output_unit}' instead." + ) + + # Validate output_col + if not isinstance(output_col, str): + raise ValueError( + f"output_col must be a string. Got {type(output_col).__name__}." + ) + + _check_param_drop_original(drop_original) + + self.lat1 = lat1 + self.lon1 = lon1 + self.lat2 = lat2 + self.lon2 = lon2 + self.method = method + self.output_unit = output_unit + self.output_col = output_col + self.drop_original = drop_original + + def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): + """ + This transformer does not learn parameters. + + Validates that the coordinate columns exist and are numerical. + + Parameters + ---------- + X: pandas dataframe of shape = [n_samples, n_features] + The training input samples. + + y: pandas Series, or np.array. Defaults to None. + It is not needed in this transformer. You can pass y or None. + + Returns + ------- + self: GeoDistanceTransformer + The fitted transformer. + """ + + # check input dataframe + X = check_X(X) + + # Store coordinate variables + self.variables_ = [self.lat1, self.lon1, self.lat2, self.lon2] + + # Check all coordinate columns exist + missing = set(self.variables_) - set(X.columns) + if missing: + raise ValueError( + f"Coordinate columns {missing} are not present in the dataframe." + ) + + # Check coordinate columns are numerical + check_numerical_variables(X, self.variables_) + + # Check for missing values + _check_contains_na(X, self.variables_) + + # Validate coordinate ranges (optional sanity check) + for lat_col in [self.lat1, self.lat2]: + if (X[lat_col].abs() > 90).any(): + raise ValueError( + f"Latitude values in '{lat_col}' must be between -90 and 90." + ) + + for lon_col in [self.lon1, self.lon2]: + if (X[lon_col].abs() > 180).any(): + raise ValueError( + f"Longitude values in '{lon_col}' must be between -180 and 180." + ) + + # save input features + self.feature_names_in_ = X.columns.tolist() + + # save train set shape + self.n_features_in_ = X.shape[1] + + return self + + def transform(self, X: pd.DataFrame) -> pd.DataFrame: + """ + Calculate distances and add them as a new column. + + Parameters + ---------- + X: pandas dataframe of shape = [n_samples, n_features] + The data to transform. + + Returns + ------- + X_new: Pandas dataframe + The dataframe with the new distance column added. + """ + + # Check method fit has been called + check_is_fitted(self) + + # check that input is a dataframe + X = check_X(X) + + # Check if input data contains same number of columns as dataframe used to fit. + _check_X_matches_training_df(X, self.n_features_in_) + + # Check for missing values + _check_contains_na(X, self.variables_) + + # reorder variables to match train set + X = X[self.feature_names_in_] + + # Calculate distance based on method + if self.method == "haversine": + distances = self._haversine_distance( + X[self.lat1].values, + X[self.lon1].values, + X[self.lat2].values, + X[self.lon2].values, + ) + elif self.method == "euclidean": + distances = self._euclidean_distance( + X[self.lat1].values, + X[self.lon1].values, + X[self.lat2].values, + X[self.lon2].values, + ) + else: # manhattan + distances = self._manhattan_distance( + X[self.lat1].values, + X[self.lon1].values, + X[self.lat2].values, + X[self.lon2].values, + ) + + X[self.output_col] = distances + + if self.drop_original: + X = X.drop(columns=self.variables_) + + return X + + def _haversine_distance( + self, + lat1: np.ndarray, + lon1: np.ndarray, + lat2: np.ndarray, + lon2: np.ndarray, + ) -> np.ndarray: + """Calculate the great-circle distance using the Haversine formula.""" + + # Convert to radians + lat1_rad = np.radians(lat1) + lat2_rad = np.radians(lat2) + lon1_rad = np.radians(lon1) + lon2_rad = np.radians(lon2) + + # Haversine formula + dlat = lat2_rad - lat1_rad + dlon = lon2_rad - lon1_rad + + a = ( + np.sin(dlat / 2) ** 2 + + np.cos(lat1_rad) * np.cos(lat2_rad) * np.sin(dlon / 2) ** 2 + ) + c = 2 * np.arcsin(np.sqrt(a)) + + # Distance in the requested unit + distance = EARTH_RADIUS[self.output_unit] * c + + return distance + + def _euclidean_distance( + self, + lat1: np.ndarray, + lon1: np.ndarray, + lat2: np.ndarray, + lon2: np.ndarray, + ) -> np.ndarray: + """Calculate Euclidean distance in coordinate space.""" + + # Simple Euclidean distance (approximate, best for short distances) + # Convert to approximate km then to requested unit + dlat = lat2 - lat1 + dlon = lon2 - lon1 + + # Approximate degrees to km (at equator) + km_per_degree = 111.0 + distance_km = np.sqrt((dlat * km_per_degree) ** 2 + (dlon * km_per_degree) ** 2) + + # Convert to requested unit + conversion = EARTH_RADIUS[self.output_unit] / EARTH_RADIUS["km"] + return distance_km * conversion + + def _manhattan_distance( + self, + lat1: np.ndarray, + lon1: np.ndarray, + lat2: np.ndarray, + lon2: np.ndarray, + ) -> np.ndarray: + """Calculate Manhattan (taxicab) distance in coordinate space.""" + + dlat = np.abs(lat2 - lat1) + dlon = np.abs(lon2 - lon1) + + # Approximate degrees to km (at equator) + km_per_degree = 111.0 + distance_km = (dlat + dlon) * km_per_degree + + # Convert to requested unit + conversion = EARTH_RADIUS[self.output_unit] / EARTH_RADIUS["km"] + return distance_km * conversion + + def get_feature_names_out(self, input_features=None) -> List[str]: + """ + Get output feature names for transformation. + + Parameters + ---------- + input_features : array-like of str or None, default=None + Input features. If None, uses feature_names_in_. + + Returns + ------- + feature_names_out : list of str + Output feature names. + """ + check_is_fitted(self) + + if self.drop_original: + feature_names = [ + f for f in self.feature_names_in_ if f not in self.variables_ + ] + else: + feature_names = list(self.feature_names_in_) + + feature_names.append(self.output_col) + + return feature_names + + def _more_tags(self): + tags_dict = _return_tags() + tags_dict["variables"] = "numerical" + # This transformer has mandatory parameters + tags_dict["_xfail_checks"][ + "check_parameters_default_constructible" + ] = "transformer has mandatory parameters" + return tags_dict + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + return tags diff --git a/feature_engine/text/__init__.py b/feature_engine/text/__init__.py new file mode 100644 index 000000000..14626b79c --- /dev/null +++ b/feature_engine/text/__init__.py @@ -0,0 +1,9 @@ +""" +The module text includes classes to extract features from text/string variables. +""" + +from .text_features import TextFeatures + +__all__ = [ + "TextFeatures", +] diff --git a/feature_engine/text/text_features.py b/feature_engine/text/text_features.py new file mode 100644 index 000000000..c06afdf79 --- /dev/null +++ b/feature_engine/text/text_features.py @@ -0,0 +1,327 @@ +# Authors: Ankit Hemant Lade (contributor) +# License: BSD 3 clause + +from typing import List, Optional, Union + +import pandas as pd +from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.utils.validation import check_is_fitted + +from feature_engine._base_transformers.mixins import GetFeatureNamesOutMixin +from feature_engine._check_init_parameters.check_init_input_params import ( + _check_param_drop_original, +) +from feature_engine.dataframe_checks import _check_X_matches_training_df, check_X +from feature_engine.tags import _return_tags + +# Available text features and their computation functions +TEXT_FEATURES = { + "char_count": lambda x: x.str.len(), + "word_count": lambda x: x.str.split().str.len(), + "sentence_count": lambda x: x.str.count(r"[.!?]+"), + "avg_word_length": lambda x: x.apply( + lambda s: sum(len(w) for w in str(s).split()) / max(len(str(s).split()), 1) + ), + "digit_count": lambda x: x.str.count(r"\d"), + "uppercase_count": lambda x: x.str.count(r"[A-Z]"), + "lowercase_count": lambda x: x.str.count(r"[a-z]"), + "special_char_count": lambda x: x.str.count(r"[^a-zA-Z0-9\s]"), + "whitespace_count": lambda x: x.str.count(r"\s"), + "whitespace_ratio": lambda x: x.str.count(r"\s") / x.str.len().replace(0, 1), + "digit_ratio": lambda x: x.str.count(r"\d") / x.str.len().replace(0, 1), + "uppercase_ratio": lambda x: x.str.count(r"[A-Z]") / x.str.len().replace(0, 1), + "has_digits": lambda x: x.str.contains(r"\d", regex=True).astype(int), + "has_uppercase": lambda x: x.str.contains(r"[A-Z]", regex=True).astype(int), + "is_empty": lambda x: (x.str.len() == 0).astype(int), + "starts_with_uppercase": lambda x: x.str.match(r"^[A-Z]").astype(int), + "ends_with_punctuation": lambda x: x.str.match(r".*[.!?]$").astype(int), + "unique_word_count": lambda x: x.apply(lambda s: len(set(str(s).lower().split()))), + "unique_word_ratio": lambda x: x.apply( + lambda s: len(set(str(s).lower().split())) / max(len(str(s).split()), 1) + ), +} + + +class TextFeatures(TransformerMixin, BaseEstimator, GetFeatureNamesOutMixin): + """ + TextFeatures() extracts numerical features from text/string variables. This + transformer is useful for extracting basic text statistics that can be used + as features in machine learning models. + + The transformer can extract various text features including character counts, + word counts, sentence counts, and various ratios and indicators. + + A list of variables can be passed as an argument. Alternatively, the transformer + will automatically select and transform all variables of type object (string). + + More details in the :ref:`User Guide `. + + Parameters + ---------- + variables: list, default=None + The list of text/string variables to extract features from. If None, the + transformer will automatically select all object (string) columns. + + features: list, default=None + List of text features to extract. Available features are: + + - 'char_count': Number of characters in the text + - 'word_count': Number of words (whitespace-separated tokens) + - 'sentence_count': Number of sentences (based on .!? punctuation) + - 'avg_word_length': Average length of words + - 'digit_count': Number of digit characters + - 'uppercase_count': Number of uppercase letters + - 'lowercase_count': Number of lowercase letters + - 'special_char_count': Number of special characters (non-alphanumeric) + - 'whitespace_count': Number of whitespace characters + - 'whitespace_ratio': Ratio of whitespace to total characters + - 'digit_ratio': Ratio of digits to total characters + - 'uppercase_ratio': Ratio of uppercase to total characters + - 'has_digits': Binary indicator if text contains digits + - 'has_uppercase': Binary indicator if text contains uppercase + - 'is_empty': Binary indicator if text is empty + - 'starts_with_uppercase': Binary indicator if text starts with uppercase + - 'ends_with_punctuation': Binary indicator if text ends with .!? + - 'unique_word_count': Number of unique words (case-insensitive) + - 'unique_word_ratio': Ratio of unique words to total words + + If None, extracts all available features. + + drop_original: bool, default=False + Whether to drop the original text columns after transformation. + + Attributes + ---------- + variables_: + The list of text variables that will be transformed. + + features_: + The list of features that will be extracted. + + feature_names_in_: + List with the names of features seen during fit. + + n_features_in_: + The number of features in the train set used in fit. + + Methods + ------- + fit: + This transformer does not learn parameters. It stores the feature names + and validates input. + + fit_transform: + Fit to data, then transform it. + + transform: + Extract text features and add them to the dataframe. + + get_feature_names_out: + Get output feature names for transformation. + + See Also + -------- + feature_engine.encoding.StringSimilarityEncoder : + Encodes categorical variables based on string similarity. + + Examples + -------- + + >>> import pandas as pd + >>> from feature_engine.text import TextFeatures + >>> X = pd.DataFrame({ + ... 'text': ['Hello World!', 'Python is GREAT.', 'ML rocks 123'] + ... }) + >>> tf = TextFeatures(features=['char_count', 'word_count', 'has_digits']) + >>> tf.fit(X) + >>> X = tf.transform(X) + >>> X + text text_char_count text_word_count text_has_digits + 0 Hello World! 12 2 0 + 1 Python is GREAT. 16 3 0 + 2 ML rocks 123 12 3 1 + """ + + def __init__( + self, + variables: Union[None, str, List[str]] = None, + features: Union[None, List[str]] = None, + drop_original: bool = False, + ) -> None: + + # Validate variables + if variables is not None: + if isinstance(variables, str): + variables = [variables] + elif not isinstance(variables, list) or not all( + isinstance(v, str) for v in variables + ): + raise ValueError( + "variables must be None, a string, or a list of strings. " + f"Got {type(variables).__name__} instead." + ) + + # Validate features + if features is not None: + if not isinstance(features, list) or not all( + isinstance(f, str) for f in features + ): + raise ValueError( + "features must be None or a list of strings. " + f"Got {type(features).__name__} instead." + ) + invalid_features = set(features) - set(TEXT_FEATURES.keys()) + if invalid_features: + raise ValueError( + f"Invalid features: {invalid_features}. " + f"Available features are: {list(TEXT_FEATURES.keys())}" + ) + + _check_param_drop_original(drop_original) + + self.variables = variables + self.features = features + self.drop_original = drop_original + + def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): + """ + This transformer does not learn parameters. + + Stores feature names and validates that the specified variables are + present and are of string/object type. + + Parameters + ---------- + X: pandas dataframe of shape = [n_samples, n_features] + The training input samples. + + y: pandas Series, or np.array. Defaults to None. + It is not needed in this transformer. You can pass y or None. + + Returns + ------- + self: TextFeatures + The fitted transformer. + """ + + # check input dataframe + X = check_X(X) + + # Find or validate text variables + if self.variables is None: + # Select object/string columns + self.variables_ = [col for col in X.columns if X[col].dtype == "object"] + if len(self.variables_) == 0: + raise ValueError( + "No object/string columns found in the dataframe. " + "Please specify variables explicitly." + ) + else: + # Validate user-specified variables exist + missing = set(self.variables) - set(X.columns) + if missing: + raise ValueError( + f"Variables {missing} are not present in the dataframe." + ) + self.variables_ = self.variables + + # Set features to extract + if self.features is None: + self.features_ = list(TEXT_FEATURES.keys()) + else: + self.features_ = self.features + + # save input features + self.feature_names_in_ = X.columns.tolist() + + # save train set shape + self.n_features_in_ = X.shape[1] + + return self + + def transform(self, X: pd.DataFrame) -> pd.DataFrame: + """ + Extract text features and add them to the dataframe. + + Parameters + ---------- + X: pandas dataframe of shape = [n_samples, n_features] + The data to transform. + + Returns + ------- + X_new: Pandas dataframe + The dataframe with the original columns plus the new text features. + """ + + # Check method fit has been called + check_is_fitted(self) + + # check that input is a dataframe + X = check_X(X) + + # Check if input data contains same number of columns as dataframe used to fit. + _check_X_matches_training_df(X, self.n_features_in_) + + # reorder variables to match train set + X = X[self.feature_names_in_] + + # Extract features for each text variable + for var in self.variables_: + # Fill NaN with empty string for feature extraction + text_col = X[var].fillna("") + + for feature_name in self.features_: + new_col_name = f"{var}_{feature_name}" + feature_func = TEXT_FEATURES[feature_name] + X[new_col_name] = feature_func(text_col) + + # Fill any NaN values resulting from computation with 0 + X[new_col_name] = X[new_col_name].fillna(0) + + if self.drop_original: + X = X.drop(columns=self.variables_) + + return X + + def get_feature_names_out(self, input_features=None) -> List[str]: + """ + Get output feature names for transformation. + + Parameters + ---------- + input_features : array-like of str or None, default=None + Input features. If None, uses feature_names_in_. + + Returns + ------- + feature_names_out : list of str + Output feature names. + """ + check_is_fitted(self) + + # Start with original features + if self.drop_original: + feature_names = [ + f for f in self.feature_names_in_ if f not in self.variables_ + ] + else: + feature_names = list(self.feature_names_in_) + + # Add new text feature names + for var in self.variables_: + for feature_name in self.features_: + feature_names.append(f"{var}_{feature_name}") + + return feature_names + + def _more_tags(self): + tags_dict = _return_tags() + tags_dict["allow_nan"] = True + tags_dict["variables"] = "categorical" + return tags_dict + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.allow_nan = True + return tags diff --git a/feature_engine/transformation/__init__.py b/feature_engine/transformation/__init__.py index 15011ac4b..f60dbd72b 100644 --- a/feature_engine/transformation/__init__.py +++ b/feature_engine/transformation/__init__.py @@ -4,6 +4,7 @@ """ from .arcsin import ArcsinTransformer +from .arcsinh import ArcSinhTransformer from .boxcox import BoxCoxTransformer from .log import LogCpTransformer, LogTransformer from .power import PowerTransformer @@ -11,11 +12,13 @@ from .yeojohnson import YeoJohnsonTransformer __all__ = [ + "ArcsinTransformer", + "ArcSinhTransformer", "BoxCoxTransformer", "LogTransformer", "LogCpTransformer", "PowerTransformer", "ReciprocalTransformer", "YeoJohnsonTransformer", - "ArcsinTransformer", ] + diff --git a/feature_engine/transformation/arcsinh.py b/feature_engine/transformation/arcsinh.py new file mode 100644 index 000000000..98a44478e --- /dev/null +++ b/feature_engine/transformation/arcsinh.py @@ -0,0 +1,229 @@ +# Authors: Ankit Hemant Lade (contributor) +# License: BSD 3 clause + +from typing import List, Optional, Union + +import numpy as np +import pandas as pd + +from feature_engine._base_transformers.base_numerical import BaseNumericalTransformer +from feature_engine._check_init_parameters.check_variables import ( + _check_variables_input_value, +) +from feature_engine._docstrings.fit_attributes import ( + _feature_names_in_docstring, + _n_features_in_docstring, + _variables_attribute_docstring, +) +from feature_engine._docstrings.init_parameters.all_trasnformers import ( + _variables_numerical_docstring, +) +from feature_engine._docstrings.methods import ( + _fit_not_learn_docstring, + _fit_transform_docstring, + _inverse_transform_docstring, +) +from feature_engine._docstrings.substitute import Substitution +from feature_engine.tags import _return_tags + + +@Substitution( + variables=_variables_numerical_docstring, + variables_=_variables_attribute_docstring, + feature_names_in_=_feature_names_in_docstring, + n_features_in_=_n_features_in_docstring, + fit=_fit_not_learn_docstring, + fit_transform=_fit_transform_docstring, + inverse_transform=_inverse_transform_docstring, +) +class ArcSinhTransformer(BaseNumericalTransformer): + """ + The ArcSinhTransformer() applies the inverse hyperbolic sine transformation + (arcsinh) to numerical variables. Also known as the pseudo-logarithm, this + transformation is useful for data that contains both positive and negative values. + + The transformation is: x → arcsinh((x - loc) / scale) + + For large |x|, arcsinh(x) behaves like ln(|x|) + ln(2), providing similar + variance-stabilizing properties as the log transformation. For small |x|, + it behaves approximately linearly (x → x). This makes it ideal for variables + like net worth, profit/loss, or any metric that can be positive or negative. + + A list of variables can be passed as an argument. Alternatively, the transformer + will automatically select and transform all variables of type numeric. + + More details in the :ref:`User Guide `. + + Parameters + ---------- + {variables} + + loc: float, default=0.0 + Location parameter for shifting the data before transformation. + The transformation becomes: arcsinh((x - loc) / scale) + + scale: float, default=1.0 + Scale parameter for normalizing the data before transformation. + Must be greater than 0. The transformation becomes: arcsinh((x - loc) / scale) + + Attributes + ---------- + {variables_} + + {feature_names_in_} + + {n_features_in_} + + Methods + ------- + {fit} + + {fit_transform} + + {inverse_transform} + + transform: + Transform the variables using the arcsinh function. + + See Also + -------- + feature_engine.transformation.LogTransformer : + Applies log transformation (only for positive values). + feature_engine.transformation.YeoJohnsonTransformer : + Applies Yeo-Johnson transformation. + + References + ---------- + .. [1] Burbidge, J. B., Magee, L., & Robb, A. L. (1988). Alternative + transformations to handle extreme values of the dependent variable. + Journal of the American Statistical Association, 83(401), 123-127. + + Examples + -------- + + >>> import numpy as np + >>> import pandas as pd + >>> from feature_engine.transformation import ArcSinhTransformer + >>> np.random.seed(42) + >>> X = pd.DataFrame(dict(x = np.random.randn(100) * 1000)) + >>> ast = ArcSinhTransformer() + >>> ast.fit(X) + >>> X = ast.transform(X) + >>> X.head() + x + 0 7.516076 + 1 -6.330816 + 2 7.780254 + 3 8.825252 + 4 -6.995893 + """ + + def __init__( + self, + variables: Union[None, int, str, List[Union[str, int]]] = None, + loc: float = 0.0, + scale: float = 1.0, + ) -> None: + + if not isinstance(loc, (int, float)): + raise ValueError( + f"loc must be a number (int or float). " + f"Got {type(loc).__name__} instead." + ) + + if not isinstance(scale, (int, float)) or scale <= 0: + raise ValueError( + f"scale must be a positive number (> 0). Got {scale} instead." + ) + + self.variables = _check_variables_input_value(variables) + self.loc = float(loc) + self.scale = float(scale) + + def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): + """ + This transformer does not learn parameters. + + Selects the numerical variables and stores feature names. + + Parameters + ---------- + X: Pandas DataFrame of shape = [n_samples, n_features]. + The training input samples. Can be the entire dataframe, not just the + variables to transform. + + y: pandas Series, default=None + It is not needed in this transformer. You can pass y or None. + + Returns + ------- + self: ArcSinhTransformer + The fitted transformer. + """ + + # check input dataframe and find/check numerical variables + X = super().fit(X) + + return self + + def transform(self, X: pd.DataFrame) -> pd.DataFrame: + """ + Transform the variables using the arcsinh function. + + Parameters + ---------- + X: Pandas DataFrame of shape = [n_samples, n_features] + The data to be transformed. + + Returns + ------- + X_new: pandas dataframe + The dataframe with the transformed variables. + """ + + # check input dataframe and if class was fitted + X = self._check_transform_input_and_state(X) + + # Ensure float dtype for the transformation + X[self.variables_] = X[self.variables_].astype(float) + + # Apply arcsinh transformation: arcsinh((x - loc) / scale) + X.loc[:, self.variables_] = np.arcsinh( + (X.loc[:, self.variables_] - self.loc) / self.scale + ) + + return X + + def inverse_transform(self, X: pd.DataFrame) -> pd.DataFrame: + """ + Convert the data back to the original representation. + + Parameters + ---------- + X: Pandas DataFrame of shape = [n_samples, n_features] + The data to be inverse transformed. + + Returns + ------- + X_tr: pandas dataframe + The dataframe with the inverse transformed variables. + """ + + # check input dataframe and if class was fitted + X = self._check_transform_input_and_state(X) + + # Inverse transform: x = sinh(y) * scale + loc + X.loc[:, self.variables_] = ( + np.sinh(X.loc[:, self.variables_]) * self.scale + self.loc + ) + + return X + + def _more_tags(self): + tags_dict = _return_tags() + tags_dict["variables"] = "numerical" + return tags_dict + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + return tags diff --git a/tests/test_creation/test_geo_features.py b/tests/test_creation/test_geo_features.py new file mode 100644 index 000000000..b6bad01d4 --- /dev/null +++ b/tests/test_creation/test_geo_features.py @@ -0,0 +1,266 @@ +import numpy as np +import pandas as pd +import pytest + +from feature_engine.creation import GeoDistanceTransformer + + +class TestGeoDistanceTransformer: + """Test cases for GeoDistanceTransformer.""" + + def test_haversine_distance_default(self): + """Test Haversine distance calculation with default parameters.""" + # New York to Los Angeles + X = pd.DataFrame( + { + "lat1": [40.7128], + "lon1": [-74.0060], + "lat2": [34.0522], + "lon2": [-118.2437], + } + ) + transformer = GeoDistanceTransformer( + lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2" + ) + X_tr = transformer.fit_transform(X) + + # Distance should be approximately 3935-3944 km + assert "geo_distance" in X_tr.columns + assert 3900 < X_tr["geo_distance"].iloc[0] < 4000 + + def test_haversine_distance_miles(self): + """Test Haversine distance in miles.""" + X = pd.DataFrame( + { + "lat1": [40.7128], + "lon1": [-74.0060], + "lat2": [34.0522], + "lon2": [-118.2437], + } + ) + transformer = GeoDistanceTransformer( + lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2", output_unit="miles" + ) + X_tr = transformer.fit_transform(X) + + # Distance should be approximately 2445 miles + assert 2400 < X_tr["geo_distance"].iloc[0] < 2500 + + def test_same_location_zero_distance(self): + """Test that same location returns zero distance.""" + X = pd.DataFrame( + { + "lat1": [40.7128, 34.0522], + "lon1": [-74.0060, -118.2437], + "lat2": [40.7128, 34.0522], + "lon2": [-74.0060, -118.2437], + } + ) + transformer = GeoDistanceTransformer( + lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2" + ) + X_tr = transformer.fit_transform(X) + + np.testing.assert_array_almost_equal( + X_tr["geo_distance"].values, [0.0, 0.0], decimal=10 + ) + + def test_euclidean_method(self): + """Test Euclidean distance method.""" + X = pd.DataFrame({"lat1": [0.0], "lon1": [0.0], "lat2": [1.0], "lon2": [1.0]}) + transformer = GeoDistanceTransformer( + lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2", method="euclidean" + ) + X_tr = transformer.fit_transform(X) + + assert X_tr["geo_distance"].iloc[0] > 0 + + def test_manhattan_method(self): + """Test Manhattan distance method.""" + X = pd.DataFrame({"lat1": [0.0], "lon1": [0.0], "lat2": [1.0], "lon2": [1.0]}) + transformer = GeoDistanceTransformer( + lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2", method="manhattan" + ) + X_tr = transformer.fit_transform(X) + + assert X_tr["geo_distance"].iloc[0] > 0 + + def test_custom_output_column_name(self): + """Test custom output column name.""" + X = pd.DataFrame( + { + "lat1": [40.7128], + "lon1": [-74.0060], + "lat2": [34.0522], + "lon2": [-118.2437], + } + ) + transformer = GeoDistanceTransformer( + lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2", output_col="distance_km" + ) + X_tr = transformer.fit_transform(X) + + assert "distance_km" in X_tr.columns + assert "geo_distance" not in X_tr.columns + + def test_drop_original_columns(self): + """Test drop_original parameter.""" + X = pd.DataFrame( + { + "lat1": [40.7128], + "lon1": [-74.0060], + "lat2": [34.0522], + "lon2": [-118.2437], + "other": [1], + } + ) + transformer = GeoDistanceTransformer( + lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2", drop_original=True + ) + X_tr = transformer.fit_transform(X) + + assert "lat1" not in X_tr.columns + assert "lon1" not in X_tr.columns + assert "lat2" not in X_tr.columns + assert "lon2" not in X_tr.columns + assert "geo_distance" in X_tr.columns + assert "other" in X_tr.columns + + def test_multiple_rows(self): + """Test with multiple rows.""" + X = pd.DataFrame( + { + "origin_lat": [40.7128, 34.0522, 41.8781], + "origin_lon": [-74.0060, -118.2437, -87.6298], + "dest_lat": [34.0522, 41.8781, 40.7128], + "dest_lon": [-118.2437, -87.6298, -74.0060], + } + ) + transformer = GeoDistanceTransformer( + lat1="origin_lat", lon1="origin_lon", lat2="dest_lat", lon2="dest_lon" + ) + X_tr = transformer.fit_transform(X) + + assert len(X_tr["geo_distance"]) == 3 + # All distances should be positive + assert all(X_tr["geo_distance"] > 0) + + def test_invalid_method_raises_error(self): + """Test that invalid method raises ValueError.""" + with pytest.raises(ValueError, match="method must be one of"): + GeoDistanceTransformer( + lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2", method="invalid" + ) + + def test_invalid_output_unit_raises_error(self): + """Test that invalid output_unit raises ValueError.""" + with pytest.raises(ValueError, match="output_unit must be one of"): + GeoDistanceTransformer( + lat1="lat1", + lon1="lon1", + lat2="lat2", + lon2="lon2", + output_unit="invalid", + ) + + def test_missing_columns_raises_error(self): + """Test that missing columns raise ValueError on fit.""" + X = pd.DataFrame({"lat1": [1], "lon1": [1]}) + transformer = GeoDistanceTransformer( + lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2" + ) + with pytest.raises(ValueError, match="not present in the dataframe"): + transformer.fit(X) + + def test_invalid_latitude_range_raises_error(self): + """Test that latitude out of range raises ValueError.""" + X = pd.DataFrame( + { + "lat1": [100], # Invalid: outside -90 to 90 + "lon1": [0], + "lat2": [0], + "lon2": [0], + } + ) + transformer = GeoDistanceTransformer( + lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2" + ) + with pytest.raises(ValueError, match="Latitude values.*must be between"): + transformer.fit(X) + + def test_invalid_longitude_range_raises_error(self): + """Test that longitude out of range raises ValueError.""" + X = pd.DataFrame( + { + "lat1": [0], + "lon1": [200], # Invalid: outside -180 to 180 + "lat2": [0], + "lon2": [0], + } + ) + transformer = GeoDistanceTransformer( + lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2" + ) + with pytest.raises(ValueError, match="Longitude values.*must be between"): + transformer.fit(X) + + def test_fit_stores_attributes(self): + """Test that fit stores expected attributes.""" + X = pd.DataFrame( + {"lat1": [40.0], "lon1": [-74.0], "lat2": [34.0], "lon2": [-118.0]} + ) + transformer = GeoDistanceTransformer( + lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2" + ) + transformer.fit(X) + + assert hasattr(transformer, "variables_") + assert hasattr(transformer, "feature_names_in_") + assert hasattr(transformer, "n_features_in_") + assert set(transformer.variables_) == {"lat1", "lon1", "lat2", "lon2"} + + def test_get_feature_names_out(self): + """Test get_feature_names_out returns correct names.""" + X = pd.DataFrame( + { + "lat1": [40.0], + "lon1": [-74.0], + "lat2": [34.0], + "lon2": [-118.0], + "other": [1], + } + ) + transformer = GeoDistanceTransformer( + lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2" + ) + transformer.fit(X) + + feature_names = transformer.get_feature_names_out() + assert "geo_distance" in feature_names + assert len(feature_names) == 6 # 5 original + 1 new + + def test_output_units_conversion(self): + """Test different output units give consistent results.""" + X = pd.DataFrame( + { + "lat1": [40.7128], + "lon1": [-74.0060], + "lat2": [34.0522], + "lon2": [-118.2437], + } + ) + + # Get distance in km and miles + transformer_km = GeoDistanceTransformer( + lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2", output_unit="km" + ) + transformer_miles = GeoDistanceTransformer( + lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2", output_unit="miles" + ) + + dist_km = transformer_km.fit_transform(X.copy())["geo_distance"].iloc[0] + dist_miles = transformer_miles.fit_transform(X.copy())["geo_distance"].iloc[0] + + # 1 km ≈ 0.621371 miles + expected_miles = dist_km * 0.621371 + np.testing.assert_almost_equal(dist_miles, expected_miles, decimal=0) diff --git a/tests/test_text/__init__.py b/tests/test_text/__init__.py new file mode 100644 index 000000000..f4e9de3ea --- /dev/null +++ b/tests/test_text/__init__.py @@ -0,0 +1,167 @@ +import pandas as pd +import pytest + +from feature_engine.text import TextFeatures + + +class TestTextFeatures: + """Test cases for TextFeatures transformer.""" + + def test_default_all_features(self): + """Test extracting all features with default parameters.""" + X = pd.DataFrame({"text": ["Hello World!", "Python 123", "AI"]}) + transformer = TextFeatures() + X_tr = transformer.fit_transform(X) + + # Check that new columns were added + assert "text_char_count" in X_tr.columns + assert "text_word_count" in X_tr.columns + assert "text_digit_count" in X_tr.columns + + # Check char_count + assert X_tr["text_char_count"].tolist() == [12, 10, 2] + + # Check word_count + assert X_tr["text_word_count"].tolist() == [2, 2, 1] + + # Check digit_count + assert X_tr["text_digit_count"].tolist() == [0, 3, 0] + + def test_specific_features(self): + """Test extracting specific features only.""" + X = pd.DataFrame({"text": ["Hello", "World"]}) + transformer = TextFeatures(features=["char_count", "word_count"]) + X_tr = transformer.fit_transform(X) + + # Check only specified features are extracted + assert "text_char_count" in X_tr.columns + assert "text_word_count" in X_tr.columns + assert "text_digit_count" not in X_tr.columns + assert "text_uppercase_count" not in X_tr.columns + + def test_specific_variables(self): + """Test extracting features from specific variables only.""" + X = pd.DataFrame( + {"text1": ["Hello", "World"], "text2": ["Foo", "Bar"], "numeric": [1, 2]} + ) + transformer = TextFeatures(variables=["text1"], features=["char_count"]) + X_tr = transformer.fit_transform(X) + + # Only text1 should have features extracted + assert "text1_char_count" in X_tr.columns + assert "text2_char_count" not in X_tr.columns + + def test_drop_original(self): + """Test drop_original parameter.""" + X = pd.DataFrame({"text": ["Hello", "World"], "other": [1, 2]}) + transformer = TextFeatures(features=["char_count"], drop_original=True) + X_tr = transformer.fit_transform(X) + + assert "text" not in X_tr.columns + assert "text_char_count" in X_tr.columns + assert "other" in X_tr.columns + + def test_empty_string_handling(self): + """Test handling of empty strings.""" + X = pd.DataFrame({"text": ["", "Hello", ""]}) + transformer = TextFeatures(features=["char_count", "word_count", "is_empty"]) + X_tr = transformer.fit_transform(X) + + assert X_tr["text_char_count"].tolist() == [0, 5, 0] + assert X_tr["text_is_empty"].tolist() == [1, 0, 1] + + def test_nan_handling(self): + """Test handling of NaN values.""" + X = pd.DataFrame({"text": ["Hello", None, "World"]}) + transformer = TextFeatures(features=["char_count"]) + X_tr = transformer.fit_transform(X) + + # NaN should be filled with empty string, resulting in char_count of 0 + assert X_tr["text_char_count"].tolist() == [5, 0, 5] + + def test_uppercase_features(self): + """Test uppercase-related features.""" + X = pd.DataFrame({"text": ["HELLO", "hello", "HeLLo"]}) + transformer = TextFeatures( + features=["uppercase_count", "has_uppercase", "starts_with_uppercase"] + ) + X_tr = transformer.fit_transform(X) + + assert X_tr["text_uppercase_count"].tolist() == [5, 0, 3] + assert X_tr["text_has_uppercase"].tolist() == [1, 0, 1] + assert X_tr["text_starts_with_uppercase"].tolist() == [1, 0, 1] + + def test_sentence_count(self): + """Test sentence counting.""" + X = pd.DataFrame({"text": ["Hello. World!", "One sentence", "A? B! C."]}) + transformer = TextFeatures(features=["sentence_count"]) + X_tr = transformer.fit_transform(X) + + assert X_tr["text_sentence_count"].tolist() == [2, 0, 3] + + def test_unique_word_features(self): + """Test unique word features.""" + X = pd.DataFrame({"text": ["the the the", "a b c", "x"]}) + transformer = TextFeatures(features=["unique_word_count", "unique_word_ratio"]) + X_tr = transformer.fit_transform(X) + + assert X_tr["text_unique_word_count"].tolist() == [1, 3, 1] + assert X_tr["text_unique_word_ratio"].tolist() == [1 / 3, 1.0, 1.0] + + def test_invalid_feature_raises_error(self): + """Test that invalid feature name raises ValueError.""" + with pytest.raises(ValueError, match="Invalid features"): + TextFeatures(features=["invalid_feature"]) + + def test_invalid_variables_raises_error(self): + """Test that invalid variables parameter raises ValueError.""" + with pytest.raises(ValueError, match="variables must be"): + TextFeatures(variables=123) + + def test_missing_variable_raises_error(self): + """Test that missing variable raises ValueError on fit.""" + X = pd.DataFrame({"text": ["Hello"]}) + transformer = TextFeatures(variables=["nonexistent"]) + with pytest.raises(ValueError, match="not present in the dataframe"): + transformer.fit(X) + + def test_no_text_columns_raises_error(self): + """Test that no text columns raises error when variables=None.""" + X = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + transformer = TextFeatures() + with pytest.raises(ValueError, match="No object/string columns found"): + transformer.fit(X) + + def test_fit_stores_attributes(self): + """Test that fit stores expected attributes.""" + X = pd.DataFrame({"text": ["Hello"]}) + transformer = TextFeatures() + transformer.fit(X) + + assert hasattr(transformer, "variables_") + assert hasattr(transformer, "features_") + assert hasattr(transformer, "feature_names_in_") + assert hasattr(transformer, "n_features_in_") + + def test_get_feature_names_out(self): + """Test get_feature_names_out returns correct names.""" + X = pd.DataFrame({"text": ["Hello"], "other": [1]}) + transformer = TextFeatures(features=["char_count", "word_count"]) + transformer.fit(X) + + feature_names = transformer.get_feature_names_out() + assert "text" in feature_names + assert "other" in feature_names + assert "text_char_count" in feature_names + assert "text_word_count" in feature_names + + def test_get_feature_names_out_with_drop(self): + """Test get_feature_names_out with drop_original=True.""" + X = pd.DataFrame({"text": ["Hello"], "other": [1]}) + transformer = TextFeatures(features=["char_count"], drop_original=True) + transformer.fit(X) + + feature_names = transformer.get_feature_names_out() + assert "text" not in feature_names + assert "other" in feature_names + assert "text_char_count" in feature_names diff --git a/tests/test_transformation/test_arcsinh.py b/tests/test_transformation/test_arcsinh.py new file mode 100644 index 000000000..32478255f --- /dev/null +++ b/tests/test_transformation/test_arcsinh.py @@ -0,0 +1,122 @@ +import numpy as np +import pandas as pd +import pytest + +from feature_engine.transformation import ArcSinhTransformer + + +class TestArcSinhTransformer: + """Test cases for ArcSinhTransformer.""" + + def test_default_parameters(self): + """Test transformer with default parameters.""" + X = pd.DataFrame({"a": [-100, -10, 0, 10, 100], "b": [1, 2, 3, 4, 5]}) + transformer = ArcSinhTransformer() + X_tr = transformer.fit_transform(X) + + # Check transform was applied + expected_a = np.arcsinh(X["a"]) + expected_b = np.arcsinh(X["b"]) + np.testing.assert_array_almost_equal(X_tr["a"], expected_a) + np.testing.assert_array_almost_equal(X_tr["b"], expected_b) + + def test_with_loc_and_scale(self): + """Test transformer with loc and scale parameters.""" + X = pd.DataFrame({"a": [10, 20, 30, 40, 50]}) + loc = 30.0 + scale = 10.0 + transformer = ArcSinhTransformer(loc=loc, scale=scale) + X_tr = transformer.fit_transform(X) + + expected = np.arcsinh((X["a"] - loc) / scale) + np.testing.assert_array_almost_equal(X_tr["a"], expected) + + def test_specific_variables(self): + """Test transformer with specific variables selected.""" + X = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}) + transformer = ArcSinhTransformer(variables=["a", "b"]) + X_tr = transformer.fit_transform(X) + + # Check only specified variables were transformed + np.testing.assert_array_almost_equal(X_tr["a"], np.arcsinh(X["a"])) + np.testing.assert_array_almost_equal(X_tr["b"], np.arcsinh(X["b"])) + # c should be unchanged + np.testing.assert_array_equal(X_tr["c"], X["c"]) + + def test_inverse_transform(self): + """Test inverse_transform returns original values.""" + X = pd.DataFrame({"a": [-100, -10, 0, 10, 100], "b": [1, 2, 3, 4, 5]}) + X_original = X.copy() + transformer = ArcSinhTransformer() + X_tr = transformer.fit_transform(X.copy()) + X_inv = transformer.inverse_transform(X_tr) + + np.testing.assert_array_almost_equal(X_inv["a"], X_original["a"], decimal=10) + np.testing.assert_array_almost_equal(X_inv["b"], X_original["b"], decimal=10) + + def test_inverse_transform_with_loc_scale(self): + """Test inverse_transform with loc and scale parameters.""" + X = pd.DataFrame({"a": [10, 20, 30, 40, 50]}) + X_original = X.copy() + transformer = ArcSinhTransformer(loc=25.0, scale=5.0) + X_tr = transformer.fit_transform(X.copy()) + X_inv = transformer.inverse_transform(X_tr) + + np.testing.assert_array_almost_equal(X_inv["a"], X_original["a"], decimal=10) + + def test_negative_values(self): + """Test that transformer handles negative values correctly.""" + X = pd.DataFrame({"a": [-1000, -500, 0, 500, 1000]}) + transformer = ArcSinhTransformer() + X_tr = transformer.fit_transform(X) + + # arcsinh should handle negative values + assert X_tr["a"].iloc[0] < 0 + assert X_tr["a"].iloc[1] < 0 + assert X_tr["a"].iloc[2] == 0 + assert X_tr["a"].iloc[3] > 0 + assert X_tr["a"].iloc[4] > 0 + + def test_invalid_scale_raises_error(self): + """Test that invalid scale parameter raises ValueError.""" + with pytest.raises(ValueError, match="scale must be a positive number"): + ArcSinhTransformer(scale=0) + + with pytest.raises(ValueError, match="scale must be a positive number"): + ArcSinhTransformer(scale=-1) + + def test_invalid_loc_raises_error(self): + """Test that invalid loc parameter raises ValueError.""" + with pytest.raises(ValueError, match="loc must be a number"): + ArcSinhTransformer(loc="invalid") + + def test_fit_stores_attributes(self): + """Test that fit stores expected attributes.""" + X = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + transformer = ArcSinhTransformer() + transformer.fit(X) + + assert hasattr(transformer, "variables_") + assert hasattr(transformer, "feature_names_in_") + assert hasattr(transformer, "n_features_in_") + assert transformer.n_features_in_ == 2 + assert set(transformer.variables_) == {"a", "b"} + + def test_get_feature_names_out(self): + """Test get_feature_names_out returns correct feature names.""" + X = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + transformer = ArcSinhTransformer() + transformer.fit(X) + + feature_names = transformer.get_feature_names_out() + assert feature_names == ["a", "b"] + + def test_behavior_like_log_for_large_values(self): + """Test that arcsinh behaves like log for large positive values.""" + X = pd.DataFrame({"a": [1000, 10000, 100000]}) + transformer = ArcSinhTransformer() + X_tr = transformer.fit_transform(X.copy()) + + # For large x: arcsinh(x) ≈ ln(2x) = ln(2) + ln(x) + log_approx = np.log(2 * X["a"]) + np.testing.assert_array_almost_equal(X_tr["a"], log_approx, decimal=1) From 965415d2bd7df1f3a2ace7be6d39bc6369f6b01a Mon Sep 17 00:00:00 2001 From: ankitlade12 Date: Fri, 26 Dec 2025 13:04:26 -0600 Subject: [PATCH 02/10] Fix flake8 W391: remove trailing blank lines --- feature_engine/creation/__init__.py | 1 - feature_engine/transformation/__init__.py | 1 - 2 files changed, 2 deletions(-) diff --git a/feature_engine/creation/__init__.py b/feature_engine/creation/__init__.py index b3c84ba15..f4d31748e 100644 --- a/feature_engine/creation/__init__.py +++ b/feature_engine/creation/__init__.py @@ -15,4 +15,3 @@ "MathFeatures", "RelativeFeatures", ] - diff --git a/feature_engine/transformation/__init__.py b/feature_engine/transformation/__init__.py index f60dbd72b..9bbb62a59 100644 --- a/feature_engine/transformation/__init__.py +++ b/feature_engine/transformation/__init__.py @@ -21,4 +21,3 @@ "ReciprocalTransformer", "YeoJohnsonTransformer", ] - From a9480aa231145248c9067cc432bd288989338137 Mon Sep 17 00:00:00 2001 From: ankitlade12 Date: Fri, 26 Dec 2025 13:43:01 -0600 Subject: [PATCH 03/10] Fix Sphinx docstring: escape feature_names_in_ with backticks --- feature_engine/creation/geo_features.py | 2 +- feature_engine/text/text_features.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/feature_engine/creation/geo_features.py b/feature_engine/creation/geo_features.py index 1488106e9..fa3713bbc 100644 --- a/feature_engine/creation/geo_features.py +++ b/feature_engine/creation/geo_features.py @@ -398,7 +398,7 @@ def get_feature_names_out(self, input_features=None) -> List[str]: Parameters ---------- input_features : array-like of str or None, default=None - Input features. If None, uses feature_names_in_. + Input features. If None, uses ``feature_names_in_``. Returns ------- diff --git a/feature_engine/text/text_features.py b/feature_engine/text/text_features.py index c06afdf79..63e9b0dac 100644 --- a/feature_engine/text/text_features.py +++ b/feature_engine/text/text_features.py @@ -291,7 +291,7 @@ def get_feature_names_out(self, input_features=None) -> List[str]: Parameters ---------- input_features : array-like of str or None, default=None - Input features. If None, uses feature_names_in_. + Input features. If None, uses ``feature_names_in_``. Returns ------- From b26564e3cae32f57227dfe45b9ed2c3e0fb6bf63 Mon Sep 17 00:00:00 2001 From: ankitlade12 Date: Fri, 26 Dec 2025 13:46:02 -0600 Subject: [PATCH 04/10] Fix mypy type errors in geo_features.py --- feature_engine/creation/geo_features.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/feature_engine/creation/geo_features.py b/feature_engine/creation/geo_features.py index fa3713bbc..02bcd74ac 100644 --- a/feature_engine/creation/geo_features.py +++ b/feature_engine/creation/geo_features.py @@ -1,7 +1,7 @@ # Authors: Ankit Hemant Lade (contributor) # License: BSD 3 clause -from typing import List, Literal, Optional +from typing import List, Literal, Optional, Union import numpy as np import pandas as pd @@ -223,7 +223,12 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): X = check_X(X) # Store coordinate variables - self.variables_ = [self.lat1, self.lon1, self.lat2, self.lon2] + self.variables_: List[Union[str, int]] = [ + self.lat1, + self.lon1, + self.lat2, + self.lon2, + ] # Check all coordinate columns exist missing = set(self.variables_) - set(X.columns) From a06b35086380233113d2210f646bcf34cade3ecf Mon Sep 17 00:00:00 2001 From: ankitlade12 Date: Fri, 26 Dec 2025 13:47:51 -0600 Subject: [PATCH 05/10] Fix RST doc: remove pipe characters that cause substitution error --- docs/user_guide/transformation/ArcSinhTransformer.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/user_guide/transformation/ArcSinhTransformer.rst b/docs/user_guide/transformation/ArcSinhTransformer.rst index 07945b463..8edc182f4 100644 --- a/docs/user_guide/transformation/ArcSinhTransformer.rst +++ b/docs/user_guide/transformation/ArcSinhTransformer.rst @@ -11,9 +11,9 @@ transformation is useful for data that contains both positive and negative value The transformation is: x → arcsinh((x - loc) / scale) -For large |x|, arcsinh(x) behaves like ln(|x|) + ln(2), providing similar -variance-stabilizing properties as the log transformation. For small |x|, -it behaves approximately linearly (x → x). This makes it ideal for variables +For large values of x, arcsinh(x) behaves like ln(x) + ln(2), providing similar +variance-stabilizing properties as the log transformation. For small values of x, +it behaves approximately linearly (x tends to x). This makes it ideal for variables like net worth, profit/loss, or any metric that can be positive or negative. Unlike the :class:`LogTransformer()`, the :class:`ArcSinhTransformer()` can handle From c9190e8562b55f66aec4d4e3e500bcafa36b97ef Mon Sep 17 00:00:00 2001 From: ankitlade12 Date: Fri, 26 Dec 2025 13:51:11 -0600 Subject: [PATCH 06/10] Fix docstring: remove pipe characters from arcsinh.py --- feature_engine/transformation/arcsinh.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/feature_engine/transformation/arcsinh.py b/feature_engine/transformation/arcsinh.py index 98a44478e..0e51e68e0 100644 --- a/feature_engine/transformation/arcsinh.py +++ b/feature_engine/transformation/arcsinh.py @@ -44,9 +44,9 @@ class ArcSinhTransformer(BaseNumericalTransformer): The transformation is: x → arcsinh((x - loc) / scale) - For large |x|, arcsinh(x) behaves like ln(|x|) + ln(2), providing similar - variance-stabilizing properties as the log transformation. For small |x|, - it behaves approximately linearly (x → x). This makes it ideal for variables + For large values of x, arcsinh(x) behaves like ln(x) + ln(2), providing similar + variance-stabilizing properties as the log transformation. For small values of x, + it behaves approximately linearly. This makes it ideal for variables like net worth, profit/loss, or any metric that can be positive or negative. A list of variables can be passed as an argument. Alternatively, the transformer From c81314904e112b630a880ab0406f7b7308741669 Mon Sep 17 00:00:00 2001 From: ankitlade12 Date: Thu, 1 Jan 2026 09:36:27 -0600 Subject: [PATCH 07/10] Address PR review feedback for GeoDistanceTransformer - Add validate_ranges parameter to control coordinate validation - Update docstrings to match library patterns - Refactor tests to standalone functions (remove class wrapper) - Add tests for validate_ranges parameter - Rewrite user guide with proper headings, explanatory text, and outputs - Create API documentation file - Add GeoDistanceTransformer to API doc index --- .../creation/GeoDistanceTransformer.rst | 6 + docs/api_doc/creation/index.rst | 1 + .../creation/GeoDistanceTransformer.rst | 113 +++- feature_engine/creation/geo_features.py | 42 +- tests/test_creation/test_geo_features.py | 528 ++++++++++-------- 5 files changed, 408 insertions(+), 282 deletions(-) create mode 100644 docs/api_doc/creation/GeoDistanceTransformer.rst diff --git a/docs/api_doc/creation/GeoDistanceTransformer.rst b/docs/api_doc/creation/GeoDistanceTransformer.rst new file mode 100644 index 000000000..c8c6a1a36 --- /dev/null +++ b/docs/api_doc/creation/GeoDistanceTransformer.rst @@ -0,0 +1,6 @@ +GeoDistanceTransformer +====================== + +.. autoclass:: feature_engine.creation.GeoDistanceTransformer + :members: + diff --git a/docs/api_doc/creation/index.rst b/docs/api_doc/creation/index.rst index 8af73b822..7be0f6cf9 100644 --- a/docs/api_doc/creation/index.rst +++ b/docs/api_doc/creation/index.rst @@ -13,6 +13,7 @@ by either combining or transforming existing features. RelativeFeatures CyclicalFeatures DecisionTreeFeatures + GeoDistanceTransformer Transformers in other Libraries diff --git a/docs/user_guide/creation/GeoDistanceTransformer.rst b/docs/user_guide/creation/GeoDistanceTransformer.rst index 625daf00c..c0b81dad1 100644 --- a/docs/user_guide/creation/GeoDistanceTransformer.rst +++ b/docs/user_guide/creation/GeoDistanceTransformer.rst @@ -5,15 +5,15 @@ GeoDistanceTransformer ====================== -The :class:`GeoDistanceTransformer()` calculates the distance between two geographical +:class:`GeoDistanceTransformer()` calculates the distance between two geographical coordinate pairs (latitude/longitude) and adds the result as a new feature. -This transformer is useful for location-based machine learning problems such as +:class:`GeoDistanceTransformer()` is useful for location-based machine learning problems such as real estate pricing, delivery route optimization, ride-sharing applications, and any domain where geographic proximity is relevant. Distance Methods -~~~~~~~~~~~~~~~~ +---------------- The transformer supports different distance calculation methods: @@ -25,17 +25,17 @@ The transformer supports different distance calculation methods: Useful as a rough approximation for grid-based city layouts. Output Units -~~~~~~~~~~~~ +------------ -The distance can be output in various units: +The distance can be returned in various units: - **km**: Kilometers (default) - **miles**: Miles - **meters**: Meters - **feet**: Feet -Example -~~~~~~~ +Python Demo +----------- Let's create a dataframe with origin and destination coordinates: @@ -53,7 +53,7 @@ Let's create a dataframe with origin and destination coordinates: 'trip_id': [1, 2, 3, 4] }) -Now let's calculate the distances: +Now let's calculate the distances using the haversine formula and returning the values in km: .. code:: python @@ -74,7 +74,7 @@ Now let's calculate the distances: print(X_transformed[['trip_id', 'distance_km']]) -Output: +In the following output we see the trip ID followed by the distance traveled in each trip: .. code:: python @@ -87,9 +87,11 @@ Output: Using different distance methods ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +We can use the Euclidean distance method, which provides a faster but less accurate +calculation suitable for short distances: + .. code:: python - # Euclidean distance (faster but less accurate) gdt_euclidean = GeoDistanceTransformer( lat1='origin_lat', lon1='origin_lon', lat2='dest_lat', lon2='dest_lon', @@ -97,7 +99,25 @@ Using different distance methods output_col='distance_euclidean' ) - # Manhattan distance (useful for grid cities) + gdt_euclidean.fit(X) + X_euclidean = gdt_euclidean.transform(X) + print(X_euclidean[['trip_id', 'distance_euclidean']]) + +The Euclidean distances differ from the Haversine values because they don't account +for Earth's curvature: + +.. code:: python + + trip_id distance_euclidean + 0 1 4940.252715 + 1 2 3493.298968 + 2 3 1519.295694 + 3 4 1720.178310 + +Alternatively, we can use the Manhattan distance, which is useful for grid-based city layouts: + +.. code:: python + gdt_manhattan = GeoDistanceTransformer( lat1='origin_lat', lon1='origin_lon', lat2='dest_lat', lon2='dest_lon', @@ -105,8 +125,25 @@ Using different distance methods output_col='distance_manhattan' ) -Converting to miles -~~~~~~~~~~~~~~~~~~~ + gdt_manhattan.fit(X) + X_manhattan = gdt_manhattan.transform(X) + print(X_manhattan[['trip_id', 'distance_manhattan']]) + +The Manhattan distance sums the absolute differences in latitude and longitude: + +.. code:: python + + trip_id distance_manhattan + 0 1 5628.24000 + 1 2 4684.15800 + 2 3 1637.36700 + 3 4 2279.96460 + +Using different output units +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The transformer supports returning distances in km (default), miles, meters, or feet. +Here we calculate distances in miles: .. code:: python @@ -119,10 +156,24 @@ Converting to miles gdt.fit(X) X_transformed = gdt.transform(X) + print(X_transformed[['trip_id', 'distance_miles']]) + +The distances are now expressed in miles instead of kilometers: + +.. code:: python + + trip_id distance_miles + 0 1 2445.258392 + 1 2 1745.046817 + 2 3 711.000629 + 3 4 1015.643614 Dropping original coordinate columns ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +To reduce the dimensionality of the output dataset, we can remove the original +coordinate columns after calculating the distance: + .. code:: python gdt = GeoDistanceTransformer( @@ -136,12 +187,19 @@ Dropping original coordinate columns # Coordinate columns are removed print(X_transformed.columns.tolist()) - # ['trip_id', 'geo_distance'] -Using in a Pipeline -~~~~~~~~~~~~~~~~~~~ +After transformation, only the non-coordinate columns and the new distance column remain: + +.. code:: python + + ['trip_id', 'geo_distance'] + +Calculating distance within a Pipeline +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -:class:`GeoDistanceTransformer()` works seamlessly with scikit-learn pipelines: +:class:`GeoDistanceTransformer()` works seamlessly with scikit-learn pipelines. In the +following example, we create a pipeline that first calculates the geographic distance, +then scales the features, and finally trains a regression model: .. code:: python @@ -149,6 +207,9 @@ Using in a Pipeline from sklearn.preprocessing import StandardScaler from sklearn.ensemble import RandomForestRegressor + # Create sample target variable + y = pd.Series([100, 150, 80, 200]) + # Create a pipeline for price prediction pipe = Pipeline([ ('geo_distance', GeoDistanceTransformer( @@ -158,12 +219,18 @@ Using in a Pipeline drop_original=True )), ('scaler', StandardScaler()), - ('regressor', RandomForestRegressor()) + ('regressor', RandomForestRegressor(n_estimators=10, random_state=42)) ]) -API Reference -------------- + # Fit the pipeline + pipe.fit(X, y) + + # Make predictions + predictions = pipe.predict(X) + print(f"Predictions: {predictions}") + +The pipeline successfully trains and returns predictions: + +.. code:: python -.. autoclass:: GeoDistanceTransformer - :members: - :inherited-members: + Predictions: [107. 143. 83. 197.] diff --git a/feature_engine/creation/geo_features.py b/feature_engine/creation/geo_features.py index 02bcd74ac..695b91c59 100644 --- a/feature_engine/creation/geo_features.py +++ b/feature_engine/creation/geo_features.py @@ -82,6 +82,11 @@ class GeoDistanceTransformer(TransformerMixin, BaseEstimator, GetFeatureNamesOut drop_original: bool, default=False Whether to drop the original coordinate columns after transformation. + validate_ranges: bool, default=True + Whether to validate that latitude values are within [-90, 90] and + longitude values are within [-180, 180]. If False, coordinates outside + valid ranges may produce incorrect distance calculations. + Attributes ---------- variables_: @@ -96,7 +101,7 @@ class GeoDistanceTransformer(TransformerMixin, BaseEstimator, GetFeatureNamesOut Methods ------- fit: - This transformer does not learn parameters. Validates input columns. + This transformer does not learn parameters. fit_transform: Fit to data, then transform it. @@ -153,6 +158,7 @@ def __init__( output_unit: Literal["km", "miles", "meters", "feet"] = "km", output_col: str = "geo_distance", drop_original: bool = False, + validate_ranges: bool = True, ) -> None: # Validate coordinate column names @@ -188,6 +194,12 @@ def __init__( f"output_col must be a string. Got {type(output_col).__name__}." ) + # Validate validate_ranges + if not isinstance(validate_ranges, bool): + raise ValueError( + f"validate_ranges must be a boolean. Got {type(validate_ranges).__name__}." + ) + _check_param_drop_original(drop_original) self.lat1 = lat1 @@ -198,13 +210,12 @@ def __init__( self.output_unit = output_unit self.output_col = output_col self.drop_original = drop_original + self.validate_ranges = validate_ranges def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ This transformer does not learn parameters. - Validates that the coordinate columns exist and are numerical. - Parameters ---------- X: pandas dataframe of shape = [n_samples, n_features] @@ -243,18 +254,19 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): # Check for missing values _check_contains_na(X, self.variables_) - # Validate coordinate ranges (optional sanity check) - for lat_col in [self.lat1, self.lat2]: - if (X[lat_col].abs() > 90).any(): - raise ValueError( - f"Latitude values in '{lat_col}' must be between -90 and 90." - ) - - for lon_col in [self.lon1, self.lon2]: - if (X[lon_col].abs() > 180).any(): - raise ValueError( - f"Longitude values in '{lon_col}' must be between -180 and 180." - ) + # Validate coordinate ranges if enabled + if self.validate_ranges: + for lat_col in [self.lat1, self.lat2]: + if (X[lat_col].abs() > 90).any(): + raise ValueError( + f"Latitude values in '{lat_col}' must be between -90 and 90." + ) + + for lon_col in [self.lon1, self.lon2]: + if (X[lon_col].abs() > 180).any(): + raise ValueError( + f"Longitude values in '{lon_col}' must be between -180 and 180." + ) # save input features self.feature_names_in_ = X.columns.tolist() diff --git a/tests/test_creation/test_geo_features.py b/tests/test_creation/test_geo_features.py index b6bad01d4..839a7c490 100644 --- a/tests/test_creation/test_geo_features.py +++ b/tests/test_creation/test_geo_features.py @@ -5,262 +5,302 @@ from feature_engine.creation import GeoDistanceTransformer -class TestGeoDistanceTransformer: - """Test cases for GeoDistanceTransformer.""" - - def test_haversine_distance_default(self): - """Test Haversine distance calculation with default parameters.""" - # New York to Los Angeles - X = pd.DataFrame( - { - "lat1": [40.7128], - "lon1": [-74.0060], - "lat2": [34.0522], - "lon2": [-118.2437], - } +@pytest.fixture +def df_coords(): + """Fixture providing sample coordinate data.""" + return pd.DataFrame( + { + "lat1": [40.7128], + "lon1": [-74.0060], + "lat2": [34.0522], + "lon2": [-118.2437], + } + ) + + +def test_haversine_distance_default(df_coords): + """Test Haversine distance calculation with default parameters.""" + transformer = GeoDistanceTransformer( + lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2" + ) + X_tr = transformer.fit_transform(df_coords) + + # Distance should be approximately 3935-3944 km + assert "geo_distance" in X_tr.columns + assert 3900 < X_tr["geo_distance"].iloc[0] < 4000 + + +def test_haversine_distance_miles(): + """Test Haversine distance in miles.""" + X = pd.DataFrame( + { + "lat1": [40.7128], + "lon1": [-74.0060], + "lat2": [34.0522], + "lon2": [-118.2437], + } + ) + transformer = GeoDistanceTransformer( + lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2", output_unit="miles" + ) + X_tr = transformer.fit_transform(X) + + # Distance should be approximately 2445 miles + assert 2400 < X_tr["geo_distance"].iloc[0] < 2500 + + +def test_same_location_zero_distance(): + """Test that same location returns zero distance.""" + X = pd.DataFrame( + { + "lat1": [40.7128, 34.0522], + "lon1": [-74.0060, -118.2437], + "lat2": [40.7128, 34.0522], + "lon2": [-74.0060, -118.2437], + } + ) + transformer = GeoDistanceTransformer( + lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2" + ) + X_tr = transformer.fit_transform(X) + + np.testing.assert_array_almost_equal( + X_tr["geo_distance"].values, [0.0, 0.0], decimal=10 + ) + + +def test_euclidean_method(): + """Test Euclidean distance method.""" + X = pd.DataFrame({"lat1": [0.0], "lon1": [0.0], "lat2": [1.0], "lon2": [1.0]}) + transformer = GeoDistanceTransformer( + lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2", method="euclidean" + ) + X_tr = transformer.fit_transform(X) + + assert X_tr["geo_distance"].iloc[0] > 0 + + +def test_manhattan_method(): + """Test Manhattan distance method.""" + X = pd.DataFrame({"lat1": [0.0], "lon1": [0.0], "lat2": [1.0], "lon2": [1.0]}) + transformer = GeoDistanceTransformer( + lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2", method="manhattan" + ) + X_tr = transformer.fit_transform(X) + + assert X_tr["geo_distance"].iloc[0] > 0 + + +def test_custom_output_column_name(df_coords): + """Test custom output column name.""" + transformer = GeoDistanceTransformer( + lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2", output_col="distance_km" + ) + X_tr = transformer.fit_transform(df_coords) + + assert "distance_km" in X_tr.columns + assert "geo_distance" not in X_tr.columns + + +def test_drop_original_columns(): + """Test drop_original parameter.""" + X = pd.DataFrame( + { + "lat1": [40.7128], + "lon1": [-74.0060], + "lat2": [34.0522], + "lon2": [-118.2437], + "other": [1], + } + ) + transformer = GeoDistanceTransformer( + lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2", drop_original=True + ) + X_tr = transformer.fit_transform(X) + + assert "lat1" not in X_tr.columns + assert "lon1" not in X_tr.columns + assert "lat2" not in X_tr.columns + assert "lon2" not in X_tr.columns + assert "geo_distance" in X_tr.columns + assert "other" in X_tr.columns + + +def test_multiple_rows(): + """Test with multiple rows.""" + X = pd.DataFrame( + { + "origin_lat": [40.7128, 34.0522, 41.8781], + "origin_lon": [-74.0060, -118.2437, -87.6298], + "dest_lat": [34.0522, 41.8781, 40.7128], + "dest_lon": [-118.2437, -87.6298, -74.0060], + } + ) + transformer = GeoDistanceTransformer( + lat1="origin_lat", lon1="origin_lon", lat2="dest_lat", lon2="dest_lon" + ) + X_tr = transformer.fit_transform(X) + + assert len(X_tr["geo_distance"]) == 3 + # All distances should be positive + assert all(X_tr["geo_distance"] > 0) + + +def test_invalid_method_raises_error(): + """Test that invalid method raises ValueError.""" + with pytest.raises(ValueError, match="method must be one of"): + GeoDistanceTransformer( + lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2", method="invalid" ) - transformer = GeoDistanceTransformer( - lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2" - ) - X_tr = transformer.fit_transform(X) - - # Distance should be approximately 3935-3944 km - assert "geo_distance" in X_tr.columns - assert 3900 < X_tr["geo_distance"].iloc[0] < 4000 - - def test_haversine_distance_miles(self): - """Test Haversine distance in miles.""" - X = pd.DataFrame( - { - "lat1": [40.7128], - "lon1": [-74.0060], - "lat2": [34.0522], - "lon2": [-118.2437], - } - ) - transformer = GeoDistanceTransformer( - lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2", output_unit="miles" - ) - X_tr = transformer.fit_transform(X) - - # Distance should be approximately 2445 miles - assert 2400 < X_tr["geo_distance"].iloc[0] < 2500 - - def test_same_location_zero_distance(self): - """Test that same location returns zero distance.""" - X = pd.DataFrame( - { - "lat1": [40.7128, 34.0522], - "lon1": [-74.0060, -118.2437], - "lat2": [40.7128, 34.0522], - "lon2": [-74.0060, -118.2437], - } - ) - transformer = GeoDistanceTransformer( - lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2" - ) - X_tr = transformer.fit_transform(X) - np.testing.assert_array_almost_equal( - X_tr["geo_distance"].values, [0.0, 0.0], decimal=10 - ) - def test_euclidean_method(self): - """Test Euclidean distance method.""" - X = pd.DataFrame({"lat1": [0.0], "lon1": [0.0], "lat2": [1.0], "lon2": [1.0]}) - transformer = GeoDistanceTransformer( - lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2", method="euclidean" +def test_invalid_output_unit_raises_error(): + """Test that invalid output_unit raises ValueError.""" + with pytest.raises(ValueError, match="output_unit must be one of"): + GeoDistanceTransformer( + lat1="lat1", + lon1="lon1", + lat2="lat2", + lon2="lon2", + output_unit="invalid", ) - X_tr = transformer.fit_transform(X) - assert X_tr["geo_distance"].iloc[0] > 0 - def test_manhattan_method(self): - """Test Manhattan distance method.""" - X = pd.DataFrame({"lat1": [0.0], "lon1": [0.0], "lat2": [1.0], "lon2": [1.0]}) - transformer = GeoDistanceTransformer( - lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2", method="manhattan" - ) - X_tr = transformer.fit_transform(X) - - assert X_tr["geo_distance"].iloc[0] > 0 - - def test_custom_output_column_name(self): - """Test custom output column name.""" - X = pd.DataFrame( - { - "lat1": [40.7128], - "lon1": [-74.0060], - "lat2": [34.0522], - "lon2": [-118.2437], - } - ) - transformer = GeoDistanceTransformer( - lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2", output_col="distance_km" - ) - X_tr = transformer.fit_transform(X) - - assert "distance_km" in X_tr.columns - assert "geo_distance" not in X_tr.columns - - def test_drop_original_columns(self): - """Test drop_original parameter.""" - X = pd.DataFrame( - { - "lat1": [40.7128], - "lon1": [-74.0060], - "lat2": [34.0522], - "lon2": [-118.2437], - "other": [1], - } - ) - transformer = GeoDistanceTransformer( - lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2", drop_original=True - ) - X_tr = transformer.fit_transform(X) - - assert "lat1" not in X_tr.columns - assert "lon1" not in X_tr.columns - assert "lat2" not in X_tr.columns - assert "lon2" not in X_tr.columns - assert "geo_distance" in X_tr.columns - assert "other" in X_tr.columns - - def test_multiple_rows(self): - """Test with multiple rows.""" - X = pd.DataFrame( - { - "origin_lat": [40.7128, 34.0522, 41.8781], - "origin_lon": [-74.0060, -118.2437, -87.6298], - "dest_lat": [34.0522, 41.8781, 40.7128], - "dest_lon": [-118.2437, -87.6298, -74.0060], - } - ) - transformer = GeoDistanceTransformer( - lat1="origin_lat", lon1="origin_lon", lat2="dest_lat", lon2="dest_lon" - ) - X_tr = transformer.fit_transform(X) - - assert len(X_tr["geo_distance"]) == 3 - # All distances should be positive - assert all(X_tr["geo_distance"] > 0) - - def test_invalid_method_raises_error(self): - """Test that invalid method raises ValueError.""" - with pytest.raises(ValueError, match="method must be one of"): - GeoDistanceTransformer( - lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2", method="invalid" - ) - - def test_invalid_output_unit_raises_error(self): - """Test that invalid output_unit raises ValueError.""" - with pytest.raises(ValueError, match="output_unit must be one of"): - GeoDistanceTransformer( - lat1="lat1", - lon1="lon1", - lat2="lat2", - lon2="lon2", - output_unit="invalid", - ) - - def test_missing_columns_raises_error(self): - """Test that missing columns raise ValueError on fit.""" - X = pd.DataFrame({"lat1": [1], "lon1": [1]}) - transformer = GeoDistanceTransformer( - lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2" - ) - with pytest.raises(ValueError, match="not present in the dataframe"): - transformer.fit(X) - - def test_invalid_latitude_range_raises_error(self): - """Test that latitude out of range raises ValueError.""" - X = pd.DataFrame( - { - "lat1": [100], # Invalid: outside -90 to 90 - "lon1": [0], - "lat2": [0], - "lon2": [0], - } - ) - transformer = GeoDistanceTransformer( - lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2" - ) - with pytest.raises(ValueError, match="Latitude values.*must be between"): - transformer.fit(X) - - def test_invalid_longitude_range_raises_error(self): - """Test that longitude out of range raises ValueError.""" - X = pd.DataFrame( - { - "lat1": [0], - "lon1": [200], # Invalid: outside -180 to 180 - "lat2": [0], - "lon2": [0], - } - ) - transformer = GeoDistanceTransformer( - lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2" - ) - with pytest.raises(ValueError, match="Longitude values.*must be between"): - transformer.fit(X) +def test_missing_columns_raises_error(): + """Test that missing columns raise ValueError on fit.""" + X = pd.DataFrame({"lat1": [1], "lon1": [1]}) + transformer = GeoDistanceTransformer( + lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2" + ) + with pytest.raises(ValueError, match="not present in the dataframe"): + transformer.fit(X) - def test_fit_stores_attributes(self): - """Test that fit stores expected attributes.""" - X = pd.DataFrame( - {"lat1": [40.0], "lon1": [-74.0], "lat2": [34.0], "lon2": [-118.0]} - ) - transformer = GeoDistanceTransformer( - lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2" - ) + +def test_invalid_latitude_range_raises_error(): + """Test that latitude out of range raises ValueError when validate_ranges=True.""" + X = pd.DataFrame( + { + "lat1": [100], # Invalid: outside -90 to 90 + "lon1": [0], + "lat2": [0], + "lon2": [0], + } + ) + transformer = GeoDistanceTransformer( + lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2" + ) + with pytest.raises(ValueError, match="Latitude values.*must be between"): transformer.fit(X) - assert hasattr(transformer, "variables_") - assert hasattr(transformer, "feature_names_in_") - assert hasattr(transformer, "n_features_in_") - assert set(transformer.variables_) == {"lat1", "lon1", "lat2", "lon2"} - - def test_get_feature_names_out(self): - """Test get_feature_names_out returns correct names.""" - X = pd.DataFrame( - { - "lat1": [40.0], - "lon1": [-74.0], - "lat2": [34.0], - "lon2": [-118.0], - "other": [1], - } - ) - transformer = GeoDistanceTransformer( - lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2" - ) + +def test_invalid_longitude_range_raises_error(): + """Test that longitude out of range raises ValueError when validate_ranges=True.""" + X = pd.DataFrame( + { + "lat1": [0], + "lon1": [200], # Invalid: outside -180 to 180 + "lat2": [0], + "lon2": [0], + } + ) + transformer = GeoDistanceTransformer( + lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2" + ) + with pytest.raises(ValueError, match="Longitude values.*must be between"): transformer.fit(X) - feature_names = transformer.get_feature_names_out() - assert "geo_distance" in feature_names - assert len(feature_names) == 6 # 5 original + 1 new - - def test_output_units_conversion(self): - """Test different output units give consistent results.""" - X = pd.DataFrame( - { - "lat1": [40.7128], - "lon1": [-74.0060], - "lat2": [34.0522], - "lon2": [-118.2437], - } - ) - # Get distance in km and miles - transformer_km = GeoDistanceTransformer( - lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2", output_unit="km" - ) - transformer_miles = GeoDistanceTransformer( - lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2", output_unit="miles" +def test_validate_ranges_disabled(): + """Test that invalid coordinates don't raise error when validate_ranges=False.""" + X = pd.DataFrame( + { + "lat1": [100], # Invalid latitude + "lon1": [200], # Invalid longitude + "lat2": [0], + "lon2": [0], + } + ) + transformer = GeoDistanceTransformer( + lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2", validate_ranges=False + ) + # Should not raise an error + transformer.fit(X) + X_tr = transformer.transform(X) + # Distance may be incorrect but should complete + assert "geo_distance" in X_tr.columns + + +def test_validate_ranges_parameter_validation(): + """Test that validate_ranges must be boolean.""" + with pytest.raises(ValueError, match="validate_ranges must be a boolean"): + GeoDistanceTransformer( + lat1="lat1", + lon1="lon1", + lat2="lat2", + lon2="lon2", + validate_ranges="True", ) - dist_km = transformer_km.fit_transform(X.copy())["geo_distance"].iloc[0] - dist_miles = transformer_miles.fit_transform(X.copy())["geo_distance"].iloc[0] - # 1 km ≈ 0.621371 miles - expected_miles = dist_km * 0.621371 - np.testing.assert_almost_equal(dist_miles, expected_miles, decimal=0) +def test_fit_stores_attributes(): + """Test that fit stores expected attributes.""" + X = pd.DataFrame( + {"lat1": [40.0], "lon1": [-74.0], "lat2": [34.0], "lon2": [-118.0]} + ) + transformer = GeoDistanceTransformer( + lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2" + ) + transformer.fit(X) + + assert hasattr(transformer, "variables_") + assert hasattr(transformer, "feature_names_in_") + assert hasattr(transformer, "n_features_in_") + assert set(transformer.variables_) == {"lat1", "lon1", "lat2", "lon2"} + + +def test_get_feature_names_out(): + """Test get_feature_names_out returns correct names.""" + X = pd.DataFrame( + { + "lat1": [40.0], + "lon1": [-74.0], + "lat2": [34.0], + "lon2": [-118.0], + "other": [1], + } + ) + transformer = GeoDistanceTransformer( + lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2" + ) + transformer.fit(X) + + feature_names = transformer.get_feature_names_out() + assert "geo_distance" in feature_names + assert len(feature_names) == 6 # 5 original + 1 new + + +def test_output_units_conversion(): + """Test different output units give consistent results.""" + X = pd.DataFrame( + { + "lat1": [40.7128], + "lon1": [-74.0060], + "lat2": [34.0522], + "lon2": [-118.2437], + } + ) + + # Get distance in km and miles + transformer_km = GeoDistanceTransformer( + lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2", output_unit="km" + ) + transformer_miles = GeoDistanceTransformer( + lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2", output_unit="miles" + ) + + dist_km = transformer_km.fit_transform(X.copy())["geo_distance"].iloc[0] + dist_miles = transformer_miles.fit_transform(X.copy())["geo_distance"].iloc[0] + + # 1 km ≈ 0.621371 miles + expected_miles = dist_km * 0.621371 + np.testing.assert_almost_equal(dist_miles, expected_miles, decimal=0) From 227808d4c7eb5870ec835b4d4195b347af9ca234 Mon Sep 17 00:00:00 2001 From: ankitlade12 Date: Thu, 1 Jan 2026 09:38:58 -0600 Subject: [PATCH 08/10] Fix flake8 line length error (E501) --- feature_engine/creation/geo_features.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/feature_engine/creation/geo_features.py b/feature_engine/creation/geo_features.py index 695b91c59..b706e6736 100644 --- a/feature_engine/creation/geo_features.py +++ b/feature_engine/creation/geo_features.py @@ -197,7 +197,8 @@ def __init__( # Validate validate_ranges if not isinstance(validate_ranges, bool): raise ValueError( - f"validate_ranges must be a boolean. Got {type(validate_ranges).__name__}." + "validate_ranges must be a boolean. " + f"Got {type(validate_ranges).__name__}." ) _check_param_drop_original(drop_original) From 754a68b87719e8b6ec12ab68da86c7e16cf6bb6b Mon Sep 17 00:00:00 2001 From: ankitlade12 Date: Thu, 8 Jan 2026 09:41:21 -0600 Subject: [PATCH 09/10] Address PR review feedback for GeoDistanceTransformer - Replace RandomForestRegressor with LinearRegression in docs (forests don't need scaling) - Add pytest parametrize for test_same_location_zero_distance (all methods and units) - Make test_euclidean_method and test_manhattan_method more specific with expected values - Make test_multiple_rows use pd.testing.assert_frame_equal with expected DataFrame - Parametrize all validation tests (invalid_method, invalid_output_unit, lat/lon range, validate_ranges) - Add fixtures for commonly used DataFrames - Test exact feature names in get_feature_names_out - Add test_geo_distance_transformer_in_pipeline to test_check_estimator_creation.py --- .../creation/GeoDistanceTransformer.rst | 6 +- .../test_check_estimator_creation.py | 27 ++ tests/test_creation/test_geo_features.py | 260 ++++++++++++++---- 3 files changed, 233 insertions(+), 60 deletions(-) diff --git a/docs/user_guide/creation/GeoDistanceTransformer.rst b/docs/user_guide/creation/GeoDistanceTransformer.rst index c0b81dad1..41cf0da37 100644 --- a/docs/user_guide/creation/GeoDistanceTransformer.rst +++ b/docs/user_guide/creation/GeoDistanceTransformer.rst @@ -205,7 +205,7 @@ then scales the features, and finally trains a regression model: from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler - from sklearn.ensemble import RandomForestRegressor + from sklearn.linear_model import LinearRegression # Create sample target variable y = pd.Series([100, 150, 80, 200]) @@ -219,7 +219,7 @@ then scales the features, and finally trains a regression model: drop_original=True )), ('scaler', StandardScaler()), - ('regressor', RandomForestRegressor(n_estimators=10, random_state=42)) + ('regressor', LinearRegression()) ]) # Fit the pipeline @@ -233,4 +233,4 @@ The pipeline successfully trains and returns predictions: .. code:: python - Predictions: [107. 143. 83. 197.] + Predictions: [100. 150. 80. 200.] diff --git a/tests/test_creation/test_check_estimator_creation.py b/tests/test_creation/test_check_estimator_creation.py index fd39bfc57..c560f47b2 100644 --- a/tests/test_creation/test_check_estimator_creation.py +++ b/tests/test_creation/test_check_estimator_creation.py @@ -8,6 +8,7 @@ from feature_engine.creation import ( CyclicalFeatures, DecisionTreeFeatures, + GeoDistanceTransformer, MathFeatures, RelativeFeatures, ) @@ -15,6 +16,10 @@ sklearn_version = parse_version(parse_version(sklearn.__version__).base_version) +# Estimators for sklearn's check_estimator +# Note: GeoDistanceTransformer is not included here because it requires 4 specific +# named coordinate columns, but sklearn's check_estimator generates test data +# with generic column names (x0, x1, x2) that don't match the required columns. _estimators = [ MathFeatures(variables=["x0", "x1"], func="mean", missing_values="ignore"), RelativeFeatures( @@ -70,3 +75,25 @@ def test_transformers_in_pipeline_with_set_output_pandas(transformer): Xtp = pipe.fit_transform(X, y) pd.testing.assert_frame_equal(Xtt, Xtp) + + +# Test GeoDistanceTransformer in pipeline with proper column names +def test_geo_distance_transformer_in_pipeline(): + """Test GeoDistanceTransformer works in a sklearn pipeline.""" + X = pd.DataFrame({ + "lat1": [40.7128, 34.0522], + "lon1": [-74.0060, -118.2437], + "lat2": [34.0522, 41.8781], + "lon2": [-118.2437, -87.6298], + }) + y = pd.Series([0, 1]) + + transformer = GeoDistanceTransformer( + lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2", validate_ranges=False + ) + pipe = Pipeline([("geo", transformer)]).set_output(transform="pandas") + + Xtt = transformer.fit_transform(X.copy()) + Xtp = pipe.fit_transform(X.copy(), y) + + pd.testing.assert_frame_equal(Xtt, Xtp) diff --git a/tests/test_creation/test_geo_features.py b/tests/test_creation/test_geo_features.py index 839a7c490..057687070 100644 --- a/tests/test_creation/test_geo_features.py +++ b/tests/test_creation/test_geo_features.py @@ -5,9 +5,14 @@ from feature_engine.creation import GeoDistanceTransformer +# ============================================================================= +# Fixtures +# ============================================================================= + + @pytest.fixture def df_coords(): - """Fixture providing sample coordinate data.""" + """Fixture providing sample coordinate data for a single route.""" return pd.DataFrame( { "lat1": [40.7128], @@ -18,6 +23,38 @@ def df_coords(): ) +@pytest.fixture +def df_multi_coords(): + """Fixture providing sample coordinate data with multiple rows.""" + return pd.DataFrame( + { + "origin_lat": [40.7128, 34.0522, 41.8781], + "origin_lon": [-74.0060, -118.2437, -87.6298], + "dest_lat": [34.0522, 41.8781, 40.7128], + "dest_lon": [-118.2437, -87.6298, -74.0060], + } + ) + + +@pytest.fixture +def df_with_extra(): + """Fixture for DataFrame with coordinates and extra columns.""" + return pd.DataFrame( + { + "lat1": [40.0], + "lon1": [-74.0], + "lat2": [34.0], + "lon2": [-118.0], + "other": [1], + } + ) + + +# ============================================================================= +# Test Haversine Distance +# ============================================================================= + + def test_haversine_distance_default(df_coords): """Test Haversine distance calculation with default parameters.""" transformer = GeoDistanceTransformer( @@ -25,13 +62,16 @@ def test_haversine_distance_default(df_coords): ) X_tr = transformer.fit_transform(df_coords) - # Distance should be approximately 3935-3944 km + # Distance from NYC to LA is approximately 3935-3944 km assert "geo_distance" in X_tr.columns assert 3900 < X_tr["geo_distance"].iloc[0] < 4000 def test_haversine_distance_miles(): - """Test Haversine distance in miles.""" + """Test Haversine distance in miles. + + Expected: NYC to LA is approximately 2445 miles. + """ X = pd.DataFrame( { "lat1": [40.7128], @@ -49,8 +89,10 @@ def test_haversine_distance_miles(): assert 2400 < X_tr["geo_distance"].iloc[0] < 2500 -def test_same_location_zero_distance(): - """Test that same location returns zero distance.""" +@pytest.mark.parametrize("method", ["haversine", "euclidean", "manhattan"]) +@pytest.mark.parametrize("output_unit", ["km", "miles", "meters", "feet"]) +def test_same_location_zero_distance(method, output_unit): + """Test that same location returns zero distance for all methods and units.""" X = pd.DataFrame( { "lat1": [40.7128, 34.0522], @@ -60,7 +102,12 @@ def test_same_location_zero_distance(): } ) transformer = GeoDistanceTransformer( - lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2" + lat1="lat1", + lon1="lon1", + lat2="lat2", + lon2="lon2", + method=method, + output_unit=output_unit, ) X_tr = transformer.fit_transform(X) @@ -69,26 +116,56 @@ def test_same_location_zero_distance(): ) +# ============================================================================= +# Test Alternative Distance Methods +# ============================================================================= + + def test_euclidean_method(): - """Test Euclidean distance method.""" + """Test Euclidean distance method returns expected values. + + For coordinates (0,0) to (1,1): + - dlat = 1, dlon = 1 + - At equator: 1 degree ≈ 111 km + - Euclidean distance = sqrt((1*111)^2 + (1*111)^2) = sqrt(2) * 111 ≈ 157.0 km + """ X = pd.DataFrame({"lat1": [0.0], "lon1": [0.0], "lat2": [1.0], "lon2": [1.0]}) transformer = GeoDistanceTransformer( lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2", method="euclidean" ) X_tr = transformer.fit_transform(X) - assert X_tr["geo_distance"].iloc[0] > 0 + # Expected: sqrt(2) * 111 ≈ 157.0 km + expected_distance = np.sqrt(2) * 111.0 + np.testing.assert_almost_equal( + X_tr["geo_distance"].iloc[0], expected_distance, decimal=1 + ) def test_manhattan_method(): - """Test Manhattan distance method.""" + """Test Manhattan distance method returns expected values. + + For coordinates (0,0) to (1,1): + - dlat = 1, dlon = 1 + - At equator: 1 degree ≈ 111 km + - Manhattan distance = (1 + 1) * 111 = 222 km + """ X = pd.DataFrame({"lat1": [0.0], "lon1": [0.0], "lat2": [1.0], "lon2": [1.0]}) transformer = GeoDistanceTransformer( lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2", method="manhattan" ) X_tr = transformer.fit_transform(X) - assert X_tr["geo_distance"].iloc[0] > 0 + # Expected: (1 + 1) * 111 = 222 km + expected_distance = 2 * 111.0 + np.testing.assert_almost_equal( + X_tr["geo_distance"].iloc[0], expected_distance, decimal=1 + ) + + +# ============================================================================= +# Test Output Configuration +# ============================================================================= def test_custom_output_column_name(df_coords): @@ -103,7 +180,7 @@ def test_custom_output_column_name(df_coords): def test_drop_original_columns(): - """Test drop_original parameter.""" + """Test drop_original parameter removes coordinate columns.""" X = pd.DataFrame( { "lat1": [40.7128], @@ -118,51 +195,76 @@ def test_drop_original_columns(): ) X_tr = transformer.fit_transform(X) + # Coordinate columns should be removed assert "lat1" not in X_tr.columns assert "lon1" not in X_tr.columns assert "lat2" not in X_tr.columns assert "lon2" not in X_tr.columns + # New distance column and other columns remain assert "geo_distance" in X_tr.columns assert "other" in X_tr.columns + # Check exact columns + assert list(X_tr.columns) == ["other", "geo_distance"] -def test_multiple_rows(): - """Test with multiple rows.""" - X = pd.DataFrame( - { - "origin_lat": [40.7128, 34.0522, 41.8781], - "origin_lon": [-74.0060, -118.2437, -87.6298], - "dest_lat": [34.0522, 41.8781, 40.7128], - "dest_lon": [-118.2437, -87.6298, -74.0060], - } - ) +# ============================================================================= +# Test Multiple Rows +# ============================================================================= + + +def test_multiple_rows(df_multi_coords): + """Test transformation with multiple rows returns expected distances. + + Expected haversine distances in km: + - NYC to LA: ~3935.75 km + - LA to Chicago: ~2803.97 km + - Chicago to NYC: ~1144.29 km + """ transformer = GeoDistanceTransformer( lat1="origin_lat", lon1="origin_lon", lat2="dest_lat", lon2="dest_lon" ) - X_tr = transformer.fit_transform(X) + X_tr = transformer.fit_transform(df_multi_coords) + + # Build expected DataFrame + expected = df_multi_coords.copy() + expected["geo_distance"] = [ + 3935.746254609723, + 2803.971506975193, + 1144.2912739463475, + ] + + pd.testing.assert_frame_equal( + X_tr, + expected, + check_exact=False, + atol=0.001, # Allow very small tolerance for floating point + ) - assert len(X_tr["geo_distance"]) == 3 - # All distances should be positive - assert all(X_tr["geo_distance"] > 0) +# ============================================================================= +# Test Invalid Parameters +# ============================================================================= -def test_invalid_method_raises_error(): - """Test that invalid method raises ValueError.""" + +@pytest.mark.parametrize("invalid_method", ["invalid", True, 123]) +def test_invalid_method_raises_error(invalid_method): + """Test that invalid method values raise ValueError.""" with pytest.raises(ValueError, match="method must be one of"): GeoDistanceTransformer( - lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2", method="invalid" + lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2", method=invalid_method ) -def test_invalid_output_unit_raises_error(): - """Test that invalid output_unit raises ValueError.""" +@pytest.mark.parametrize("invalid_unit", ["invalid", True, 123]) +def test_invalid_output_unit_raises_error(invalid_unit): + """Test that invalid output_unit values raise ValueError.""" with pytest.raises(ValueError, match="output_unit must be one of"): GeoDistanceTransformer( lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2", - output_unit="invalid", + output_unit=invalid_unit, ) @@ -176,11 +278,20 @@ def test_missing_columns_raises_error(): transformer.fit(X) -def test_invalid_latitude_range_raises_error(): - """Test that latitude out of range raises ValueError when validate_ranges=True.""" +# ============================================================================= +# Test Coordinate Range Validation +# ============================================================================= + + +@pytest.mark.parametrize("invalid_lat", [100, -100]) +def test_invalid_latitude_range_raises_error(invalid_lat): + """Test that latitude outside [-90, 90] raises ValueError. + + Only applies when validate_ranges=True. + """ X = pd.DataFrame( { - "lat1": [100], # Invalid: outside -90 to 90 + "lat1": [invalid_lat], "lon1": [0], "lat2": [0], "lon2": [0], @@ -193,12 +304,16 @@ def test_invalid_latitude_range_raises_error(): transformer.fit(X) -def test_invalid_longitude_range_raises_error(): - """Test that longitude out of range raises ValueError when validate_ranges=True.""" +@pytest.mark.parametrize("invalid_lon", [200, -200]) +def test_invalid_longitude_range_raises_error(invalid_lon): + """Test that longitude outside [-180, 180] raises ValueError. + + Only applies when validate_ranges=True. + """ X = pd.DataFrame( { "lat1": [0], - "lon1": [200], # Invalid: outside -180 to 180 + "lon1": [invalid_lon], "lat2": [0], "lon2": [0], } @@ -226,24 +341,33 @@ def test_validate_ranges_disabled(): # Should not raise an error transformer.fit(X) X_tr = transformer.transform(X) - # Distance may be incorrect but should complete + # Distance may be incorrect but should complete without error assert "geo_distance" in X_tr.columns -def test_validate_ranges_parameter_validation(): - """Test that validate_ranges must be boolean.""" +@pytest.mark.parametrize("invalid_value", ["True", 123, 0.5]) +def test_validate_ranges_parameter_validation(invalid_value): + """Test that validate_ranges must be a boolean. + + Note: 1 and 0 are not tested because they are interpreted as booleans in Python. + """ with pytest.raises(ValueError, match="validate_ranges must be a boolean"): GeoDistanceTransformer( lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2", - validate_ranges="True", + validate_ranges=invalid_value, ) +# ============================================================================= +# Test Fit Attributes +# ============================================================================= + + def test_fit_stores_attributes(): - """Test that fit stores expected attributes.""" + """Test that fit stores expected attributes with correct values.""" X = pd.DataFrame( {"lat1": [40.0], "lon1": [-74.0], "lat2": [34.0], "lon2": [-118.0]} ) @@ -252,35 +376,57 @@ def test_fit_stores_attributes(): ) transformer.fit(X) + # Check attributes exist assert hasattr(transformer, "variables_") assert hasattr(transformer, "feature_names_in_") assert hasattr(transformer, "n_features_in_") + + # Check attribute values assert set(transformer.variables_) == {"lat1", "lon1", "lat2", "lon2"} + assert transformer.feature_names_in_ == ["lat1", "lon1", "lat2", "lon2"] + assert transformer.n_features_in_ == 4 -def test_get_feature_names_out(): - """Test get_feature_names_out returns correct names.""" - X = pd.DataFrame( - { - "lat1": [40.0], - "lon1": [-74.0], - "lat2": [34.0], - "lon2": [-118.0], - "other": [1], - } - ) +# ============================================================================= +# Test get_feature_names_out +# ============================================================================= + + +def test_get_feature_names_out(df_with_extra): + """Test get_feature_names_out returns correct feature names.""" transformer = GeoDistanceTransformer( lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2" ) - transformer.fit(X) + transformer.fit(df_with_extra) + + feature_names = transformer.get_feature_names_out() + + # Should return original columns + new distance column + expected_names = ["lat1", "lon1", "lat2", "lon2", "other", "geo_distance"] + assert feature_names == expected_names + + +def test_get_feature_names_out_with_drop_original(df_with_extra): + """Test get_feature_names_out when drop_original=True.""" + transformer = GeoDistanceTransformer( + lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2", drop_original=True + ) + transformer.fit(df_with_extra) feature_names = transformer.get_feature_names_out() - assert "geo_distance" in feature_names - assert len(feature_names) == 6 # 5 original + 1 new + + # Coordinate columns should be excluded + expected_names = ["other", "geo_distance"] + assert feature_names == expected_names + + +# ============================================================================= +# Test Output Unit Conversion +# ============================================================================= def test_output_units_conversion(): - """Test different output units give consistent results.""" + """Test different output units give consistent results with correct conversion.""" X = pd.DataFrame( { "lat1": [40.7128], From 7cfdc53b8be5d645658ec2d99a4b7c1cabeb23cf Mon Sep 17 00:00:00 2001 From: ankitlade12 Date: Thu, 8 Jan 2026 09:54:06 -0600 Subject: [PATCH 10/10] Clean up test file: remove section header comments for production quality --- tests/test_creation/test_geo_features.py | 259 ++++++----------------- 1 file changed, 70 insertions(+), 189 deletions(-) diff --git a/tests/test_creation/test_geo_features.py b/tests/test_creation/test_geo_features.py index 057687070..f7cbae966 100644 --- a/tests/test_creation/test_geo_features.py +++ b/tests/test_creation/test_geo_features.py @@ -5,54 +5,38 @@ from feature_engine.creation import GeoDistanceTransformer -# ============================================================================= -# Fixtures -# ============================================================================= - - @pytest.fixture def df_coords(): """Fixture providing sample coordinate data for a single route.""" - return pd.DataFrame( - { - "lat1": [40.7128], - "lon1": [-74.0060], - "lat2": [34.0522], - "lon2": [-118.2437], - } - ) + return pd.DataFrame({ + "lat1": [40.7128], + "lon1": [-74.0060], + "lat2": [34.0522], + "lon2": [-118.2437], + }) @pytest.fixture def df_multi_coords(): """Fixture providing sample coordinate data with multiple rows.""" - return pd.DataFrame( - { - "origin_lat": [40.7128, 34.0522, 41.8781], - "origin_lon": [-74.0060, -118.2437, -87.6298], - "dest_lat": [34.0522, 41.8781, 40.7128], - "dest_lon": [-118.2437, -87.6298, -74.0060], - } - ) + return pd.DataFrame({ + "origin_lat": [40.7128, 34.0522, 41.8781], + "origin_lon": [-74.0060, -118.2437, -87.6298], + "dest_lat": [34.0522, 41.8781, 40.7128], + "dest_lon": [-118.2437, -87.6298, -74.0060], + }) @pytest.fixture def df_with_extra(): """Fixture for DataFrame with coordinates and extra columns.""" - return pd.DataFrame( - { - "lat1": [40.0], - "lon1": [-74.0], - "lat2": [34.0], - "lon2": [-118.0], - "other": [1], - } - ) - - -# ============================================================================= -# Test Haversine Distance -# ============================================================================= + return pd.DataFrame({ + "lat1": [40.0], + "lon1": [-74.0], + "lat2": [34.0], + "lon2": [-118.0], + "other": [1], + }) def test_haversine_distance_default(df_coords): @@ -62,30 +46,23 @@ def test_haversine_distance_default(df_coords): ) X_tr = transformer.fit_transform(df_coords) - # Distance from NYC to LA is approximately 3935-3944 km assert "geo_distance" in X_tr.columns assert 3900 < X_tr["geo_distance"].iloc[0] < 4000 def test_haversine_distance_miles(): - """Test Haversine distance in miles. - - Expected: NYC to LA is approximately 2445 miles. - """ - X = pd.DataFrame( - { - "lat1": [40.7128], - "lon1": [-74.0060], - "lat2": [34.0522], - "lon2": [-118.2437], - } - ) + """Test Haversine distance in miles.""" + X = pd.DataFrame({ + "lat1": [40.7128], + "lon1": [-74.0060], + "lat2": [34.0522], + "lon2": [-118.2437], + }) transformer = GeoDistanceTransformer( lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2", output_unit="miles" ) X_tr = transformer.fit_transform(X) - # Distance should be approximately 2445 miles assert 2400 < X_tr["geo_distance"].iloc[0] < 2500 @@ -93,14 +70,12 @@ def test_haversine_distance_miles(): @pytest.mark.parametrize("output_unit", ["km", "miles", "meters", "feet"]) def test_same_location_zero_distance(method, output_unit): """Test that same location returns zero distance for all methods and units.""" - X = pd.DataFrame( - { - "lat1": [40.7128, 34.0522], - "lon1": [-74.0060, -118.2437], - "lat2": [40.7128, 34.0522], - "lon2": [-74.0060, -118.2437], - } - ) + X = pd.DataFrame({ + "lat1": [40.7128, 34.0522], + "lon1": [-74.0060, -118.2437], + "lat2": [40.7128, 34.0522], + "lon2": [-74.0060, -118.2437], + }) transformer = GeoDistanceTransformer( lat1="lat1", lon1="lon1", @@ -116,26 +91,14 @@ def test_same_location_zero_distance(method, output_unit): ) -# ============================================================================= -# Test Alternative Distance Methods -# ============================================================================= - - def test_euclidean_method(): - """Test Euclidean distance method returns expected values. - - For coordinates (0,0) to (1,1): - - dlat = 1, dlon = 1 - - At equator: 1 degree ≈ 111 km - - Euclidean distance = sqrt((1*111)^2 + (1*111)^2) = sqrt(2) * 111 ≈ 157.0 km - """ + """Test Euclidean distance method returns expected values.""" X = pd.DataFrame({"lat1": [0.0], "lon1": [0.0], "lat2": [1.0], "lon2": [1.0]}) transformer = GeoDistanceTransformer( lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2", method="euclidean" ) X_tr = transformer.fit_transform(X) - # Expected: sqrt(2) * 111 ≈ 157.0 km expected_distance = np.sqrt(2) * 111.0 np.testing.assert_almost_equal( X_tr["geo_distance"].iloc[0], expected_distance, decimal=1 @@ -143,31 +106,19 @@ def test_euclidean_method(): def test_manhattan_method(): - """Test Manhattan distance method returns expected values. - - For coordinates (0,0) to (1,1): - - dlat = 1, dlon = 1 - - At equator: 1 degree ≈ 111 km - - Manhattan distance = (1 + 1) * 111 = 222 km - """ + """Test Manhattan distance method returns expected values.""" X = pd.DataFrame({"lat1": [0.0], "lon1": [0.0], "lat2": [1.0], "lon2": [1.0]}) transformer = GeoDistanceTransformer( lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2", method="manhattan" ) X_tr = transformer.fit_transform(X) - # Expected: (1 + 1) * 111 = 222 km expected_distance = 2 * 111.0 np.testing.assert_almost_equal( X_tr["geo_distance"].iloc[0], expected_distance, decimal=1 ) -# ============================================================================= -# Test Output Configuration -# ============================================================================= - - def test_custom_output_column_name(df_coords): """Test custom output column name.""" transformer = GeoDistanceTransformer( @@ -181,51 +132,34 @@ def test_custom_output_column_name(df_coords): def test_drop_original_columns(): """Test drop_original parameter removes coordinate columns.""" - X = pd.DataFrame( - { - "lat1": [40.7128], - "lon1": [-74.0060], - "lat2": [34.0522], - "lon2": [-118.2437], - "other": [1], - } - ) + X = pd.DataFrame({ + "lat1": [40.7128], + "lon1": [-74.0060], + "lat2": [34.0522], + "lon2": [-118.2437], + "other": [1], + }) transformer = GeoDistanceTransformer( lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2", drop_original=True ) X_tr = transformer.fit_transform(X) - # Coordinate columns should be removed assert "lat1" not in X_tr.columns assert "lon1" not in X_tr.columns assert "lat2" not in X_tr.columns assert "lon2" not in X_tr.columns - # New distance column and other columns remain assert "geo_distance" in X_tr.columns assert "other" in X_tr.columns - # Check exact columns assert list(X_tr.columns) == ["other", "geo_distance"] -# ============================================================================= -# Test Multiple Rows -# ============================================================================= - - def test_multiple_rows(df_multi_coords): - """Test transformation with multiple rows returns expected distances. - - Expected haversine distances in km: - - NYC to LA: ~3935.75 km - - LA to Chicago: ~2803.97 km - - Chicago to NYC: ~1144.29 km - """ + """Test transformation with multiple rows returns expected distances.""" transformer = GeoDistanceTransformer( lat1="origin_lat", lon1="origin_lon", lat2="dest_lat", lon2="dest_lon" ) X_tr = transformer.fit_transform(df_multi_coords) - # Build expected DataFrame expected = df_multi_coords.copy() expected["geo_distance"] = [ 3935.746254609723, @@ -237,15 +171,10 @@ def test_multiple_rows(df_multi_coords): X_tr, expected, check_exact=False, - atol=0.001, # Allow very small tolerance for floating point + atol=0.001, ) -# ============================================================================= -# Test Invalid Parameters -# ============================================================================= - - @pytest.mark.parametrize("invalid_method", ["invalid", True, 123]) def test_invalid_method_raises_error(invalid_method): """Test that invalid method values raise ValueError.""" @@ -278,25 +207,15 @@ def test_missing_columns_raises_error(): transformer.fit(X) -# ============================================================================= -# Test Coordinate Range Validation -# ============================================================================= - - @pytest.mark.parametrize("invalid_lat", [100, -100]) def test_invalid_latitude_range_raises_error(invalid_lat): - """Test that latitude outside [-90, 90] raises ValueError. - - Only applies when validate_ranges=True. - """ - X = pd.DataFrame( - { - "lat1": [invalid_lat], - "lon1": [0], - "lat2": [0], - "lon2": [0], - } - ) + """Test that latitude outside [-90, 90] raises ValueError.""" + X = pd.DataFrame({ + "lat1": [invalid_lat], + "lon1": [0], + "lat2": [0], + "lon2": [0], + }) transformer = GeoDistanceTransformer( lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2" ) @@ -306,18 +225,13 @@ def test_invalid_latitude_range_raises_error(invalid_lat): @pytest.mark.parametrize("invalid_lon", [200, -200]) def test_invalid_longitude_range_raises_error(invalid_lon): - """Test that longitude outside [-180, 180] raises ValueError. - - Only applies when validate_ranges=True. - """ - X = pd.DataFrame( - { - "lat1": [0], - "lon1": [invalid_lon], - "lat2": [0], - "lon2": [0], - } - ) + """Test that longitude outside [-180, 180] raises ValueError.""" + X = pd.DataFrame({ + "lat1": [0], + "lon1": [invalid_lon], + "lat2": [0], + "lon2": [0], + }) transformer = GeoDistanceTransformer( lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2" ) @@ -327,30 +241,23 @@ def test_invalid_longitude_range_raises_error(invalid_lon): def test_validate_ranges_disabled(): """Test that invalid coordinates don't raise error when validate_ranges=False.""" - X = pd.DataFrame( - { - "lat1": [100], # Invalid latitude - "lon1": [200], # Invalid longitude - "lat2": [0], - "lon2": [0], - } - ) + X = pd.DataFrame({ + "lat1": [100], + "lon1": [200], + "lat2": [0], + "lon2": [0], + }) transformer = GeoDistanceTransformer( lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2", validate_ranges=False ) - # Should not raise an error transformer.fit(X) X_tr = transformer.transform(X) - # Distance may be incorrect but should complete without error assert "geo_distance" in X_tr.columns @pytest.mark.parametrize("invalid_value", ["True", 123, 0.5]) def test_validate_ranges_parameter_validation(invalid_value): - """Test that validate_ranges must be a boolean. - - Note: 1 and 0 are not tested because they are interpreted as booleans in Python. - """ + """Test that validate_ranges must be a boolean.""" with pytest.raises(ValueError, match="validate_ranges must be a boolean"): GeoDistanceTransformer( lat1="lat1", @@ -361,11 +268,6 @@ def test_validate_ranges_parameter_validation(invalid_value): ) -# ============================================================================= -# Test Fit Attributes -# ============================================================================= - - def test_fit_stores_attributes(): """Test that fit stores expected attributes with correct values.""" X = pd.DataFrame( @@ -376,22 +278,14 @@ def test_fit_stores_attributes(): ) transformer.fit(X) - # Check attributes exist assert hasattr(transformer, "variables_") assert hasattr(transformer, "feature_names_in_") assert hasattr(transformer, "n_features_in_") - - # Check attribute values assert set(transformer.variables_) == {"lat1", "lon1", "lat2", "lon2"} assert transformer.feature_names_in_ == ["lat1", "lon1", "lat2", "lon2"] assert transformer.n_features_in_ == 4 -# ============================================================================= -# Test get_feature_names_out -# ============================================================================= - - def test_get_feature_names_out(df_with_extra): """Test get_feature_names_out returns correct feature names.""" transformer = GeoDistanceTransformer( @@ -400,8 +294,6 @@ def test_get_feature_names_out(df_with_extra): transformer.fit(df_with_extra) feature_names = transformer.get_feature_names_out() - - # Should return original columns + new distance column expected_names = ["lat1", "lon1", "lat2", "lon2", "other", "geo_distance"] assert feature_names == expected_names @@ -414,29 +306,19 @@ def test_get_feature_names_out_with_drop_original(df_with_extra): transformer.fit(df_with_extra) feature_names = transformer.get_feature_names_out() - - # Coordinate columns should be excluded expected_names = ["other", "geo_distance"] assert feature_names == expected_names -# ============================================================================= -# Test Output Unit Conversion -# ============================================================================= - - def test_output_units_conversion(): """Test different output units give consistent results with correct conversion.""" - X = pd.DataFrame( - { - "lat1": [40.7128], - "lon1": [-74.0060], - "lat2": [34.0522], - "lon2": [-118.2437], - } - ) + X = pd.DataFrame({ + "lat1": [40.7128], + "lon1": [-74.0060], + "lat2": [34.0522], + "lon2": [-118.2437], + }) - # Get distance in km and miles transformer_km = GeoDistanceTransformer( lat1="lat1", lon1="lon1", lat2="lat2", lon2="lon2", output_unit="km" ) @@ -447,6 +329,5 @@ def test_output_units_conversion(): dist_km = transformer_km.fit_transform(X.copy())["geo_distance"].iloc[0] dist_miles = transformer_miles.fit_transform(X.copy())["geo_distance"].iloc[0] - # 1 km ≈ 0.621371 miles expected_miles = dist_km * 0.621371 np.testing.assert_almost_equal(dist_miles, expected_miles, decimal=0)