From e1353747895d5b987211e20a882d2c7db3072a02 Mon Sep 17 00:00:00 2001 From: kdt523 Date: Fri, 17 Oct 2025 23:09:09 +0530 Subject: [PATCH 1/3] Add KNN Manhattan and Minkowski distances with tests --- machine_learning/k_nearest_neighbours.py | 105 ++++++++++++++++-- .../tests/test_k_nearest_neighbours.py | 75 +++++++++++++ 2 files changed, 171 insertions(+), 9 deletions(-) create mode 100644 machine_learning/tests/test_k_nearest_neighbours.py diff --git a/machine_learning/k_nearest_neighbours.py b/machine_learning/k_nearest_neighbours.py index fbc1b8bd227e..c5ae442644e3 100644 --- a/machine_learning/k_nearest_neighbours.py +++ b/machine_learning/k_nearest_neighbours.py @@ -6,8 +6,8 @@ of the given point. In effect, the label of the given point is decided by a majority vote. -This implementation uses the commonly used Euclidean distance metric, but other -distance metrics can also be used. +This implementation uses the Euclidean distance metric by default, and also +supports Manhattan (L1) and Minkowski (Lp) distances. Reference: https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm """ @@ -16,8 +16,6 @@ from heapq import nsmallest import numpy as np -from sklearn import datasets -from sklearn.model_selection import train_test_split class KNN: @@ -26,12 +24,42 @@ def __init__( train_data: np.ndarray[float], train_target: np.ndarray[int], class_labels: list[str], + *, + distance_metric: str = "euclidean", + p: float = 2.0, ) -> None: """ Create a kNN classifier using the given training data and class labels + + Parameters + ---------- + train_data : np.ndarray[float] + Training features. + train_target : np.ndarray[int] + Training labels as integer indices. + class_labels : list[str] + Mapping from label index to label name. + distance_metric : {"euclidean", "manhattan", "minkowski"} + Distance to use for neighbour search. Defaults to "euclidean". + p : float + Power parameter for Minkowski distance (Lp norm). Must be >= 1 when + distance_metric is "minkowski". Defaults to 2.0. """ - self.data = zip(train_data, train_target) + # Store a reusable copy; zip() returns an iterator that would be + # exhausted after one classification otherwise. + self.data = list(zip(train_data, train_target)) self.labels = class_labels + self.distance_metric = distance_metric.lower() + self.p = float(p) + + if self.distance_metric not in {"euclidean", "manhattan", "minkowski"}: + msg = ( + "distance_metric must be one of {'euclidean', 'manhattan', 'minkowski'}" + ) + raise ValueError(msg) + if self.distance_metric == "minkowski" and self.p < 1: + msg = "For Minkowski distance, p must be >= 1" + raise ValueError(msg) @staticmethod def _euclidean_distance(a: np.ndarray[float], b: np.ndarray[float]) -> float: @@ -44,6 +72,30 @@ def _euclidean_distance(a: np.ndarray[float], b: np.ndarray[float]) -> float: """ return float(np.linalg.norm(a - b)) + @staticmethod + def _manhattan_distance(a: np.ndarray[float], b: np.ndarray[float]) -> float: + """ + Calculate the Manhattan (L1) distance between two points + >>> KNN._manhattan_distance(np.array([0, 0]), np.array([3, 4])) + 7.0 + >>> KNN._manhattan_distance(np.array([1, 2, 3]), np.array([1, 8, 11])) + 14.0 + """ + return float(np.linalg.norm(a - b, ord=1)) + + @staticmethod + def _minkowski_distance( + a: np.ndarray[float], b: np.ndarray[float], p: float + ) -> float: + """ + Calculate the Minkowski (Lp) distance between two points + >>> KNN._minkowski_distance(np.array([0, 0]), np.array([3, 4]), 2) + 5.0 + >>> KNN._minkowski_distance(np.array([0, 0]), np.array([3, 4]), 1) + 7.0 + """ + return float(np.linalg.norm(a - b, ord=p)) + def classify(self, pred_point: np.ndarray[float], k: int = 5) -> str: """ Classify a given point using the kNN algorithm @@ -56,12 +108,42 @@ def classify(self, pred_point: np.ndarray[float], k: int = 5) -> str: >>> point = np.array([1.2, 1.2]) >>> knn.classify(point) 'A' + >>> # Manhattan distance yields the same class here + >>> knn_l1 = KNN(train_X, train_y, classes, distance_metric='manhattan') + >>> knn_l1.classify(point) + 'A' + >>> # Minkowski with p=2 equals Euclidean + >>> knn_lp = KNN(train_X, train_y, classes, distance_metric='minkowski', p=2) + >>> knn_lp.classify(point) + 'A' + >>> # Invalid distance metric + >>> try: + ... _ = KNN(train_X, train_y, classes, distance_metric='chebyshev') + ... except ValueError as e: + ... 'distance_metric' in str(e) + True + >>> # Invalid Minkowski power + >>> try: + ... _ = KNN(train_X, train_y, classes, distance_metric='minkowski', p=0.5) + ... except ValueError as e: + ... 'p must be >=' in str(e) + True """ + # Choose the distance function once + if self.distance_metric == "euclidean": + def dist_fn(a: np.ndarray[float]) -> float: + return self._euclidean_distance(a, pred_point) + elif self.distance_metric == "manhattan": + def dist_fn(a: np.ndarray[float]) -> float: + return self._manhattan_distance(a, pred_point) + else: # minkowski + p = self.p + + def dist_fn(a: np.ndarray[float]) -> float: + return self._minkowski_distance(a, pred_point, p) + # Distances of all points from the point to be classified - distances = ( - (self._euclidean_distance(data_point[0], pred_point), data_point[1]) - for data_point in self.data - ) + distances = ((dist_fn(dp), lbl) for dp, lbl in self.data) # Choosing k points with the shortest distances votes = (i[1] for i in nsmallest(k, distances)) @@ -76,6 +158,11 @@ def classify(self, pred_point: np.ndarray[float], k: int = 5) -> str: doctest.testmod() + # Optional demo using scikit-learn's iris dataset. Kept under __main__ to + # avoid making scikit-learn a hard dependency for importing this module. + from sklearn import datasets + from sklearn.model_selection import train_test_split + iris = datasets.load_iris() X = np.array(iris["data"]) diff --git a/machine_learning/tests/test_k_nearest_neighbours.py b/machine_learning/tests/test_k_nearest_neighbours.py new file mode 100644 index 000000000000..59f51ae4d6f2 --- /dev/null +++ b/machine_learning/tests/test_k_nearest_neighbours.py @@ -0,0 +1,75 @@ +import numpy as np +import pytest + +from machine_learning.k_nearest_neighbours import KNN + + +def test_distance_functions(): + a = np.array([0, 0]) + b = np.array([3, 4]) + assert KNN._euclidean_distance(a, b) == 5.0 + assert KNN._manhattan_distance(a, b) == 7.0 + assert KNN._minkowski_distance(a, b, 2) == 5.0 + assert KNN._minkowski_distance(a, b, 1) == 7.0 + + +@pytest.mark.parametrize( + ("distance_metric", "p"), + [ + ("euclidean", None), + ("manhattan", None), + ("minkowski", 2), # p=2 -> Euclidean + ("minkowski", 3), # another valid p + ], +) +def test_classify_with_different_metrics(distance_metric: str, p: float | None): + train_X = np.array( + [[0, 0], [1, 0], [0, 1], [0.5, 0.5], [3, 3], [2, 3], [3, 2]] + ) + train_y = np.array([0, 0, 0, 0, 1, 1, 1]) + classes = ["A", "B"] + + kwargs: dict[str, object] = {"distance_metric": distance_metric} + if p is not None: + kwargs["p"] = float(p) + + knn = KNN(train_X, train_y, classes, **kwargs) + point = np.array([1.2, 1.2]) + # For this dataset/point, the class should be 'A' regardless of metric + assert knn.classify(point) == "A" + + +def test_invalid_distance_metric_raises(): + X = np.array([[0.0, 0.0]]) + y = np.array([0]) + labels = ["A"] + with pytest.raises(ValueError): + KNN(X, y, labels, distance_metric="chebyshev") + + +def test_invalid_minkowski_p_raises(): + X = np.array([[0.0, 0.0]]) + y = np.array([0]) + labels = ["A"] + with pytest.raises(ValueError): + KNN(X, y, labels, distance_metric="minkowski", p=0.5) + + +def test_multiple_classify_calls_with_same_instance(): + train_X = np.array([[0, 0], [1, 1], [2, 2]]) + train_y = np.array([0, 0, 1]) + classes = ["A", "B"] + knn = KNN(train_X, train_y, classes) + + p1 = np.array([0.1, 0.2]) + p2 = np.array([1.9, 2.0]) + + # Ensure we can call classify multiple times (zip exhaustion bug regression) + assert knn.classify(p1) == "A" + assert knn.classify(p2) in {"A", "B"} + + +if __name__ == "__main__": + import pytest as _pytest + + _pytest.main([__file__]) From 0ab81c1d252a501bbbe292a0dac122635ace0dcd Mon Sep 17 00:00:00 2001 From: kdt523 Date: Fri, 17 Oct 2025 23:19:16 +0530 Subject: [PATCH 2/3] Remove pytest file to split code/tests per PR template --- .../tests/test_k_nearest_neighbours.py | 75 ------------------- 1 file changed, 75 deletions(-) delete mode 100644 machine_learning/tests/test_k_nearest_neighbours.py diff --git a/machine_learning/tests/test_k_nearest_neighbours.py b/machine_learning/tests/test_k_nearest_neighbours.py deleted file mode 100644 index 59f51ae4d6f2..000000000000 --- a/machine_learning/tests/test_k_nearest_neighbours.py +++ /dev/null @@ -1,75 +0,0 @@ -import numpy as np -import pytest - -from machine_learning.k_nearest_neighbours import KNN - - -def test_distance_functions(): - a = np.array([0, 0]) - b = np.array([3, 4]) - assert KNN._euclidean_distance(a, b) == 5.0 - assert KNN._manhattan_distance(a, b) == 7.0 - assert KNN._minkowski_distance(a, b, 2) == 5.0 - assert KNN._minkowski_distance(a, b, 1) == 7.0 - - -@pytest.mark.parametrize( - ("distance_metric", "p"), - [ - ("euclidean", None), - ("manhattan", None), - ("minkowski", 2), # p=2 -> Euclidean - ("minkowski", 3), # another valid p - ], -) -def test_classify_with_different_metrics(distance_metric: str, p: float | None): - train_X = np.array( - [[0, 0], [1, 0], [0, 1], [0.5, 0.5], [3, 3], [2, 3], [3, 2]] - ) - train_y = np.array([0, 0, 0, 0, 1, 1, 1]) - classes = ["A", "B"] - - kwargs: dict[str, object] = {"distance_metric": distance_metric} - if p is not None: - kwargs["p"] = float(p) - - knn = KNN(train_X, train_y, classes, **kwargs) - point = np.array([1.2, 1.2]) - # For this dataset/point, the class should be 'A' regardless of metric - assert knn.classify(point) == "A" - - -def test_invalid_distance_metric_raises(): - X = np.array([[0.0, 0.0]]) - y = np.array([0]) - labels = ["A"] - with pytest.raises(ValueError): - KNN(X, y, labels, distance_metric="chebyshev") - - -def test_invalid_minkowski_p_raises(): - X = np.array([[0.0, 0.0]]) - y = np.array([0]) - labels = ["A"] - with pytest.raises(ValueError): - KNN(X, y, labels, distance_metric="minkowski", p=0.5) - - -def test_multiple_classify_calls_with_same_instance(): - train_X = np.array([[0, 0], [1, 1], [2, 2]]) - train_y = np.array([0, 0, 1]) - classes = ["A", "B"] - knn = KNN(train_X, train_y, classes) - - p1 = np.array([0.1, 0.2]) - p2 = np.array([1.9, 2.0]) - - # Ensure we can call classify multiple times (zip exhaustion bug regression) - assert knn.classify(p1) == "A" - assert knn.classify(p2) in {"A", "B"} - - -if __name__ == "__main__": - import pytest as _pytest - - _pytest.main([__file__]) From 003422bdac2d30a789c4d0b4ef086cb0c556f46e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 17 Oct 2025 17:57:29 +0000 Subject: [PATCH 3/3] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- machine_learning/k_nearest_neighbours.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/machine_learning/k_nearest_neighbours.py b/machine_learning/k_nearest_neighbours.py index c5ae442644e3..9c514ca91116 100644 --- a/machine_learning/k_nearest_neighbours.py +++ b/machine_learning/k_nearest_neighbours.py @@ -131,9 +131,11 @@ def classify(self, pred_point: np.ndarray[float], k: int = 5) -> str: """ # Choose the distance function once if self.distance_metric == "euclidean": + def dist_fn(a: np.ndarray[float]) -> float: return self._euclidean_distance(a, pred_point) elif self.distance_metric == "manhattan": + def dist_fn(a: np.ndarray[float]) -> float: return self._manhattan_distance(a, pred_point) else: # minkowski