Skip to content

Commit a1ab9e5

Browse files
committed
Added k_medoids algorithm
1 parent 709c18e commit a1ab9e5

File tree

1 file changed

+204
-0
lines changed

1 file changed

+204
-0
lines changed

machine_learning/k_medoids.py

Lines changed: 204 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,204 @@
1+
"""
2+
k-Medoids Clustering Algorithm
3+
4+
For more details, see:
5+
https://en.wikipedia.org/wiki/K-medoids
6+
"""
7+
8+
import doctest
9+
10+
import numpy as np
11+
from numpy import ndarray
12+
from sklearn.datasets import load_iris
13+
14+
15+
def _get_data() -> tuple[ndarray, ndarray]:
16+
"""
17+
Load the Iris dataset and return features and labels.
18+
19+
Returns:
20+
tuple[ndarray, ndarray]: Feature matrix and target labels.
21+
22+
>>> features, labels = _get_data()
23+
>>> features.shape
24+
(150, 4)
25+
>>> labels.shape
26+
(150,)
27+
"""
28+
iris = load_iris()
29+
return np.array(iris.data), np.array(iris.target)
30+
31+
32+
def _compute_distances(data_matrix: ndarray, medoids: ndarray) -> ndarray:
33+
"""
34+
Compute pairwise distances between points and medoids.
35+
36+
Args:
37+
data_matrix: Input dataset.
38+
medoids: Indices of current medoids.
39+
40+
Returns:
41+
ndarray: Distance matrix of shape (n_samples, n_clusters).
42+
43+
>>> x = np.array([[0.0, 0.0], [1.0, 0.0], [0.0, 1.0]])
44+
>>> d = _compute_distances(x, np.array([0, 2]))
45+
>>> d.shape
46+
(3, 2)
47+
"""
48+
return np.linalg.norm(data_matrix[:, np.newaxis] - data_matrix[medoids], axis=2)
49+
50+
51+
def _assign_clusters(distances: ndarray) -> ndarray:
52+
"""
53+
Assign each data point to the nearest medoid.
54+
55+
Args:
56+
distances: Pairwise distance matrix.
57+
58+
Returns:
59+
ndarray: Cluster assignments.
60+
61+
>>> d = np.array([[0.1, 0.4], [0.2, 0.3], [0.9, 0.1]])
62+
>>> _assign_clusters(d)
63+
array([0, 0, 1])
64+
"""
65+
return np.argmin(distances, axis=1).astype(int)
66+
67+
68+
def _initialize_medoids(
69+
n_samples: int, n_clusters: int, random_state: int | None = None
70+
) -> ndarray:
71+
"""
72+
Randomly select initial medoids.
73+
74+
Args:
75+
n_samples: Total number of samples.
76+
n_clusters: Number of clusters.
77+
random_state: Optional random seed.
78+
79+
Returns:
80+
ndarray: Indices of initial medoids.
81+
82+
>>> np.random.seed(42)
83+
>>> _initialize_medoids(10, 3).shape
84+
(3,)
85+
"""
86+
rng = np.random.default_rng(random_state)
87+
return rng.choice(n_samples, n_clusters, replace=False)
88+
89+
90+
def _update_medoids(
91+
data_matrix: ndarray, clusters: ndarray, n_clusters: int
92+
) -> ndarray:
93+
"""
94+
Update medoids by minimizing intra-cluster distances.
95+
96+
Args:
97+
data_matrix: Dataset.
98+
clusters: Cluster assignments.
99+
n_clusters: Number of clusters.
100+
101+
Returns:
102+
ndarray: Updated medoid indices.
103+
104+
>>> x = np.array([[0.0, 0.0], [1.0, 0.0], [5.0, 0.0]])
105+
>>> clusters = np.array([0, 0, 1])
106+
>>> _update_medoids(x, clusters, 2).shape
107+
(2,)
108+
"""
109+
new_medoids = np.zeros(n_clusters, dtype=int)
110+
for k in range(n_clusters):
111+
cluster_points = np.where(clusters == k)[0]
112+
if len(cluster_points) == 0:
113+
continue
114+
115+
intra_distances = np.sum(
116+
np.linalg.norm(
117+
data_matrix[cluster_points][:, np.newaxis]
118+
- data_matrix[cluster_points],
119+
axis=2,
120+
),
121+
axis=1,
122+
)
123+
new_medoids[k] = cluster_points[np.argmin(intra_distances)]
124+
125+
return new_medoids
126+
127+
128+
def apply_k_medoids(
129+
data_matrix: ndarray,
130+
n_clusters: int = 3,
131+
max_iter: int = 100,
132+
random_state: int | None = None,
133+
) -> tuple[ndarray, ndarray]:
134+
"""
135+
Apply k-Medoids clustering to a dataset.
136+
137+
Args:
138+
data_matrix: Input dataset.
139+
n_clusters: Number of clusters.
140+
max_iter: Maximum iterations.
141+
random_state: Optional random seed.
142+
143+
Returns:
144+
tuple[ndarray, ndarray]: Final medoids and cluster assignments.
145+
146+
>>> features, _ = _get_data()
147+
>>> medoids, clusters = apply_k_medoids(features, n_clusters=3, max_iter=10)
148+
>>> len(medoids)
149+
3
150+
"""
151+
if n_clusters < 1 or max_iter < 1:
152+
raise ValueError("n_clusters and max_iter must be >= 1")
153+
154+
n_samples = data_matrix.shape[0]
155+
medoids = _initialize_medoids(n_samples, n_clusters, random_state)
156+
157+
for _ in range(max_iter):
158+
distances = _compute_distances(data_matrix, medoids)
159+
clusters = _assign_clusters(distances)
160+
new_medoids = _update_medoids(data_matrix, clusters, n_clusters)
161+
162+
if np.array_equal(medoids, new_medoids):
163+
break
164+
medoids = new_medoids
165+
166+
return medoids, clusters
167+
168+
169+
def main() -> None:
170+
"""
171+
Run k-Medoids on the Iris dataset and display results.
172+
173+
>>> main() # doctest: +ELLIPSIS
174+
k-Medoids clustering (first 10 assignments):
175+
[...]
176+
"""
177+
features, _ = _get_data()
178+
_, clusters = apply_k_medoids(features, n_clusters=3, max_iter=50, random_state=42)
179+
180+
if not isinstance(clusters, np.ndarray):
181+
raise TypeError("Cluster assignments must be an ndarray")
182+
183+
print("k-Medoids clustering (first 10 assignments):")
184+
print(clusters[:10])
185+
186+
# Optional visualization
187+
# import matplotlib.pyplot as plt
188+
# plt.scatter(features[:, 0], features[:, 1], c=clusters, cmap="viridis", s=30)
189+
# plt.scatter(
190+
# features[medoids, 0],
191+
# features[medoids, 1],
192+
# c="red",
193+
# marker="x",
194+
# s=100,
195+
# )
196+
# plt.title("k-Medoids Clustering (Iris Dataset)")
197+
# plt.xlabel("Feature 1")
198+
# plt.ylabel("Feature 2")
199+
# plt.show()
200+
201+
202+
if __name__ == "__main__":
203+
doctest.testmod()
204+
main()

0 commit comments

Comments
 (0)