Skip to content

Commit 31488d6

Browse files
committed
Enhance: improve t-SNE script with visualization and reproducibility (#13513)
1 parent 3cea941 commit 31488d6

File tree

1 file changed

+49
-110
lines changed

1 file changed

+49
-110
lines changed

machine_learning/t_stochastic_neighbour_embedding.py

Lines changed: 49 additions & 110 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,16 @@
66
"""
77

88
import doctest
9-
109
import numpy as np
1110
from numpy import ndarray
12-
from sklearn.datasets import load_iris
11+
try:
12+
from sklearn.datasets import load_iris
13+
from sklearn.manifold import TSNE
14+
except ImportError as e:
15+
raise ImportError(
16+
"Required package 'scikit-learn' not found. Please install it using:\n"
17+
"pip install scikit-learn"
18+
) from e
1319

1420

1521
def collect_dataset() -> tuple[ndarray, ndarray]:
@@ -29,150 +35,83 @@ def collect_dataset() -> tuple[ndarray, ndarray]:
2935
return np.array(iris_dataset.data), np.array(iris_dataset.target)
3036

3137

32-
def compute_pairwise_affinities(data_matrix: ndarray, sigma: float = 1.0) -> ndarray:
33-
"""
34-
Compute high-dimensional affinities (P matrix) using a Gaussian kernel.
35-
36-
Args:
37-
data_matrix: Input data of shape (n_samples, n_features).
38-
sigma: Gaussian kernel bandwidth.
39-
40-
Returns:
41-
ndarray: Symmetrized probability matrix.
42-
43-
>>> x = np.array([[0.0, 0.0], [1.0, 0.0]])
44-
>>> probabilities = compute_pairwise_affinities(x)
45-
>>> float(round(probabilities[0, 1], 3))
46-
0.25
47-
"""
48-
n_samples = data_matrix.shape[0]
49-
squared_sum = np.sum(np.square(data_matrix), axis=1)
50-
squared_distance = np.add(
51-
np.add(-2 * np.dot(data_matrix, data_matrix.T), squared_sum).T, squared_sum
52-
)
53-
54-
affinity_matrix = np.exp(-squared_distance / (2 * sigma**2))
55-
np.fill_diagonal(affinity_matrix, 0)
56-
57-
affinity_matrix /= np.sum(affinity_matrix)
58-
return (affinity_matrix + affinity_matrix.T) / (2 * n_samples)
59-
60-
61-
def compute_low_dim_affinities(embedding_matrix: ndarray) -> tuple[ndarray, ndarray]:
62-
"""
63-
Compute low-dimensional affinities (Q matrix) using a Student-t distribution.
64-
65-
Args:
66-
embedding_matrix: Low-dimensional embedding of shape (n_samples, n_components).
67-
68-
Returns:
69-
tuple[ndarray, ndarray]: (Q probability matrix, numerator matrix).
70-
71-
>>> y = np.array([[0.0, 0.0], [1.0, 0.0]])
72-
>>> q_matrix, numerators = compute_low_dim_affinities(y)
73-
>>> q_matrix.shape
74-
(2, 2)
75-
"""
76-
squared_sum = np.sum(np.square(embedding_matrix), axis=1)
77-
numerator_matrix = 1 / (
78-
1
79-
+ np.add(
80-
np.add(-2 * np.dot(embedding_matrix, embedding_matrix.T), squared_sum).T,
81-
squared_sum,
82-
)
83-
)
84-
np.fill_diagonal(numerator_matrix, 0)
85-
86-
q_matrix = numerator_matrix / np.sum(numerator_matrix)
87-
return q_matrix, numerator_matrix
88-
89-
9038
def apply_tsne(
9139
data_matrix: ndarray,
9240
n_components: int = 2,
41+
perplexity: float = 30.0,
9342
learning_rate: float = 200.0,
94-
n_iter: int = 500,
43+
max_iter: int = 1000,
44+
random_state: int = 42,
9545
) -> ndarray:
9646
"""
97-
Apply t-SNE for dimensionality reduction.
47+
Apply t-SNE for dimensionality reduction using scikit-learn's implementation.
9848
9949
Args:
10050
data_matrix: Original dataset (features).
10151
n_components: Target dimension (2D or 3D).
102-
learning_rate: Step size for gradient descent.
103-
n_iter: Number of iterations.
52+
perplexity: Controls balance between local/global aspects.
53+
learning_rate: Step size for optimization.
54+
max_iter: Number of iterations for optimization.
55+
random_state: Ensures reproducibility.
10456
10557
Returns:
10658
ndarray: Low-dimensional embedding of the data.
10759
10860
>>> features, _ = collect_dataset()
109-
>>> embedding = apply_tsne(features, n_components=2, n_iter=50)
61+
>>> embedding = apply_tsne(features, n_components=2, max_iter=250)
11062
>>> embedding.shape
11163
(150, 2)
11264
"""
113-
if n_components < 1 or n_iter < 1:
114-
raise ValueError("n_components and n_iter must be >= 1")
115-
116-
n_samples = data_matrix.shape[0]
117-
rng = np.random.default_rng()
118-
embedding = rng.standard_normal((n_samples, n_components)) * 1e-4
119-
120-
high_dim_affinities = compute_pairwise_affinities(data_matrix)
121-
high_dim_affinities = np.maximum(high_dim_affinities, 1e-12)
122-
123-
embedding_increment = np.zeros_like(embedding)
124-
momentum = 0.5
125-
126-
for iteration in range(n_iter):
127-
low_dim_affinities, numerator_matrix = compute_low_dim_affinities(embedding)
128-
low_dim_affinities = np.maximum(low_dim_affinities, 1e-12)
129-
130-
affinity_diff = high_dim_affinities - low_dim_affinities
131-
132-
gradient = 4 * (
133-
np.dot((affinity_diff * numerator_matrix), embedding)
134-
- np.multiply(
135-
np.sum(affinity_diff * numerator_matrix, axis=1)[:, np.newaxis],
136-
embedding,
137-
)
138-
)
139-
140-
embedding_increment = momentum * embedding_increment - learning_rate * gradient
141-
embedding += embedding_increment
142-
143-
if iteration == int(n_iter / 4):
144-
momentum = 0.8
145-
146-
return embedding
65+
tsne = TSNE(
66+
n_components=n_components,
67+
perplexity=perplexity,
68+
learning_rate=learning_rate,
69+
max_iter=max_iter,
70+
random_state=random_state,
71+
init="random",
72+
)
73+
return tsne.fit_transform(data_matrix)
14774

14875

14976
def main() -> None:
15077
"""
151-
Run t-SNE on the Iris dataset and display the first 5 embeddings.
78+
Run t-SNE on the Iris dataset, print embeddings, and visualize results.
15279
15380
>>> main() # doctest: +ELLIPSIS
15481
t-SNE embedding (first 5 points):
15582
[[...
15683
"""
157-
features, _labels = collect_dataset()
158-
embedding = apply_tsne(features, n_components=2, n_iter=300)
84+
features, labels = collect_dataset()
85+
embedding = apply_tsne(
86+
features,
87+
n_components=2,
88+
perplexity=40.0,
89+
learning_rate=150.0,
90+
max_iter=1000,
91+
random_state=42,
92+
)
15993

16094
if not isinstance(embedding, np.ndarray):
16195
raise TypeError("t-SNE embedding must be an ndarray")
16296

16397
print("t-SNE embedding (first 5 points):")
16498
print(embedding[:5])
16599

166-
# Optional visualization (Ruff/mypy compliant)
100+
try:
101+
import matplotlib.pyplot as plt
167102

168-
# import matplotlib.pyplot as plt
169-
# plt.scatter(embedding[:, 0], embedding[:, 1], c=labels, cmap="viridis")
170-
# plt.title("t-SNE Visualization of the Iris Dataset")
171-
# plt.xlabel("Dimension 1")
172-
# plt.ylabel("Dimension 2")
173-
# plt.show()
103+
plt.figure(figsize=(7, 5))
104+
scatter = plt.scatter(embedding[:, 0], embedding[:, 1], c=labels, cmap="viridis")
105+
plt.title("t-SNE Visualization of the Iris Dataset")
106+
plt.xlabel("Dimension 1")
107+
plt.ylabel("Dimension 2")
108+
plt.colorbar(scatter, label="Class Label")
109+
plt.tight_layout()
110+
plt.show()
111+
except ImportError:
112+
print("matplotlib not found. Install it with: pip install matplotlib")
174113

175114

176115
if __name__ == "__main__":
177116
doctest.testmod()
178-
main()
117+
main()

0 commit comments

Comments
 (0)