Skip to content

Commit 2331f8f

Browse files
committed
improve tsne
1 parent e2a78d4 commit 2331f8f

File tree

1 file changed

+168
-33
lines changed

1 file changed

+168
-33
lines changed

machine_learning/t_stochastic_neighbour_embedding.py

Lines changed: 168 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,15 @@
11
"""
2-
t-distributed stochastic neighbor embedding (t-SNE)
2+
t-distributed Stochastic Neighbor Embedding (t-SNE)
3+
4+
t-SNE is a nonlinear dimensionality reduction technique particularly well-suited
5+
for visualizing high-dimensional datasets in 2D or 3D space. It works by:
6+
1. Computing pairwise similarities in high-dimensional space using Gaussian kernels
7+
2. Computing pairwise similarities in low-dimensional space using Student-t distribution
8+
3. Minimizing the Kullback-Leibler divergence between these two distributions
9+
10+
Reference:
11+
van der Maaten, L., & Hinton, G. (2008). Visualizing data using t-SNE.
12+
Journal of Machine Learning Research, 9(86), 2579-2605.
313
414
For more details, see:
515
https://en.wikipedia.org/wiki/T-distributed_stochastic_neighbor_embedding
@@ -29,104 +39,225 @@ def collect_dataset() -> tuple[ndarray, ndarray]:
2939
return np.array(iris_dataset.data), np.array(iris_dataset.target)
3040

3141

32-
def compute_pairwise_affinities(data_matrix: ndarray, sigma: float = 1.0) -> ndarray:
42+
def compute_perplexity_based_sigma(
43+
distances: ndarray,
44+
target_perplexity: float,
45+
tolerance: float = 1e-5,
46+
max_iter: int = 50,
47+
) -> float:
48+
"""
49+
Compute the optimal sigma (bandwidth) for a given perplexity using binary search.
50+
51+
Perplexity is a measure of the effective number of neighbors. This function
52+
finds the Gaussian kernel bandwidth that produces the target perplexity.
53+
54+
Args:
55+
distances: Squared distances from a point to all other points.
56+
target_perplexity: Desired perplexity value (typically 5-50).
57+
tolerance: Convergence tolerance for binary search.
58+
max_iter: Maximum number of binary search iterations.
59+
60+
Returns:
61+
float: Optimal sigma value for the target perplexity.
62+
63+
>>> distances = np.array([0.0, 1.0, 4.0, 9.0])
64+
>>> sigma = compute_perplexity_based_sigma(distances, target_perplexity=2.0)
65+
>>> 0.5 < sigma < 2.0
66+
True
67+
"""
68+
# Binary search bounds for sigma
69+
sigma_min, sigma_max = 1e-20, 1e20
70+
sigma = 1.0
71+
72+
for _ in range(max_iter):
73+
# Compute probabilities with current sigma
74+
probabilities = np.exp(-distances / (2 * sigma**2))
75+
probabilities[probabilities == np.inf] = 0
76+
sum_probs = np.sum(probabilities)
77+
78+
if sum_probs == 0:
79+
probabilities = np.ones_like(probabilities) / len(probabilities)
80+
sum_probs = 1.0
81+
82+
probabilities /= sum_probs
83+
84+
# Compute Shannon entropy and perplexity
85+
# H = -sum(p * log2(p)), Perplexity = 2^H
86+
entropy = -np.sum(probabilities * np.log2(probabilities + 1e-12))
87+
perplexity = 2**entropy
88+
89+
# Adjust sigma based on perplexity difference
90+
perplexity_diff = perplexity - target_perplexity
91+
if abs(perplexity_diff) < tolerance:
92+
break
93+
94+
if perplexity_diff > 0:
95+
sigma_max = sigma
96+
sigma = (sigma + sigma_min) / 2
97+
else:
98+
sigma_min = sigma
99+
sigma = (sigma + sigma_max) / 2
100+
101+
return sigma
102+
103+
104+
def compute_pairwise_affinities(
105+
data_matrix: ndarray, perplexity: float = 30.0
106+
) -> ndarray:
33107
"""
34108
Compute high-dimensional affinities (P matrix) using a Gaussian kernel.
35109
110+
This function computes pairwise conditional probabilities p_j|i that represent
111+
the similarity between points in the high-dimensional space. The bandwidth (sigma)
112+
for each point is chosen to achieve the target perplexity.
113+
36114
Args:
37115
data_matrix: Input data of shape (n_samples, n_features).
38-
sigma: Gaussian kernel bandwidth.
116+
perplexity: Target perplexity, controls effective number of neighbors
117+
(typically 5-50).
39118
40119
Returns:
41-
ndarray: Symmetrized probability matrix.
42-
43-
>>> x = np.array([[0.0, 0.0], [1.0, 0.0]])
44-
>>> probabilities = compute_pairwise_affinities(x)
45-
>>> float(round(probabilities[0, 1], 3))
46-
0.25
120+
ndarray: Symmetrized probability matrix P where P_ij = (p_j|i + p_i|j) / (2n).
121+
122+
>>> x = np.array([[0.0, 0.0], [1.0, 0.0], [0.0, 1.0]])
123+
>>> probabilities = compute_pairwise_affinities(x, perplexity=2.0)
124+
>>> probabilities.shape
125+
(3, 3)
126+
>>> np.allclose(probabilities, probabilities.T) # Check symmetry
127+
True
47128
"""
48129
n_samples = data_matrix.shape[0]
130+
# Compute pairwise squared Euclidean distances
49131
squared_sum = np.sum(np.square(data_matrix), axis=1)
50-
squared_distance = np.add(
132+
squared_distances = np.add(
51133
np.add(-2 * np.dot(data_matrix, data_matrix.T), squared_sum).T, squared_sum
52134
)
53135

54-
affinity_matrix = np.exp(-squared_distance / (2 * sigma**2))
55-
np.fill_diagonal(affinity_matrix, 0)
136+
# Compute conditional probabilities p_j|i for each point i
137+
affinity_matrix = np.zeros((n_samples, n_samples))
138+
for i in range(n_samples):
139+
# Find optimal sigma for this point to achieve target perplexity
140+
distances_i = squared_distances[i].copy()
141+
distances_i[i] = 0 # Distance to self is 0
142+
sigma_i = compute_perplexity_based_sigma(distances_i, perplexity)
143+
144+
# Compute conditional probabilities for point i
145+
affinity_matrix[i] = np.exp(-squared_distances[i] / (2 * sigma_i**2))
146+
affinity_matrix[i, i] = 0 # Set diagonal to 0
147+
148+
# Normalize to get probabilities
149+
sum_affinities = np.sum(affinity_matrix[i])
150+
if sum_affinities > 0:
151+
affinity_matrix[i] /= sum_affinities
56152

57-
affinity_matrix /= np.sum(affinity_matrix)
58-
return (affinity_matrix + affinity_matrix.T) / (2 * n_samples)
153+
# Symmetrize: P_ij = (p_j|i + p_i|j) / (2n)
154+
affinity_matrix = (affinity_matrix + affinity_matrix.T) / (2 * n_samples)
155+
return affinity_matrix
59156

60157

61158
def compute_low_dim_affinities(embedding_matrix: ndarray) -> tuple[ndarray, ndarray]:
62159
"""
63160
Compute low-dimensional affinities (Q matrix) using a Student-t distribution.
64161
162+
In the low-dimensional space, t-SNE uses a Student-t distribution (with 1 degree
163+
of freedom) instead of a Gaussian. This heavy-tailed distribution helps alleviate
164+
the "crowding problem" by allowing dissimilar points to be farther apart.
165+
166+
The probability q_ij is proportional to (1 + ||y_i - y_j||^2)^(-1).
167+
65168
Args:
66169
embedding_matrix: Low-dimensional embedding of shape (n_samples, n_components).
67170
68171
Returns:
69-
tuple[ndarray, ndarray]: (Q probability matrix, numerator matrix).
172+
tuple[ndarray, ndarray]: (Q probability matrix, numerator matrix for gradient).
70173
71174
>>> y = np.array([[0.0, 0.0], [1.0, 0.0]])
72175
>>> q_matrix, numerators = compute_low_dim_affinities(y)
73176
>>> q_matrix.shape
74177
(2, 2)
178+
>>> np.allclose(q_matrix.sum(), 1.0) # Probabilities sum to 1
179+
True
75180
"""
181+
# Compute pairwise squared Euclidean distances in low-dimensional space
76182
squared_sum = np.sum(np.square(embedding_matrix), axis=1)
77-
numerator_matrix = 1 / (
78-
1
79-
+ np.add(
80-
np.add(-2 * np.dot(embedding_matrix, embedding_matrix.T), squared_sum).T,
81-
squared_sum,
82-
)
183+
squared_distances = np.add(
184+
np.add(-2 * np.dot(embedding_matrix, embedding_matrix.T), squared_sum).T,
185+
squared_sum,
83186
)
84-
np.fill_diagonal(numerator_matrix, 0)
85187

188+
# Student-t distribution with 1 degree of freedom: (1 + d^2)^(-1)
189+
numerator_matrix = 1 / (1 + squared_distances)
190+
np.fill_diagonal(numerator_matrix, 0) # Set diagonal to 0
191+
192+
# Normalize to get probability distribution Q
86193
q_matrix = numerator_matrix / np.sum(numerator_matrix)
87194
return q_matrix, numerator_matrix
88195

89196

90197
def apply_tsne(
91198
data_matrix: ndarray,
92199
n_components: int = 2,
200+
perplexity: float = 30.0,
93201
learning_rate: float = 200.0,
94202
n_iter: int = 500,
95203
) -> ndarray:
96204
"""
97205
Apply t-SNE for dimensionality reduction.
98206
207+
t-SNE minimizes the Kullback-Leibler divergence between the high-dimensional
208+
probability distribution P and the low-dimensional distribution Q using
209+
gradient descent with momentum.
210+
99211
Args:
100-
data_matrix: Original dataset (features).
101-
n_components: Target dimension (2D or 3D).
102-
learning_rate: Step size for gradient descent.
103-
n_iter: Number of iterations.
212+
data_matrix: Original dataset of shape (n_samples, n_features).
213+
n_components: Target dimension (typically 2 or 3 for visualization).
214+
perplexity: Perplexity parameter, controls effective number of neighbors.
215+
Typical values are between 5 and 50. Larger datasets usually
216+
require larger perplexity values.
217+
learning_rate: Step size for gradient descent (typically 10-1000).
218+
n_iter: Number of gradient descent iterations (typically 250-1000).
104219
105220
Returns:
106-
ndarray: Low-dimensional embedding of the data.
221+
ndarray: Low-dimensional embedding of shape (n_samples, n_components).
222+
223+
Raises:
224+
ValueError: If n_components or n_iter is less than 1, or if perplexity
225+
is not in valid range.
107226
108227
>>> features, _ = collect_dataset()
109-
>>> embedding = apply_tsne(features, n_components=2, n_iter=50)
228+
>>> embedding = apply_tsne(features, n_components=2, perplexity=30.0, n_iter=50)
110229
>>> embedding.shape
111230
(150, 2)
112231
"""
232+
# Validate input parameters
113233
if n_components < 1 or n_iter < 1:
114234
raise ValueError("n_components and n_iter must be >= 1")
235+
if perplexity < 1 or perplexity >= data_matrix.shape[0]:
236+
raise ValueError("perplexity must be between 1 and n_samples - 1")
115237

116238
n_samples = data_matrix.shape[0]
239+
240+
# Initialize embedding with small random values from Gaussian distribution
117241
rng = np.random.default_rng()
118242
embedding = rng.standard_normal((n_samples, n_components)) * 1e-4
119243

120-
high_dim_affinities = compute_pairwise_affinities(data_matrix)
121-
high_dim_affinities = np.maximum(high_dim_affinities, 1e-12)
244+
# Compute high-dimensional affinities (P matrix) based on perplexity
245+
high_dim_affinities = compute_pairwise_affinities(data_matrix, perplexity)
246+
high_dim_affinities = np.maximum(high_dim_affinities, 1e-12) # Avoid log(0)
122247

248+
# Initialize momentum-based gradient descent
123249
embedding_increment = np.zeros_like(embedding)
124-
momentum = 0.5
250+
momentum = 0.5 # Initial momentum value
125251

252+
# Gradient descent optimization loop
126253
for iteration in range(n_iter):
254+
# Compute low-dimensional affinities (Q matrix) using Student-t distribution
127255
low_dim_affinities, numerator_matrix = compute_low_dim_affinities(embedding)
128-
low_dim_affinities = np.maximum(low_dim_affinities, 1e-12)
256+
low_dim_affinities = np.maximum(low_dim_affinities, 1e-12) # Avoid log(0)
129257

258+
# Compute gradient of KL divergence: dC/dy_i
259+
# The gradient has an attractive force (P_ij > Q_ij) and
260+
# a repulsive force (P_ij < Q_ij)
130261
affinity_diff = high_dim_affinities - low_dim_affinities
131262

132263
gradient = 4 * (
@@ -137,9 +268,11 @@ def apply_tsne(
137268
)
138269
)
139270

271+
# Update embedding using momentum-based gradient descent
140272
embedding_increment = momentum * embedding_increment - learning_rate * gradient
141273
embedding += embedding_increment
142274

275+
# Increase momentum after initial iterations for faster convergence
143276
if iteration == int(n_iter / 4):
144277
momentum = 0.8
145278

@@ -155,7 +288,9 @@ def main() -> None:
155288
[[...
156289
"""
157290
features, _labels = collect_dataset()
158-
embedding = apply_tsne(features, n_components=2, n_iter=300)
291+
embedding = apply_tsne(
292+
features, n_components=2, perplexity=30.0, learning_rate=200.0, n_iter=300
293+
)
159294

160295
if not isinstance(embedding, np.ndarray):
161296
raise TypeError("t-SNE embedding must be an ndarray")

0 commit comments

Comments
 (0)