11"""
2- t-distributed stochastic neighbor embedding (t-SNE)
2+ t-distributed Stochastic Neighbor Embedding (t-SNE)
3+
4+ t-SNE is a nonlinear dimensionality reduction technique particularly well-suited
5+ for visualizing high-dimensional datasets in 2D or 3D space. It works by:
6+ 1. Computing pairwise similarities in high-dimensional space using Gaussian kernels
7+ 2. Computing pairwise similarities in low-dimensional space using Student-t distribution
8+ 3. Minimizing the Kullback-Leibler divergence between these two distributions
9+
10+ Reference:
11+ van der Maaten, L., & Hinton, G. (2008). Visualizing data using t-SNE.
12+ Journal of Machine Learning Research, 9(86), 2579-2605.
313
414For more details, see:
515https://en.wikipedia.org/wiki/T-distributed_stochastic_neighbor_embedding
@@ -29,104 +39,225 @@ def collect_dataset() -> tuple[ndarray, ndarray]:
2939 return np .array (iris_dataset .data ), np .array (iris_dataset .target )
3040
3141
32- def compute_pairwise_affinities (data_matrix : ndarray , sigma : float = 1.0 ) -> ndarray :
42+ def compute_perplexity_based_sigma (
43+ distances : ndarray ,
44+ target_perplexity : float ,
45+ tolerance : float = 1e-5 ,
46+ max_iter : int = 50 ,
47+ ) -> float :
48+ """
49+ Compute the optimal sigma (bandwidth) for a given perplexity using binary search.
50+
51+ Perplexity is a measure of the effective number of neighbors. This function
52+ finds the Gaussian kernel bandwidth that produces the target perplexity.
53+
54+ Args:
55+ distances: Squared distances from a point to all other points.
56+ target_perplexity: Desired perplexity value (typically 5-50).
57+ tolerance: Convergence tolerance for binary search.
58+ max_iter: Maximum number of binary search iterations.
59+
60+ Returns:
61+ float: Optimal sigma value for the target perplexity.
62+
63+ >>> distances = np.array([0.0, 1.0, 4.0, 9.0])
64+ >>> sigma = compute_perplexity_based_sigma(distances, target_perplexity=2.0)
65+ >>> 0.5 < sigma < 2.0
66+ True
67+ """
68+ # Binary search bounds for sigma
69+ sigma_min , sigma_max = 1e-20 , 1e20
70+ sigma = 1.0
71+
72+ for _ in range (max_iter ):
73+ # Compute probabilities with current sigma
74+ probabilities = np .exp (- distances / (2 * sigma ** 2 ))
75+ probabilities [probabilities == np .inf ] = 0
76+ sum_probs = np .sum (probabilities )
77+
78+ if sum_probs == 0 :
79+ probabilities = np .ones_like (probabilities ) / len (probabilities )
80+ sum_probs = 1.0
81+
82+ probabilities /= sum_probs
83+
84+ # Compute Shannon entropy and perplexity
85+ # H = -sum(p * log2(p)), Perplexity = 2^H
86+ entropy = - np .sum (probabilities * np .log2 (probabilities + 1e-12 ))
87+ perplexity = 2 ** entropy
88+
89+ # Adjust sigma based on perplexity difference
90+ perplexity_diff = perplexity - target_perplexity
91+ if abs (perplexity_diff ) < tolerance :
92+ break
93+
94+ if perplexity_diff > 0 :
95+ sigma_max = sigma
96+ sigma = (sigma + sigma_min ) / 2
97+ else :
98+ sigma_min = sigma
99+ sigma = (sigma + sigma_max ) / 2
100+
101+ return sigma
102+
103+
104+ def compute_pairwise_affinities (
105+ data_matrix : ndarray , perplexity : float = 30.0
106+ ) -> ndarray :
33107 """
34108 Compute high-dimensional affinities (P matrix) using a Gaussian kernel.
35109
110+ This function computes pairwise conditional probabilities p_j|i that represent
111+ the similarity between points in the high-dimensional space. The bandwidth (sigma)
112+ for each point is chosen to achieve the target perplexity.
113+
36114 Args:
37115 data_matrix: Input data of shape (n_samples, n_features).
38- sigma: Gaussian kernel bandwidth.
116+ perplexity: Target perplexity, controls effective number of neighbors
117+ (typically 5-50).
39118
40119 Returns:
41- ndarray: Symmetrized probability matrix.
42-
43- >>> x = np.array([[0.0, 0.0], [1.0, 0.0]])
44- >>> probabilities = compute_pairwise_affinities(x)
45- >>> float(round(probabilities[0, 1], 3))
46- 0.25
120+ ndarray: Symmetrized probability matrix P where P_ij = (p_j|i + p_i|j) / (2n).
121+
122+ >>> x = np.array([[0.0, 0.0], [1.0, 0.0], [0.0, 1.0]])
123+ >>> probabilities = compute_pairwise_affinities(x, perplexity=2.0)
124+ >>> probabilities.shape
125+ (3, 3)
126+ >>> np.allclose(probabilities, probabilities.T) # Check symmetry
127+ True
47128 """
48129 n_samples = data_matrix .shape [0 ]
130+ # Compute pairwise squared Euclidean distances
49131 squared_sum = np .sum (np .square (data_matrix ), axis = 1 )
50- squared_distance = np .add (
132+ squared_distances = np .add (
51133 np .add (- 2 * np .dot (data_matrix , data_matrix .T ), squared_sum ).T , squared_sum
52134 )
53135
54- affinity_matrix = np .exp (- squared_distance / (2 * sigma ** 2 ))
55- np .fill_diagonal (affinity_matrix , 0 )
136+ # Compute conditional probabilities p_j|i for each point i
137+ affinity_matrix = np .zeros ((n_samples , n_samples ))
138+ for i in range (n_samples ):
139+ # Find optimal sigma for this point to achieve target perplexity
140+ distances_i = squared_distances [i ].copy ()
141+ distances_i [i ] = 0 # Distance to self is 0
142+ sigma_i = compute_perplexity_based_sigma (distances_i , perplexity )
143+
144+ # Compute conditional probabilities for point i
145+ affinity_matrix [i ] = np .exp (- squared_distances [i ] / (2 * sigma_i ** 2 ))
146+ affinity_matrix [i , i ] = 0 # Set diagonal to 0
147+
148+ # Normalize to get probabilities
149+ sum_affinities = np .sum (affinity_matrix [i ])
150+ if sum_affinities > 0 :
151+ affinity_matrix [i ] /= sum_affinities
56152
57- affinity_matrix /= np .sum (affinity_matrix )
58- return (affinity_matrix + affinity_matrix .T ) / (2 * n_samples )
153+ # Symmetrize: P_ij = (p_j|i + p_i|j) / (2n)
154+ affinity_matrix = (affinity_matrix + affinity_matrix .T ) / (2 * n_samples )
155+ return affinity_matrix
59156
60157
61158def compute_low_dim_affinities (embedding_matrix : ndarray ) -> tuple [ndarray , ndarray ]:
62159 """
63160 Compute low-dimensional affinities (Q matrix) using a Student-t distribution.
64161
162+ In the low-dimensional space, t-SNE uses a Student-t distribution (with 1 degree
163+ of freedom) instead of a Gaussian. This heavy-tailed distribution helps alleviate
164+ the "crowding problem" by allowing dissimilar points to be farther apart.
165+
166+ The probability q_ij is proportional to (1 + ||y_i - y_j||^2)^(-1).
167+
65168 Args:
66169 embedding_matrix: Low-dimensional embedding of shape (n_samples, n_components).
67170
68171 Returns:
69- tuple[ndarray, ndarray]: (Q probability matrix, numerator matrix).
172+ tuple[ndarray, ndarray]: (Q probability matrix, numerator matrix for gradient ).
70173
71174 >>> y = np.array([[0.0, 0.0], [1.0, 0.0]])
72175 >>> q_matrix, numerators = compute_low_dim_affinities(y)
73176 >>> q_matrix.shape
74177 (2, 2)
178+ >>> np.allclose(q_matrix.sum(), 1.0) # Probabilities sum to 1
179+ True
75180 """
181+ # Compute pairwise squared Euclidean distances in low-dimensional space
76182 squared_sum = np .sum (np .square (embedding_matrix ), axis = 1 )
77- numerator_matrix = 1 / (
78- 1
79- + np .add (
80- np .add (- 2 * np .dot (embedding_matrix , embedding_matrix .T ), squared_sum ).T ,
81- squared_sum ,
82- )
183+ squared_distances = np .add (
184+ np .add (- 2 * np .dot (embedding_matrix , embedding_matrix .T ), squared_sum ).T ,
185+ squared_sum ,
83186 )
84- np .fill_diagonal (numerator_matrix , 0 )
85187
188+ # Student-t distribution with 1 degree of freedom: (1 + d^2)^(-1)
189+ numerator_matrix = 1 / (1 + squared_distances )
190+ np .fill_diagonal (numerator_matrix , 0 ) # Set diagonal to 0
191+
192+ # Normalize to get probability distribution Q
86193 q_matrix = numerator_matrix / np .sum (numerator_matrix )
87194 return q_matrix , numerator_matrix
88195
89196
90197def apply_tsne (
91198 data_matrix : ndarray ,
92199 n_components : int = 2 ,
200+ perplexity : float = 30.0 ,
93201 learning_rate : float = 200.0 ,
94202 n_iter : int = 500 ,
95203) -> ndarray :
96204 """
97205 Apply t-SNE for dimensionality reduction.
98206
207+ t-SNE minimizes the Kullback-Leibler divergence between the high-dimensional
208+ probability distribution P and the low-dimensional distribution Q using
209+ gradient descent with momentum.
210+
99211 Args:
100- data_matrix: Original dataset (features).
101- n_components: Target dimension (2D or 3D).
102- learning_rate: Step size for gradient descent.
103- n_iter: Number of iterations.
212+ data_matrix: Original dataset of shape (n_samples, n_features).
213+ n_components: Target dimension (typically 2 or 3 for visualization).
214+ perplexity: Perplexity parameter, controls effective number of neighbors.
215+ Typical values are between 5 and 50. Larger datasets usually
216+ require larger perplexity values.
217+ learning_rate: Step size for gradient descent (typically 10-1000).
218+ n_iter: Number of gradient descent iterations (typically 250-1000).
104219
105220 Returns:
106- ndarray: Low-dimensional embedding of the data.
221+ ndarray: Low-dimensional embedding of shape (n_samples, n_components).
222+
223+ Raises:
224+ ValueError: If n_components or n_iter is less than 1, or if perplexity
225+ is not in valid range.
107226
108227 >>> features, _ = collect_dataset()
109- >>> embedding = apply_tsne(features, n_components=2, n_iter=50)
228+ >>> embedding = apply_tsne(features, n_components=2, perplexity=30.0, n_iter=50)
110229 >>> embedding.shape
111230 (150, 2)
112231 """
232+ # Validate input parameters
113233 if n_components < 1 or n_iter < 1 :
114234 raise ValueError ("n_components and n_iter must be >= 1" )
235+ if perplexity < 1 or perplexity >= data_matrix .shape [0 ]:
236+ raise ValueError ("perplexity must be between 1 and n_samples - 1" )
115237
116238 n_samples = data_matrix .shape [0 ]
239+
240+ # Initialize embedding with small random values from Gaussian distribution
117241 rng = np .random .default_rng ()
118242 embedding = rng .standard_normal ((n_samples , n_components )) * 1e-4
119243
120- high_dim_affinities = compute_pairwise_affinities (data_matrix )
121- high_dim_affinities = np .maximum (high_dim_affinities , 1e-12 )
244+ # Compute high-dimensional affinities (P matrix) based on perplexity
245+ high_dim_affinities = compute_pairwise_affinities (data_matrix , perplexity )
246+ high_dim_affinities = np .maximum (high_dim_affinities , 1e-12 ) # Avoid log(0)
122247
248+ # Initialize momentum-based gradient descent
123249 embedding_increment = np .zeros_like (embedding )
124- momentum = 0.5
250+ momentum = 0.5 # Initial momentum value
125251
252+ # Gradient descent optimization loop
126253 for iteration in range (n_iter ):
254+ # Compute low-dimensional affinities (Q matrix) using Student-t distribution
127255 low_dim_affinities , numerator_matrix = compute_low_dim_affinities (embedding )
128- low_dim_affinities = np .maximum (low_dim_affinities , 1e-12 )
256+ low_dim_affinities = np .maximum (low_dim_affinities , 1e-12 ) # Avoid log(0)
129257
258+ # Compute gradient of KL divergence: dC/dy_i
259+ # The gradient has an attractive force (P_ij > Q_ij) and
260+ # a repulsive force (P_ij < Q_ij)
130261 affinity_diff = high_dim_affinities - low_dim_affinities
131262
132263 gradient = 4 * (
@@ -137,9 +268,11 @@ def apply_tsne(
137268 )
138269 )
139270
271+ # Update embedding using momentum-based gradient descent
140272 embedding_increment = momentum * embedding_increment - learning_rate * gradient
141273 embedding += embedding_increment
142274
275+ # Increase momentum after initial iterations for faster convergence
143276 if iteration == int (n_iter / 4 ):
144277 momentum = 0.8
145278
@@ -155,7 +288,9 @@ def main() -> None:
155288 [[...
156289 """
157290 features , _labels = collect_dataset ()
158- embedding = apply_tsne (features , n_components = 2 , n_iter = 300 )
291+ embedding = apply_tsne (
292+ features , n_components = 2 , perplexity = 30.0 , learning_rate = 200.0 , n_iter = 300
293+ )
159294
160295 if not isinstance (embedding , np .ndarray ):
161296 raise TypeError ("t-SNE embedding must be an ndarray" )
0 commit comments