66"""
77
88import doctest
9-
109import numpy as np
1110from numpy import ndarray
12- from sklearn .datasets import load_iris
11+ try :
12+ from sklearn .datasets import load_iris
13+ from sklearn .manifold import TSNE
14+ except ImportError as e :
15+ raise ImportError (
16+ "Required package 'scikit-learn' not found. Please install it using:\n "
17+ "pip install scikit-learn"
18+ ) from e
1319
1420
1521def collect_dataset () -> tuple [ndarray , ndarray ]:
@@ -29,150 +35,83 @@ def collect_dataset() -> tuple[ndarray, ndarray]:
2935 return np .array (iris_dataset .data ), np .array (iris_dataset .target )
3036
3137
32- def compute_pairwise_affinities (data_matrix : ndarray , sigma : float = 1.0 ) -> ndarray :
33- """
34- Compute high-dimensional affinities (P matrix) using a Gaussian kernel.
35-
36- Args:
37- data_matrix: Input data of shape (n_samples, n_features).
38- sigma: Gaussian kernel bandwidth.
39-
40- Returns:
41- ndarray: Symmetrized probability matrix.
42-
43- >>> x = np.array([[0.0, 0.0], [1.0, 0.0]])
44- >>> probabilities = compute_pairwise_affinities(x)
45- >>> float(round(probabilities[0, 1], 3))
46- 0.25
47- """
48- n_samples = data_matrix .shape [0 ]
49- squared_sum = np .sum (np .square (data_matrix ), axis = 1 )
50- squared_distance = np .add (
51- np .add (- 2 * np .dot (data_matrix , data_matrix .T ), squared_sum ).T , squared_sum
52- )
53-
54- affinity_matrix = np .exp (- squared_distance / (2 * sigma ** 2 ))
55- np .fill_diagonal (affinity_matrix , 0 )
56-
57- affinity_matrix /= np .sum (affinity_matrix )
58- return (affinity_matrix + affinity_matrix .T ) / (2 * n_samples )
59-
60-
61- def compute_low_dim_affinities (embedding_matrix : ndarray ) -> tuple [ndarray , ndarray ]:
62- """
63- Compute low-dimensional affinities (Q matrix) using a Student-t distribution.
64-
65- Args:
66- embedding_matrix: Low-dimensional embedding of shape (n_samples, n_components).
67-
68- Returns:
69- tuple[ndarray, ndarray]: (Q probability matrix, numerator matrix).
70-
71- >>> y = np.array([[0.0, 0.0], [1.0, 0.0]])
72- >>> q_matrix, numerators = compute_low_dim_affinities(y)
73- >>> q_matrix.shape
74- (2, 2)
75- """
76- squared_sum = np .sum (np .square (embedding_matrix ), axis = 1 )
77- numerator_matrix = 1 / (
78- 1
79- + np .add (
80- np .add (- 2 * np .dot (embedding_matrix , embedding_matrix .T ), squared_sum ).T ,
81- squared_sum ,
82- )
83- )
84- np .fill_diagonal (numerator_matrix , 0 )
85-
86- q_matrix = numerator_matrix / np .sum (numerator_matrix )
87- return q_matrix , numerator_matrix
88-
89-
9038def apply_tsne (
9139 data_matrix : ndarray ,
9240 n_components : int = 2 ,
41+ perplexity : float = 30.0 ,
9342 learning_rate : float = 200.0 ,
94- n_iter : int = 500 ,
43+ max_iter : int = 1000 ,
44+ random_state : int = 42 ,
9545) -> ndarray :
9646 """
97- Apply t-SNE for dimensionality reduction.
47+ Apply t-SNE for dimensionality reduction using scikit-learn's implementation .
9848
9949 Args:
10050 data_matrix: Original dataset (features).
10151 n_components: Target dimension (2D or 3D).
102- learning_rate: Step size for gradient descent.
103- n_iter: Number of iterations.
52+ perplexity: Controls balance between local/global aspects.
53+ learning_rate: Step size for optimization.
54+ max_iter: Number of iterations for optimization.
55+ random_state: Ensures reproducibility.
10456
10557 Returns:
10658 ndarray: Low-dimensional embedding of the data.
10759
10860 >>> features, _ = collect_dataset()
109- >>> embedding = apply_tsne(features, n_components=2, n_iter=50 )
61+ >>> embedding = apply_tsne(features, n_components=2, max_iter=250 )
11062 >>> embedding.shape
11163 (150, 2)
11264 """
113- if n_components < 1 or n_iter < 1 :
114- raise ValueError ("n_components and n_iter must be >= 1" )
115-
116- n_samples = data_matrix .shape [0 ]
117- rng = np .random .default_rng ()
118- embedding = rng .standard_normal ((n_samples , n_components )) * 1e-4
119-
120- high_dim_affinities = compute_pairwise_affinities (data_matrix )
121- high_dim_affinities = np .maximum (high_dim_affinities , 1e-12 )
122-
123- embedding_increment = np .zeros_like (embedding )
124- momentum = 0.5
125-
126- for iteration in range (n_iter ):
127- low_dim_affinities , numerator_matrix = compute_low_dim_affinities (embedding )
128- low_dim_affinities = np .maximum (low_dim_affinities , 1e-12 )
129-
130- affinity_diff = high_dim_affinities - low_dim_affinities
131-
132- gradient = 4 * (
133- np .dot ((affinity_diff * numerator_matrix ), embedding )
134- - np .multiply (
135- np .sum (affinity_diff * numerator_matrix , axis = 1 )[:, np .newaxis ],
136- embedding ,
137- )
138- )
139-
140- embedding_increment = momentum * embedding_increment - learning_rate * gradient
141- embedding += embedding_increment
142-
143- if iteration == int (n_iter / 4 ):
144- momentum = 0.8
145-
146- return embedding
65+ tsne = TSNE (
66+ n_components = n_components ,
67+ perplexity = perplexity ,
68+ learning_rate = learning_rate ,
69+ max_iter = max_iter ,
70+ random_state = random_state ,
71+ init = "random" ,
72+ )
73+ return tsne .fit_transform (data_matrix )
14774
14875
14976def main () -> None :
15077 """
151- Run t-SNE on the Iris dataset and display the first 5 embeddings .
78+ Run t-SNE on the Iris dataset, print embeddings, and visualize results .
15279
15380 >>> main() # doctest: +ELLIPSIS
15481 t-SNE embedding (first 5 points):
15582 [[...
15683 """
157- features , _labels = collect_dataset ()
158- embedding = apply_tsne (features , n_components = 2 , n_iter = 300 )
84+ features , labels = collect_dataset ()
85+ embedding = apply_tsne (
86+ features ,
87+ n_components = 2 ,
88+ perplexity = 40.0 ,
89+ learning_rate = 150.0 ,
90+ max_iter = 1000 ,
91+ random_state = 42 ,
92+ )
15993
16094 if not isinstance (embedding , np .ndarray ):
16195 raise TypeError ("t-SNE embedding must be an ndarray" )
16296
16397 print ("t-SNE embedding (first 5 points):" )
16498 print (embedding [:5 ])
16599
166- # Optional visualization (Ruff/mypy compliant)
100+ try :
101+ import matplotlib .pyplot as plt
167102
168- # import matplotlib.pyplot as plt
169- # plt.scatter(embedding[:, 0], embedding[:, 1], c=labels, cmap="viridis")
170- # plt.title("t-SNE Visualization of the Iris Dataset")
171- # plt.xlabel("Dimension 1")
172- # plt.ylabel("Dimension 2")
173- # plt.show()
103+ plt .figure (figsize = (7 , 5 ))
104+ scatter = plt .scatter (embedding [:, 0 ], embedding [:, 1 ], c = labels , cmap = "viridis" )
105+ plt .title ("t-SNE Visualization of the Iris Dataset" )
106+ plt .xlabel ("Dimension 1" )
107+ plt .ylabel ("Dimension 2" )
108+ plt .colorbar (scatter , label = "Class Label" )
109+ plt .tight_layout ()
110+ plt .show ()
111+ except ImportError :
112+ print ("matplotlib not found. Install it with: pip install matplotlib" )
174113
175114
176115if __name__ == "__main__" :
177116 doctest .testmod ()
178- main ()
117+ main ()
0 commit comments