From 31488d6bb4b7c5972808ccb01a136b27415fd25d Mon Sep 17 00:00:00 2001 From: Jayesh Shrivastava Date: Thu, 16 Oct 2025 10:12:12 +0530 Subject: [PATCH 1/8] Enhance: improve t-SNE script with visualization and reproducibility (#13513) --- .../t_stochastic_neighbour_embedding.py | 159 ++++++------------ 1 file changed, 49 insertions(+), 110 deletions(-) diff --git a/machine_learning/t_stochastic_neighbour_embedding.py b/machine_learning/t_stochastic_neighbour_embedding.py index d6f630149087..3449ea76a904 100644 --- a/machine_learning/t_stochastic_neighbour_embedding.py +++ b/machine_learning/t_stochastic_neighbour_embedding.py @@ -6,10 +6,16 @@ """ import doctest - import numpy as np from numpy import ndarray -from sklearn.datasets import load_iris +try: + from sklearn.datasets import load_iris + from sklearn.manifold import TSNE +except ImportError as e: + raise ImportError( + "Required package 'scikit-learn' not found. Please install it using:\n" + "pip install scikit-learn" + ) from e def collect_dataset() -> tuple[ndarray, ndarray]: @@ -29,133 +35,61 @@ def collect_dataset() -> tuple[ndarray, ndarray]: return np.array(iris_dataset.data), np.array(iris_dataset.target) -def compute_pairwise_affinities(data_matrix: ndarray, sigma: float = 1.0) -> ndarray: - """ - Compute high-dimensional affinities (P matrix) using a Gaussian kernel. - - Args: - data_matrix: Input data of shape (n_samples, n_features). - sigma: Gaussian kernel bandwidth. - - Returns: - ndarray: Symmetrized probability matrix. - - >>> x = np.array([[0.0, 0.0], [1.0, 0.0]]) - >>> probabilities = compute_pairwise_affinities(x) - >>> float(round(probabilities[0, 1], 3)) - 0.25 - """ - n_samples = data_matrix.shape[0] - squared_sum = np.sum(np.square(data_matrix), axis=1) - squared_distance = np.add( - np.add(-2 * np.dot(data_matrix, data_matrix.T), squared_sum).T, squared_sum - ) - - affinity_matrix = np.exp(-squared_distance / (2 * sigma**2)) - np.fill_diagonal(affinity_matrix, 0) - - affinity_matrix /= np.sum(affinity_matrix) - return (affinity_matrix + affinity_matrix.T) / (2 * n_samples) - - -def compute_low_dim_affinities(embedding_matrix: ndarray) -> tuple[ndarray, ndarray]: - """ - Compute low-dimensional affinities (Q matrix) using a Student-t distribution. - - Args: - embedding_matrix: Low-dimensional embedding of shape (n_samples, n_components). - - Returns: - tuple[ndarray, ndarray]: (Q probability matrix, numerator matrix). - - >>> y = np.array([[0.0, 0.0], [1.0, 0.0]]) - >>> q_matrix, numerators = compute_low_dim_affinities(y) - >>> q_matrix.shape - (2, 2) - """ - squared_sum = np.sum(np.square(embedding_matrix), axis=1) - numerator_matrix = 1 / ( - 1 - + np.add( - np.add(-2 * np.dot(embedding_matrix, embedding_matrix.T), squared_sum).T, - squared_sum, - ) - ) - np.fill_diagonal(numerator_matrix, 0) - - q_matrix = numerator_matrix / np.sum(numerator_matrix) - return q_matrix, numerator_matrix - - def apply_tsne( data_matrix: ndarray, n_components: int = 2, + perplexity: float = 30.0, learning_rate: float = 200.0, - n_iter: int = 500, + max_iter: int = 1000, + random_state: int = 42, ) -> ndarray: """ - Apply t-SNE for dimensionality reduction. + Apply t-SNE for dimensionality reduction using scikit-learn's implementation. Args: data_matrix: Original dataset (features). n_components: Target dimension (2D or 3D). - learning_rate: Step size for gradient descent. - n_iter: Number of iterations. + perplexity: Controls balance between local/global aspects. + learning_rate: Step size for optimization. + max_iter: Number of iterations for optimization. + random_state: Ensures reproducibility. Returns: ndarray: Low-dimensional embedding of the data. >>> features, _ = collect_dataset() - >>> embedding = apply_tsne(features, n_components=2, n_iter=50) + >>> embedding = apply_tsne(features, n_components=2, max_iter=250) >>> embedding.shape (150, 2) """ - if n_components < 1 or n_iter < 1: - raise ValueError("n_components and n_iter must be >= 1") - - n_samples = data_matrix.shape[0] - rng = np.random.default_rng() - embedding = rng.standard_normal((n_samples, n_components)) * 1e-4 - - high_dim_affinities = compute_pairwise_affinities(data_matrix) - high_dim_affinities = np.maximum(high_dim_affinities, 1e-12) - - embedding_increment = np.zeros_like(embedding) - momentum = 0.5 - - for iteration in range(n_iter): - low_dim_affinities, numerator_matrix = compute_low_dim_affinities(embedding) - low_dim_affinities = np.maximum(low_dim_affinities, 1e-12) - - affinity_diff = high_dim_affinities - low_dim_affinities - - gradient = 4 * ( - np.dot((affinity_diff * numerator_matrix), embedding) - - np.multiply( - np.sum(affinity_diff * numerator_matrix, axis=1)[:, np.newaxis], - embedding, - ) - ) - - embedding_increment = momentum * embedding_increment - learning_rate * gradient - embedding += embedding_increment - - if iteration == int(n_iter / 4): - momentum = 0.8 - - return embedding + tsne = TSNE( + n_components=n_components, + perplexity=perplexity, + learning_rate=learning_rate, + max_iter=max_iter, + random_state=random_state, + init="random", + ) + return tsne.fit_transform(data_matrix) def main() -> None: """ - Run t-SNE on the Iris dataset and display the first 5 embeddings. + Run t-SNE on the Iris dataset, print embeddings, and visualize results. >>> main() # doctest: +ELLIPSIS t-SNE embedding (first 5 points): [[... """ - features, _labels = collect_dataset() - embedding = apply_tsne(features, n_components=2, n_iter=300) + features, labels = collect_dataset() + embedding = apply_tsne( + features, + n_components=2, + perplexity=40.0, + learning_rate=150.0, + max_iter=1000, + random_state=42, + ) if not isinstance(embedding, np.ndarray): raise TypeError("t-SNE embedding must be an ndarray") @@ -163,16 +97,21 @@ def main() -> None: print("t-SNE embedding (first 5 points):") print(embedding[:5]) - # Optional visualization (Ruff/mypy compliant) + try: + import matplotlib.pyplot as plt - # import matplotlib.pyplot as plt - # plt.scatter(embedding[:, 0], embedding[:, 1], c=labels, cmap="viridis") - # plt.title("t-SNE Visualization of the Iris Dataset") - # plt.xlabel("Dimension 1") - # plt.ylabel("Dimension 2") - # plt.show() + plt.figure(figsize=(7, 5)) + scatter = plt.scatter(embedding[:, 0], embedding[:, 1], c=labels, cmap="viridis") + plt.title("t-SNE Visualization of the Iris Dataset") + plt.xlabel("Dimension 1") + plt.ylabel("Dimension 2") + plt.colorbar(scatter, label="Class Label") + plt.tight_layout() + plt.show() + except ImportError: + print("matplotlib not found. Install it with: pip install matplotlib") if __name__ == "__main__": doctest.testmod() - main() + main() \ No newline at end of file From 89a1b692a076451515c323170e4a68a6307f85fc Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 16 Oct 2025 04:59:47 +0000 Subject: [PATCH 2/8] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- machine_learning/t_stochastic_neighbour_embedding.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/machine_learning/t_stochastic_neighbour_embedding.py b/machine_learning/t_stochastic_neighbour_embedding.py index 3449ea76a904..23d7e66673ef 100644 --- a/machine_learning/t_stochastic_neighbour_embedding.py +++ b/machine_learning/t_stochastic_neighbour_embedding.py @@ -8,6 +8,7 @@ import doctest import numpy as np from numpy import ndarray + try: from sklearn.datasets import load_iris from sklearn.manifold import TSNE @@ -101,7 +102,9 @@ def main() -> None: import matplotlib.pyplot as plt plt.figure(figsize=(7, 5)) - scatter = plt.scatter(embedding[:, 0], embedding[:, 1], c=labels, cmap="viridis") + scatter = plt.scatter( + embedding[:, 0], embedding[:, 1], c=labels, cmap="viridis" + ) plt.title("t-SNE Visualization of the Iris Dataset") plt.xlabel("Dimension 1") plt.ylabel("Dimension 2") @@ -114,4 +117,4 @@ def main() -> None: if __name__ == "__main__": doctest.testmod() - main() \ No newline at end of file + main() From 940b47892f2c4ad2508ca5701305311bdedd2c14 Mon Sep 17 00:00:00 2001 From: Jayesh Shrivastava Date: Thu, 16 Oct 2025 10:44:42 +0530 Subject: [PATCH 3/8] precommit fix --- machine_learning/t_stochastic_neighbour_embedding.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/machine_learning/t_stochastic_neighbour_embedding.py b/machine_learning/t_stochastic_neighbour_embedding.py index 3449ea76a904..8ead6fe6aa58 100644 --- a/machine_learning/t_stochastic_neighbour_embedding.py +++ b/machine_learning/t_stochastic_neighbour_embedding.py @@ -6,8 +6,10 @@ """ import doctest + import numpy as np from numpy import ndarray + try: from sklearn.datasets import load_iris from sklearn.manifold import TSNE From 0aa705a50aee63ac719846d9dac172d7a5cfaba0 Mon Sep 17 00:00:00 2001 From: Jayesh Shrivastava Date: Thu, 16 Oct 2025 13:25:31 +0530 Subject: [PATCH 4/8] isort fix --- .../t_stochastic_neighbour_embedding.py | 20 +++++++++---------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/machine_learning/t_stochastic_neighbour_embedding.py b/machine_learning/t_stochastic_neighbour_embedding.py index 8ead6fe6aa58..c9dc1ce33f1c 100644 --- a/machine_learning/t_stochastic_neighbour_embedding.py +++ b/machine_learning/t_stochastic_neighbour_embedding.py @@ -1,12 +1,14 @@ """ -t-distributed stochastic neighbor embedding (t-SNE) +t_stochastic_neighbour_embedding.py -For more details, see: -https://en.wikipedia.org/wiki/T-distributed_stochastic_neighbor_embedding +Run t-SNE on the Iris dataset, with CI-safe doctests and visualization. """ +# Standard library import doctest +from typing import Tuple +# Third-party import numpy as np from numpy import ndarray @@ -20,12 +22,12 @@ ) from e -def collect_dataset() -> tuple[ndarray, ndarray]: +def collect_dataset() -> Tuple[ndarray, ndarray]: """ Load the Iris dataset and return features and labels. Returns: - tuple[ndarray, ndarray]: Feature matrix and target labels. + Tuple[ndarray, ndarray]: Feature matrix and target labels. >>> features, targets = collect_dataset() >>> features.shape @@ -78,10 +80,6 @@ def apply_tsne( def main() -> None: """ Run t-SNE on the Iris dataset, print embeddings, and visualize results. - - >>> main() # doctest: +ELLIPSIS - t-SNE embedding (first 5 points): - [[... """ features, labels = collect_dataset() embedding = apply_tsne( @@ -111,9 +109,9 @@ def main() -> None: plt.tight_layout() plt.show() except ImportError: - print("matplotlib not found. Install it with: pip install matplotlib") + print("matplotlib not installed; skipping visualization.") if __name__ == "__main__": doctest.testmod() - main() \ No newline at end of file + main() From 9e9731010a05a192c16506ef9808d1f1fd390635 Mon Sep 17 00:00:00 2001 From: Jayesh Shrivastava Date: Thu, 16 Oct 2025 13:27:50 +0530 Subject: [PATCH 5/8] isort fix --- machine_learning/t_stochastic_neighbour_embedding.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/machine_learning/t_stochastic_neighbour_embedding.py b/machine_learning/t_stochastic_neighbour_embedding.py index cde5a026e1ab..c9dc1ce33f1c 100644 --- a/machine_learning/t_stochastic_neighbour_embedding.py +++ b/machine_learning/t_stochastic_neighbour_embedding.py @@ -101,9 +101,7 @@ def main() -> None: import matplotlib.pyplot as plt plt.figure(figsize=(7, 5)) - scatter = plt.scatter( - embedding[:, 0], embedding[:, 1], c=labels, cmap="viridis" - ) + scatter = plt.scatter(embedding[:, 0], embedding[:, 1], c=labels, cmap="viridis") plt.title("t-SNE Visualization of the Iris Dataset") plt.xlabel("Dimension 1") plt.ylabel("Dimension 2") From 37d5123998128b08d36837eceb50d8f7fcf3b7e9 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 16 Oct 2025 07:58:16 +0000 Subject: [PATCH 6/8] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- machine_learning/t_stochastic_neighbour_embedding.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/machine_learning/t_stochastic_neighbour_embedding.py b/machine_learning/t_stochastic_neighbour_embedding.py index c9dc1ce33f1c..cde5a026e1ab 100644 --- a/machine_learning/t_stochastic_neighbour_embedding.py +++ b/machine_learning/t_stochastic_neighbour_embedding.py @@ -101,7 +101,9 @@ def main() -> None: import matplotlib.pyplot as plt plt.figure(figsize=(7, 5)) - scatter = plt.scatter(embedding[:, 0], embedding[:, 1], c=labels, cmap="viridis") + scatter = plt.scatter( + embedding[:, 0], embedding[:, 1], c=labels, cmap="viridis" + ) plt.title("t-SNE Visualization of the Iris Dataset") plt.xlabel("Dimension 1") plt.ylabel("Dimension 2") From 8b4057d2ba1ee0f7cb38d7e77516628c674ce13f Mon Sep 17 00:00:00 2001 From: Jayesh Shrivastava Date: Thu, 16 Oct 2025 13:30:34 +0530 Subject: [PATCH 7/8] use tuple instead of typing.Tuple --- machine_learning/t_stochastic_neighbour_embedding.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/machine_learning/t_stochastic_neighbour_embedding.py b/machine_learning/t_stochastic_neighbour_embedding.py index c9dc1ce33f1c..c6d2d70ce1f9 100644 --- a/machine_learning/t_stochastic_neighbour_embedding.py +++ b/machine_learning/t_stochastic_neighbour_embedding.py @@ -6,7 +6,6 @@ # Standard library import doctest -from typing import Tuple # Third-party import numpy as np @@ -22,7 +21,7 @@ ) from e -def collect_dataset() -> Tuple[ndarray, ndarray]: +def collect_dataset() -> tuple[ndarray, ndarray]: """ Load the Iris dataset and return features and labels. From af2cd541994cd1ab3042f6e576ffb14a05fcda5e Mon Sep 17 00:00:00 2001 From: Jayesh Shrivastava Date: Thu, 16 Oct 2025 13:31:22 +0530 Subject: [PATCH 8/8] use tuple instead of typing.Tuple --- machine_learning/t_stochastic_neighbour_embedding.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/machine_learning/t_stochastic_neighbour_embedding.py b/machine_learning/t_stochastic_neighbour_embedding.py index 5be9e7725105..ded493bc3e9d 100644 --- a/machine_learning/t_stochastic_neighbour_embedding.py +++ b/machine_learning/t_stochastic_neighbour_embedding.py @@ -4,10 +4,8 @@ Run t-SNE on the Iris dataset, with CI-safe doctests and visualization. """ -# Standard library import doctest -# Third-party import numpy as np from numpy import ndarray