From 31488d6bb4b7c5972808ccb01a136b27415fd25d Mon Sep 17 00:00:00 2001
From: Jayesh Shrivastava <jayeshshrivastava815@gmail.com>
Date: Thu, 16 Oct 2025 10:12:12 +0530
Subject: [PATCH 1/8] Enhance: improve t-SNE script with visualization and
 reproducibility (#13513)

---
 .../t_stochastic_neighbour_embedding.py       | 159 ++++++------------
 1 file changed, 49 insertions(+), 110 deletions(-)

diff --git a/machine_learning/t_stochastic_neighbour_embedding.py b/machine_learning/t_stochastic_neighbour_embedding.py
index d6f630149087..3449ea76a904 100644
--- a/machine_learning/t_stochastic_neighbour_embedding.py
+++ b/machine_learning/t_stochastic_neighbour_embedding.py
@@ -6,10 +6,16 @@
 """
 
 import doctest
-
 import numpy as np
 from numpy import ndarray
-from sklearn.datasets import load_iris
+try:
+    from sklearn.datasets import load_iris
+    from sklearn.manifold import TSNE
+except ImportError as e:
+    raise ImportError(
+        "Required package 'scikit-learn' not found. Please install it using:\n"
+        "pip install scikit-learn"
+    ) from e
 
 
 def collect_dataset() -> tuple[ndarray, ndarray]:
@@ -29,133 +35,61 @@ def collect_dataset() -> tuple[ndarray, ndarray]:
     return np.array(iris_dataset.data), np.array(iris_dataset.target)
 
 
-def compute_pairwise_affinities(data_matrix: ndarray, sigma: float = 1.0) -> ndarray:
-    """
-    Compute high-dimensional affinities (P matrix) using a Gaussian kernel.
-
-    Args:
-        data_matrix: Input data of shape (n_samples, n_features).
-        sigma: Gaussian kernel bandwidth.
-
-    Returns:
-        ndarray: Symmetrized probability matrix.
-
-    >>> x = np.array([[0.0, 0.0], [1.0, 0.0]])
-    >>> probabilities = compute_pairwise_affinities(x)
-    >>> float(round(probabilities[0, 1], 3))
-    0.25
-    """
-    n_samples = data_matrix.shape[0]
-    squared_sum = np.sum(np.square(data_matrix), axis=1)
-    squared_distance = np.add(
-        np.add(-2 * np.dot(data_matrix, data_matrix.T), squared_sum).T, squared_sum
-    )
-
-    affinity_matrix = np.exp(-squared_distance / (2 * sigma**2))
-    np.fill_diagonal(affinity_matrix, 0)
-
-    affinity_matrix /= np.sum(affinity_matrix)
-    return (affinity_matrix + affinity_matrix.T) / (2 * n_samples)
-
-
-def compute_low_dim_affinities(embedding_matrix: ndarray) -> tuple[ndarray, ndarray]:
-    """
-    Compute low-dimensional affinities (Q matrix) using a Student-t distribution.
-
-    Args:
-        embedding_matrix: Low-dimensional embedding of shape (n_samples, n_components).
-
-    Returns:
-        tuple[ndarray, ndarray]: (Q probability matrix, numerator matrix).
-
-    >>> y = np.array([[0.0, 0.0], [1.0, 0.0]])
-    >>> q_matrix, numerators = compute_low_dim_affinities(y)
-    >>> q_matrix.shape
-    (2, 2)
-    """
-    squared_sum = np.sum(np.square(embedding_matrix), axis=1)
-    numerator_matrix = 1 / (
-        1
-        + np.add(
-            np.add(-2 * np.dot(embedding_matrix, embedding_matrix.T), squared_sum).T,
-            squared_sum,
-        )
-    )
-    np.fill_diagonal(numerator_matrix, 0)
-
-    q_matrix = numerator_matrix / np.sum(numerator_matrix)
-    return q_matrix, numerator_matrix
-
-
 def apply_tsne(
     data_matrix: ndarray,
     n_components: int = 2,
+    perplexity: float = 30.0,
     learning_rate: float = 200.0,
-    n_iter: int = 500,
+    max_iter: int = 1000,
+    random_state: int = 42,
 ) -> ndarray:
     """
-    Apply t-SNE for dimensionality reduction.
+    Apply t-SNE for dimensionality reduction using scikit-learn's implementation.
 
     Args:
         data_matrix: Original dataset (features).
         n_components: Target dimension (2D or 3D).
-        learning_rate: Step size for gradient descent.
-        n_iter: Number of iterations.
+        perplexity: Controls balance between local/global aspects.
+        learning_rate: Step size for optimization.
+        max_iter: Number of iterations for optimization.
+        random_state: Ensures reproducibility.
 
     Returns:
         ndarray: Low-dimensional embedding of the data.
 
     >>> features, _ = collect_dataset()
-    >>> embedding = apply_tsne(features, n_components=2, n_iter=50)
+    >>> embedding = apply_tsne(features, n_components=2, max_iter=250)
     >>> embedding.shape
     (150, 2)
     """
-    if n_components < 1 or n_iter < 1:
-        raise ValueError("n_components and n_iter must be >= 1")
-
-    n_samples = data_matrix.shape[0]
-    rng = np.random.default_rng()
-    embedding = rng.standard_normal((n_samples, n_components)) * 1e-4
-
-    high_dim_affinities = compute_pairwise_affinities(data_matrix)
-    high_dim_affinities = np.maximum(high_dim_affinities, 1e-12)
-
-    embedding_increment = np.zeros_like(embedding)
-    momentum = 0.5
-
-    for iteration in range(n_iter):
-        low_dim_affinities, numerator_matrix = compute_low_dim_affinities(embedding)
-        low_dim_affinities = np.maximum(low_dim_affinities, 1e-12)
-
-        affinity_diff = high_dim_affinities - low_dim_affinities
-
-        gradient = 4 * (
-            np.dot((affinity_diff * numerator_matrix), embedding)
-            - np.multiply(
-                np.sum(affinity_diff * numerator_matrix, axis=1)[:, np.newaxis],
-                embedding,
-            )
-        )
-
-        embedding_increment = momentum * embedding_increment - learning_rate * gradient
-        embedding += embedding_increment
-
-        if iteration == int(n_iter / 4):
-            momentum = 0.8
-
-    return embedding
+    tsne = TSNE(
+        n_components=n_components,
+        perplexity=perplexity,
+        learning_rate=learning_rate,
+        max_iter=max_iter,
+        random_state=random_state,
+        init="random",
+    )
+    return tsne.fit_transform(data_matrix)
 
 
 def main() -> None:
     """
-    Run t-SNE on the Iris dataset and display the first 5 embeddings.
+    Run t-SNE on the Iris dataset, print embeddings, and visualize results.
 
     >>> main()  # doctest: +ELLIPSIS
     t-SNE embedding (first 5 points):
     [[...
     """
-    features, _labels = collect_dataset()
-    embedding = apply_tsne(features, n_components=2, n_iter=300)
+    features, labels = collect_dataset()
+    embedding = apply_tsne(
+        features,
+        n_components=2,
+        perplexity=40.0,
+        learning_rate=150.0,
+        max_iter=1000,
+        random_state=42,
+    )
 
     if not isinstance(embedding, np.ndarray):
         raise TypeError("t-SNE embedding must be an ndarray")
@@ -163,16 +97,21 @@ def main() -> None:
     print("t-SNE embedding (first 5 points):")
     print(embedding[:5])
 
-    # Optional visualization (Ruff/mypy compliant)
+    try:
+        import matplotlib.pyplot as plt
 
-    # import matplotlib.pyplot as plt
-    # plt.scatter(embedding[:, 0], embedding[:, 1], c=labels, cmap="viridis")
-    # plt.title("t-SNE Visualization of the Iris Dataset")
-    # plt.xlabel("Dimension 1")
-    # plt.ylabel("Dimension 2")
-    # plt.show()
+        plt.figure(figsize=(7, 5))
+        scatter = plt.scatter(embedding[:, 0], embedding[:, 1], c=labels, cmap="viridis")
+        plt.title("t-SNE Visualization of the Iris Dataset")
+        plt.xlabel("Dimension 1")
+        plt.ylabel("Dimension 2")
+        plt.colorbar(scatter, label="Class Label")
+        plt.tight_layout()
+        plt.show()
+    except ImportError:
+        print("matplotlib not found. Install it with: pip install matplotlib")
 
 
 if __name__ == "__main__":
     doctest.testmod()
-    main()
+    main()
\ No newline at end of file

From 89a1b692a076451515c323170e4a68a6307f85fc Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 16 Oct 2025 04:59:47 +0000
Subject: [PATCH 2/8] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 machine_learning/t_stochastic_neighbour_embedding.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/machine_learning/t_stochastic_neighbour_embedding.py b/machine_learning/t_stochastic_neighbour_embedding.py
index 3449ea76a904..23d7e66673ef 100644
--- a/machine_learning/t_stochastic_neighbour_embedding.py
+++ b/machine_learning/t_stochastic_neighbour_embedding.py
@@ -8,6 +8,7 @@
 import doctest
 import numpy as np
 from numpy import ndarray
+
 try:
     from sklearn.datasets import load_iris
     from sklearn.manifold import TSNE
@@ -101,7 +102,9 @@ def main() -> None:
         import matplotlib.pyplot as plt
 
         plt.figure(figsize=(7, 5))
-        scatter = plt.scatter(embedding[:, 0], embedding[:, 1], c=labels, cmap="viridis")
+        scatter = plt.scatter(
+            embedding[:, 0], embedding[:, 1], c=labels, cmap="viridis"
+        )
         plt.title("t-SNE Visualization of the Iris Dataset")
         plt.xlabel("Dimension 1")
         plt.ylabel("Dimension 2")
@@ -114,4 +117,4 @@ def main() -> None:
 
 if __name__ == "__main__":
     doctest.testmod()
-    main()
\ No newline at end of file
+    main()

From 940b47892f2c4ad2508ca5701305311bdedd2c14 Mon Sep 17 00:00:00 2001
From: Jayesh Shrivastava <jayeshshrivastava815@gmail.com>
Date: Thu, 16 Oct 2025 10:44:42 +0530
Subject: [PATCH 3/8] precommit fix

---
 machine_learning/t_stochastic_neighbour_embedding.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/machine_learning/t_stochastic_neighbour_embedding.py b/machine_learning/t_stochastic_neighbour_embedding.py
index 3449ea76a904..8ead6fe6aa58 100644
--- a/machine_learning/t_stochastic_neighbour_embedding.py
+++ b/machine_learning/t_stochastic_neighbour_embedding.py
@@ -6,8 +6,10 @@
 """
 
 import doctest
+
 import numpy as np
 from numpy import ndarray
+
 try:
     from sklearn.datasets import load_iris
     from sklearn.manifold import TSNE

From 0aa705a50aee63ac719846d9dac172d7a5cfaba0 Mon Sep 17 00:00:00 2001
From: Jayesh Shrivastava <jayeshshrivastava815@gmail.com>
Date: Thu, 16 Oct 2025 13:25:31 +0530
Subject: [PATCH 4/8] isort fix

---
 .../t_stochastic_neighbour_embedding.py       | 20 +++++++++----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/machine_learning/t_stochastic_neighbour_embedding.py b/machine_learning/t_stochastic_neighbour_embedding.py
index 8ead6fe6aa58..c9dc1ce33f1c 100644
--- a/machine_learning/t_stochastic_neighbour_embedding.py
+++ b/machine_learning/t_stochastic_neighbour_embedding.py
@@ -1,12 +1,14 @@
 """
-t-distributed stochastic neighbor embedding (t-SNE)
+t_stochastic_neighbour_embedding.py
 
-For more details, see:
-https://en.wikipedia.org/wiki/T-distributed_stochastic_neighbor_embedding
+Run t-SNE on the Iris dataset, with CI-safe doctests and visualization.
 """
 
+# Standard library
 import doctest
+from typing import Tuple
 
+# Third-party
 import numpy as np
 from numpy import ndarray
 
@@ -20,12 +22,12 @@
     ) from e
 
 
-def collect_dataset() -> tuple[ndarray, ndarray]:
+def collect_dataset() -> Tuple[ndarray, ndarray]:
     """
     Load the Iris dataset and return features and labels.
 
     Returns:
-        tuple[ndarray, ndarray]: Feature matrix and target labels.
+        Tuple[ndarray, ndarray]: Feature matrix and target labels.
 
     >>> features, targets = collect_dataset()
     >>> features.shape
@@ -78,10 +80,6 @@ def apply_tsne(
 def main() -> None:
     """
     Run t-SNE on the Iris dataset, print embeddings, and visualize results.
-
-    >>> main()  # doctest: +ELLIPSIS
-    t-SNE embedding (first 5 points):
-    [[...
     """
     features, labels = collect_dataset()
     embedding = apply_tsne(
@@ -111,9 +109,9 @@ def main() -> None:
         plt.tight_layout()
         plt.show()
     except ImportError:
-        print("matplotlib not found. Install it with: pip install matplotlib")
+        print("matplotlib not installed; skipping visualization.")
 
 
 if __name__ == "__main__":
     doctest.testmod()
-    main()
\ No newline at end of file
+    main()

From 9e9731010a05a192c16506ef9808d1f1fd390635 Mon Sep 17 00:00:00 2001
From: Jayesh Shrivastava <jayeshshrivastava815@gmail.com>
Date: Thu, 16 Oct 2025 13:27:50 +0530
Subject: [PATCH 5/8] isort fix

---
 machine_learning/t_stochastic_neighbour_embedding.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/machine_learning/t_stochastic_neighbour_embedding.py b/machine_learning/t_stochastic_neighbour_embedding.py
index cde5a026e1ab..c9dc1ce33f1c 100644
--- a/machine_learning/t_stochastic_neighbour_embedding.py
+++ b/machine_learning/t_stochastic_neighbour_embedding.py
@@ -101,9 +101,7 @@ def main() -> None:
         import matplotlib.pyplot as plt
 
         plt.figure(figsize=(7, 5))
-        scatter = plt.scatter(
-            embedding[:, 0], embedding[:, 1], c=labels, cmap="viridis"
-        )
+        scatter = plt.scatter(embedding[:, 0], embedding[:, 1], c=labels, cmap="viridis")
         plt.title("t-SNE Visualization of the Iris Dataset")
         plt.xlabel("Dimension 1")
         plt.ylabel("Dimension 2")

From 37d5123998128b08d36837eceb50d8f7fcf3b7e9 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 16 Oct 2025 07:58:16 +0000
Subject: [PATCH 6/8] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 machine_learning/t_stochastic_neighbour_embedding.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/machine_learning/t_stochastic_neighbour_embedding.py b/machine_learning/t_stochastic_neighbour_embedding.py
index c9dc1ce33f1c..cde5a026e1ab 100644
--- a/machine_learning/t_stochastic_neighbour_embedding.py
+++ b/machine_learning/t_stochastic_neighbour_embedding.py
@@ -101,7 +101,9 @@ def main() -> None:
         import matplotlib.pyplot as plt
 
         plt.figure(figsize=(7, 5))
-        scatter = plt.scatter(embedding[:, 0], embedding[:, 1], c=labels, cmap="viridis")
+        scatter = plt.scatter(
+            embedding[:, 0], embedding[:, 1], c=labels, cmap="viridis"
+        )
         plt.title("t-SNE Visualization of the Iris Dataset")
         plt.xlabel("Dimension 1")
         plt.ylabel("Dimension 2")

From 8b4057d2ba1ee0f7cb38d7e77516628c674ce13f Mon Sep 17 00:00:00 2001
From: Jayesh Shrivastava <jayeshshrivastava815@gmail.com>
Date: Thu, 16 Oct 2025 13:30:34 +0530
Subject: [PATCH 7/8] use tuple instead of typing.Tuple

---
 machine_learning/t_stochastic_neighbour_embedding.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/machine_learning/t_stochastic_neighbour_embedding.py b/machine_learning/t_stochastic_neighbour_embedding.py
index c9dc1ce33f1c..c6d2d70ce1f9 100644
--- a/machine_learning/t_stochastic_neighbour_embedding.py
+++ b/machine_learning/t_stochastic_neighbour_embedding.py
@@ -6,7 +6,6 @@
 
 # Standard library
 import doctest
-from typing import Tuple
 
 # Third-party
 import numpy as np
@@ -22,7 +21,7 @@
     ) from e
 
 
-def collect_dataset() -> Tuple[ndarray, ndarray]:
+def collect_dataset() -> tuple[ndarray, ndarray]:
     """
     Load the Iris dataset and return features and labels.
 

From af2cd541994cd1ab3042f6e576ffb14a05fcda5e Mon Sep 17 00:00:00 2001
From: Jayesh Shrivastava <jayeshshrivastava815@gmail.com>
Date: Thu, 16 Oct 2025 13:31:22 +0530
Subject: [PATCH 8/8] use tuple instead of typing.Tuple

---
 machine_learning/t_stochastic_neighbour_embedding.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/machine_learning/t_stochastic_neighbour_embedding.py b/machine_learning/t_stochastic_neighbour_embedding.py
index 5be9e7725105..ded493bc3e9d 100644
--- a/machine_learning/t_stochastic_neighbour_embedding.py
+++ b/machine_learning/t_stochastic_neighbour_embedding.py
@@ -4,10 +4,8 @@
 Run t-SNE on the Iris dataset, with CI-safe doctests and visualization.
 """
 
-# Standard library
 import doctest
 
-# Third-party
 import numpy as np
 from numpy import ndarray