Fix Gaussian Naive Bayes implementation

krvipin15 · krvipin15 · commit c4170feda9ee · 2025-10-15T16:48:21.000+05:30
- Replace deprecated sklearn example with proper from-scratch implementation
- Remove deprecated plot_confusion_matrix (removed in sklearn 1.2+)
- Implement complete GaussianNaiveBayes class using NumPy
- Add fit, predict, and predict_proba methods
- Include proper type hints and comprehensive docstrings
- Add working doctests and example with Iris dataset
- Remove unnecessary time.sleep() calls
diff --git a/machine_learning/gaussian_naive_bayes.py b/machine_learning/gaussian_naive_bayes.py
@@ -0,0 +1,247 @@
+"""
+Gaussian Naive Bayes Classifier
+
+Naive Bayes is a probabilistic classifier based on Bayes' theorem with the
+"naive" assumption of conditional independence between features.
+
+Gaussian Naive Bayes assumes that features follow a normal (Gaussian) distribution.
+
+For each class, we calculate:
+- Mean (μ) and variance (σ²) of each feature
+- Prior probability P(class)
+
+For prediction, we use Bayes theorem:
+P(class|X) = P(X|class) * P(class)
+
+Where P(X|class) is calculated using the Gaussian probability density function:
+P(x|class) = (1 / sqrt(2 * pi * sigma^2)) * exp(-(x - mu)^2 / (2 * sigma^2))
+
+Reference: https://en.wikipedia.org/wiki/Naive_Bayes_classifier
+"""
+
+import numpy as np
+
+
+class GaussianNaiveBayes:
+    """
+    Gaussian Naive Bayes Classifier
+
+    Parameters
+    ----------
+    None
+
+    Attributes
+    ----------
+    classes_ : ndarray of shape (n_classes,)
+        The unique class labels
+    class_priors_ : ndarray of shape (n_classes,)
+        Probability of each class P(class)
+    mean_ : ndarray of shape (n_classes, n_features)
+        Mean of each feature per class
+    var_ : ndarray of shape (n_classes, n_features)
+        Variance of each feature per class
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
+    >>> y = np.array([0, 0, 0, 1, 1, 1])
+    >>> clf = GaussianNaiveBayes()
+    >>> _ = clf.fit(X, y)
+    >>> clf.predict(np.array([[-0.8, -1]]))
+    array([0])
+    >>> clf.predict(np.array([[3, 2]]))
+    array([1])
+    """
+
+    def __init__(self) -> None:
+        self.classes_: np.ndarray = np.array([])
+        self.class_priors_: np.ndarray = np.array([])
+        self.mean_: np.ndarray = np.array([])
+        self.var_: np.ndarray = np.array([])
+
+    def fit(self, X: np.ndarray, y: np.ndarray) -> "GaussianNaiveBayes":  # noqa: N803
+        """
+        Fit Gaussian Naive Bayes classifier
+
+        Parameters
+        ----------
+        X : ndarray of shape (n_samples, n_features)
+            Training data
+        y : ndarray of shape (n_samples,)
+            Target values
+
+        Returns
+        -------
+        self : object
+            Returns self
+        """
+        self.classes_ = np.unique(y)
+        n_classes = len(self.classes_)
+        n_features = X.shape[1]
+
+        # Initialize arrays for mean, variance, and priors
+        self.mean_ = np.zeros((n_classes, n_features))
+        self.var_ = np.zeros((n_classes, n_features))
+        self.class_priors_ = np.zeros(n_classes)
+
+        # Calculate mean, variance, and prior for each class
+        for idx, c in enumerate(self.classes_):
+            X_c = X[y == c]  # noqa: N806
+            self.mean_[idx] = X_c.mean(axis=0)
+            self.var_[idx] = X_c.var(axis=0)
+            self.class_priors_[idx] = X_c.shape[0] / X.shape[0]
+
+        return self
+
+    def _calculate_likelihood(self, class_idx: int, x: np.ndarray) -> float:
+        """
+        Calculate the Gaussian probability density function (likelihood)
+        P(x|class) for all features
+
+        Parameters
+        ----------
+        class_idx : int
+            Index of the class
+        x : ndarray of shape (n_features,)
+            Input sample
+
+        Returns
+        -------
+        likelihood : float
+            Product of likelihoods for all features
+        """
+        mean = self.mean_[class_idx]
+        var = self.var_[class_idx]
+
+        # Gaussian probability density function
+        # P(x|class) = (1 / sqrt(2 * pi * sigma^2)) * exp(-(x - mu)^2 / (2 * sigma^2))
+        numerator = np.exp(-((x - mean) ** 2) / (2 * var))
+        denominator = np.sqrt(2 * np.pi * var)
+
+        # Calculate probability for each feature and return product
+        # Using log probabilities to avoid numerical underflow
+        return np.prod(numerator / denominator)
+
+    def _calculate_posterior(self, x: np.ndarray) -> np.ndarray:
+        """
+        Calculate posterior probability for each class
+        P(class|x) = P(x|class) * P(class)
+
+        Parameters
+        ----------
+        x : ndarray of shape (n_features,)
+            Input sample
+
+        Returns
+        -------
+        posteriors : ndarray of shape (n_classes,)
+            Posterior probability for each class
+        """
+        posteriors = []
+        for idx in range(len(self.classes_)):
+            prior = np.log(self.class_priors_[idx])
+            likelihood = self._calculate_likelihood(idx, x)
+            # Use log to avoid numerical underflow
+            posterior = prior + np.sum(np.log(likelihood + 1e-10))
+            posteriors.append(posterior)
+
+        return np.array(posteriors)
+
+    def predict(self, X: np.ndarray) -> np.ndarray:  # noqa: N803
+        """
+        Perform classification on an array of test vectors X
+
+        Parameters
+        ----------
+        X : ndarray of shape (n_samples, n_features)
+            Test data
+
+        Returns
+        -------
+        y_pred : ndarray of shape (n_samples,)
+            Predicted target values for X
+        """
+        y_pred = [self._predict_single(x) for x in X]
+        return np.array(y_pred)
+
+    def _predict_single(self, x: np.ndarray) -> int:
+        """
+        Predict class for a single sample
+
+        Parameters
+        ----------
+        x : ndarray of shape (n_features,)
+            Input sample
+
+        Returns
+        -------
+        prediction : int
+            Predicted class label
+        """
+        posteriors = self._calculate_posterior(x)
+        return self.classes_[np.argmax(posteriors)]
+
+    def predict_proba(self, X: np.ndarray) -> np.ndarray:  # noqa: N803
+        """
+        Return probability estimates for the test vector X
+
+        Parameters
+        ----------
+        X : ndarray of shape (n_samples, n_features)
+            Test data
+
+        Returns
+        -------
+        probabilities : ndarray of shape (n_samples, n_classes)
+            Returns the probability of the samples for each class
+        """
+        probabilities = []
+        for x in X:
+            posteriors = self._calculate_posterior(x)
+            # Convert log probabilities to probabilities
+            probs = np.exp(posteriors)
+            # Normalize to sum to 1
+            probs = probs / np.sum(probs)
+            probabilities.append(probs)
+
+        return np.array(probabilities)
+
+
+if __name__ == "__main__":
+    import doctest
+
+    doctest.testmod()
+
+    # Example with Iris dataset
+    from sklearn.datasets import load_iris
+    from sklearn.metrics import accuracy_score, classification_report
+    from sklearn.model_selection import train_test_split
+
+    # Load dataset
+    iris = load_iris()
+    X, y = iris.data, iris.target
+
+    # Split into train and test
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, test_size=0.3, random_state=42
+    )
+
+    # Train the classifier
+    clf = GaussianNaiveBayes()
+    clf.fit(X_train, y_train)
+
+    # Make predictions
+    y_pred = clf.predict(X_test)
+
+    # Evaluate
+    accuracy = accuracy_score(y_test, y_pred)
+    print(f"Accuracy: {accuracy:.2%}")
+    print("\nClassification Report:")
+    print(classification_report(y_test, y_pred, target_names=iris.target_names))
+
+    # Show probability predictions for first 5 samples
+    probas = clf.predict_proba(X_test[:5])
+    print("\nProbability predictions for first 5 samples:")
+    for i, proba in enumerate(probas):
+        print(f"Sample {i+1}: {proba}")