From ff2f17ab923d373cf0d2a0ccf1450c8bb0c67e39 Mon Sep 17 00:00:00 2001
From: "codeflash-ai[bot]"
 <148906541+codeflash-ai[bot]@users.noreply.github.com>
Date: Fri, 23 Jan 2026 10:50:14 +0000
Subject: [PATCH] Optimize UnoptimizedNeuralNet.forward
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The optimized code achieves a **379x speedup** (from 900ms to 2.37ms) by replacing nested Python loops with PyTorch's highly optimized vectorized operations.

**Key Changes:**

1. **Matrix multiplication via `F.linear()`**: The original code manually computes fully-connected layers using triple-nested loops (batch × hidden × input dimensions), performing ~23,552 individual scalar operations for the first layer alone. The line profiler shows this taking **71.3% of total runtime**. The optimized version replaces this with a single `F.linear()` call that uses BLAS/LAPACK or CUDA kernels for matrix multiplication, reducing this to just **3.1% of runtime**.

2. **ReLU activation with `torch.clamp()`**: The original code loops through every element to manually apply `max(0, x)`, taking **2.2%** of runtime. The optimized version uses `torch.clamp(hidden, min=0.0)`, a vectorized C/CUDA operation that processes the entire tensor in parallel.

3. **Softmax via `torch.softmax()`**: The original implementation manually computes max, exponentials, sum, and division across nested loops (~**5.2%** of runtime combined). The optimized version uses PyTorch's numerically stable `torch.softmax()`, which is both faster and prevents numerical overflow/underflow issues.

4. **Eliminated temporary tensor allocations**: The original code creates many small tensors (`torch.tensor(0.0)`, `temp_values`, etc.) inside loops, causing significant memory allocation overhead. The optimized version operates on entire tensors at once, drastically reducing memory churn.

**Why This Matters:**

- **Python loop overhead**: Each loop iteration in Python involves significant interpreter overhead. The original code had ~26,438 inner loop iterations per forward pass. Vectorized operations execute in compiled C/CUDA with minimal Python overhead.

- **Hardware acceleration**: `F.linear()` and other PyTorch ops leverage CPU SIMD instructions or GPU parallelism, processing thousands of elements simultaneously rather than sequentially.

- **Memory efficiency**: Vectorized operations have better cache locality and avoid the memory allocator being called thousands of times per forward pass.

**Impact:** This optimization is critical for any workload using neural networks, especially during training (thousands of forward passes) or real-time inference. The 379x speedup transforms this from impractical to production-ready code.
---
 code_to_optimize/unoptimized_neural_net.py | 59 +++-------------------
 1 file changed, 6 insertions(+), 53 deletions(-)

diff --git a/code_to_optimize/unoptimized_neural_net.py b/code_to_optimize/unoptimized_neural_net.py
index acd7f0a26..b5ea732b3 100644
--- a/code_to_optimize/unoptimized_neural_net.py
+++ b/code_to_optimize/unoptimized_neural_net.py
@@ -1,5 +1,6 @@
 import torch
-import torch.nn as nn
+import torch.nn.functional as F
+from torch import nn
 
 
 class UnoptimizedNeuralNet(nn.Module):
@@ -20,57 +21,9 @@ def forward(self, x):
         batch_size = x.shape[0]
         x = x.view(batch_size, -1)
 
-        hidden = torch.zeros(batch_size, self.hidden_size, dtype=x.dtype, device=x.device)
-        for b in range(batch_size):
-            for i in range(self.hidden_size):
-                neuron_sum = torch.tensor(0.0, dtype=x.dtype, device=x.device)
-                for j in range(self.input_size):
-                    neuron_sum = neuron_sum + x[b, j] * self.fc1_weight[i, j]
-                neuron_sum = neuron_sum + self.fc1_bias[i]
-                hidden[b, i] = neuron_sum
-
-        activated = torch.zeros_like(hidden)
-        for b in range(batch_size):
-            for i in range(self.hidden_size):
-                val = hidden[b, i]
-                if val > 0:
-                    activated[b, i] = val
-                else:
-                    activated[b, i] = 0.0
-
-        output = torch.zeros(batch_size, self.num_classes, dtype=x.dtype, device=x.device)
-        for b in range(batch_size):
-            for i in range(self.num_classes):
-                neuron_sum = torch.tensor(0.0, dtype=x.dtype, device=x.device)
-                temp_values = torch.zeros(self.hidden_size, dtype=x.dtype, device=x.device)
-                for j in range(self.hidden_size):
-                    temp_values[j] = activated[b, j]
-
-                for j in range(self.hidden_size):
-                    neuron_sum = neuron_sum + temp_values[j] * self.fc2_weight[i, j]
-
-                bias_value = self.fc2_bias[i]
-                neuron_sum = neuron_sum + bias_value
-
-                output[b, i] = neuron_sum
-
-        softmax_output = torch.zeros_like(output)
-        for b in range(batch_size):
-            max_val = output[b, 0].clone()
-            for i in range(1, self.num_classes):
-                if output[b, i] > max_val:
-                    max_val = output[b, i].clone()
-
-            exp_values = torch.zeros(self.num_classes, dtype=x.dtype, device=x.device)
-            for i in range(self.num_classes):
-                exp_val = torch.exp(output[b, i] - max_val)
-                exp_values[i] = exp_val
-
-            sum_exp = torch.tensor(0.0, dtype=x.dtype, device=x.device)
-            for i in range(self.num_classes):
-                sum_exp = sum_exp + exp_values[i]
-
-            for i in range(self.num_classes):
-                softmax_output[b, i] = exp_values[i] / sum_exp
+        hidden = F.linear(x, self.fc1_weight, self.fc1_bias)
+        activated = torch.clamp(hidden, min=0.0)
+        output = F.linear(activated, self.fc2_weight, self.fc2_bias)
+        softmax_output = torch.softmax(output, dim=1)
 
         return softmax_output.detach()