From ff2f17ab923d373cf0d2a0ccf1450c8bb0c67e39 Mon Sep 17 00:00:00 2001 From: "codeflash-ai[bot]" <148906541+codeflash-ai[bot]@users.noreply.github.com> Date: Fri, 23 Jan 2026 10:50:14 +0000 Subject: [PATCH] Optimize UnoptimizedNeuralNet.forward MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The optimized code achieves a **379x speedup** (from 900ms to 2.37ms) by replacing nested Python loops with PyTorch's highly optimized vectorized operations. **Key Changes:** 1. **Matrix multiplication via `F.linear()`**: The original code manually computes fully-connected layers using triple-nested loops (batch × hidden × input dimensions), performing ~23,552 individual scalar operations for the first layer alone. The line profiler shows this taking **71.3% of total runtime**. The optimized version replaces this with a single `F.linear()` call that uses BLAS/LAPACK or CUDA kernels for matrix multiplication, reducing this to just **3.1% of runtime**. 2. **ReLU activation with `torch.clamp()`**: The original code loops through every element to manually apply `max(0, x)`, taking **2.2%** of runtime. The optimized version uses `torch.clamp(hidden, min=0.0)`, a vectorized C/CUDA operation that processes the entire tensor in parallel. 3. **Softmax via `torch.softmax()`**: The original implementation manually computes max, exponentials, sum, and division across nested loops (~**5.2%** of runtime combined). The optimized version uses PyTorch's numerically stable `torch.softmax()`, which is both faster and prevents numerical overflow/underflow issues. 4. **Eliminated temporary tensor allocations**: The original code creates many small tensors (`torch.tensor(0.0)`, `temp_values`, etc.) inside loops, causing significant memory allocation overhead. The optimized version operates on entire tensors at once, drastically reducing memory churn. **Why This Matters:** - **Python loop overhead**: Each loop iteration in Python involves significant interpreter overhead. The original code had ~26,438 inner loop iterations per forward pass. Vectorized operations execute in compiled C/CUDA with minimal Python overhead. - **Hardware acceleration**: `F.linear()` and other PyTorch ops leverage CPU SIMD instructions or GPU parallelism, processing thousands of elements simultaneously rather than sequentially. - **Memory efficiency**: Vectorized operations have better cache locality and avoid the memory allocator being called thousands of times per forward pass. **Impact:** This optimization is critical for any workload using neural networks, especially during training (thousands of forward passes) or real-time inference. The 379x speedup transforms this from impractical to production-ready code. --- code_to_optimize/unoptimized_neural_net.py | 59 +++------------------- 1 file changed, 6 insertions(+), 53 deletions(-) diff --git a/code_to_optimize/unoptimized_neural_net.py b/code_to_optimize/unoptimized_neural_net.py index acd7f0a26..b5ea732b3 100644 --- a/code_to_optimize/unoptimized_neural_net.py +++ b/code_to_optimize/unoptimized_neural_net.py @@ -1,5 +1,6 @@ import torch -import torch.nn as nn +import torch.nn.functional as F +from torch import nn class UnoptimizedNeuralNet(nn.Module): @@ -20,57 +21,9 @@ def forward(self, x): batch_size = x.shape[0] x = x.view(batch_size, -1) - hidden = torch.zeros(batch_size, self.hidden_size, dtype=x.dtype, device=x.device) - for b in range(batch_size): - for i in range(self.hidden_size): - neuron_sum = torch.tensor(0.0, dtype=x.dtype, device=x.device) - for j in range(self.input_size): - neuron_sum = neuron_sum + x[b, j] * self.fc1_weight[i, j] - neuron_sum = neuron_sum + self.fc1_bias[i] - hidden[b, i] = neuron_sum - - activated = torch.zeros_like(hidden) - for b in range(batch_size): - for i in range(self.hidden_size): - val = hidden[b, i] - if val > 0: - activated[b, i] = val - else: - activated[b, i] = 0.0 - - output = torch.zeros(batch_size, self.num_classes, dtype=x.dtype, device=x.device) - for b in range(batch_size): - for i in range(self.num_classes): - neuron_sum = torch.tensor(0.0, dtype=x.dtype, device=x.device) - temp_values = torch.zeros(self.hidden_size, dtype=x.dtype, device=x.device) - for j in range(self.hidden_size): - temp_values[j] = activated[b, j] - - for j in range(self.hidden_size): - neuron_sum = neuron_sum + temp_values[j] * self.fc2_weight[i, j] - - bias_value = self.fc2_bias[i] - neuron_sum = neuron_sum + bias_value - - output[b, i] = neuron_sum - - softmax_output = torch.zeros_like(output) - for b in range(batch_size): - max_val = output[b, 0].clone() - for i in range(1, self.num_classes): - if output[b, i] > max_val: - max_val = output[b, i].clone() - - exp_values = torch.zeros(self.num_classes, dtype=x.dtype, device=x.device) - for i in range(self.num_classes): - exp_val = torch.exp(output[b, i] - max_val) - exp_values[i] = exp_val - - sum_exp = torch.tensor(0.0, dtype=x.dtype, device=x.device) - for i in range(self.num_classes): - sum_exp = sum_exp + exp_values[i] - - for i in range(self.num_classes): - softmax_output[b, i] = exp_values[i] / sum_exp + hidden = F.linear(x, self.fc1_weight, self.fc1_bias) + activated = torch.clamp(hidden, min=0.0) + output = F.linear(activated, self.fc2_weight, self.fc2_bias) + softmax_output = torch.softmax(output, dim=1) return softmax_output.detach()