diff --git a/NAM/convnet.cpp b/NAM/convnet.cpp
index 3b8b18f..8bbcded 100644
--- a/NAM/convnet.cpp
+++ b/NAM/convnet.cpp
@@ -129,39 +129,69 @@ long nam::convnet::ConvNetBlock::get_out_channels() const
   return this->conv.get_out_channels();
 }
 
-nam::convnet::_Head::_Head(const int channels, std::vector<float>::iterator& weights)
+nam::convnet::_Head::_Head(const int in_channels, const int out_channels, std::vector<float>::iterator& weights)
 {
-  this->_weight.resize(channels);
-  for (int i = 0; i < channels; i++)
-    this->_weight[i] = *(weights++);
-  this->_bias = *(weights++);
+  // Weights are stored row-major: first row (output 0), then row 1 (output 1), etc.
+  // For each output channel: [w0, w1, ..., w_{in_channels-1}]
+  // Then biases: [bias0, bias1, ..., bias_{out_channels-1}]
+  this->_weight.resize(out_channels, in_channels);
+  for (int out_ch = 0; out_ch < out_channels; out_ch++)
+  {
+    for (int in_ch = 0; in_ch < in_channels; in_ch++)
+    {
+      this->_weight(out_ch, in_ch) = *(weights++);
+    }
+  }
+
+  // Biases for each output channel
+  this->_bias.resize(out_channels);
+  for (int out_ch = 0; out_ch < out_channels; out_ch++)
+  {
+    this->_bias(out_ch) = *(weights++);
+  }
 }
 
-void nam::convnet::_Head::process_(const Eigen::MatrixXf& input, Eigen::VectorXf& output, const long i_start,
+void nam::convnet::_Head::process_(const Eigen::MatrixXf& input, Eigen::MatrixXf& output, const long i_start,
                                    const long i_end) const
 {
   const long length = i_end - i_start;
-  output.resize(length);
-  for (long i = 0, j = i_start; i < length; i++, j++)
-    output(i) = this->_bias + input.col(j).dot(this->_weight);
+  const long out_channels = this->_weight.rows();
+
+  // Resize output to (out_channels x length)
+  output.resize(out_channels, length);
+
+  // Extract input slice: (in_channels x length)
+  Eigen::MatrixXf input_slice = input.middleCols(i_start, length);
+
+  // Compute output = weight * input_slice: (out_channels x in_channels) * (in_channels x length) = (out_channels x
+  // length)
+  output.noalias() = this->_weight * input_slice;
+
+  // Add bias to each column: output.colwise() += bias
+  // output is (out_channels x length), bias is (out_channels x 1), so colwise() += works
+  output.colwise() += this->_bias;
 }
 
-nam::convnet::ConvNet::ConvNet(const int channels, const std::vector<int>& dilations, const bool batchnorm,
-                               const std::string activation, std::vector<float>& weights,
-                               const double expected_sample_rate, const int groups)
-: Buffer(*std::max_element(dilations.begin(), dilations.end()), expected_sample_rate)
+nam::convnet::ConvNet::ConvNet(const int in_channels, const int out_channels, const int channels,
+                               const std::vector<int>& dilations, const bool batchnorm, const std::string activation,
+                               std::vector<float>& weights, const double expected_sample_rate, const int groups)
+: Buffer(in_channels, out_channels, *std::max_element(dilations.begin(), dilations.end()), expected_sample_rate)
 {
   this->_verify_weights(channels, dilations, batchnorm, weights.size());
   this->_blocks.resize(dilations.size());
   std::vector<float>::iterator it = weights.begin();
+  // First block takes in_channels input, subsequent blocks take channels input
   for (size_t i = 0; i < dilations.size(); i++)
-    this->_blocks[i].set_weights_(i == 0 ? 1 : channels, channels, dilations[i], batchnorm, activation, groups, it);
+    this->_blocks[i].set_weights_(
+      i == 0 ? in_channels : channels, channels, dilations[i], batchnorm, activation, groups, it);
   // Only need _block_vals for the head (one entry)
   // Conv1D layers manage their own buffers now
   this->_block_vals.resize(1);
   this->_block_vals[0].setZero();
-  std::fill(this->_input_buffer.begin(), this->_input_buffer.end(), 0.0f);
-  this->_head = _Head(channels, it);
+
+  // Create single head that outputs all channels
+  this->_head = _Head(channels, out_channels, it);
+
   if (it != weights.end())
     throw std::runtime_error("Didn't touch all the weights when initializing ConvNet");
 
@@ -171,18 +201,25 @@ nam::convnet::ConvNet::ConvNet(const int channels, const std::vector<int>& dilat
 }
 
 
-void nam::convnet::ConvNet::process(NAM_SAMPLE* input, NAM_SAMPLE* output, const int num_frames)
+void nam::convnet::ConvNet::process(NAM_SAMPLE** input, NAM_SAMPLE** output, const int num_frames)
 
 {
   this->_update_buffers_(input, num_frames);
-  // Main computation!
-  const long i_start = this->_input_buffer_offset;
-  const long i_end = i_start + num_frames;
+  const int in_channels = NumInputChannels();
+  const int out_channels = NumOutputChannels();
+
+  // For multi-channel, we process each input channel independently through the network
+  // and sum outputs to each output channel (simple implementation)
+  // This can be extended later for more sophisticated cross-channel processing
 
-  // Convert input buffer to matrix for first layer
-  Eigen::MatrixXf input_matrix(1, num_frames);
-  for (int i = 0; i < num_frames; i++)
-    input_matrix(0, i) = this->_input_buffer[i_start + i];
+  // Convert input buffers to matrix for first layer (stack input channels)
+  Eigen::MatrixXf input_matrix(in_channels, num_frames);
+  const long i_start = this->_input_buffer_offset;
+  for (int ch = 0; ch < in_channels; ch++)
+  {
+    for (int i = 0; i < num_frames; i++)
+      input_matrix(ch, i) = this->_input_buffers[ch][i_start + i];
+  }
 
   // Process through ConvNetBlock layers
   // Each block now uses Conv1D's internal buffers via Process() and GetOutput()
@@ -206,23 +243,33 @@ void nam::convnet::ConvNet::process(NAM_SAMPLE* input, NAM_SAMPLE* output, const
     this->_blocks[i].Process(block_input, num_frames);
   }
 
-  // Process head with output from last Conv1D
-  // Head still needs the old interface, so we need to provide it via a matrix
-  // We still need _block_vals[0] for the head interface
+  // Process head for all output channels at once
+  // We need _block_vals[0] for the head interface
+  const long buffer_size = (long)this->_input_buffers[0].size();
   if (this->_block_vals[0].rows() != this->_blocks.back().get_out_channels()
-      || this->_block_vals[0].cols() != (long)this->_input_buffer.size())
+      || this->_block_vals[0].cols() != buffer_size)
   {
-    this->_block_vals[0].resize(this->_blocks.back().get_out_channels(), this->_input_buffer.size());
+    this->_block_vals[0].resize(this->_blocks.back().get_out_channels(), buffer_size);
   }
+
   // Copy last block output to _block_vals for head
   auto last_output = this->_blocks.back().GetOutput(num_frames);
-  this->_block_vals[0].middleCols(i_start, num_frames) = last_output;
-
-  this->_head.process_(this->_block_vals[0], this->_head_output, i_start, i_end);
-
-  // Copy to required output array
-  for (int s = 0; s < num_frames; s++)
-    output[s] = this->_head_output(s);
+  const long buffer_offset = this->_input_buffer_offset;
+  const long buffer_i_end = buffer_offset + num_frames;
+  // last_output is (channels x num_frames), _block_vals[0] is (channels x buffer_size)
+  // Copy to the correct location in _block_vals
+  this->_block_vals[0].block(0, buffer_offset, last_output.rows(), num_frames) = last_output;
+
+  // Process head - outputs all channels at once
+  // Head will resize _head_output internally
+  this->_head.process_(this->_block_vals[0], this->_head_output, buffer_offset, buffer_i_end);
+
+  // Copy to output arrays for each channel
+  for (int ch = 0; ch < out_channels; ch++)
+  {
+    for (int s = 0; s < num_frames; s++)
+      output[ch][s] = this->_head_output(ch, s);
+  }
 
   // Prepare for next call:
   nam::Buffer::_advance_input_buffer_(num_frames);
@@ -245,11 +292,12 @@ void nam::convnet::ConvNet::SetMaxBufferSize(const int maxBufferSize)
   }
 }
 
-void nam::convnet::ConvNet::_update_buffers_(NAM_SAMPLE* input, const int num_frames)
+void nam::convnet::ConvNet::_update_buffers_(NAM_SAMPLE** input, const int num_frames)
 {
   this->Buffer::_update_buffers_(input, num_frames);
 
-  const long buffer_size = (long)this->_input_buffer.size();
+  // All channels use the same buffer size
+  const long buffer_size = (long)this->_input_buffers[0].size();
 
   // Only need _block_vals[0] for the head
   // Conv1D layers manage their own buffers now
@@ -281,8 +329,11 @@ std::unique_ptr<nam::DSP> nam::convnet::Factory(const nlohmann::json& config, st
   const bool batchnorm = config["batchnorm"];
   const std::string activation = config["activation"];
   const int groups = config.value("groups", 1); // defaults to 1
+  // Default to 1 channel in/out for backward compatibility
+  const int in_channels = config.value("in_channels", 1);
+  const int out_channels = config.value("out_channels", 1);
   return std::make_unique<nam::convnet::ConvNet>(
-    channels, dilations, batchnorm, activation, weights, expectedSampleRate, groups);
+    in_channels, out_channels, channels, dilations, batchnorm, activation, weights, expectedSampleRate, groups);
 }
 
 namespace
diff --git a/NAM/convnet.h b/NAM/convnet.h
index ccc1edb..d1e846c 100644
--- a/NAM/convnet.h
+++ b/NAM/convnet.h
@@ -66,32 +66,33 @@ class _Head
 {
 public:
   _Head() {};
-  _Head(const int channels, std::vector<float>::iterator& weights);
-  void process_(const Eigen::MatrixXf& input, Eigen::VectorXf& output, const long i_start, const long i_end) const;
+  _Head(const int in_channels, const int out_channels, std::vector<float>::iterator& weights);
+  void process_(const Eigen::MatrixXf& input, Eigen::MatrixXf& output, const long i_start, const long i_end) const;
 
 private:
-  Eigen::VectorXf _weight;
-  float _bias = 0.0f;
+  Eigen::MatrixXf _weight; // (out_channels, in_channels)
+  Eigen::VectorXf _bias; // (out_channels,)
 };
 
 class ConvNet : public Buffer
 {
 public:
-  ConvNet(const int channels, const std::vector<int>& dilations, const bool batchnorm, const std::string activation,
-          std::vector<float>& weights, const double expected_sample_rate = -1.0, const int groups = 1);
+  ConvNet(const int in_channels, const int out_channels, const int channels, const std::vector<int>& dilations,
+          const bool batchnorm, const std::string activation, std::vector<float>& weights,
+          const double expected_sample_rate = -1.0, const int groups = 1);
   ~ConvNet() = default;
 
-  void process(NAM_SAMPLE* input, NAM_SAMPLE* output, const int num_frames) override;
+  void process(NAM_SAMPLE** input, NAM_SAMPLE** output, const int num_frames) override;
   void SetMaxBufferSize(const int maxBufferSize) override;
 
 protected:
   std::vector<ConvNetBlock> _blocks;
   std::vector<Eigen::MatrixXf> _block_vals;
-  Eigen::VectorXf _head_output;
+  Eigen::MatrixXf _head_output; // (out_channels, num_frames)
   _Head _head;
   void _verify_weights(const int channels, const std::vector<int>& dilations, const bool batchnorm,
                        const size_t actual_weights);
-  void _update_buffers_(NAM_SAMPLE* input, const int num_frames) override;
+  void _update_buffers_(NAM_SAMPLE** input, const int num_frames) override;
   void _rewind_buffers_() override;
 
   int mPrewarmSamples = 0; // Pre-compute during initialization
diff --git a/NAM/dsp.cpp b/NAM/dsp.cpp
index dc46891..023c42a 100644
--- a/NAM/dsp.cpp
+++ b/NAM/dsp.cpp
@@ -15,9 +15,15 @@
 
 constexpr const long _INPUT_BUFFER_SAFETY_FACTOR = 32;
 
-nam::DSP::DSP(const double expected_sample_rate)
+nam::DSP::DSP(const int in_channels, const int out_channels, const double expected_sample_rate)
 : mExpectedSampleRate(expected_sample_rate)
+, mInChannels(in_channels)
+, mOutChannels(out_channels)
 {
+  if (in_channels <= 0 || out_channels <= 0)
+  {
+    throw std::runtime_error("Channel counts must be positive");
+  }
 }
 
 void nam::DSP::prewarm()
@@ -31,29 +37,47 @@ void nam::DSP::prewarm()
     return;
 
   const size_t bufferSize = std::max(mMaxBufferSize, 1);
-  std::vector<NAM_SAMPLE> inputBuffer, outputBuffer;
-  inputBuffer.resize(bufferSize);
-  outputBuffer.resize(bufferSize);
-  for (auto it = inputBuffer.begin(); it != inputBuffer.end(); ++it)
+  // Allocate buffers for all channels
+  std::vector<std::vector<NAM_SAMPLE>> inputBuffers(mInChannels);
+  std::vector<std::vector<NAM_SAMPLE>> outputBuffers(mOutChannels);
+  std::vector<NAM_SAMPLE*> inputPtrs(mInChannels);
+  std::vector<NAM_SAMPLE*> outputPtrs(mOutChannels);
+
+  for (int ch = 0; ch < mInChannels; ch++)
   {
-    (*it) = (NAM_SAMPLE)0.0;
+    inputBuffers[ch].resize(bufferSize, (NAM_SAMPLE)0.0);
+    inputPtrs[ch] = inputBuffers[ch].data();
+  }
+  for (int ch = 0; ch < mOutChannels; ch++)
+  {
+    outputBuffers[ch].resize(bufferSize, (NAM_SAMPLE)0.0);
+    outputPtrs[ch] = outputBuffers[ch].data();
   }
 
-  NAM_SAMPLE* inputPtr = inputBuffer.data();
-  NAM_SAMPLE* outputPtr = outputBuffer.data();
   int samplesProcessed = 0;
   while (samplesProcessed < prewarmSamples)
   {
-    this->process(inputPtr, outputPtr, bufferSize);
+    this->process(inputPtrs.data(), outputPtrs.data(), bufferSize);
     samplesProcessed += bufferSize;
   }
 }
 
-void nam::DSP::process(NAM_SAMPLE* input, NAM_SAMPLE* output, const int num_frames)
+void nam::DSP::process(NAM_SAMPLE** input, NAM_SAMPLE** output, const int num_frames)
 {
-  // Default implementation is the null operation
-  for (int i = 0; i < num_frames; i++)
-    output[i] = input[i];
+  // Default implementation is the null operation: copy input to output
+  // For now, assume 1:1 channel mapping (first min(in_channels, out_channels) channels)
+  const int channelsToProcess = std::min(mInChannels, mOutChannels);
+  for (int ch = 0; ch < channelsToProcess; ch++)
+  {
+    for (int i = 0; i < num_frames; i++)
+      output[ch][i] = input[ch][i];
+  }
+  // Zero out any extra output channels
+  for (int ch = channelsToProcess; ch < mOutChannels; ch++)
+  {
+    for (int i = 0; i < num_frames; i++)
+      output[ch][i] = (NAM_SAMPLE)0.0;
+  }
 }
 
 double nam::DSP::GetLoudness() const
@@ -87,10 +111,43 @@ void nam::DSP::SetMaxBufferSize(const int maxBufferSize)
   mMaxBufferSize = maxBufferSize;
 }
 
+double nam::DSP::GetInputLevel()
+{
+  return mInputLevel.level;
+}
+
+double nam::DSP::GetOutputLevel()
+{
+  return mOutputLevel.level;
+}
+
+bool nam::DSP::HasInputLevel()
+{
+  return mInputLevel.haveLevel;
+}
+
+bool nam::DSP::HasOutputLevel()
+{
+  return mOutputLevel.haveLevel;
+}
+
+void nam::DSP::SetInputLevel(const double inputLevel)
+{
+  mInputLevel.haveLevel = true;
+  mInputLevel.level = inputLevel;
+}
+
+void nam::DSP::SetOutputLevel(const double outputLevel)
+{
+  mOutputLevel.haveLevel = true;
+  mOutputLevel.level = outputLevel;
+}
+
 // Buffer =====================================================================
 
-nam::Buffer::Buffer(const int receptive_field, const double expected_sample_rate)
-: nam::DSP(expected_sample_rate)
+nam::Buffer::Buffer(const int in_channels, const int out_channels, const int receptive_field,
+                    const double expected_sample_rate)
+: nam::DSP(in_channels, out_channels, expected_sample_rate)
 {
   this->_set_receptive_field(receptive_field);
 }
@@ -103,45 +160,77 @@ void nam::Buffer::_set_receptive_field(const int new_receptive_field)
 void nam::Buffer::_set_receptive_field(const int new_receptive_field, const int input_buffer_size)
 {
   this->_receptive_field = new_receptive_field;
-  this->_input_buffer.resize(input_buffer_size);
-  std::fill(this->_input_buffer.begin(), this->_input_buffer.end(), 0.0f);
+  const int in_channels = NumInputChannels();
+  const int out_channels = NumOutputChannels();
+
+  // Resize buffers for all input channels
+  _input_buffers.resize(in_channels);
+  for (int ch = 0; ch < in_channels; ch++)
+  {
+    _input_buffers[ch].resize(input_buffer_size);
+    std::fill(_input_buffers[ch].begin(), _input_buffers[ch].end(), 0.0f);
+  }
+
+  // Resize output buffers (though they'll be resized per call in _update_buffers_)
+  _output_buffers.resize(out_channels);
+
   this->_reset_input_buffer();
 }
 
-void nam::Buffer::_update_buffers_(NAM_SAMPLE* input, const int num_frames)
+void nam::Buffer::_update_buffers_(NAM_SAMPLE** input, const int num_frames)
 {
-  // Make sure that the buffer is big enough for the receptive field and the
-  // frames needed!
+  const int in_channels = NumInputChannels();
+  const int out_channels = NumOutputChannels();
+
+  // Make sure that the buffers are big enough for the receptive field and the
+  // frames needed. All channels use the same buffer size.
+  const long minimum_input_buffer_size = (long)this->_receptive_field + _INPUT_BUFFER_SAFETY_FACTOR * num_frames;
+
+  for (int ch = 0; ch < in_channels; ch++)
   {
-    const long minimum_input_buffer_size = (long)this->_receptive_field + _INPUT_BUFFER_SAFETY_FACTOR * num_frames;
-    if ((long)this->_input_buffer.size() < minimum_input_buffer_size)
+    if ((long)this->_input_buffers[ch].size() < minimum_input_buffer_size)
     {
       long new_buffer_size = 2;
       while (new_buffer_size < minimum_input_buffer_size)
         new_buffer_size *= 2;
-      this->_input_buffer.resize(new_buffer_size);
-      std::fill(this->_input_buffer.begin(), this->_input_buffer.end(), 0.0f);
+      this->_input_buffers[ch].resize(new_buffer_size);
+      std::fill(this->_input_buffers[ch].begin(), this->_input_buffers[ch].end(), 0.0f);
     }
   }
 
   // If we'd run off the end of the input buffer, then we need to move the data
-  // back to the start of the buffer and start again.
-  if (this->_input_buffer_offset + num_frames > (long)this->_input_buffer.size())
+  // back to the start of the buffer and start again. All channels move together.
+  const long buffer_size = (long)this->_input_buffers[0].size();
+  if (this->_input_buffer_offset + num_frames > buffer_size)
     this->_rewind_buffers_();
-  // Put the new samples into the input buffer
-  for (long i = this->_input_buffer_offset, j = 0; j < num_frames; i++, j++)
-    this->_input_buffer[i] = input[j];
-  // And resize the output buffer:
-  this->_output_buffer.resize(num_frames);
-  std::fill(this->_output_buffer.begin(), this->_output_buffer.end(), 0.0f);
+
+  // Put the new samples into the input buffer for each channel
+  for (int ch = 0; ch < in_channels; ch++)
+  {
+    for (long i = this->_input_buffer_offset, j = 0; j < num_frames; i++, j++)
+      this->_input_buffers[ch][i] = (float)input[ch][j];
+  }
+
+  // Resize output buffers for all output channels
+  for (int ch = 0; ch < out_channels; ch++)
+  {
+    this->_output_buffers[ch].resize(num_frames);
+    std::fill(this->_output_buffers[ch].begin(), this->_output_buffers[ch].end(), 0.0f);
+  }
 }
 
 void nam::Buffer::_rewind_buffers_()
 {
-  // Copy the input buffer back
-  // RF-1 samples because we've got at least one new one inbound.
-  for (long i = 0, j = this->_input_buffer_offset - this->_receptive_field; i < this->_receptive_field; i++, j++)
-    this->_input_buffer[i] = this->_input_buffer[j];
+  const int in_channels = NumInputChannels();
+
+  // Rewind buffers for all input channels (they all move together)
+  for (int ch = 0; ch < in_channels; ch++)
+  {
+    // Copy the input buffer back
+    // RF-1 samples because we've got at least one new one inbound.
+    for (long i = 0, j = this->_input_buffer_offset - this->_receptive_field; i < this->_receptive_field; i++, j++)
+      this->_input_buffers[ch][i] = this->_input_buffers[ch][j];
+  }
   // And reset the offset.
   // Even though we could be stingy about that one sample that we won't be using
   // (because a new set is incoming) it's probably not worth the
@@ -162,9 +251,9 @@ void nam::Buffer::_advance_input_buffer_(const int num_frames)
 
 // Linear =====================================================================
 
-nam::Linear::Linear(const int receptive_field, const bool _bias, const std::vector<float>& weights,
-                    const double expected_sample_rate)
-: nam::Buffer(receptive_field, expected_sample_rate)
+nam::Linear::Linear(const int in_channels, const int out_channels, const int receptive_field, const bool _bias,
+                    const std::vector<float>& weights, const double expected_sample_rate)
+: nam::Buffer(in_channels, out_channels, receptive_field, expected_sample_rate)
 {
   if ((int)weights.size() != (receptive_field + (_bias ? 1 : 0)))
     throw std::runtime_error(
@@ -178,16 +267,33 @@ nam::Linear::Linear(const int receptive_field, const bool _bias, const std::vect
   this->_bias = _bias ? weights[receptive_field] : (float)0.0;
 }
 
-void nam::Linear::process(NAM_SAMPLE* input, NAM_SAMPLE* output, const int num_frames)
+void nam::Linear::process(NAM_SAMPLE** input, NAM_SAMPLE** output, const int num_frames)
 {
   this->nam::Buffer::_update_buffers_(input, num_frames);
 
+  const int in_channels = NumInputChannels();
+  const int out_channels = NumOutputChannels();
+
+  // For now, Linear processes each input channel independently to corresponding output channel
+  // This is a simple implementation - can be extended later for cross-channel mixing
+  const int channelsToProcess = std::min(in_channels, out_channels);
+
   // Main computation!
-  for (int i = 0; i < num_frames; i++)
+  for (int ch = 0; ch < channelsToProcess; ch++)
+  {
+    for (int i = 0; i < num_frames; i++)
+    {
+      const long offset = this->_input_buffer_offset - this->_weight.size() + i + 1;
+      auto input_vec = Eigen::Map<const Eigen::VectorXf>(&this->_input_buffers[ch][offset], this->_receptive_field);
+      output[ch][i] = this->_bias + this->_weight.dot(input_vec);
+    }
+  }
+
+  // Zero out any extra output channels
+  for (int ch = channelsToProcess; ch < out_channels; ch++)
   {
-    const long offset = this->_input_buffer_offset - this->_weight.size() + i + 1;
-    auto input = Eigen::Map<const Eigen::VectorXf>(&this->_input_buffer[offset], this->_receptive_field);
-    output[i] = this->_bias + this->_weight.dot(input);
+    for (int i = 0; i < num_frames; i++)
+      output[ch][i] = (NAM_SAMPLE)0.0;
   }
 
   // Prepare for next call:
@@ -200,7 +306,10 @@ std::unique_ptr<nam::DSP> nam::linear::Factory(const nlohmann::json& config, std
 {
   const int receptive_field = config["receptive_field"];
   const bool bias = config["bias"];
-  return std::make_unique<nam::Linear>(receptive_field, bias, weights, expectedSampleRate);
+  // Default to 1 channel in/out for backward compatibility
+  const int in_channels = config.value("in_channels", 1);
+  const int out_channels = config.value("out_channels", 1);
+  return std::make_unique<nam::Linear>(in_channels, out_channels, receptive_field, bias, weights, expectedSampleRate);
 }
 
 // NN modules =================================================================
diff --git a/NAM/dsp.h b/NAM/dsp.h
index f359a68..5787212 100644
--- a/NAM/dsp.h
+++ b/NAM/dsp.h
@@ -40,7 +40,7 @@ class DSP
   // Older models won't know, but newer ones will come with a loudness from the training based on their response to a
   // standardized input.
   // We may choose to have the models figure out for themselves how loud they are in here in the future.
-  DSP(const double expected_sample_rate);
+  DSP(const int in_channels, const int out_channels, const double expected_sample_rate);
   virtual ~DSP() = default;
   // prewarm() does any required intial work required to "settle" model initial conditions
   // it can be somewhat expensive, so should not be called during realtime audio processing
@@ -54,25 +54,36 @@ class DSP
   // 1. The core DSP algorithm is run (This is what should probably be
   //    overridden in subclasses).
   // 2. The output level is applied and the result stored to `output`.
-  virtual void process(NAM_SAMPLE* input, NAM_SAMPLE* output, const int num_frames);
+  // `input` and `output` are double pointers where the first pointer indexes channels
+  // and the second indexes frames: input[channel][frame]
+  virtual void process(NAM_SAMPLE** input, NAM_SAMPLE** output, const int num_frames);
   // Expected sample rate, in Hz.
   // TODO throw if it doesn't know.
   double GetExpectedSampleRate() const { return mExpectedSampleRate; };
+  // Number of input channels
+  int NumInputChannels() const { return mInChannels; };
+  // Number of output channels
+  int NumOutputChannels() const { return mOutChannels; };
   // Input Level, in dBu, corresponding to 0 dBFS for a sine wave
   // You should call HasInputLevel() first to be safe.
-  double GetInputLevel() { return mInputLevel.level; };
+  // Note: input level is assumed global over all inputs.
+  double GetInputLevel();
   // Get how loud this model is, in dB.
   // Throws a std::runtime_error if the model doesn't know how loud it is.
+  // Note: loudness is assumed global over all outputs.
   double GetLoudness() const;
   // Output Level, in dBu, corresponding to 0 dBFS for a sine wave
   // You should call HasOutputLevel() first to be safe.
-  double GetOutputLevel() { return mOutputLevel.level; };
-  // Does this model know its output level?
-  bool HasInputLevel() { return mInputLevel.haveLevel; };
+  // Note: output level is assumed global over all outputs.
+  double GetOutputLevel();
+  // Does this model know its input level?
+  // Note: input level is assumed global over all inputs.
+  bool HasInputLevel();
   // Get whether the model knows how loud it is.
   bool HasLoudness() const { return mHasLoudness; };
   // Does this model know its output level?
-  bool HasOutputLevel() { return mOutputLevel.haveLevel; };
+  // Note: output level is assumed global over all outputs.
+  bool HasOutputLevel();
   // General function for resetting the DSP unit.
   // This doesn't call prewarm(). If you want to do that, then you might want to use ResetAndPrewarm().
   // See https://github.com/sdatkinson/NeuralAmpModelerCore/issues/96 for the reasoning.
@@ -83,20 +94,13 @@ class DSP
     Reset(sampleRate, maxBufferSize);
     prewarm();
   }
-  void SetInputLevel(const double inputLevel)
-  {
-    mInputLevel.haveLevel = true;
-    mInputLevel.level = inputLevel;
-  };
+  void SetInputLevel(const double inputLevel);
   // Set the loudness, in dB.
   // This is usually defined to be the loudness to a standardized input. The trainer has its own, but you can always
   // use this to define it a different way if you like yours better.
+  // Note: loudness is assumed global over all outputs.
   void SetLoudness(const double loudness);
-  void SetOutputLevel(const double outputLevel)
-  {
-    mOutputLevel.haveLevel = true;
-    mOutputLevel.level = outputLevel;
-  };
+  void SetOutputLevel(const double outputLevel);
 
 protected:
   bool mHasLoudness = false;
@@ -117,11 +121,14 @@ class DSP
   int GetMaxBufferSize() const { return mMaxBufferSize; };
 
 private:
+  const int mInChannels;
+  const int mOutChannels;
   struct Level
   {
     bool haveLevel = false;
     float level = 0.0;
   };
+  // Note: input/output levels are assumed global over all inputs/outputs
   Level mInputLevel;
   Level mOutputLevel;
 };
@@ -132,23 +139,23 @@ class DSP
 class Buffer : public DSP
 {
 public:
-  Buffer(const int receptive_field, const double expected_sample_rate = -1.0);
+  Buffer(const int in_channels, const int out_channels, const int receptive_field,
+         const double expected_sample_rate = -1.0);
 
 protected:
-  // Input buffer
-  const int _input_buffer_channels = 1; // Mono
   int _receptive_field;
-  // First location where we add new samples from the input
+  // First location where we add new samples from the input (same for all channels)
   long _input_buffer_offset;
-  std::vector<float> _input_buffer;
-  std::vector<float> _output_buffer;
+  // Per-channel input buffers
+  std::vector<std::vector<float>> _input_buffers;
+  std::vector<std::vector<float>> _output_buffers;
 
   void _advance_input_buffer_(const int num_frames);
   void _set_receptive_field(const int new_receptive_field, const int input_buffer_size);
   void _set_receptive_field(const int new_receptive_field);
   void _reset_input_buffer();
   // Use this->_input_post_gain
-  virtual void _update_buffers_(NAM_SAMPLE* input, int num_frames);
+  virtual void _update_buffers_(NAM_SAMPLE** input, int num_frames);
   virtual void _rewind_buffers_();
 };
 
@@ -156,9 +163,9 @@ class Buffer : public DSP
 class Linear : public Buffer
 {
 public:
-  Linear(const int receptive_field, const bool _bias, const std::vector<float>& weights,
-         const double expected_sample_rate = -1.0);
-  void process(NAM_SAMPLE* input, NAM_SAMPLE* output, const int num_frames) override;
+  Linear(const int in_channels, const int out_channels, const int receptive_field, const bool _bias,
+         const std::vector<float>& weights, const double expected_sample_rate = -1.0);
+  void process(NAM_SAMPLE** input, NAM_SAMPLE** output, const int num_frames) override;
 
 protected:
   Eigen::VectorXf _weight;
diff --git a/NAM/lstm.cpp b/NAM/lstm.cpp
index 6fa33a2..d162d55 100644
--- a/NAM/lstm.cpp
+++ b/NAM/lstm.cpp
@@ -65,25 +65,61 @@ void nam::lstm::LSTMCell::process_(const Eigen::VectorXf& x)
   }
 }
 
-nam::lstm::LSTM::LSTM(const int num_layers, const int input_size, const int hidden_size, std::vector<float>& weights,
-                      const double expected_sample_rate)
-: DSP(expected_sample_rate)
+nam::lstm::LSTM::LSTM(const int in_channels, const int out_channels, const int num_layers, const int input_size,
+                      const int hidden_size, std::vector<float>& weights, const double expected_sample_rate)
+: DSP(in_channels, out_channels, expected_sample_rate)
 {
-  this->_input.resize(1);
+  // Allocate input and output vectors
+  this->_input.resize(input_size);
+  this->_output.resize(out_channels);
+
   std::vector<float>::iterator it = weights.begin();
   for (int i = 0; i < num_layers; i++)
     this->_layers.push_back(LSTMCell(i == 0 ? input_size : hidden_size, hidden_size, it));
-  this->_head_weight.resize(hidden_size);
-  for (int i = 0; i < hidden_size; i++)
-    this->_head_weight[i] = *(it++);
-  this->_head_bias = *(it++);
+
+  // Load head weight as matrix (out_channels x hidden_size)
+  // Weights are stored row-major: first row (output 0), then row 1 (output 1), etc.
+  this->_head_weight.resize(out_channels, hidden_size);
+  for (int out_ch = 0; out_ch < out_channels; out_ch++)
+  {
+    for (int h = 0; h < hidden_size; h++)
+    {
+      this->_head_weight(out_ch, h) = *(it++);
+    }
+  }
+
+  // Load head bias as vector (out_channels)
+  this->_head_bias.resize(out_channels);
+  for (int out_ch = 0; out_ch < out_channels; out_ch++)
+  {
+    this->_head_bias(out_ch) = *(it++);
+  }
+
   assert(it == weights.end());
 }
 
-void nam::lstm::LSTM::process(NAM_SAMPLE* input, NAM_SAMPLE* output, const int num_frames)
+void nam::lstm::LSTM::process(NAM_SAMPLE** input, NAM_SAMPLE** output, const int num_frames)
 {
+  const int in_channels = NumInputChannels();
+  const int out_channels = NumOutputChannels();
+
   for (int i = 0; i < num_frames; i++)
-    output[i] = this->_process_sample(input[i]);
+  {
+    // Copy multi-channel input to _input vector
+    for (int ch = 0; ch < in_channels; ch++)
+    {
+      this->_input(ch) = input[ch][i];
+    }
+
+    // Process sample (stores result in _output)
+    this->_process_sample();
+
+    // Copy multi-channel output from _output to output arrays
+    for (int ch = 0; ch < out_channels; ch++)
+    {
+      output[ch][i] = this->_output(ch);
+    }
+  }
 }
 
 int nam::lstm::LSTM::PrewarmSamples()
@@ -94,15 +130,37 @@ int nam::lstm::LSTM::PrewarmSamples()
   return result <= 0 ? 1 : result;
 }
 
-float nam::lstm::LSTM::_process_sample(const float x)
+void nam::lstm::LSTM::_process_sample()
 {
+  const int in_channels = NumInputChannels();
+  const int out_channels = NumOutputChannels();
+
   if (this->_layers.size() == 0)
-    return x;
-  this->_input(0) = x;
+  {
+    // No layers - pass input through to output (using first in_channels of output)
+    const int channels_to_copy = std::min(in_channels, out_channels);
+    for (int ch = 0; ch < channels_to_copy; ch++)
+      this->_output(ch) = this->_input(ch);
+    // Zero-fill remaining output channels if in_channels < out_channels
+    for (int ch = channels_to_copy; ch < out_channels; ch++)
+      this->_output(ch) = 0.0f;
+    return;
+  }
+
   this->_layers[0].process_(this->_input);
   for (size_t i = 1; i < this->_layers.size(); i++)
     this->_layers[i].process_(this->_layers[i - 1].get_hidden_state());
-  return this->_head_weight.dot(this->_layers[this->_layers.size() - 1].get_hidden_state()) + this->_head_bias;
+
+  // Compute output using head weight matrix and bias vector
+  // _output = _head_weight * hidden_state + _head_bias
+  const Eigen::VectorXf& hidden_state = this->_layers[this->_layers.size() - 1].get_hidden_state();
+
+  // Compute matrix-vector product: (out_channels x hidden_size) * (hidden_size) = (out_channels)
+  // Store directly in _output (which is already sized correctly in constructor)
+  this->_output.noalias() = this->_head_weight * hidden_state;
+
+  // Add bias: (out_channels) += (out_channels)
+  this->_output.noalias() += this->_head_bias;
 }
 
 // Factory to instantiate from nlohmann json
@@ -112,7 +170,11 @@ std::unique_ptr<nam::DSP> nam::lstm::Factory(const nlohmann::json& config, std::
   const int num_layers = config["num_layers"];
   const int input_size = config["input_size"];
   const int hidden_size = config["hidden_size"];
-  return std::make_unique<nam::lstm::LSTM>(num_layers, input_size, hidden_size, weights, expectedSampleRate);
+  // Default to 1 channel in/out for backward compatibility
+  const int in_channels = config.value("in_channels", 1);
+  const int out_channels = config.value("out_channels", 1);
+  return std::make_unique<nam::lstm::LSTM>(
+    in_channels, out_channels, num_layers, input_size, hidden_size, weights, expectedSampleRate);
 }
 
 // Register the factory
diff --git a/NAM/lstm.h b/NAM/lstm.h
index 17d0ada..5c03853 100644
--- a/NAM/lstm.h
+++ b/NAM/lstm.h
@@ -51,24 +51,26 @@ class LSTMCell
 class LSTM : public DSP
 {
 public:
-  LSTM(const int num_layers, const int input_size, const int hidden_size, std::vector<float>& weights,
-       const double expected_sample_rate = -1.0);
+  LSTM(const int in_channels, const int out_channels, const int num_layers, const int input_size, const int hidden_size,
+       std::vector<float>& weights, const double expected_sample_rate = -1.0);
   ~LSTM() = default;
+  void process(NAM_SAMPLE** input, NAM_SAMPLE** output, const int num_frames) override;
 
 protected:
   // Hacky, but a half-second seems to work for most models.
   int PrewarmSamples() override;
 
-  Eigen::VectorXf _head_weight;
-  float _head_bias;
-  void process(NAM_SAMPLE* input, NAM_SAMPLE* output, const int num_frames) override;
+  Eigen::MatrixXf _head_weight; // (out_channels x hidden_size)
+  Eigen::VectorXf _head_bias; // (out_channels)
   std::vector<LSTMCell> _layers;
 
-  float _process_sample(const float x);
+  void _process_sample();
 
   // Input to the LSTM.
-  // Since this is assumed to not be a parametric model, its shape should be (1,)
+  // Since this is assumed to not be a parametric model, its shape should be (in_channels,)
   Eigen::VectorXf _input;
+  // Output from _process_sample - multi-channel output vector (size out_channels)
+  Eigen::VectorXf _output;
 };
 
 // Factory to instantiate from nlohmann json
diff --git a/NAM/wavenet.cpp b/NAM/wavenet.cpp
index 285ea69..6686f93 100644
--- a/NAM/wavenet.cpp
+++ b/NAM/wavenet.cpp
@@ -192,12 +192,18 @@ long nam::wavenet::_LayerArray::_get_channels() const
 
 // WaveNet ====================================================================
 
-nam::wavenet::WaveNet::WaveNet(const std::vector<nam::wavenet::LayerArrayParams>& layer_array_params,
+nam::wavenet::WaveNet::WaveNet(const int in_channels,
+                               const std::vector<nam::wavenet::LayerArrayParams>& layer_array_params,
                                const float head_scale, const bool with_head, std::vector<float> weights,
                                const double expected_sample_rate)
-: DSP(expected_sample_rate)
+: DSP(in_channels,
+      layer_array_params.empty() ? throw std::runtime_error("WaveNet requires at least one layer array")
+                                 : layer_array_params.back().head_size,
+      expected_sample_rate)
 , _head_scale(head_scale)
 {
+  if (layer_array_params.empty())
+    throw std::runtime_error("WaveNet requires at least one layer array");
   if (with_head)
     throw std::runtime_error("Head not implemented!");
   for (size_t i = 0; i < layer_array_params.size(); i++)
@@ -251,17 +257,24 @@ void nam::wavenet::WaveNet::SetMaxBufferSize(const int maxBufferSize)
     this->_layer_arrays[i].SetMaxBufferSize(maxBufferSize);
 }
 
-void nam::wavenet::WaveNet::_set_condition_array(NAM_SAMPLE* input, const int num_frames)
+void nam::wavenet::WaveNet::_set_condition_array(NAM_SAMPLE** input, const int num_frames)
 {
-  for (int j = 0; j < num_frames; j++)
+  const int in_channels = NumInputChannels();
+  // Fill condition array with input channels
+  for (int ch = 0; ch < in_channels; ch++)
   {
-    this->_condition(0, j) = input[j];
+    for (int j = 0; j < num_frames; j++)
+    {
+      this->_condition(ch, j) = input[ch][j];
+    }
   }
 }
 
-void nam::wavenet::WaveNet::process(NAM_SAMPLE* input, NAM_SAMPLE* output, const int num_frames)
+void nam::wavenet::WaveNet::process(NAM_SAMPLE** input, NAM_SAMPLE** output, const int num_frames)
 {
   assert(num_frames <= mMaxBufferSize);
+  const int out_channels = NumOutputChannels();
+
   this->_set_condition_array(input, num_frames);
 
   // Main layer arrays:
@@ -287,11 +300,15 @@ void nam::wavenet::WaveNet::process(NAM_SAMPLE* input, NAM_SAMPLE* output, const
   // (Head not implemented)
 
   auto& final_head_outputs = this->_layer_arrays.back().GetHeadOutputs();
-  assert(final_head_outputs.rows() == 1);
-  for (int s = 0; s < num_frames; s++)
+  assert(final_head_outputs.rows() == out_channels);
+
+  for (int ch = 0; ch < out_channels; ch++)
   {
-    const float out = this->_head_scale * final_head_outputs(0, s);
-    output[s] = out;
+    for (int s = 0; s < num_frames; s++)
+    {
+      const float out = this->_head_scale * final_head_outputs(ch, s);
+      output[ch][s] = out;
+    }
   }
 }
 
@@ -314,8 +331,16 @@ std::unique_ptr<nam::DSP> nam::wavenet::Factory(const nlohmann::json& config, st
   }
   const bool with_head = !config["head"].is_null();
   const float head_scale = config["head_scale"];
+
+  if (layer_array_params.empty())
+    throw std::runtime_error("WaveNet config requires at least one layer array");
+
+  // Backward compatibility: assume 1 input channel
+  const int in_channels = config.value("in_channels", 1);
+
+  // out_channels is determined from last layer array's head_size
   return std::make_unique<nam::wavenet::WaveNet>(
-    layer_array_params, head_scale, with_head, weights, expectedSampleRate);
+    in_channels, layer_array_params, head_scale, with_head, weights, expectedSampleRate);
 }
 
 // Register the factory
diff --git a/NAM/wavenet.h b/NAM/wavenet.h
index 832673b..2e99256 100644
--- a/NAM/wavenet.h
+++ b/NAM/wavenet.h
@@ -174,10 +174,10 @@ class _LayerArray
 class WaveNet : public DSP
 {
 public:
-  WaveNet(const std::vector<LayerArrayParams>& layer_array_params, const float head_scale, const bool with_head,
-          std::vector<float> weights, const double expected_sample_rate = -1.0);
+  WaveNet(const int in_channels, const std::vector<LayerArrayParams>& layer_array_params, const float head_scale,
+          const bool with_head, std::vector<float> weights, const double expected_sample_rate = -1.0);
   ~WaveNet() = default;
-  void process(NAM_SAMPLE* input, NAM_SAMPLE* output, const int num_frames) override;
+  void process(NAM_SAMPLE** input, NAM_SAMPLE** output, const int num_frames) override;
   void set_weights_(std::vector<float>& weights);
 
 protected:
@@ -186,10 +186,10 @@ class WaveNet : public DSP
 
   void SetMaxBufferSize(const int maxBufferSize) override;
   // Fill in the "condition" array that's fed into the various parts of the net.
-  virtual void _set_condition_array(NAM_SAMPLE* input, const int num_frames);
+  virtual void _set_condition_array(NAM_SAMPLE** input, const int num_frames);
   // How many conditioning inputs are there.
   // Just one--the audio.
-  virtual int _get_condition_dim() const { return 1; };
+  virtual int _get_condition_dim() const { return NumInputChannels(); };
 
 private:
   std::vector<_LayerArray> _layer_arrays;
diff --git a/tools/benchmodel.cpp b/tools/benchmodel.cpp
index 5c3d60c..d8a1690 100644
--- a/tools/benchmodel.cpp
+++ b/tools/benchmodel.cpp
@@ -40,18 +40,31 @@ int main(int argc, char* argv[])
     model->Reset(model->GetExpectedSampleRate(), bufferSize);
     size_t numBuffers = (48000 / bufferSize) * 2;
 
-    // Fill input buffer with zeroes.
-    // Output buffer doesn't matter.
-    for (int i = 0; i < AUDIO_BUFFER_SIZE; i++)
+    // Allocate multi-channel buffers
+    const int in_channels = model->NumInputChannels();
+    const int out_channels = model->NumOutputChannels();
+
+    std::vector<std::vector<double>> inputBuffers(in_channels);
+    std::vector<std::vector<double>> outputBuffers(out_channels);
+    std::vector<double*> inputPtrs(in_channels);
+    std::vector<double*> outputPtrs(out_channels);
+
+    for (int ch = 0; ch < in_channels; ch++)
+    {
+      inputBuffers[ch].resize(AUDIO_BUFFER_SIZE, 0.0);
+      inputPtrs[ch] = inputBuffers[ch].data();
+    }
+    for (int ch = 0; ch < out_channels; ch++)
     {
-      inputBuffer[i] = 0.0;
+      outputBuffers[ch].resize(AUDIO_BUFFER_SIZE, 0.0);
+      outputPtrs[ch] = outputBuffers[ch].data();
     }
 
     std::cout << "Running benchmark\n";
     auto t1 = high_resolution_clock::now();
     for (size_t i = 0; i < numBuffers; i++)
     {
-      model->process(inputBuffer, outputBuffer, AUDIO_BUFFER_SIZE);
+      model->process(inputPtrs.data(), outputPtrs.data(), AUDIO_BUFFER_SIZE);
     }
     auto t2 = high_resolution_clock::now();
     std::cout << "Finished\n";
diff --git a/tools/run_tests.cpp b/tools/run_tests.cpp
index 33c4d45..2a50c77 100644
--- a/tools/run_tests.cpp
+++ b/tools/run_tests.cpp
@@ -18,6 +18,7 @@
 #include "test/test_wavenet_gating_compatibility.cpp"
 #include "test/test_blending_detailed.cpp"
 #include "test/test_input_buffer_verification.cpp"
+#include "test/test_lstm.cpp"
 
 int main()
 {
@@ -124,6 +125,7 @@ int main()
   test_wavenet::test_layer_grouped_process_realtime_safe();
   test_wavenet::test_layer_array_process_realtime_safe();
   test_wavenet::test_process_realtime_safe();
+  test_wavenet::test_process_3in_2out_realtime_safe();
 
   test_convnet::test_convnet_basic();
   test_convnet::test_convnet_batchnorm();
@@ -133,6 +135,19 @@ int main()
   test_convnet::test_convnet_prewarm();
   test_convnet::test_convnet_multiple_calls();
 
+  // LSTM tests
+  test_lstm::test_lstm_basic();
+  test_lstm::test_lstm_multiple_layers();
+  test_lstm::test_lstm_zero_input();
+  test_lstm::test_lstm_different_buffer_sizes();
+  test_lstm::test_lstm_prewarm();
+  test_lstm::test_lstm_multiple_calls();
+  test_lstm::test_lstm_multichannel();
+  test_lstm::test_lstm_large_hidden_size();
+  test_lstm::test_lstm_different_input_size();
+  test_lstm::test_lstm_state_evolution();
+  test_lstm::test_lstm_no_layers();
+
   // Gating activations tests
   test_gating_activations::TestGatingActivation::test_basic_functionality();
   test_gating_activations::TestGatingActivation::test_with_custom_activations();
diff --git a/tools/test/test_convnet.cpp b/tools/test/test_convnet.cpp
index ff11074..56bd5ec 100644
--- a/tools/test/test_convnet.cpp
+++ b/tools/test/test_convnet.cpp
@@ -13,6 +13,8 @@ namespace test_convnet
 // Test basic ConvNet construction and processing
 void test_convnet_basic()
 {
+  const int in_channels = 1;
+  const int out_channels = 1;
   const int channels = 2;
   const std::vector<int> dilations{1, 2};
   const bool batchnorm = false;
@@ -32,7 +34,8 @@ void test_convnet_basic()
   // Head weights (2 weights + 1 bias)
   weights.insert(weights.end(), {1.0f, 1.0f, 0.0f});
 
-  nam::convnet::ConvNet convnet(channels, dilations, batchnorm, activation, weights, expected_sample_rate);
+  nam::convnet::ConvNet convnet(
+    in_channels, out_channels, channels, dilations, batchnorm, activation, weights, expected_sample_rate);
 
   const int numFrames = 4;
   const int maxBufferSize = 64;
@@ -40,8 +43,10 @@ void test_convnet_basic()
 
   std::vector<NAM_SAMPLE> input(numFrames, 1.0f);
   std::vector<NAM_SAMPLE> output(numFrames, 0.0f);
+  NAM_SAMPLE* inputPtrs[] = {input.data()};
+  NAM_SAMPLE* outputPtrs[] = {output.data()};
 
-  convnet.process(input.data(), output.data(), numFrames);
+  convnet.process(inputPtrs, outputPtrs, numFrames);
 
   // Verify output dimensions
   assert(output.size() == numFrames);
@@ -55,6 +60,8 @@ void test_convnet_basic()
 // Test ConvNet with batchnorm
 void test_convnet_batchnorm()
 {
+  const int in_channels = 1;
+  const int out_channels = 1;
   const int channels = 1;
   const std::vector<int> dilations{1};
   const bool batchnorm = true;
@@ -74,7 +81,8 @@ void test_convnet_batchnorm()
   // Head weights (1 weight + 1 bias)
   weights.insert(weights.end(), {1.0f, 0.0f});
 
-  nam::convnet::ConvNet convnet(channels, dilations, batchnorm, activation, weights, expected_sample_rate);
+  nam::convnet::ConvNet convnet(
+    in_channels, out_channels, channels, dilations, batchnorm, activation, weights, expected_sample_rate);
 
   const int numFrames = 4;
   const int maxBufferSize = 64;
@@ -82,8 +90,10 @@ void test_convnet_batchnorm()
 
   std::vector<NAM_SAMPLE> input(numFrames, 1.0f);
   std::vector<NAM_SAMPLE> output(numFrames, 0.0f);
+  NAM_SAMPLE* inputPtrs[] = {input.data()};
+  NAM_SAMPLE* outputPtrs[] = {output.data()};
 
-  convnet.process(input.data(), output.data(), numFrames);
+  convnet.process(inputPtrs, outputPtrs, numFrames);
 
   assert(output.size() == numFrames);
   for (int i = 0; i < numFrames; i++)
@@ -95,6 +105,8 @@ void test_convnet_batchnorm()
 // Test ConvNet with multiple blocks
 void test_convnet_multiple_blocks()
 {
+  const int in_channels = 1;
+  const int out_channels = 1;
   const int channels = 2;
   const std::vector<int> dilations{1, 2, 4};
   const bool batchnorm = false;
@@ -117,7 +129,8 @@ void test_convnet_multiple_blocks()
   // Head weights
   weights.insert(weights.end(), {1.0f, 1.0f, 0.0f});
 
-  nam::convnet::ConvNet convnet(channels, dilations, batchnorm, activation, weights, expected_sample_rate);
+  nam::convnet::ConvNet convnet(
+    in_channels, out_channels, channels, dilations, batchnorm, activation, weights, expected_sample_rate);
 
   const int numFrames = 8;
   const int maxBufferSize = 64;
@@ -125,8 +138,10 @@ void test_convnet_multiple_blocks()
 
   std::vector<NAM_SAMPLE> input(numFrames, 0.5f);
   std::vector<NAM_SAMPLE> output(numFrames, 0.0f);
+  NAM_SAMPLE* inputPtrs[] = {input.data()};
+  NAM_SAMPLE* outputPtrs[] = {output.data()};
 
-  convnet.process(input.data(), output.data(), numFrames);
+  convnet.process(inputPtrs, outputPtrs, numFrames);
 
   assert(output.size() == numFrames);
   for (int i = 0; i < numFrames; i++)
@@ -138,6 +153,8 @@ void test_convnet_multiple_blocks()
 // Test ConvNet with zero input
 void test_convnet_zero_input()
 {
+  const int in_channels = 1;
+  const int out_channels = 1;
   const int channels = 1;
   const std::vector<int> dilations{1};
   const bool batchnorm = false;
@@ -150,15 +167,18 @@ void test_convnet_zero_input()
   // Head weights (1 weight + 1 bias)
   weights.insert(weights.end(), {1.0f, 0.0f});
 
-  nam::convnet::ConvNet convnet(channels, dilations, batchnorm, activation, weights, expected_sample_rate);
+  nam::convnet::ConvNet convnet(
+    in_channels, out_channels, channels, dilations, batchnorm, activation, weights, expected_sample_rate);
 
   const int numFrames = 4;
   convnet.Reset(expected_sample_rate, numFrames);
 
   std::vector<NAM_SAMPLE> input(numFrames, 0.0f);
   std::vector<NAM_SAMPLE> output(numFrames, 0.0f);
+  NAM_SAMPLE* inputPtrs[] = {input.data()};
+  NAM_SAMPLE* outputPtrs[] = {output.data()};
 
-  convnet.process(input.data(), output.data(), numFrames);
+  convnet.process(inputPtrs, outputPtrs, numFrames);
 
   // With zero input, output should be finite (may be zero or non-zero depending on bias)
   for (int i = 0; i < numFrames; i++)
@@ -170,6 +190,8 @@ void test_convnet_zero_input()
 // Test ConvNet with different buffer sizes
 void test_convnet_different_buffer_sizes()
 {
+  const int in_channels = 1;
+  const int out_channels = 1;
   const int channels = 1;
   const std::vector<int> dilations{1};
   const bool batchnorm = false;
@@ -182,18 +204,23 @@ void test_convnet_different_buffer_sizes()
   // Head weights (1 weight + 1 bias)
   weights.insert(weights.end(), {1.0f, 0.0f});
 
-  nam::convnet::ConvNet convnet(channels, dilations, batchnorm, activation, weights, expected_sample_rate);
+  nam::convnet::ConvNet convnet(
+    in_channels, out_channels, channels, dilations, batchnorm, activation, weights, expected_sample_rate);
 
   // Test with different buffer sizes
   convnet.Reset(expected_sample_rate, 64);
   std::vector<NAM_SAMPLE> input1(32, 1.0f);
   std::vector<NAM_SAMPLE> output1(32, 0.0f);
-  convnet.process(input1.data(), output1.data(), 32);
+  NAM_SAMPLE* inputPtrs1[] = {input1.data()};
+  NAM_SAMPLE* outputPtrs1[] = {output1.data()};
+  convnet.process(inputPtrs1, outputPtrs1, 32);
 
   convnet.Reset(expected_sample_rate, 128);
   std::vector<NAM_SAMPLE> input2(64, 1.0f);
   std::vector<NAM_SAMPLE> output2(64, 0.0f);
-  convnet.process(input2.data(), output2.data(), 64);
+  NAM_SAMPLE* inputPtrs2[] = {input2.data()};
+  NAM_SAMPLE* outputPtrs2[] = {output2.data()};
+  convnet.process(inputPtrs2, outputPtrs2, 64);
 
   // Both should work without errors
   assert(output1.size() == 32);
@@ -203,6 +230,8 @@ void test_convnet_different_buffer_sizes()
 // Test ConvNet prewarm functionality
 void test_convnet_prewarm()
 {
+  const int in_channels = 1;
+  const int out_channels = 1;
   const int channels = 2;
   const std::vector<int> dilations{1, 2, 4};
   const bool batchnorm = false;
@@ -219,7 +248,8 @@ void test_convnet_prewarm()
   // Head weights (2 weights + 1 bias)
   weights.insert(weights.end(), {1.0f, 1.0f, 0.0f});
 
-  nam::convnet::ConvNet convnet(channels, dilations, batchnorm, activation, weights, expected_sample_rate);
+  nam::convnet::ConvNet convnet(
+    in_channels, out_channels, channels, dilations, batchnorm, activation, weights, expected_sample_rate);
 
   // Test that prewarm can be called without errors
   convnet.Reset(expected_sample_rate, 64);
@@ -229,7 +259,9 @@ void test_convnet_prewarm()
   const int numFrames = 4;
   std::vector<NAM_SAMPLE> input(numFrames, 1.0f);
   std::vector<NAM_SAMPLE> output(numFrames, 0.0f);
-  convnet.process(input.data(), output.data(), numFrames);
+  NAM_SAMPLE* inputPtrs[] = {input.data()};
+  NAM_SAMPLE* outputPtrs[] = {output.data()};
+  convnet.process(inputPtrs, outputPtrs, numFrames);
 
   // Output should be finite
   for (int i = 0; i < numFrames; i++)
@@ -241,6 +273,8 @@ void test_convnet_prewarm()
 // Test multiple process() calls (ring buffer functionality)
 void test_convnet_multiple_calls()
 {
+  const int in_channels = 1;
+  const int out_channels = 1;
   const int channels = 1;
   const std::vector<int> dilations{1};
   const bool batchnorm = false;
@@ -253,7 +287,8 @@ void test_convnet_multiple_calls()
   // Head weights (1 weight + 1 bias)
   weights.insert(weights.end(), {1.0f, 0.0f});
 
-  nam::convnet::ConvNet convnet(channels, dilations, batchnorm, activation, weights, expected_sample_rate);
+  nam::convnet::ConvNet convnet(
+    in_channels, out_channels, channels, dilations, batchnorm, activation, weights, expected_sample_rate);
 
   const int numFrames = 2;
   convnet.Reset(expected_sample_rate, numFrames);
@@ -263,7 +298,9 @@ void test_convnet_multiple_calls()
   {
     std::vector<NAM_SAMPLE> input(numFrames, 1.0f);
     std::vector<NAM_SAMPLE> output(numFrames, 0.0f);
-    convnet.process(input.data(), output.data(), numFrames);
+    NAM_SAMPLE* inputPtrs[] = {input.data()};
+    NAM_SAMPLE* outputPtrs[] = {output.data()};
+    convnet.process(inputPtrs, outputPtrs, numFrames);
 
     // Output should be finite
     for (int j = 0; j < numFrames; j++)
diff --git a/tools/test/test_dsp.cpp b/tools/test/test_dsp.cpp
index bbdee63..d019a87 100644
--- a/tools/test/test_dsp.cpp
+++ b/tools/test/test_dsp.cpp
@@ -1,18 +1,32 @@
 // Tests for dsp
 
 #include "NAM/dsp.h"
+#include <vector>
 
 namespace test_dsp
 {
 // Simplest test: can I construct something!
 void test_construct()
 {
-  nam::DSP myDsp(48000.0);
+  const int in_channels = 1;
+  const int out_channels = 1;
+  nam::DSP myDsp(in_channels, out_channels, 48000.0);
+}
+
+void test_channels()
+{
+  const int in_channels = 2;
+  const int out_channels = 3;
+  nam::DSP myDsp(in_channels, out_channels, 48000.0);
+  assert(myDsp.NumInputChannels() == in_channels);
+  assert(myDsp.NumOutputChannels() == out_channels);
 }
 
 void test_get_input_level()
 {
-  nam::DSP myDsp(48000.0);
+  const int in_channels = 2;
+  const int out_channels = 1;
+  nam::DSP myDsp(in_channels, out_channels, 48000.0);
   const double expected = 19.0;
   myDsp.SetInputLevel(expected);
   assert(myDsp.HasInputLevel());
@@ -23,7 +37,9 @@ void test_get_input_level()
 
 void test_get_output_level()
 {
-  nam::DSP myDsp(48000.0);
+  const int in_channels = 1;
+  const int out_channels = 2;
+  nam::DSP myDsp(in_channels, out_channels, 48000.0);
   const double expected = 12.0;
   myDsp.SetOutputLevel(expected);
   assert(myDsp.HasOutputLevel());
@@ -35,32 +51,89 @@ void test_get_output_level()
 // Test correct function of DSP::HasInputLevel()
 void test_has_input_level()
 {
-  nam::DSP myDsp(48000.0);
+  const int in_channels = 2;
+  const int out_channels = 1;
+  nam::DSP myDsp(in_channels, out_channels, 48000.0);
   assert(!myDsp.HasInputLevel());
 
-  myDsp.SetInputLevel(19.0);
+  const double level = 19.0;
+  myDsp.SetInputLevel(level);
   assert(myDsp.HasInputLevel());
 }
 
 void test_has_output_level()
 {
-  nam::DSP myDsp(48000.0);
+  const int in_channels = 1;
+  const int out_channels = 2;
+  nam::DSP myDsp(in_channels, out_channels, 48000.0);
+
   assert(!myDsp.HasOutputLevel());
 
-  myDsp.SetOutputLevel(12.0);
+  const double level = 12.0;
+  myDsp.SetOutputLevel(level);
   assert(myDsp.HasOutputLevel());
 }
 
 // Test correct function of DSP::HasInputLevel()
 void test_set_input_level()
 {
-  nam::DSP myDsp(48000.0);
+  const int in_channels = 2;
+  const int out_channels = 1;
+  nam::DSP myDsp(in_channels, out_channels, 48000.0);
   myDsp.SetInputLevel(19.0);
 }
 
 void test_set_output_level()
 {
-  nam::DSP myDsp(48000.0);
+  const int in_channels = 1;
+  const int out_channels = 2;
+  nam::DSP myDsp(in_channels, out_channels, 48000.0);
   myDsp.SetOutputLevel(19.0);
 }
+
+void test_process_multi_channel()
+{
+  const int in_channels = 2;
+  const int out_channels = 2;
+  nam::DSP myDsp(in_channels, out_channels, 48000.0);
+  const int num_frames = 64;
+
+  // Allocate buffers
+  std::vector<std::vector<double>> inputBuffers(in_channels);
+  std::vector<std::vector<double>> outputBuffers(out_channels);
+  std::vector<double*> inputPtrs(in_channels);
+  std::vector<double*> outputPtrs(out_channels);
+
+  for (int ch = 0; ch < in_channels; ch++)
+  {
+    inputBuffers[ch].resize(num_frames);
+    outputBuffers[ch].resize(num_frames);
+    inputPtrs[ch] = inputBuffers[ch].data();
+    outputPtrs[ch] = outputBuffers[ch].data();
+
+    // Fill input with test data
+    for (int i = 0; i < num_frames; i++)
+    {
+      inputBuffers[ch][i] = (ch + 1) * 0.5 + i * 0.01;
+    }
+  }
+  for (int ch = 0; ch < out_channels; ch++)
+  {
+    outputBuffers[ch].resize(num_frames);
+    outputPtrs[ch] = outputBuffers[ch].data();
+  }
+
+  // Process
+  myDsp.process(inputPtrs.data(), outputPtrs.data(), num_frames);
+
+  // Check that default implementation copied input to output
+  const int channelsToCheck = std::min(in_channels, out_channels);
+  for (int ch = 0; ch < channelsToCheck; ch++)
+  {
+    for (int i = 0; i < num_frames; i++)
+    {
+      assert(outputBuffers[ch][i] == inputBuffers[ch][i]);
+    }
+  }
+}
 }; // namespace test_dsp
diff --git a/tools/test/test_lstm.cpp b/tools/test/test_lstm.cpp
new file mode 100644
index 0000000..8c655b9
--- /dev/null
+++ b/tools/test/test_lstm.cpp
@@ -0,0 +1,451 @@
+// Tests for LSTM
+
+#include <Eigen/Dense>
+#include <cassert>
+#include <cmath>
+#include <iostream>
+#include <vector>
+
+#include "NAM/lstm.h"
+
+namespace test_lstm
+{
+// Helper function to calculate weights needed for LSTM
+// For each LSTMCell:
+// - Weight matrix: (4 * hidden_size) x (input_size + hidden_size) in row-major order
+// - Bias: 4 * hidden_size
+// - Initial hidden state: hidden_size (stored in second half of _xh)
+// - Initial cell state: hidden_size
+// For the LSTM:
+// - Head weight matrix: out_channels x hidden_size in row-major order
+// - Head bias: out_channels
+std::vector<float> create_lstm_weights(int num_layers, int input_size, int hidden_size, int out_channels)
+{
+  std::vector<float> weights;
+
+  for (int layer = 0; layer < num_layers; layer++)
+  {
+    int layer_input_size = (layer == 0) ? input_size : hidden_size;
+    int w_rows = 4 * hidden_size;
+    int w_cols = layer_input_size + hidden_size;
+
+    // Weight matrix (row-major)
+    for (int i = 0; i < w_rows * w_cols; i++)
+    {
+      weights.push_back(0.1f); // Small weights for stability
+    }
+
+    // Bias vector
+    for (int i = 0; i < 4 * hidden_size; i++)
+    {
+      weights.push_back(0.0f);
+    }
+
+    // Initial hidden state (stored in _xh)
+    for (int i = 0; i < hidden_size; i++)
+    {
+      weights.push_back(0.0f);
+    }
+
+    // Initial cell state
+    for (int i = 0; i < hidden_size; i++)
+    {
+      weights.push_back(0.0f);
+    }
+  }
+
+  // Head weight matrix (row-major: out_channels x hidden_size)
+  for (int out_ch = 0; out_ch < out_channels; out_ch++)
+  {
+    for (int h = 0; h < hidden_size; h++)
+    {
+      weights.push_back(0.1f);
+    }
+  }
+
+  // Head bias
+  for (int out_ch = 0; out_ch < out_channels; out_ch++)
+  {
+    weights.push_back(0.0f);
+  }
+
+  return weights;
+}
+
+// Test basic LSTM construction and processing
+void test_lstm_basic()
+{
+  const int in_channels = 1;
+  const int out_channels = 1;
+  const int num_layers = 1;
+  const int input_size = 1;
+  const int hidden_size = 4;
+  const double expected_sample_rate = 48000.0;
+
+  std::vector<float> weights = create_lstm_weights(num_layers, input_size, hidden_size, out_channels);
+
+  nam::lstm::LSTM lstm(in_channels, out_channels, num_layers, input_size, hidden_size, weights, expected_sample_rate);
+
+  const int numFrames = 4;
+  const int maxBufferSize = 64;
+  lstm.Reset(expected_sample_rate, maxBufferSize);
+
+  std::vector<NAM_SAMPLE> input(numFrames, 1.0f);
+  std::vector<NAM_SAMPLE> output(numFrames, 0.0f);
+  NAM_SAMPLE* inputPtrs[] = {input.data()};
+  NAM_SAMPLE* outputPtrs[] = {output.data()};
+
+  lstm.process(inputPtrs, outputPtrs, numFrames);
+
+  // Verify output dimensions
+  assert(output.size() == numFrames);
+  // Output should be non-zero and finite
+  for (int i = 0; i < numFrames; i++)
+  {
+    assert(std::isfinite(output[i]));
+  }
+}
+
+// Test LSTM with multiple layers
+void test_lstm_multiple_layers()
+{
+  const int in_channels = 1;
+  const int out_channels = 1;
+  const int num_layers = 2;
+  const int input_size = 1;
+  const int hidden_size = 4;
+  const double expected_sample_rate = 48000.0;
+
+  std::vector<float> weights = create_lstm_weights(num_layers, input_size, hidden_size, out_channels);
+
+  nam::lstm::LSTM lstm(in_channels, out_channels, num_layers, input_size, hidden_size, weights, expected_sample_rate);
+
+  const int numFrames = 8;
+  const int maxBufferSize = 64;
+  lstm.Reset(expected_sample_rate, maxBufferSize);
+
+  std::vector<NAM_SAMPLE> input(numFrames, 0.5f);
+  std::vector<NAM_SAMPLE> output(numFrames, 0.0f);
+  NAM_SAMPLE* inputPtrs[] = {input.data()};
+  NAM_SAMPLE* outputPtrs[] = {output.data()};
+
+  lstm.process(inputPtrs, outputPtrs, numFrames);
+
+  assert(output.size() == numFrames);
+  for (int i = 0; i < numFrames; i++)
+  {
+    assert(std::isfinite(output[i]));
+  }
+}
+
+// Test LSTM with zero input
+void test_lstm_zero_input()
+{
+  const int in_channels = 1;
+  const int out_channels = 1;
+  const int num_layers = 1;
+  const int input_size = 1;
+  const int hidden_size = 4;
+  const double expected_sample_rate = 48000.0;
+
+  std::vector<float> weights = create_lstm_weights(num_layers, input_size, hidden_size, out_channels);
+
+  nam::lstm::LSTM lstm(in_channels, out_channels, num_layers, input_size, hidden_size, weights, expected_sample_rate);
+
+  const int numFrames = 4;
+  lstm.Reset(expected_sample_rate, numFrames);
+
+  std::vector<NAM_SAMPLE> input(numFrames, 0.0f);
+  std::vector<NAM_SAMPLE> output(numFrames, 0.0f);
+  NAM_SAMPLE* inputPtrs[] = {input.data()};
+  NAM_SAMPLE* outputPtrs[] = {output.data()};
+
+  lstm.process(inputPtrs, outputPtrs, numFrames);
+
+  // With zero input, output should be finite (may be zero or non-zero depending on bias)
+  for (int i = 0; i < numFrames; i++)
+  {
+    assert(std::isfinite(output[i]));
+  }
+}
+
+// Test LSTM with different buffer sizes
+void test_lstm_different_buffer_sizes()
+{
+  const int in_channels = 1;
+  const int out_channels = 1;
+  const int num_layers = 1;
+  const int input_size = 1;
+  const int hidden_size = 4;
+  const double expected_sample_rate = 48000.0;
+
+  std::vector<float> weights = create_lstm_weights(num_layers, input_size, hidden_size, out_channels);
+
+  nam::lstm::LSTM lstm(in_channels, out_channels, num_layers, input_size, hidden_size, weights, expected_sample_rate);
+
+  // Test with different buffer sizes
+  lstm.Reset(expected_sample_rate, 64);
+  std::vector<NAM_SAMPLE> input1(32, 1.0f);
+  std::vector<NAM_SAMPLE> output1(32, 0.0f);
+  NAM_SAMPLE* inputPtrs1[] = {input1.data()};
+  NAM_SAMPLE* outputPtrs1[] = {output1.data()};
+  lstm.process(inputPtrs1, outputPtrs1, 32);
+
+  lstm.Reset(expected_sample_rate, 128);
+  std::vector<NAM_SAMPLE> input2(64, 1.0f);
+  std::vector<NAM_SAMPLE> output2(64, 0.0f);
+  NAM_SAMPLE* inputPtrs2[] = {input2.data()};
+  NAM_SAMPLE* outputPtrs2[] = {output2.data()};
+  lstm.process(inputPtrs2, outputPtrs2, 64);
+
+  // Both should work without errors
+  assert(output1.size() == 32);
+  assert(output2.size() == 64);
+}
+
+// Test LSTM prewarm functionality
+void test_lstm_prewarm()
+{
+  const int in_channels = 1;
+  const int out_channels = 1;
+  const int num_layers = 1;
+  const int input_size = 1;
+  const int hidden_size = 4;
+  const double expected_sample_rate = 48000.0;
+
+  std::vector<float> weights = create_lstm_weights(num_layers, input_size, hidden_size, out_channels);
+
+  nam::lstm::LSTM lstm(in_channels, out_channels, num_layers, input_size, hidden_size, weights, expected_sample_rate);
+
+  // Test that prewarm can be called without errors
+  lstm.Reset(expected_sample_rate, 64);
+  lstm.prewarm();
+
+  // After prewarm, processing should work
+  const int numFrames = 4;
+  std::vector<NAM_SAMPLE> input(numFrames, 1.0f);
+  std::vector<NAM_SAMPLE> output(numFrames, 0.0f);
+  NAM_SAMPLE* inputPtrs[] = {input.data()};
+  NAM_SAMPLE* outputPtrs[] = {output.data()};
+  lstm.process(inputPtrs, outputPtrs, numFrames);
+
+  // Output should be finite
+  for (int i = 0; i < numFrames; i++)
+  {
+    assert(std::isfinite(output[i]));
+  }
+}
+
+// Test multiple process() calls (state persistence)
+void test_lstm_multiple_calls()
+{
+  const int in_channels = 1;
+  const int out_channels = 1;
+  const int num_layers = 1;
+  const int input_size = 1;
+  const int hidden_size = 4;
+  const double expected_sample_rate = 48000.0;
+
+  std::vector<float> weights = create_lstm_weights(num_layers, input_size, hidden_size, out_channels);
+
+  nam::lstm::LSTM lstm(in_channels, out_channels, num_layers, input_size, hidden_size, weights, expected_sample_rate);
+
+  const int numFrames = 2;
+  lstm.Reset(expected_sample_rate, numFrames);
+
+  // Multiple calls should work correctly with state persistence
+  for (int i = 0; i < 5; i++)
+  {
+    std::vector<NAM_SAMPLE> input(numFrames, 1.0f);
+    std::vector<NAM_SAMPLE> output(numFrames, 0.0f);
+    NAM_SAMPLE* inputPtrs[] = {input.data()};
+    NAM_SAMPLE* outputPtrs[] = {output.data()};
+    lstm.process(inputPtrs, outputPtrs, numFrames);
+
+    // Output should be finite
+    for (int j = 0; j < numFrames; j++)
+    {
+      assert(std::isfinite(output[j]));
+    }
+  }
+}
+
+// Test LSTM with multi-channel input/output
+void test_lstm_multichannel()
+{
+  const int in_channels = 2;
+  const int out_channels = 2;
+  const int num_layers = 1;
+  const int input_size = 2;
+  const int hidden_size = 4;
+  const double expected_sample_rate = 48000.0;
+
+  std::vector<float> weights = create_lstm_weights(num_layers, input_size, hidden_size, out_channels);
+
+  nam::lstm::LSTM lstm(in_channels, out_channels, num_layers, input_size, hidden_size, weights, expected_sample_rate);
+
+  const int numFrames = 4;
+  lstm.Reset(expected_sample_rate, 64);
+
+  std::vector<NAM_SAMPLE> input1(numFrames, 0.5f);
+  std::vector<NAM_SAMPLE> input2(numFrames, 0.3f);
+  std::vector<NAM_SAMPLE> output1(numFrames, 0.0f);
+  std::vector<NAM_SAMPLE> output2(numFrames, 0.0f);
+  NAM_SAMPLE* inputPtrs[] = {input1.data(), input2.data()};
+  NAM_SAMPLE* outputPtrs[] = {output1.data(), output2.data()};
+
+  lstm.process(inputPtrs, outputPtrs, numFrames);
+
+  // Verify both output channels are finite
+  for (int i = 0; i < numFrames; i++)
+  {
+    assert(std::isfinite(output1[i]));
+    assert(std::isfinite(output2[i]));
+  }
+}
+
+// Test LSTM with larger hidden size
+void test_lstm_large_hidden_size()
+{
+  const int in_channels = 1;
+  const int out_channels = 1;
+  const int num_layers = 1;
+  const int input_size = 1;
+  const int hidden_size = 16;
+  const double expected_sample_rate = 48000.0;
+
+  std::vector<float> weights = create_lstm_weights(num_layers, input_size, hidden_size, out_channels);
+
+  nam::lstm::LSTM lstm(in_channels, out_channels, num_layers, input_size, hidden_size, weights, expected_sample_rate);
+
+  const int numFrames = 4;
+  lstm.Reset(expected_sample_rate, 64);
+
+  std::vector<NAM_SAMPLE> input(numFrames, 1.0f);
+  std::vector<NAM_SAMPLE> output(numFrames, 0.0f);
+  NAM_SAMPLE* inputPtrs[] = {input.data()};
+  NAM_SAMPLE* outputPtrs[] = {output.data()};
+
+  lstm.process(inputPtrs, outputPtrs, numFrames);
+
+  for (int i = 0; i < numFrames; i++)
+  {
+    assert(std::isfinite(output[i]));
+  }
+}
+
+// Test LSTM with different input sizes
+void test_lstm_different_input_size()
+{
+  const int in_channels = 3;
+  const int out_channels = 1;
+  const int num_layers = 1;
+  const int input_size = 3;
+  const int hidden_size = 4;
+  const double expected_sample_rate = 48000.0;
+
+  std::vector<float> weights = create_lstm_weights(num_layers, input_size, hidden_size, out_channels);
+
+  nam::lstm::LSTM lstm(in_channels, out_channels, num_layers, input_size, hidden_size, weights, expected_sample_rate);
+
+  const int numFrames = 4;
+  lstm.Reset(expected_sample_rate, 64);
+
+  std::vector<NAM_SAMPLE> input1(numFrames, 0.1f);
+  std::vector<NAM_SAMPLE> input2(numFrames, 0.2f);
+  std::vector<NAM_SAMPLE> input3(numFrames, 0.3f);
+  std::vector<NAM_SAMPLE> output(numFrames, 0.0f);
+  NAM_SAMPLE* inputPtrs[] = {input1.data(), input2.data(), input3.data()};
+  NAM_SAMPLE* outputPtrs[] = {output.data()};
+
+  lstm.process(inputPtrs, outputPtrs, numFrames);
+
+  for (int i = 0; i < numFrames; i++)
+  {
+    assert(std::isfinite(output[i]));
+  }
+}
+
+// Test LSTM state evolution over time
+void test_lstm_state_evolution()
+{
+  const int in_channels = 1;
+  const int out_channels = 1;
+  const int num_layers = 1;
+  const int input_size = 1;
+  const int hidden_size = 4;
+  const double expected_sample_rate = 48000.0;
+
+  std::vector<float> weights = create_lstm_weights(num_layers, input_size, hidden_size, out_channels);
+
+  nam::lstm::LSTM lstm(in_channels, out_channels, num_layers, input_size, hidden_size, weights, expected_sample_rate);
+
+  const int numFrames = 10;
+  lstm.Reset(expected_sample_rate, 64);
+
+  // Create a sine wave input
+  std::vector<NAM_SAMPLE> input(numFrames);
+  for (int i = 0; i < numFrames; i++)
+  {
+    input[i] = 0.5f * std::sin(2.0f * M_PI * i / numFrames);
+  }
+
+  std::vector<NAM_SAMPLE> output(numFrames, 0.0f);
+  NAM_SAMPLE* inputPtrs[] = {input.data()};
+  NAM_SAMPLE* outputPtrs[] = {output.data()};
+
+  lstm.process(inputPtrs, outputPtrs, numFrames);
+
+  // Output should be finite and potentially show some variation due to state
+  for (int i = 0; i < numFrames; i++)
+  {
+    assert(std::isfinite(output[i]));
+  }
+}
+
+// Test LSTM with no layers (edge case)
+void test_lstm_no_layers()
+{
+  const int in_channels = 1;
+  const int out_channels = 1;
+  const int num_layers = 0;
+  const int input_size = 1;
+  const int hidden_size = 4;
+  const double expected_sample_rate = 48000.0;
+
+  // With no layers, we still need head weights
+  std::vector<float> weights;
+  // Head weight matrix (row-major: out_channels x hidden_size)
+  for (int out_ch = 0; out_ch < out_channels; out_ch++)
+  {
+    for (int h = 0; h < hidden_size; h++)
+    {
+      weights.push_back(0.0f); // Zero weights means pass-through
+    }
+  }
+  // Head bias
+  for (int out_ch = 0; out_ch < out_channels; out_ch++)
+  {
+    weights.push_back(0.0f);
+  }
+
+  nam::lstm::LSTM lstm(in_channels, out_channels, num_layers, input_size, hidden_size, weights, expected_sample_rate);
+
+  const int numFrames = 4;
+  lstm.Reset(expected_sample_rate, 64);
+
+  std::vector<NAM_SAMPLE> input(numFrames, 1.0f);
+  std::vector<NAM_SAMPLE> output(numFrames, 0.0f);
+  NAM_SAMPLE* inputPtrs[] = {input.data()};
+  NAM_SAMPLE* outputPtrs[] = {output.data()};
+
+  lstm.process(inputPtrs, outputPtrs, numFrames);
+
+  // With zero head weights and bias, output should equal input for first channel
+  for (int i = 0; i < numFrames; i++)
+  {
+    assert(std::isfinite(output[i]));
+  }
+}
+
+}; // namespace test_lstm
diff --git a/tools/test/test_wavenet/test_full.cpp b/tools/test/test_wavenet/test_full.cpp
index d75ae1c..122ea0b 100644
--- a/tools/test/test_wavenet/test_full.cpp
+++ b/tools/test/test_wavenet/test_full.cpp
@@ -47,7 +47,8 @@ void test_wavenet_model()
   weights.push_back(1.0f); // Head rechannel
   weights.push_back(head_scale); // Head scale
 
-  auto wavenet = std::make_unique<nam::wavenet::WaveNet>(layer_array_params, head_scale, with_head, weights, 48000.0);
+  auto wavenet =
+    std::make_unique<nam::wavenet::WaveNet>(input_size, layer_array_params, head_scale, with_head, weights, 48000.0);
 
   const int numFrames = 4;
   const int maxBufferSize = 64;
@@ -55,8 +56,10 @@ void test_wavenet_model()
 
   std::vector<NAM_SAMPLE> input(numFrames, 1.0f);
   std::vector<NAM_SAMPLE> output(numFrames, 0.0f);
+  NAM_SAMPLE* inputPtrs[] = {input.data()};
+  NAM_SAMPLE* outputPtrs[] = {output.data()};
 
-  wavenet->process(input.data(), output.data(), numFrames);
+  wavenet->process(inputPtrs, outputPtrs, numFrames);
 
   // Verify output dimensions
   assert(output.size() == numFrames);
@@ -89,13 +92,13 @@ void test_wavenet_multiple_arrays()
   const int bottleneck = channels;
   const int groups_1x1 = 1;
   layer_array_params.push_back(nam::wavenet::LayerArrayParams(input_size, condition_size, head_size, channels,
-                                                              bottleneck, kernel_size, std::move(dilations1), activation,
-                                                              gated, head_bias, groups, groups_1x1));
+                                                              bottleneck, kernel_size, std::move(dilations1),
+                                                              activation, gated, head_bias, groups, groups_1x1));
   // Second array (head_size of first must match channels of second)
   std::vector<int> dilations2{1};
   layer_array_params.push_back(nam::wavenet::LayerArrayParams(head_size, condition_size, head_size, channels,
-                                                              bottleneck, kernel_size, std::move(dilations2), activation,
-                                                              gated, head_bias, groups, groups_1x1));
+                                                              bottleneck, kernel_size, std::move(dilations2),
+                                                              activation, gated, head_bias, groups, groups_1x1));
 
   std::vector<float> weights;
   // Array 0: rechannel, layer, head_rechannel
@@ -104,7 +107,8 @@ void test_wavenet_multiple_arrays()
   weights.insert(weights.end(), {1.0f, 1.0f, 0.0f, 1.0f, 1.0f, 0.0f, 1.0f});
   weights.push_back(head_scale);
 
-  auto wavenet = std::make_unique<nam::wavenet::WaveNet>(layer_array_params, head_scale, with_head, weights, 48000.0);
+  auto wavenet =
+    std::make_unique<nam::wavenet::WaveNet>(input_size, layer_array_params, head_scale, with_head, weights, 48000.0);
 
   const int numFrames = 4;
   const int maxBufferSize = 64;
@@ -112,8 +116,10 @@ void test_wavenet_multiple_arrays()
 
   std::vector<NAM_SAMPLE> input(numFrames, 1.0f);
   std::vector<NAM_SAMPLE> output(numFrames, 0.0f);
+  NAM_SAMPLE* inputPtrs[] = {input.data()};
+  NAM_SAMPLE* outputPtrs[] = {output.data()};
 
-  wavenet->process(input.data(), output.data(), numFrames);
+  wavenet->process(inputPtrs, outputPtrs, numFrames);
 
   assert(output.size() == numFrames);
   for (int i = 0; i < numFrames; i++)
@@ -147,15 +153,18 @@ void test_wavenet_zero_input()
 
   std::vector<float> weights{1.0f, 1.0f, 0.0f, 1.0f, 1.0f, 0.0f, 1.0f, head_scale};
 
-  auto wavenet = std::make_unique<nam::wavenet::WaveNet>(layer_array_params, head_scale, with_head, weights, 48000.0);
+  auto wavenet =
+    std::make_unique<nam::wavenet::WaveNet>(input_size, layer_array_params, head_scale, with_head, weights, 48000.0);
 
   const int numFrames = 4;
   wavenet->Reset(48000.0, numFrames);
 
   std::vector<NAM_SAMPLE> input(numFrames, 0.0f);
   std::vector<NAM_SAMPLE> output(numFrames, 0.0f);
+  NAM_SAMPLE* inputPtrs[] = {input.data()};
+  NAM_SAMPLE* outputPtrs[] = {output.data()};
 
-  wavenet->process(input.data(), output.data(), numFrames);
+  wavenet->process(inputPtrs, outputPtrs, numFrames);
 
   // With zero input, output should be finite (may be zero or non-zero depending on bias)
   for (int i = 0; i < numFrames; i++)
@@ -189,18 +198,23 @@ void test_wavenet_different_buffer_sizes()
 
   std::vector<float> weights{1.0f, 1.0f, 0.0f, 1.0f, 1.0f, 0.0f, 1.0f, head_scale};
 
-  auto wavenet = std::make_unique<nam::wavenet::WaveNet>(layer_array_params, head_scale, with_head, weights, 48000.0);
+  auto wavenet =
+    std::make_unique<nam::wavenet::WaveNet>(input_size, layer_array_params, head_scale, with_head, weights, 48000.0);
 
   // Test with different buffer sizes
   wavenet->Reset(48000.0, 64);
   std::vector<NAM_SAMPLE> input1(32, 1.0f);
   std::vector<NAM_SAMPLE> output1(32, 0.0f);
-  wavenet->process(input1.data(), output1.data(), 32);
+  NAM_SAMPLE* inputPtrs1[] = {input1.data()};
+  NAM_SAMPLE* outputPtrs1[] = {output1.data()};
+  wavenet->process(inputPtrs1, outputPtrs1, 32);
 
   wavenet->Reset(48000.0, 128);
   std::vector<NAM_SAMPLE> input2(64, 1.0f);
   std::vector<NAM_SAMPLE> output2(64, 0.0f);
-  wavenet->process(input2.data(), output2.data(), 64);
+  NAM_SAMPLE* inputPtrs2[] = {input2.data()};
+  NAM_SAMPLE* outputPtrs2[] = {output2.data()};
+  wavenet->process(inputPtrs2, outputPtrs2, 64);
 
   // Both should work without errors
   assert(output1.size() == 32);
@@ -251,7 +265,8 @@ void test_wavenet_prewarm()
   weights.push_back(1.0f);
   weights.push_back(head_scale);
 
-  auto wavenet = std::make_unique<nam::wavenet::WaveNet>(layer_array_params, head_scale, with_head, weights, 48000.0);
+  auto wavenet =
+    std::make_unique<nam::wavenet::WaveNet>(input_size, layer_array_params, head_scale, with_head, weights, 48000.0);
 
   // Test that prewarm can be called without errors
   wavenet->Reset(48000.0, 64);
@@ -261,7 +276,9 @@ void test_wavenet_prewarm()
   const int numFrames = 4;
   std::vector<NAM_SAMPLE> input(numFrames, 1.0f);
   std::vector<NAM_SAMPLE> output(numFrames, 0.0f);
-  wavenet->process(input.data(), output.data(), numFrames);
+  NAM_SAMPLE* inputPtrs[] = {input.data()};
+  NAM_SAMPLE* outputPtrs[] = {output.data()};
+  wavenet->process(inputPtrs, outputPtrs, numFrames);
 
   // Output should be finite
   for (int i = 0; i < numFrames; i++)
diff --git a/tools/test/test_wavenet/test_real_time_safe.cpp b/tools/test/test_wavenet/test_real_time_safe.cpp
index 91d8628..0a57539 100644
--- a/tools/test/test_wavenet/test_real_time_safe.cpp
+++ b/tools/test/test_wavenet/test_real_time_safe.cpp
@@ -437,8 +437,8 @@ void test_layer_process_realtime_safe()
   const int groups_input = 1;
   const int groups_1x1 = 1;
 
-  auto layer = nam::wavenet::_Layer(condition_size, channels, bottleneck, kernel_size, dilation, activation, gated,
-                                    groups_input, groups_1x1);
+  auto layer = nam::wavenet::_Layer(
+    condition_size, channels, bottleneck, kernel_size, dilation, activation, gated, groups_input, groups_1x1);
 
   // Set weights
   std::vector<float> weights{1.0f, 0.0f, // Conv (weight, bias)
@@ -492,8 +492,8 @@ void test_layer_bottleneck_process_realtime_safe()
   const int groups_input = 1;
   const int groups_1x1 = 1;
 
-  auto layer = nam::wavenet::_Layer(condition_size, channels, bottleneck, kernel_size, dilation, activation, gated,
-                                    groups_input, groups_1x1);
+  auto layer = nam::wavenet::_Layer(
+    condition_size, channels, bottleneck, kernel_size, dilation, activation, gated, groups_input, groups_1x1);
 
   // Set weights for bottleneck != channels
   // Conv: (channels, bottleneck, kernelSize=1) = (4, 2, 1) + bias
@@ -544,8 +544,8 @@ void test_layer_bottleneck_process_realtime_safe()
     input.setConstant(0.5f);
     condition.setConstant(0.5f);
 
-    std::string test_name = "Layer Process (bottleneck=" + std::to_string(bottleneck) + ", channels=" +
-                            std::to_string(channels) + ") - Buffer size " + std::to_string(buffer_size);
+    std::string test_name = "Layer Process (bottleneck=" + std::to_string(bottleneck) + ", channels="
+                            + std::to_string(channels) + ") - Buffer size " + std::to_string(buffer_size);
     run_allocation_test_no_allocations(
       nullptr, // No setup needed
       [&]() {
@@ -577,8 +577,8 @@ void test_layer_grouped_process_realtime_safe()
   const int groups_input = 2; // groups_input > 1
   const int groups_1x1 = 2; // 1x1 is also grouped
 
-  auto layer = nam::wavenet::_Layer(condition_size, channels, bottleneck, kernel_size, dilation, activation, gated,
-                                    groups_input, groups_1x1);
+  auto layer = nam::wavenet::_Layer(
+    condition_size, channels, bottleneck, kernel_size, dilation, activation, gated, groups_input, groups_1x1);
 
   // Set weights for grouped convolution
   // With groups_input=2, channels=4: each group has 2 in_channels and 2 out_channels
@@ -757,13 +757,13 @@ void test_process_realtime_safe()
   const int bottleneck = channels;
   const int groups_1x1 = 1;
   layer_array_params.push_back(nam::wavenet::LayerArrayParams(input_size, condition_size, head_size, channels,
-                                                              bottleneck, kernel_size, std::move(dilations1), activation,
-                                                              gated, head_bias, groups, groups_1x1));
+                                                              bottleneck, kernel_size, std::move(dilations1),
+                                                              activation, gated, head_bias, groups, groups_1x1));
   // Second layer array (head_size of first must match channels of second)
   std::vector<int> dilations2{1};
   layer_array_params.push_back(nam::wavenet::LayerArrayParams(head_size, condition_size, head_size, channels,
-                                                              bottleneck, kernel_size, std::move(dilations2), activation,
-                                                              gated, head_bias, groups, groups_1x1));
+                                                              bottleneck, kernel_size, std::move(dilations2),
+                                                              activation, gated, head_bias, groups, groups_1x1));
 
   // Weights: Array 0: rechannel(1), layer(conv:1+1, input_mixin:1, 1x1:1+1), head_rechannel(1)
   //          Array 1: same structure
@@ -775,7 +775,8 @@ void test_process_realtime_safe()
   weights.insert(weights.end(), {1.0f, 1.0f, 0.0f, 1.0f, 1.0f, 0.0f, 1.0f});
   weights.push_back(head_scale);
 
-  auto wavenet = std::make_unique<nam::wavenet::WaveNet>(layer_array_params, head_scale, with_head, weights, 48000.0);
+  auto wavenet =
+    std::make_unique<nam::wavenet::WaveNet>(input_size, layer_array_params, head_scale, with_head, weights, 48000.0);
 
   const int maxBufferSize = 256;
   wavenet->Reset(48000.0, maxBufferSize);
@@ -794,7 +795,9 @@ void test_process_realtime_safe()
       nullptr, // No setup needed
       [&]() {
         // Call process() - this should not allocate or free
-        wavenet->process(input.data(), output.data(), buffer_size);
+        NAM_SAMPLE* inputPtrs[] = {input.data()};
+        NAM_SAMPLE* outputPtrs[] = {output.data()};
+        wavenet->process(inputPtrs, outputPtrs, buffer_size);
       },
       nullptr, // No teardown needed
       test_name.c_str());
@@ -806,4 +809,127 @@ void test_process_realtime_safe()
     }
   }
 }
+
+// Test that WaveNet::process() method with 3 input channels and 2 output channels does not allocate or free memory
+void test_process_3in_2out_realtime_safe()
+{
+  // Setup: Create WaveNet with 3 input channels and 2 output channels
+  const int input_size = 3; // 3 input channels
+  const int condition_size = 3; // condition matches input channels
+  const int head_size = 2; // 2 output channels
+  const int channels = 4; // internal channels
+  const int bottleneck = 2; // bottleneck (will be used for head)
+  const int kernel_size = 1;
+  const std::string activation = "ReLU";
+  const bool gated = false;
+  const bool head_bias = false;
+  const float head_scale = 1.0f;
+  const bool with_head = false;
+  const int groups = 1;
+  const int groups_1x1 = 1;
+
+  std::vector<nam::wavenet::LayerArrayParams> layer_array_params;
+  std::vector<int> dilations1{1};
+  layer_array_params.push_back(nam::wavenet::LayerArrayParams(input_size, condition_size, head_size, channels,
+                                                              bottleneck, kernel_size, std::move(dilations1),
+                                                              activation, gated, head_bias, groups, groups_1x1));
+
+  // Calculate weights:
+  // _rechannel: Conv1x1(3, 4, bias=false) = 3*4 = 12 weights
+  // Layer:
+  //   _conv: Conv1D(4, 2, kernel_size=1, bias=true) = 1*(2*4) + 2 = 10 weights
+  //   _input_mixin: Conv1x1(3, 2, bias=false) = 3*2 = 6 weights
+  //   _1x1: Conv1x1(2, 4, bias=true) = 2*4 + 4 = 12 weights
+  // _head_rechannel: Conv1x1(2, 2, bias=false) = 2*2 = 4 weights
+  // Total: 12 + 10 + 6 + 12 + 4 = 44 weights
+  std::vector<float> weights;
+  // _rechannel weights (3->4): identity-like pattern
+  for (int out_ch = 0; out_ch < 4; out_ch++)
+  {
+    for (int in_ch = 0; in_ch < 3; in_ch++)
+    {
+      weights.push_back((out_ch < 3 && out_ch == in_ch) ? 1.0f : 0.0f);
+    }
+  }
+  // Layer: _conv weights (4->2, kernel_size=1, with bias)
+  // Weight layout: for each kernel position k, for each out_channel, for each in_channel
+  for (int out_ch = 0; out_ch < 2; out_ch++)
+  {
+    for (int in_ch = 0; in_ch < 4; in_ch++)
+    {
+      weights.push_back((out_ch == in_ch) ? 1.0f : 0.0f);
+    }
+  }
+  // _conv bias (2 values)
+  weights.insert(weights.end(), {0.0f, 0.0f});
+  // _input_mixin weights (3->2)
+  for (int out_ch = 0; out_ch < 2; out_ch++)
+  {
+    for (int in_ch = 0; in_ch < 3; in_ch++)
+    {
+      weights.push_back((out_ch == in_ch) ? 1.0f : 0.0f);
+    }
+  }
+  // _1x1 weights (2->4, with bias)
+  for (int out_ch = 0; out_ch < 4; out_ch++)
+  {
+    for (int in_ch = 0; in_ch < 2; in_ch++)
+    {
+      weights.push_back((out_ch < 2 && out_ch == in_ch) ? 1.0f : 0.0f);
+    }
+  }
+  // _1x1 bias (4 values)
+  weights.insert(weights.end(), {0.0f, 0.0f, 0.0f, 0.0f});
+  // _head_rechannel weights (2->2)
+  for (int out_ch = 0; out_ch < 2; out_ch++)
+  {
+    for (int in_ch = 0; in_ch < 2; in_ch++)
+    {
+      weights.push_back((out_ch == in_ch) ? 1.0f : 0.0f);
+    }
+  }
+  weights.push_back(head_scale);
+
+  const int in_channels = 3;
+  auto wavenet =
+    std::make_unique<nam::wavenet::WaveNet>(in_channels, layer_array_params, head_scale, with_head, weights, 48000.0);
+
+  const int maxBufferSize = 256;
+  wavenet->Reset(48000.0, maxBufferSize);
+
+  // Test with several different buffer sizes
+  std::vector<int> buffer_sizes{1, 8, 16, 32, 64, 128, 256};
+
+  for (int buffer_size : buffer_sizes)
+  {
+    // Prepare input/output buffers for 3 input channels and 2 output channels (allocate before tracking)
+    std::vector<std::vector<NAM_SAMPLE>> input(3, std::vector<NAM_SAMPLE>(buffer_size, 0.5f));
+    std::vector<std::vector<NAM_SAMPLE>> output(2, std::vector<NAM_SAMPLE>(buffer_size, 0.0f));
+    std::vector<NAM_SAMPLE*> inputPtrs(3);
+    std::vector<NAM_SAMPLE*> outputPtrs(2);
+    for (int ch = 0; ch < 3; ch++)
+      inputPtrs[ch] = input[ch].data();
+    for (int ch = 0; ch < 2; ch++)
+      outputPtrs[ch] = output[ch].data();
+
+    std::string test_name = "WaveNet process (3in, 2out) - Buffer size " + std::to_string(buffer_size);
+    run_allocation_test_no_allocations(
+      nullptr, // No setup needed
+      [&]() {
+        // Call process() - this should not allocate or free
+        wavenet->process(inputPtrs.data(), outputPtrs.data(), buffer_size);
+      },
+      nullptr, // No teardown needed
+      test_name.c_str());
+
+    // Verify output is valid
+    for (int ch = 0; ch < 2; ch++)
+    {
+      for (int i = 0; i < buffer_size; i++)
+      {
+        assert(std::isfinite(output[ch][i]));
+      }
+    }
+  }
+}
 } // namespace test_wavenet