diff --git a/NAM/convnet.cpp b/NAM/convnet.cpp index 3b8b18f..8bbcded 100644 --- a/NAM/convnet.cpp +++ b/NAM/convnet.cpp @@ -129,39 +129,69 @@ long nam::convnet::ConvNetBlock::get_out_channels() const return this->conv.get_out_channels(); } -nam::convnet::_Head::_Head(const int channels, std::vector::iterator& weights) +nam::convnet::_Head::_Head(const int in_channels, const int out_channels, std::vector::iterator& weights) { - this->_weight.resize(channels); - for (int i = 0; i < channels; i++) - this->_weight[i] = *(weights++); - this->_bias = *(weights++); + // Weights are stored row-major: first row (output 0), then row 1 (output 1), etc. + // For each output channel: [w0, w1, ..., w_{in_channels-1}] + // Then biases: [bias0, bias1, ..., bias_{out_channels-1}] + this->_weight.resize(out_channels, in_channels); + for (int out_ch = 0; out_ch < out_channels; out_ch++) + { + for (int in_ch = 0; in_ch < in_channels; in_ch++) + { + this->_weight(out_ch, in_ch) = *(weights++); + } + } + + // Biases for each output channel + this->_bias.resize(out_channels); + for (int out_ch = 0; out_ch < out_channels; out_ch++) + { + this->_bias(out_ch) = *(weights++); + } } -void nam::convnet::_Head::process_(const Eigen::MatrixXf& input, Eigen::VectorXf& output, const long i_start, +void nam::convnet::_Head::process_(const Eigen::MatrixXf& input, Eigen::MatrixXf& output, const long i_start, const long i_end) const { const long length = i_end - i_start; - output.resize(length); - for (long i = 0, j = i_start; i < length; i++, j++) - output(i) = this->_bias + input.col(j).dot(this->_weight); + const long out_channels = this->_weight.rows(); + + // Resize output to (out_channels x length) + output.resize(out_channels, length); + + // Extract input slice: (in_channels x length) + Eigen::MatrixXf input_slice = input.middleCols(i_start, length); + + // Compute output = weight * input_slice: (out_channels x in_channels) * (in_channels x length) = (out_channels x + // length) + output.noalias() = this->_weight * input_slice; + + // Add bias to each column: output.colwise() += bias + // output is (out_channels x length), bias is (out_channels x 1), so colwise() += works + output.colwise() += this->_bias; } -nam::convnet::ConvNet::ConvNet(const int channels, const std::vector& dilations, const bool batchnorm, - const std::string activation, std::vector& weights, - const double expected_sample_rate, const int groups) -: Buffer(*std::max_element(dilations.begin(), dilations.end()), expected_sample_rate) +nam::convnet::ConvNet::ConvNet(const int in_channels, const int out_channels, const int channels, + const std::vector& dilations, const bool batchnorm, const std::string activation, + std::vector& weights, const double expected_sample_rate, const int groups) +: Buffer(in_channels, out_channels, *std::max_element(dilations.begin(), dilations.end()), expected_sample_rate) { this->_verify_weights(channels, dilations, batchnorm, weights.size()); this->_blocks.resize(dilations.size()); std::vector::iterator it = weights.begin(); + // First block takes in_channels input, subsequent blocks take channels input for (size_t i = 0; i < dilations.size(); i++) - this->_blocks[i].set_weights_(i == 0 ? 1 : channels, channels, dilations[i], batchnorm, activation, groups, it); + this->_blocks[i].set_weights_( + i == 0 ? in_channels : channels, channels, dilations[i], batchnorm, activation, groups, it); // Only need _block_vals for the head (one entry) // Conv1D layers manage their own buffers now this->_block_vals.resize(1); this->_block_vals[0].setZero(); - std::fill(this->_input_buffer.begin(), this->_input_buffer.end(), 0.0f); - this->_head = _Head(channels, it); + + // Create single head that outputs all channels + this->_head = _Head(channels, out_channels, it); + if (it != weights.end()) throw std::runtime_error("Didn't touch all the weights when initializing ConvNet"); @@ -171,18 +201,25 @@ nam::convnet::ConvNet::ConvNet(const int channels, const std::vector& dilat } -void nam::convnet::ConvNet::process(NAM_SAMPLE* input, NAM_SAMPLE* output, const int num_frames) +void nam::convnet::ConvNet::process(NAM_SAMPLE** input, NAM_SAMPLE** output, const int num_frames) { this->_update_buffers_(input, num_frames); - // Main computation! - const long i_start = this->_input_buffer_offset; - const long i_end = i_start + num_frames; + const int in_channels = NumInputChannels(); + const int out_channels = NumOutputChannels(); + + // For multi-channel, we process each input channel independently through the network + // and sum outputs to each output channel (simple implementation) + // This can be extended later for more sophisticated cross-channel processing - // Convert input buffer to matrix for first layer - Eigen::MatrixXf input_matrix(1, num_frames); - for (int i = 0; i < num_frames; i++) - input_matrix(0, i) = this->_input_buffer[i_start + i]; + // Convert input buffers to matrix for first layer (stack input channels) + Eigen::MatrixXf input_matrix(in_channels, num_frames); + const long i_start = this->_input_buffer_offset; + for (int ch = 0; ch < in_channels; ch++) + { + for (int i = 0; i < num_frames; i++) + input_matrix(ch, i) = this->_input_buffers[ch][i_start + i]; + } // Process through ConvNetBlock layers // Each block now uses Conv1D's internal buffers via Process() and GetOutput() @@ -206,23 +243,33 @@ void nam::convnet::ConvNet::process(NAM_SAMPLE* input, NAM_SAMPLE* output, const this->_blocks[i].Process(block_input, num_frames); } - // Process head with output from last Conv1D - // Head still needs the old interface, so we need to provide it via a matrix - // We still need _block_vals[0] for the head interface + // Process head for all output channels at once + // We need _block_vals[0] for the head interface + const long buffer_size = (long)this->_input_buffers[0].size(); if (this->_block_vals[0].rows() != this->_blocks.back().get_out_channels() - || this->_block_vals[0].cols() != (long)this->_input_buffer.size()) + || this->_block_vals[0].cols() != buffer_size) { - this->_block_vals[0].resize(this->_blocks.back().get_out_channels(), this->_input_buffer.size()); + this->_block_vals[0].resize(this->_blocks.back().get_out_channels(), buffer_size); } + // Copy last block output to _block_vals for head auto last_output = this->_blocks.back().GetOutput(num_frames); - this->_block_vals[0].middleCols(i_start, num_frames) = last_output; - - this->_head.process_(this->_block_vals[0], this->_head_output, i_start, i_end); - - // Copy to required output array - for (int s = 0; s < num_frames; s++) - output[s] = this->_head_output(s); + const long buffer_offset = this->_input_buffer_offset; + const long buffer_i_end = buffer_offset + num_frames; + // last_output is (channels x num_frames), _block_vals[0] is (channels x buffer_size) + // Copy to the correct location in _block_vals + this->_block_vals[0].block(0, buffer_offset, last_output.rows(), num_frames) = last_output; + + // Process head - outputs all channels at once + // Head will resize _head_output internally + this->_head.process_(this->_block_vals[0], this->_head_output, buffer_offset, buffer_i_end); + + // Copy to output arrays for each channel + for (int ch = 0; ch < out_channels; ch++) + { + for (int s = 0; s < num_frames; s++) + output[ch][s] = this->_head_output(ch, s); + } // Prepare for next call: nam::Buffer::_advance_input_buffer_(num_frames); @@ -245,11 +292,12 @@ void nam::convnet::ConvNet::SetMaxBufferSize(const int maxBufferSize) } } -void nam::convnet::ConvNet::_update_buffers_(NAM_SAMPLE* input, const int num_frames) +void nam::convnet::ConvNet::_update_buffers_(NAM_SAMPLE** input, const int num_frames) { this->Buffer::_update_buffers_(input, num_frames); - const long buffer_size = (long)this->_input_buffer.size(); + // All channels use the same buffer size + const long buffer_size = (long)this->_input_buffers[0].size(); // Only need _block_vals[0] for the head // Conv1D layers manage their own buffers now @@ -281,8 +329,11 @@ std::unique_ptr nam::convnet::Factory(const nlohmann::json& config, st const bool batchnorm = config["batchnorm"]; const std::string activation = config["activation"]; const int groups = config.value("groups", 1); // defaults to 1 + // Default to 1 channel in/out for backward compatibility + const int in_channels = config.value("in_channels", 1); + const int out_channels = config.value("out_channels", 1); return std::make_unique( - channels, dilations, batchnorm, activation, weights, expectedSampleRate, groups); + in_channels, out_channels, channels, dilations, batchnorm, activation, weights, expectedSampleRate, groups); } namespace diff --git a/NAM/convnet.h b/NAM/convnet.h index ccc1edb..d1e846c 100644 --- a/NAM/convnet.h +++ b/NAM/convnet.h @@ -66,32 +66,33 @@ class _Head { public: _Head() {}; - _Head(const int channels, std::vector::iterator& weights); - void process_(const Eigen::MatrixXf& input, Eigen::VectorXf& output, const long i_start, const long i_end) const; + _Head(const int in_channels, const int out_channels, std::vector::iterator& weights); + void process_(const Eigen::MatrixXf& input, Eigen::MatrixXf& output, const long i_start, const long i_end) const; private: - Eigen::VectorXf _weight; - float _bias = 0.0f; + Eigen::MatrixXf _weight; // (out_channels, in_channels) + Eigen::VectorXf _bias; // (out_channels,) }; class ConvNet : public Buffer { public: - ConvNet(const int channels, const std::vector& dilations, const bool batchnorm, const std::string activation, - std::vector& weights, const double expected_sample_rate = -1.0, const int groups = 1); + ConvNet(const int in_channels, const int out_channels, const int channels, const std::vector& dilations, + const bool batchnorm, const std::string activation, std::vector& weights, + const double expected_sample_rate = -1.0, const int groups = 1); ~ConvNet() = default; - void process(NAM_SAMPLE* input, NAM_SAMPLE* output, const int num_frames) override; + void process(NAM_SAMPLE** input, NAM_SAMPLE** output, const int num_frames) override; void SetMaxBufferSize(const int maxBufferSize) override; protected: std::vector _blocks; std::vector _block_vals; - Eigen::VectorXf _head_output; + Eigen::MatrixXf _head_output; // (out_channels, num_frames) _Head _head; void _verify_weights(const int channels, const std::vector& dilations, const bool batchnorm, const size_t actual_weights); - void _update_buffers_(NAM_SAMPLE* input, const int num_frames) override; + void _update_buffers_(NAM_SAMPLE** input, const int num_frames) override; void _rewind_buffers_() override; int mPrewarmSamples = 0; // Pre-compute during initialization diff --git a/NAM/dsp.cpp b/NAM/dsp.cpp index dc46891..023c42a 100644 --- a/NAM/dsp.cpp +++ b/NAM/dsp.cpp @@ -15,9 +15,15 @@ constexpr const long _INPUT_BUFFER_SAFETY_FACTOR = 32; -nam::DSP::DSP(const double expected_sample_rate) +nam::DSP::DSP(const int in_channels, const int out_channels, const double expected_sample_rate) : mExpectedSampleRate(expected_sample_rate) +, mInChannels(in_channels) +, mOutChannels(out_channels) { + if (in_channels <= 0 || out_channels <= 0) + { + throw std::runtime_error("Channel counts must be positive"); + } } void nam::DSP::prewarm() @@ -31,29 +37,47 @@ void nam::DSP::prewarm() return; const size_t bufferSize = std::max(mMaxBufferSize, 1); - std::vector inputBuffer, outputBuffer; - inputBuffer.resize(bufferSize); - outputBuffer.resize(bufferSize); - for (auto it = inputBuffer.begin(); it != inputBuffer.end(); ++it) + // Allocate buffers for all channels + std::vector> inputBuffers(mInChannels); + std::vector> outputBuffers(mOutChannels); + std::vector inputPtrs(mInChannels); + std::vector outputPtrs(mOutChannels); + + for (int ch = 0; ch < mInChannels; ch++) { - (*it) = (NAM_SAMPLE)0.0; + inputBuffers[ch].resize(bufferSize, (NAM_SAMPLE)0.0); + inputPtrs[ch] = inputBuffers[ch].data(); + } + for (int ch = 0; ch < mOutChannels; ch++) + { + outputBuffers[ch].resize(bufferSize, (NAM_SAMPLE)0.0); + outputPtrs[ch] = outputBuffers[ch].data(); } - NAM_SAMPLE* inputPtr = inputBuffer.data(); - NAM_SAMPLE* outputPtr = outputBuffer.data(); int samplesProcessed = 0; while (samplesProcessed < prewarmSamples) { - this->process(inputPtr, outputPtr, bufferSize); + this->process(inputPtrs.data(), outputPtrs.data(), bufferSize); samplesProcessed += bufferSize; } } -void nam::DSP::process(NAM_SAMPLE* input, NAM_SAMPLE* output, const int num_frames) +void nam::DSP::process(NAM_SAMPLE** input, NAM_SAMPLE** output, const int num_frames) { - // Default implementation is the null operation - for (int i = 0; i < num_frames; i++) - output[i] = input[i]; + // Default implementation is the null operation: copy input to output + // For now, assume 1:1 channel mapping (first min(in_channels, out_channels) channels) + const int channelsToProcess = std::min(mInChannels, mOutChannels); + for (int ch = 0; ch < channelsToProcess; ch++) + { + for (int i = 0; i < num_frames; i++) + output[ch][i] = input[ch][i]; + } + // Zero out any extra output channels + for (int ch = channelsToProcess; ch < mOutChannels; ch++) + { + for (int i = 0; i < num_frames; i++) + output[ch][i] = (NAM_SAMPLE)0.0; + } } double nam::DSP::GetLoudness() const @@ -87,10 +111,43 @@ void nam::DSP::SetMaxBufferSize(const int maxBufferSize) mMaxBufferSize = maxBufferSize; } +double nam::DSP::GetInputLevel() +{ + return mInputLevel.level; +} + +double nam::DSP::GetOutputLevel() +{ + return mOutputLevel.level; +} + +bool nam::DSP::HasInputLevel() +{ + return mInputLevel.haveLevel; +} + +bool nam::DSP::HasOutputLevel() +{ + return mOutputLevel.haveLevel; +} + +void nam::DSP::SetInputLevel(const double inputLevel) +{ + mInputLevel.haveLevel = true; + mInputLevel.level = inputLevel; +} + +void nam::DSP::SetOutputLevel(const double outputLevel) +{ + mOutputLevel.haveLevel = true; + mOutputLevel.level = outputLevel; +} + // Buffer ===================================================================== -nam::Buffer::Buffer(const int receptive_field, const double expected_sample_rate) -: nam::DSP(expected_sample_rate) +nam::Buffer::Buffer(const int in_channels, const int out_channels, const int receptive_field, + const double expected_sample_rate) +: nam::DSP(in_channels, out_channels, expected_sample_rate) { this->_set_receptive_field(receptive_field); } @@ -103,45 +160,77 @@ void nam::Buffer::_set_receptive_field(const int new_receptive_field) void nam::Buffer::_set_receptive_field(const int new_receptive_field, const int input_buffer_size) { this->_receptive_field = new_receptive_field; - this->_input_buffer.resize(input_buffer_size); - std::fill(this->_input_buffer.begin(), this->_input_buffer.end(), 0.0f); + const int in_channels = NumInputChannels(); + const int out_channels = NumOutputChannels(); + + // Resize buffers for all input channels + _input_buffers.resize(in_channels); + for (int ch = 0; ch < in_channels; ch++) + { + _input_buffers[ch].resize(input_buffer_size); + std::fill(_input_buffers[ch].begin(), _input_buffers[ch].end(), 0.0f); + } + + // Resize output buffers (though they'll be resized per call in _update_buffers_) + _output_buffers.resize(out_channels); + this->_reset_input_buffer(); } -void nam::Buffer::_update_buffers_(NAM_SAMPLE* input, const int num_frames) +void nam::Buffer::_update_buffers_(NAM_SAMPLE** input, const int num_frames) { - // Make sure that the buffer is big enough for the receptive field and the - // frames needed! + const int in_channels = NumInputChannels(); + const int out_channels = NumOutputChannels(); + + // Make sure that the buffers are big enough for the receptive field and the + // frames needed. All channels use the same buffer size. + const long minimum_input_buffer_size = (long)this->_receptive_field + _INPUT_BUFFER_SAFETY_FACTOR * num_frames; + + for (int ch = 0; ch < in_channels; ch++) { - const long minimum_input_buffer_size = (long)this->_receptive_field + _INPUT_BUFFER_SAFETY_FACTOR * num_frames; - if ((long)this->_input_buffer.size() < minimum_input_buffer_size) + if ((long)this->_input_buffers[ch].size() < minimum_input_buffer_size) { long new_buffer_size = 2; while (new_buffer_size < minimum_input_buffer_size) new_buffer_size *= 2; - this->_input_buffer.resize(new_buffer_size); - std::fill(this->_input_buffer.begin(), this->_input_buffer.end(), 0.0f); + this->_input_buffers[ch].resize(new_buffer_size); + std::fill(this->_input_buffers[ch].begin(), this->_input_buffers[ch].end(), 0.0f); } } // If we'd run off the end of the input buffer, then we need to move the data - // back to the start of the buffer and start again. - if (this->_input_buffer_offset + num_frames > (long)this->_input_buffer.size()) + // back to the start of the buffer and start again. All channels move together. + const long buffer_size = (long)this->_input_buffers[0].size(); + if (this->_input_buffer_offset + num_frames > buffer_size) this->_rewind_buffers_(); - // Put the new samples into the input buffer - for (long i = this->_input_buffer_offset, j = 0; j < num_frames; i++, j++) - this->_input_buffer[i] = input[j]; - // And resize the output buffer: - this->_output_buffer.resize(num_frames); - std::fill(this->_output_buffer.begin(), this->_output_buffer.end(), 0.0f); + + // Put the new samples into the input buffer for each channel + for (int ch = 0; ch < in_channels; ch++) + { + for (long i = this->_input_buffer_offset, j = 0; j < num_frames; i++, j++) + this->_input_buffers[ch][i] = (float)input[ch][j]; + } + + // Resize output buffers for all output channels + for (int ch = 0; ch < out_channels; ch++) + { + this->_output_buffers[ch].resize(num_frames); + std::fill(this->_output_buffers[ch].begin(), this->_output_buffers[ch].end(), 0.0f); + } } void nam::Buffer::_rewind_buffers_() { - // Copy the input buffer back - // RF-1 samples because we've got at least one new one inbound. - for (long i = 0, j = this->_input_buffer_offset - this->_receptive_field; i < this->_receptive_field; i++, j++) - this->_input_buffer[i] = this->_input_buffer[j]; + const int in_channels = NumInputChannels(); + + // Rewind buffers for all input channels (they all move together) + for (int ch = 0; ch < in_channels; ch++) + { + // Copy the input buffer back + // RF-1 samples because we've got at least one new one inbound. + for (long i = 0, j = this->_input_buffer_offset - this->_receptive_field; i < this->_receptive_field; i++, j++) + this->_input_buffers[ch][i] = this->_input_buffers[ch][j]; + } // And reset the offset. // Even though we could be stingy about that one sample that we won't be using // (because a new set is incoming) it's probably not worth the @@ -162,9 +251,9 @@ void nam::Buffer::_advance_input_buffer_(const int num_frames) // Linear ===================================================================== -nam::Linear::Linear(const int receptive_field, const bool _bias, const std::vector& weights, - const double expected_sample_rate) -: nam::Buffer(receptive_field, expected_sample_rate) +nam::Linear::Linear(const int in_channels, const int out_channels, const int receptive_field, const bool _bias, + const std::vector& weights, const double expected_sample_rate) +: nam::Buffer(in_channels, out_channels, receptive_field, expected_sample_rate) { if ((int)weights.size() != (receptive_field + (_bias ? 1 : 0))) throw std::runtime_error( @@ -178,16 +267,33 @@ nam::Linear::Linear(const int receptive_field, const bool _bias, const std::vect this->_bias = _bias ? weights[receptive_field] : (float)0.0; } -void nam::Linear::process(NAM_SAMPLE* input, NAM_SAMPLE* output, const int num_frames) +void nam::Linear::process(NAM_SAMPLE** input, NAM_SAMPLE** output, const int num_frames) { this->nam::Buffer::_update_buffers_(input, num_frames); + const int in_channels = NumInputChannels(); + const int out_channels = NumOutputChannels(); + + // For now, Linear processes each input channel independently to corresponding output channel + // This is a simple implementation - can be extended later for cross-channel mixing + const int channelsToProcess = std::min(in_channels, out_channels); + // Main computation! - for (int i = 0; i < num_frames; i++) + for (int ch = 0; ch < channelsToProcess; ch++) + { + for (int i = 0; i < num_frames; i++) + { + const long offset = this->_input_buffer_offset - this->_weight.size() + i + 1; + auto input_vec = Eigen::Map(&this->_input_buffers[ch][offset], this->_receptive_field); + output[ch][i] = this->_bias + this->_weight.dot(input_vec); + } + } + + // Zero out any extra output channels + for (int ch = channelsToProcess; ch < out_channels; ch++) { - const long offset = this->_input_buffer_offset - this->_weight.size() + i + 1; - auto input = Eigen::Map(&this->_input_buffer[offset], this->_receptive_field); - output[i] = this->_bias + this->_weight.dot(input); + for (int i = 0; i < num_frames; i++) + output[ch][i] = (NAM_SAMPLE)0.0; } // Prepare for next call: @@ -200,7 +306,10 @@ std::unique_ptr nam::linear::Factory(const nlohmann::json& config, std { const int receptive_field = config["receptive_field"]; const bool bias = config["bias"]; - return std::make_unique(receptive_field, bias, weights, expectedSampleRate); + // Default to 1 channel in/out for backward compatibility + const int in_channels = config.value("in_channels", 1); + const int out_channels = config.value("out_channels", 1); + return std::make_unique(in_channels, out_channels, receptive_field, bias, weights, expectedSampleRate); } // NN modules ================================================================= diff --git a/NAM/dsp.h b/NAM/dsp.h index f359a68..5787212 100644 --- a/NAM/dsp.h +++ b/NAM/dsp.h @@ -40,7 +40,7 @@ class DSP // Older models won't know, but newer ones will come with a loudness from the training based on their response to a // standardized input. // We may choose to have the models figure out for themselves how loud they are in here in the future. - DSP(const double expected_sample_rate); + DSP(const int in_channels, const int out_channels, const double expected_sample_rate); virtual ~DSP() = default; // prewarm() does any required intial work required to "settle" model initial conditions // it can be somewhat expensive, so should not be called during realtime audio processing @@ -54,25 +54,36 @@ class DSP // 1. The core DSP algorithm is run (This is what should probably be // overridden in subclasses). // 2. The output level is applied and the result stored to `output`. - virtual void process(NAM_SAMPLE* input, NAM_SAMPLE* output, const int num_frames); + // `input` and `output` are double pointers where the first pointer indexes channels + // and the second indexes frames: input[channel][frame] + virtual void process(NAM_SAMPLE** input, NAM_SAMPLE** output, const int num_frames); // Expected sample rate, in Hz. // TODO throw if it doesn't know. double GetExpectedSampleRate() const { return mExpectedSampleRate; }; + // Number of input channels + int NumInputChannels() const { return mInChannels; }; + // Number of output channels + int NumOutputChannels() const { return mOutChannels; }; // Input Level, in dBu, corresponding to 0 dBFS for a sine wave // You should call HasInputLevel() first to be safe. - double GetInputLevel() { return mInputLevel.level; }; + // Note: input level is assumed global over all inputs. + double GetInputLevel(); // Get how loud this model is, in dB. // Throws a std::runtime_error if the model doesn't know how loud it is. + // Note: loudness is assumed global over all outputs. double GetLoudness() const; // Output Level, in dBu, corresponding to 0 dBFS for a sine wave // You should call HasOutputLevel() first to be safe. - double GetOutputLevel() { return mOutputLevel.level; }; - // Does this model know its output level? - bool HasInputLevel() { return mInputLevel.haveLevel; }; + // Note: output level is assumed global over all outputs. + double GetOutputLevel(); + // Does this model know its input level? + // Note: input level is assumed global over all inputs. + bool HasInputLevel(); // Get whether the model knows how loud it is. bool HasLoudness() const { return mHasLoudness; }; // Does this model know its output level? - bool HasOutputLevel() { return mOutputLevel.haveLevel; }; + // Note: output level is assumed global over all outputs. + bool HasOutputLevel(); // General function for resetting the DSP unit. // This doesn't call prewarm(). If you want to do that, then you might want to use ResetAndPrewarm(). // See https://github.com/sdatkinson/NeuralAmpModelerCore/issues/96 for the reasoning. @@ -83,20 +94,13 @@ class DSP Reset(sampleRate, maxBufferSize); prewarm(); } - void SetInputLevel(const double inputLevel) - { - mInputLevel.haveLevel = true; - mInputLevel.level = inputLevel; - }; + void SetInputLevel(const double inputLevel); // Set the loudness, in dB. // This is usually defined to be the loudness to a standardized input. The trainer has its own, but you can always // use this to define it a different way if you like yours better. + // Note: loudness is assumed global over all outputs. void SetLoudness(const double loudness); - void SetOutputLevel(const double outputLevel) - { - mOutputLevel.haveLevel = true; - mOutputLevel.level = outputLevel; - }; + void SetOutputLevel(const double outputLevel); protected: bool mHasLoudness = false; @@ -117,11 +121,14 @@ class DSP int GetMaxBufferSize() const { return mMaxBufferSize; }; private: + const int mInChannels; + const int mOutChannels; struct Level { bool haveLevel = false; float level = 0.0; }; + // Note: input/output levels are assumed global over all inputs/outputs Level mInputLevel; Level mOutputLevel; }; @@ -132,23 +139,23 @@ class DSP class Buffer : public DSP { public: - Buffer(const int receptive_field, const double expected_sample_rate = -1.0); + Buffer(const int in_channels, const int out_channels, const int receptive_field, + const double expected_sample_rate = -1.0); protected: - // Input buffer - const int _input_buffer_channels = 1; // Mono int _receptive_field; - // First location where we add new samples from the input + // First location where we add new samples from the input (same for all channels) long _input_buffer_offset; - std::vector _input_buffer; - std::vector _output_buffer; + // Per-channel input buffers + std::vector> _input_buffers; + std::vector> _output_buffers; void _advance_input_buffer_(const int num_frames); void _set_receptive_field(const int new_receptive_field, const int input_buffer_size); void _set_receptive_field(const int new_receptive_field); void _reset_input_buffer(); // Use this->_input_post_gain - virtual void _update_buffers_(NAM_SAMPLE* input, int num_frames); + virtual void _update_buffers_(NAM_SAMPLE** input, int num_frames); virtual void _rewind_buffers_(); }; @@ -156,9 +163,9 @@ class Buffer : public DSP class Linear : public Buffer { public: - Linear(const int receptive_field, const bool _bias, const std::vector& weights, - const double expected_sample_rate = -1.0); - void process(NAM_SAMPLE* input, NAM_SAMPLE* output, const int num_frames) override; + Linear(const int in_channels, const int out_channels, const int receptive_field, const bool _bias, + const std::vector& weights, const double expected_sample_rate = -1.0); + void process(NAM_SAMPLE** input, NAM_SAMPLE** output, const int num_frames) override; protected: Eigen::VectorXf _weight; diff --git a/NAM/lstm.cpp b/NAM/lstm.cpp index 6fa33a2..d162d55 100644 --- a/NAM/lstm.cpp +++ b/NAM/lstm.cpp @@ -65,25 +65,61 @@ void nam::lstm::LSTMCell::process_(const Eigen::VectorXf& x) } } -nam::lstm::LSTM::LSTM(const int num_layers, const int input_size, const int hidden_size, std::vector& weights, - const double expected_sample_rate) -: DSP(expected_sample_rate) +nam::lstm::LSTM::LSTM(const int in_channels, const int out_channels, const int num_layers, const int input_size, + const int hidden_size, std::vector& weights, const double expected_sample_rate) +: DSP(in_channels, out_channels, expected_sample_rate) { - this->_input.resize(1); + // Allocate input and output vectors + this->_input.resize(input_size); + this->_output.resize(out_channels); + std::vector::iterator it = weights.begin(); for (int i = 0; i < num_layers; i++) this->_layers.push_back(LSTMCell(i == 0 ? input_size : hidden_size, hidden_size, it)); - this->_head_weight.resize(hidden_size); - for (int i = 0; i < hidden_size; i++) - this->_head_weight[i] = *(it++); - this->_head_bias = *(it++); + + // Load head weight as matrix (out_channels x hidden_size) + // Weights are stored row-major: first row (output 0), then row 1 (output 1), etc. + this->_head_weight.resize(out_channels, hidden_size); + for (int out_ch = 0; out_ch < out_channels; out_ch++) + { + for (int h = 0; h < hidden_size; h++) + { + this->_head_weight(out_ch, h) = *(it++); + } + } + + // Load head bias as vector (out_channels) + this->_head_bias.resize(out_channels); + for (int out_ch = 0; out_ch < out_channels; out_ch++) + { + this->_head_bias(out_ch) = *(it++); + } + assert(it == weights.end()); } -void nam::lstm::LSTM::process(NAM_SAMPLE* input, NAM_SAMPLE* output, const int num_frames) +void nam::lstm::LSTM::process(NAM_SAMPLE** input, NAM_SAMPLE** output, const int num_frames) { + const int in_channels = NumInputChannels(); + const int out_channels = NumOutputChannels(); + for (int i = 0; i < num_frames; i++) - output[i] = this->_process_sample(input[i]); + { + // Copy multi-channel input to _input vector + for (int ch = 0; ch < in_channels; ch++) + { + this->_input(ch) = input[ch][i]; + } + + // Process sample (stores result in _output) + this->_process_sample(); + + // Copy multi-channel output from _output to output arrays + for (int ch = 0; ch < out_channels; ch++) + { + output[ch][i] = this->_output(ch); + } + } } int nam::lstm::LSTM::PrewarmSamples() @@ -94,15 +130,37 @@ int nam::lstm::LSTM::PrewarmSamples() return result <= 0 ? 1 : result; } -float nam::lstm::LSTM::_process_sample(const float x) +void nam::lstm::LSTM::_process_sample() { + const int in_channels = NumInputChannels(); + const int out_channels = NumOutputChannels(); + if (this->_layers.size() == 0) - return x; - this->_input(0) = x; + { + // No layers - pass input through to output (using first in_channels of output) + const int channels_to_copy = std::min(in_channels, out_channels); + for (int ch = 0; ch < channels_to_copy; ch++) + this->_output(ch) = this->_input(ch); + // Zero-fill remaining output channels if in_channels < out_channels + for (int ch = channels_to_copy; ch < out_channels; ch++) + this->_output(ch) = 0.0f; + return; + } + this->_layers[0].process_(this->_input); for (size_t i = 1; i < this->_layers.size(); i++) this->_layers[i].process_(this->_layers[i - 1].get_hidden_state()); - return this->_head_weight.dot(this->_layers[this->_layers.size() - 1].get_hidden_state()) + this->_head_bias; + + // Compute output using head weight matrix and bias vector + // _output = _head_weight * hidden_state + _head_bias + const Eigen::VectorXf& hidden_state = this->_layers[this->_layers.size() - 1].get_hidden_state(); + + // Compute matrix-vector product: (out_channels x hidden_size) * (hidden_size) = (out_channels) + // Store directly in _output (which is already sized correctly in constructor) + this->_output.noalias() = this->_head_weight * hidden_state; + + // Add bias: (out_channels) += (out_channels) + this->_output.noalias() += this->_head_bias; } // Factory to instantiate from nlohmann json @@ -112,7 +170,11 @@ std::unique_ptr nam::lstm::Factory(const nlohmann::json& config, std:: const int num_layers = config["num_layers"]; const int input_size = config["input_size"]; const int hidden_size = config["hidden_size"]; - return std::make_unique(num_layers, input_size, hidden_size, weights, expectedSampleRate); + // Default to 1 channel in/out for backward compatibility + const int in_channels = config.value("in_channels", 1); + const int out_channels = config.value("out_channels", 1); + return std::make_unique( + in_channels, out_channels, num_layers, input_size, hidden_size, weights, expectedSampleRate); } // Register the factory diff --git a/NAM/lstm.h b/NAM/lstm.h index 17d0ada..5c03853 100644 --- a/NAM/lstm.h +++ b/NAM/lstm.h @@ -51,24 +51,26 @@ class LSTMCell class LSTM : public DSP { public: - LSTM(const int num_layers, const int input_size, const int hidden_size, std::vector& weights, - const double expected_sample_rate = -1.0); + LSTM(const int in_channels, const int out_channels, const int num_layers, const int input_size, const int hidden_size, + std::vector& weights, const double expected_sample_rate = -1.0); ~LSTM() = default; + void process(NAM_SAMPLE** input, NAM_SAMPLE** output, const int num_frames) override; protected: // Hacky, but a half-second seems to work for most models. int PrewarmSamples() override; - Eigen::VectorXf _head_weight; - float _head_bias; - void process(NAM_SAMPLE* input, NAM_SAMPLE* output, const int num_frames) override; + Eigen::MatrixXf _head_weight; // (out_channels x hidden_size) + Eigen::VectorXf _head_bias; // (out_channels) std::vector _layers; - float _process_sample(const float x); + void _process_sample(); // Input to the LSTM. - // Since this is assumed to not be a parametric model, its shape should be (1,) + // Since this is assumed to not be a parametric model, its shape should be (in_channels,) Eigen::VectorXf _input; + // Output from _process_sample - multi-channel output vector (size out_channels) + Eigen::VectorXf _output; }; // Factory to instantiate from nlohmann json diff --git a/NAM/wavenet.cpp b/NAM/wavenet.cpp index 285ea69..6686f93 100644 --- a/NAM/wavenet.cpp +++ b/NAM/wavenet.cpp @@ -192,12 +192,18 @@ long nam::wavenet::_LayerArray::_get_channels() const // WaveNet ==================================================================== -nam::wavenet::WaveNet::WaveNet(const std::vector& layer_array_params, +nam::wavenet::WaveNet::WaveNet(const int in_channels, + const std::vector& layer_array_params, const float head_scale, const bool with_head, std::vector weights, const double expected_sample_rate) -: DSP(expected_sample_rate) +: DSP(in_channels, + layer_array_params.empty() ? throw std::runtime_error("WaveNet requires at least one layer array") + : layer_array_params.back().head_size, + expected_sample_rate) , _head_scale(head_scale) { + if (layer_array_params.empty()) + throw std::runtime_error("WaveNet requires at least one layer array"); if (with_head) throw std::runtime_error("Head not implemented!"); for (size_t i = 0; i < layer_array_params.size(); i++) @@ -251,17 +257,24 @@ void nam::wavenet::WaveNet::SetMaxBufferSize(const int maxBufferSize) this->_layer_arrays[i].SetMaxBufferSize(maxBufferSize); } -void nam::wavenet::WaveNet::_set_condition_array(NAM_SAMPLE* input, const int num_frames) +void nam::wavenet::WaveNet::_set_condition_array(NAM_SAMPLE** input, const int num_frames) { - for (int j = 0; j < num_frames; j++) + const int in_channels = NumInputChannels(); + // Fill condition array with input channels + for (int ch = 0; ch < in_channels; ch++) { - this->_condition(0, j) = input[j]; + for (int j = 0; j < num_frames; j++) + { + this->_condition(ch, j) = input[ch][j]; + } } } -void nam::wavenet::WaveNet::process(NAM_SAMPLE* input, NAM_SAMPLE* output, const int num_frames) +void nam::wavenet::WaveNet::process(NAM_SAMPLE** input, NAM_SAMPLE** output, const int num_frames) { assert(num_frames <= mMaxBufferSize); + const int out_channels = NumOutputChannels(); + this->_set_condition_array(input, num_frames); // Main layer arrays: @@ -287,11 +300,15 @@ void nam::wavenet::WaveNet::process(NAM_SAMPLE* input, NAM_SAMPLE* output, const // (Head not implemented) auto& final_head_outputs = this->_layer_arrays.back().GetHeadOutputs(); - assert(final_head_outputs.rows() == 1); - for (int s = 0; s < num_frames; s++) + assert(final_head_outputs.rows() == out_channels); + + for (int ch = 0; ch < out_channels; ch++) { - const float out = this->_head_scale * final_head_outputs(0, s); - output[s] = out; + for (int s = 0; s < num_frames; s++) + { + const float out = this->_head_scale * final_head_outputs(ch, s); + output[ch][s] = out; + } } } @@ -314,8 +331,16 @@ std::unique_ptr nam::wavenet::Factory(const nlohmann::json& config, st } const bool with_head = !config["head"].is_null(); const float head_scale = config["head_scale"]; + + if (layer_array_params.empty()) + throw std::runtime_error("WaveNet config requires at least one layer array"); + + // Backward compatibility: assume 1 input channel + const int in_channels = config.value("in_channels", 1); + + // out_channels is determined from last layer array's head_size return std::make_unique( - layer_array_params, head_scale, with_head, weights, expectedSampleRate); + in_channels, layer_array_params, head_scale, with_head, weights, expectedSampleRate); } // Register the factory diff --git a/NAM/wavenet.h b/NAM/wavenet.h index 832673b..2e99256 100644 --- a/NAM/wavenet.h +++ b/NAM/wavenet.h @@ -174,10 +174,10 @@ class _LayerArray class WaveNet : public DSP { public: - WaveNet(const std::vector& layer_array_params, const float head_scale, const bool with_head, - std::vector weights, const double expected_sample_rate = -1.0); + WaveNet(const int in_channels, const std::vector& layer_array_params, const float head_scale, + const bool with_head, std::vector weights, const double expected_sample_rate = -1.0); ~WaveNet() = default; - void process(NAM_SAMPLE* input, NAM_SAMPLE* output, const int num_frames) override; + void process(NAM_SAMPLE** input, NAM_SAMPLE** output, const int num_frames) override; void set_weights_(std::vector& weights); protected: @@ -186,10 +186,10 @@ class WaveNet : public DSP void SetMaxBufferSize(const int maxBufferSize) override; // Fill in the "condition" array that's fed into the various parts of the net. - virtual void _set_condition_array(NAM_SAMPLE* input, const int num_frames); + virtual void _set_condition_array(NAM_SAMPLE** input, const int num_frames); // How many conditioning inputs are there. // Just one--the audio. - virtual int _get_condition_dim() const { return 1; }; + virtual int _get_condition_dim() const { return NumInputChannels(); }; private: std::vector<_LayerArray> _layer_arrays; diff --git a/tools/benchmodel.cpp b/tools/benchmodel.cpp index 5c3d60c..d8a1690 100644 --- a/tools/benchmodel.cpp +++ b/tools/benchmodel.cpp @@ -40,18 +40,31 @@ int main(int argc, char* argv[]) model->Reset(model->GetExpectedSampleRate(), bufferSize); size_t numBuffers = (48000 / bufferSize) * 2; - // Fill input buffer with zeroes. - // Output buffer doesn't matter. - for (int i = 0; i < AUDIO_BUFFER_SIZE; i++) + // Allocate multi-channel buffers + const int in_channels = model->NumInputChannels(); + const int out_channels = model->NumOutputChannels(); + + std::vector> inputBuffers(in_channels); + std::vector> outputBuffers(out_channels); + std::vector inputPtrs(in_channels); + std::vector outputPtrs(out_channels); + + for (int ch = 0; ch < in_channels; ch++) + { + inputBuffers[ch].resize(AUDIO_BUFFER_SIZE, 0.0); + inputPtrs[ch] = inputBuffers[ch].data(); + } + for (int ch = 0; ch < out_channels; ch++) { - inputBuffer[i] = 0.0; + outputBuffers[ch].resize(AUDIO_BUFFER_SIZE, 0.0); + outputPtrs[ch] = outputBuffers[ch].data(); } std::cout << "Running benchmark\n"; auto t1 = high_resolution_clock::now(); for (size_t i = 0; i < numBuffers; i++) { - model->process(inputBuffer, outputBuffer, AUDIO_BUFFER_SIZE); + model->process(inputPtrs.data(), outputPtrs.data(), AUDIO_BUFFER_SIZE); } auto t2 = high_resolution_clock::now(); std::cout << "Finished\n"; diff --git a/tools/run_tests.cpp b/tools/run_tests.cpp index 33c4d45..2a50c77 100644 --- a/tools/run_tests.cpp +++ b/tools/run_tests.cpp @@ -18,6 +18,7 @@ #include "test/test_wavenet_gating_compatibility.cpp" #include "test/test_blending_detailed.cpp" #include "test/test_input_buffer_verification.cpp" +#include "test/test_lstm.cpp" int main() { @@ -124,6 +125,7 @@ int main() test_wavenet::test_layer_grouped_process_realtime_safe(); test_wavenet::test_layer_array_process_realtime_safe(); test_wavenet::test_process_realtime_safe(); + test_wavenet::test_process_3in_2out_realtime_safe(); test_convnet::test_convnet_basic(); test_convnet::test_convnet_batchnorm(); @@ -133,6 +135,19 @@ int main() test_convnet::test_convnet_prewarm(); test_convnet::test_convnet_multiple_calls(); + // LSTM tests + test_lstm::test_lstm_basic(); + test_lstm::test_lstm_multiple_layers(); + test_lstm::test_lstm_zero_input(); + test_lstm::test_lstm_different_buffer_sizes(); + test_lstm::test_lstm_prewarm(); + test_lstm::test_lstm_multiple_calls(); + test_lstm::test_lstm_multichannel(); + test_lstm::test_lstm_large_hidden_size(); + test_lstm::test_lstm_different_input_size(); + test_lstm::test_lstm_state_evolution(); + test_lstm::test_lstm_no_layers(); + // Gating activations tests test_gating_activations::TestGatingActivation::test_basic_functionality(); test_gating_activations::TestGatingActivation::test_with_custom_activations(); diff --git a/tools/test/test_convnet.cpp b/tools/test/test_convnet.cpp index ff11074..56bd5ec 100644 --- a/tools/test/test_convnet.cpp +++ b/tools/test/test_convnet.cpp @@ -13,6 +13,8 @@ namespace test_convnet // Test basic ConvNet construction and processing void test_convnet_basic() { + const int in_channels = 1; + const int out_channels = 1; const int channels = 2; const std::vector dilations{1, 2}; const bool batchnorm = false; @@ -32,7 +34,8 @@ void test_convnet_basic() // Head weights (2 weights + 1 bias) weights.insert(weights.end(), {1.0f, 1.0f, 0.0f}); - nam::convnet::ConvNet convnet(channels, dilations, batchnorm, activation, weights, expected_sample_rate); + nam::convnet::ConvNet convnet( + in_channels, out_channels, channels, dilations, batchnorm, activation, weights, expected_sample_rate); const int numFrames = 4; const int maxBufferSize = 64; @@ -40,8 +43,10 @@ void test_convnet_basic() std::vector input(numFrames, 1.0f); std::vector output(numFrames, 0.0f); + NAM_SAMPLE* inputPtrs[] = {input.data()}; + NAM_SAMPLE* outputPtrs[] = {output.data()}; - convnet.process(input.data(), output.data(), numFrames); + convnet.process(inputPtrs, outputPtrs, numFrames); // Verify output dimensions assert(output.size() == numFrames); @@ -55,6 +60,8 @@ void test_convnet_basic() // Test ConvNet with batchnorm void test_convnet_batchnorm() { + const int in_channels = 1; + const int out_channels = 1; const int channels = 1; const std::vector dilations{1}; const bool batchnorm = true; @@ -74,7 +81,8 @@ void test_convnet_batchnorm() // Head weights (1 weight + 1 bias) weights.insert(weights.end(), {1.0f, 0.0f}); - nam::convnet::ConvNet convnet(channels, dilations, batchnorm, activation, weights, expected_sample_rate); + nam::convnet::ConvNet convnet( + in_channels, out_channels, channels, dilations, batchnorm, activation, weights, expected_sample_rate); const int numFrames = 4; const int maxBufferSize = 64; @@ -82,8 +90,10 @@ void test_convnet_batchnorm() std::vector input(numFrames, 1.0f); std::vector output(numFrames, 0.0f); + NAM_SAMPLE* inputPtrs[] = {input.data()}; + NAM_SAMPLE* outputPtrs[] = {output.data()}; - convnet.process(input.data(), output.data(), numFrames); + convnet.process(inputPtrs, outputPtrs, numFrames); assert(output.size() == numFrames); for (int i = 0; i < numFrames; i++) @@ -95,6 +105,8 @@ void test_convnet_batchnorm() // Test ConvNet with multiple blocks void test_convnet_multiple_blocks() { + const int in_channels = 1; + const int out_channels = 1; const int channels = 2; const std::vector dilations{1, 2, 4}; const bool batchnorm = false; @@ -117,7 +129,8 @@ void test_convnet_multiple_blocks() // Head weights weights.insert(weights.end(), {1.0f, 1.0f, 0.0f}); - nam::convnet::ConvNet convnet(channels, dilations, batchnorm, activation, weights, expected_sample_rate); + nam::convnet::ConvNet convnet( + in_channels, out_channels, channels, dilations, batchnorm, activation, weights, expected_sample_rate); const int numFrames = 8; const int maxBufferSize = 64; @@ -125,8 +138,10 @@ void test_convnet_multiple_blocks() std::vector input(numFrames, 0.5f); std::vector output(numFrames, 0.0f); + NAM_SAMPLE* inputPtrs[] = {input.data()}; + NAM_SAMPLE* outputPtrs[] = {output.data()}; - convnet.process(input.data(), output.data(), numFrames); + convnet.process(inputPtrs, outputPtrs, numFrames); assert(output.size() == numFrames); for (int i = 0; i < numFrames; i++) @@ -138,6 +153,8 @@ void test_convnet_multiple_blocks() // Test ConvNet with zero input void test_convnet_zero_input() { + const int in_channels = 1; + const int out_channels = 1; const int channels = 1; const std::vector dilations{1}; const bool batchnorm = false; @@ -150,15 +167,18 @@ void test_convnet_zero_input() // Head weights (1 weight + 1 bias) weights.insert(weights.end(), {1.0f, 0.0f}); - nam::convnet::ConvNet convnet(channels, dilations, batchnorm, activation, weights, expected_sample_rate); + nam::convnet::ConvNet convnet( + in_channels, out_channels, channels, dilations, batchnorm, activation, weights, expected_sample_rate); const int numFrames = 4; convnet.Reset(expected_sample_rate, numFrames); std::vector input(numFrames, 0.0f); std::vector output(numFrames, 0.0f); + NAM_SAMPLE* inputPtrs[] = {input.data()}; + NAM_SAMPLE* outputPtrs[] = {output.data()}; - convnet.process(input.data(), output.data(), numFrames); + convnet.process(inputPtrs, outputPtrs, numFrames); // With zero input, output should be finite (may be zero or non-zero depending on bias) for (int i = 0; i < numFrames; i++) @@ -170,6 +190,8 @@ void test_convnet_zero_input() // Test ConvNet with different buffer sizes void test_convnet_different_buffer_sizes() { + const int in_channels = 1; + const int out_channels = 1; const int channels = 1; const std::vector dilations{1}; const bool batchnorm = false; @@ -182,18 +204,23 @@ void test_convnet_different_buffer_sizes() // Head weights (1 weight + 1 bias) weights.insert(weights.end(), {1.0f, 0.0f}); - nam::convnet::ConvNet convnet(channels, dilations, batchnorm, activation, weights, expected_sample_rate); + nam::convnet::ConvNet convnet( + in_channels, out_channels, channels, dilations, batchnorm, activation, weights, expected_sample_rate); // Test with different buffer sizes convnet.Reset(expected_sample_rate, 64); std::vector input1(32, 1.0f); std::vector output1(32, 0.0f); - convnet.process(input1.data(), output1.data(), 32); + NAM_SAMPLE* inputPtrs1[] = {input1.data()}; + NAM_SAMPLE* outputPtrs1[] = {output1.data()}; + convnet.process(inputPtrs1, outputPtrs1, 32); convnet.Reset(expected_sample_rate, 128); std::vector input2(64, 1.0f); std::vector output2(64, 0.0f); - convnet.process(input2.data(), output2.data(), 64); + NAM_SAMPLE* inputPtrs2[] = {input2.data()}; + NAM_SAMPLE* outputPtrs2[] = {output2.data()}; + convnet.process(inputPtrs2, outputPtrs2, 64); // Both should work without errors assert(output1.size() == 32); @@ -203,6 +230,8 @@ void test_convnet_different_buffer_sizes() // Test ConvNet prewarm functionality void test_convnet_prewarm() { + const int in_channels = 1; + const int out_channels = 1; const int channels = 2; const std::vector dilations{1, 2, 4}; const bool batchnorm = false; @@ -219,7 +248,8 @@ void test_convnet_prewarm() // Head weights (2 weights + 1 bias) weights.insert(weights.end(), {1.0f, 1.0f, 0.0f}); - nam::convnet::ConvNet convnet(channels, dilations, batchnorm, activation, weights, expected_sample_rate); + nam::convnet::ConvNet convnet( + in_channels, out_channels, channels, dilations, batchnorm, activation, weights, expected_sample_rate); // Test that prewarm can be called without errors convnet.Reset(expected_sample_rate, 64); @@ -229,7 +259,9 @@ void test_convnet_prewarm() const int numFrames = 4; std::vector input(numFrames, 1.0f); std::vector output(numFrames, 0.0f); - convnet.process(input.data(), output.data(), numFrames); + NAM_SAMPLE* inputPtrs[] = {input.data()}; + NAM_SAMPLE* outputPtrs[] = {output.data()}; + convnet.process(inputPtrs, outputPtrs, numFrames); // Output should be finite for (int i = 0; i < numFrames; i++) @@ -241,6 +273,8 @@ void test_convnet_prewarm() // Test multiple process() calls (ring buffer functionality) void test_convnet_multiple_calls() { + const int in_channels = 1; + const int out_channels = 1; const int channels = 1; const std::vector dilations{1}; const bool batchnorm = false; @@ -253,7 +287,8 @@ void test_convnet_multiple_calls() // Head weights (1 weight + 1 bias) weights.insert(weights.end(), {1.0f, 0.0f}); - nam::convnet::ConvNet convnet(channels, dilations, batchnorm, activation, weights, expected_sample_rate); + nam::convnet::ConvNet convnet( + in_channels, out_channels, channels, dilations, batchnorm, activation, weights, expected_sample_rate); const int numFrames = 2; convnet.Reset(expected_sample_rate, numFrames); @@ -263,7 +298,9 @@ void test_convnet_multiple_calls() { std::vector input(numFrames, 1.0f); std::vector output(numFrames, 0.0f); - convnet.process(input.data(), output.data(), numFrames); + NAM_SAMPLE* inputPtrs[] = {input.data()}; + NAM_SAMPLE* outputPtrs[] = {output.data()}; + convnet.process(inputPtrs, outputPtrs, numFrames); // Output should be finite for (int j = 0; j < numFrames; j++) diff --git a/tools/test/test_dsp.cpp b/tools/test/test_dsp.cpp index bbdee63..d019a87 100644 --- a/tools/test/test_dsp.cpp +++ b/tools/test/test_dsp.cpp @@ -1,18 +1,32 @@ // Tests for dsp #include "NAM/dsp.h" +#include namespace test_dsp { // Simplest test: can I construct something! void test_construct() { - nam::DSP myDsp(48000.0); + const int in_channels = 1; + const int out_channels = 1; + nam::DSP myDsp(in_channels, out_channels, 48000.0); +} + +void test_channels() +{ + const int in_channels = 2; + const int out_channels = 3; + nam::DSP myDsp(in_channels, out_channels, 48000.0); + assert(myDsp.NumInputChannels() == in_channels); + assert(myDsp.NumOutputChannels() == out_channels); } void test_get_input_level() { - nam::DSP myDsp(48000.0); + const int in_channels = 2; + const int out_channels = 1; + nam::DSP myDsp(in_channels, out_channels, 48000.0); const double expected = 19.0; myDsp.SetInputLevel(expected); assert(myDsp.HasInputLevel()); @@ -23,7 +37,9 @@ void test_get_input_level() void test_get_output_level() { - nam::DSP myDsp(48000.0); + const int in_channels = 1; + const int out_channels = 2; + nam::DSP myDsp(in_channels, out_channels, 48000.0); const double expected = 12.0; myDsp.SetOutputLevel(expected); assert(myDsp.HasOutputLevel()); @@ -35,32 +51,89 @@ void test_get_output_level() // Test correct function of DSP::HasInputLevel() void test_has_input_level() { - nam::DSP myDsp(48000.0); + const int in_channels = 2; + const int out_channels = 1; + nam::DSP myDsp(in_channels, out_channels, 48000.0); assert(!myDsp.HasInputLevel()); - myDsp.SetInputLevel(19.0); + const double level = 19.0; + myDsp.SetInputLevel(level); assert(myDsp.HasInputLevel()); } void test_has_output_level() { - nam::DSP myDsp(48000.0); + const int in_channels = 1; + const int out_channels = 2; + nam::DSP myDsp(in_channels, out_channels, 48000.0); + assert(!myDsp.HasOutputLevel()); - myDsp.SetOutputLevel(12.0); + const double level = 12.0; + myDsp.SetOutputLevel(level); assert(myDsp.HasOutputLevel()); } // Test correct function of DSP::HasInputLevel() void test_set_input_level() { - nam::DSP myDsp(48000.0); + const int in_channels = 2; + const int out_channels = 1; + nam::DSP myDsp(in_channels, out_channels, 48000.0); myDsp.SetInputLevel(19.0); } void test_set_output_level() { - nam::DSP myDsp(48000.0); + const int in_channels = 1; + const int out_channels = 2; + nam::DSP myDsp(in_channels, out_channels, 48000.0); myDsp.SetOutputLevel(19.0); } + +void test_process_multi_channel() +{ + const int in_channels = 2; + const int out_channels = 2; + nam::DSP myDsp(in_channels, out_channels, 48000.0); + const int num_frames = 64; + + // Allocate buffers + std::vector> inputBuffers(in_channels); + std::vector> outputBuffers(out_channels); + std::vector inputPtrs(in_channels); + std::vector outputPtrs(out_channels); + + for (int ch = 0; ch < in_channels; ch++) + { + inputBuffers[ch].resize(num_frames); + outputBuffers[ch].resize(num_frames); + inputPtrs[ch] = inputBuffers[ch].data(); + outputPtrs[ch] = outputBuffers[ch].data(); + + // Fill input with test data + for (int i = 0; i < num_frames; i++) + { + inputBuffers[ch][i] = (ch + 1) * 0.5 + i * 0.01; + } + } + for (int ch = 0; ch < out_channels; ch++) + { + outputBuffers[ch].resize(num_frames); + outputPtrs[ch] = outputBuffers[ch].data(); + } + + // Process + myDsp.process(inputPtrs.data(), outputPtrs.data(), num_frames); + + // Check that default implementation copied input to output + const int channelsToCheck = std::min(in_channels, out_channels); + for (int ch = 0; ch < channelsToCheck; ch++) + { + for (int i = 0; i < num_frames; i++) + { + assert(outputBuffers[ch][i] == inputBuffers[ch][i]); + } + } +} }; // namespace test_dsp diff --git a/tools/test/test_lstm.cpp b/tools/test/test_lstm.cpp new file mode 100644 index 0000000..8c655b9 --- /dev/null +++ b/tools/test/test_lstm.cpp @@ -0,0 +1,451 @@ +// Tests for LSTM + +#include +#include +#include +#include +#include + +#include "NAM/lstm.h" + +namespace test_lstm +{ +// Helper function to calculate weights needed for LSTM +// For each LSTMCell: +// - Weight matrix: (4 * hidden_size) x (input_size + hidden_size) in row-major order +// - Bias: 4 * hidden_size +// - Initial hidden state: hidden_size (stored in second half of _xh) +// - Initial cell state: hidden_size +// For the LSTM: +// - Head weight matrix: out_channels x hidden_size in row-major order +// - Head bias: out_channels +std::vector create_lstm_weights(int num_layers, int input_size, int hidden_size, int out_channels) +{ + std::vector weights; + + for (int layer = 0; layer < num_layers; layer++) + { + int layer_input_size = (layer == 0) ? input_size : hidden_size; + int w_rows = 4 * hidden_size; + int w_cols = layer_input_size + hidden_size; + + // Weight matrix (row-major) + for (int i = 0; i < w_rows * w_cols; i++) + { + weights.push_back(0.1f); // Small weights for stability + } + + // Bias vector + for (int i = 0; i < 4 * hidden_size; i++) + { + weights.push_back(0.0f); + } + + // Initial hidden state (stored in _xh) + for (int i = 0; i < hidden_size; i++) + { + weights.push_back(0.0f); + } + + // Initial cell state + for (int i = 0; i < hidden_size; i++) + { + weights.push_back(0.0f); + } + } + + // Head weight matrix (row-major: out_channels x hidden_size) + for (int out_ch = 0; out_ch < out_channels; out_ch++) + { + for (int h = 0; h < hidden_size; h++) + { + weights.push_back(0.1f); + } + } + + // Head bias + for (int out_ch = 0; out_ch < out_channels; out_ch++) + { + weights.push_back(0.0f); + } + + return weights; +} + +// Test basic LSTM construction and processing +void test_lstm_basic() +{ + const int in_channels = 1; + const int out_channels = 1; + const int num_layers = 1; + const int input_size = 1; + const int hidden_size = 4; + const double expected_sample_rate = 48000.0; + + std::vector weights = create_lstm_weights(num_layers, input_size, hidden_size, out_channels); + + nam::lstm::LSTM lstm(in_channels, out_channels, num_layers, input_size, hidden_size, weights, expected_sample_rate); + + const int numFrames = 4; + const int maxBufferSize = 64; + lstm.Reset(expected_sample_rate, maxBufferSize); + + std::vector input(numFrames, 1.0f); + std::vector output(numFrames, 0.0f); + NAM_SAMPLE* inputPtrs[] = {input.data()}; + NAM_SAMPLE* outputPtrs[] = {output.data()}; + + lstm.process(inputPtrs, outputPtrs, numFrames); + + // Verify output dimensions + assert(output.size() == numFrames); + // Output should be non-zero and finite + for (int i = 0; i < numFrames; i++) + { + assert(std::isfinite(output[i])); + } +} + +// Test LSTM with multiple layers +void test_lstm_multiple_layers() +{ + const int in_channels = 1; + const int out_channels = 1; + const int num_layers = 2; + const int input_size = 1; + const int hidden_size = 4; + const double expected_sample_rate = 48000.0; + + std::vector weights = create_lstm_weights(num_layers, input_size, hidden_size, out_channels); + + nam::lstm::LSTM lstm(in_channels, out_channels, num_layers, input_size, hidden_size, weights, expected_sample_rate); + + const int numFrames = 8; + const int maxBufferSize = 64; + lstm.Reset(expected_sample_rate, maxBufferSize); + + std::vector input(numFrames, 0.5f); + std::vector output(numFrames, 0.0f); + NAM_SAMPLE* inputPtrs[] = {input.data()}; + NAM_SAMPLE* outputPtrs[] = {output.data()}; + + lstm.process(inputPtrs, outputPtrs, numFrames); + + assert(output.size() == numFrames); + for (int i = 0; i < numFrames; i++) + { + assert(std::isfinite(output[i])); + } +} + +// Test LSTM with zero input +void test_lstm_zero_input() +{ + const int in_channels = 1; + const int out_channels = 1; + const int num_layers = 1; + const int input_size = 1; + const int hidden_size = 4; + const double expected_sample_rate = 48000.0; + + std::vector weights = create_lstm_weights(num_layers, input_size, hidden_size, out_channels); + + nam::lstm::LSTM lstm(in_channels, out_channels, num_layers, input_size, hidden_size, weights, expected_sample_rate); + + const int numFrames = 4; + lstm.Reset(expected_sample_rate, numFrames); + + std::vector input(numFrames, 0.0f); + std::vector output(numFrames, 0.0f); + NAM_SAMPLE* inputPtrs[] = {input.data()}; + NAM_SAMPLE* outputPtrs[] = {output.data()}; + + lstm.process(inputPtrs, outputPtrs, numFrames); + + // With zero input, output should be finite (may be zero or non-zero depending on bias) + for (int i = 0; i < numFrames; i++) + { + assert(std::isfinite(output[i])); + } +} + +// Test LSTM with different buffer sizes +void test_lstm_different_buffer_sizes() +{ + const int in_channels = 1; + const int out_channels = 1; + const int num_layers = 1; + const int input_size = 1; + const int hidden_size = 4; + const double expected_sample_rate = 48000.0; + + std::vector weights = create_lstm_weights(num_layers, input_size, hidden_size, out_channels); + + nam::lstm::LSTM lstm(in_channels, out_channels, num_layers, input_size, hidden_size, weights, expected_sample_rate); + + // Test with different buffer sizes + lstm.Reset(expected_sample_rate, 64); + std::vector input1(32, 1.0f); + std::vector output1(32, 0.0f); + NAM_SAMPLE* inputPtrs1[] = {input1.data()}; + NAM_SAMPLE* outputPtrs1[] = {output1.data()}; + lstm.process(inputPtrs1, outputPtrs1, 32); + + lstm.Reset(expected_sample_rate, 128); + std::vector input2(64, 1.0f); + std::vector output2(64, 0.0f); + NAM_SAMPLE* inputPtrs2[] = {input2.data()}; + NAM_SAMPLE* outputPtrs2[] = {output2.data()}; + lstm.process(inputPtrs2, outputPtrs2, 64); + + // Both should work without errors + assert(output1.size() == 32); + assert(output2.size() == 64); +} + +// Test LSTM prewarm functionality +void test_lstm_prewarm() +{ + const int in_channels = 1; + const int out_channels = 1; + const int num_layers = 1; + const int input_size = 1; + const int hidden_size = 4; + const double expected_sample_rate = 48000.0; + + std::vector weights = create_lstm_weights(num_layers, input_size, hidden_size, out_channels); + + nam::lstm::LSTM lstm(in_channels, out_channels, num_layers, input_size, hidden_size, weights, expected_sample_rate); + + // Test that prewarm can be called without errors + lstm.Reset(expected_sample_rate, 64); + lstm.prewarm(); + + // After prewarm, processing should work + const int numFrames = 4; + std::vector input(numFrames, 1.0f); + std::vector output(numFrames, 0.0f); + NAM_SAMPLE* inputPtrs[] = {input.data()}; + NAM_SAMPLE* outputPtrs[] = {output.data()}; + lstm.process(inputPtrs, outputPtrs, numFrames); + + // Output should be finite + for (int i = 0; i < numFrames; i++) + { + assert(std::isfinite(output[i])); + } +} + +// Test multiple process() calls (state persistence) +void test_lstm_multiple_calls() +{ + const int in_channels = 1; + const int out_channels = 1; + const int num_layers = 1; + const int input_size = 1; + const int hidden_size = 4; + const double expected_sample_rate = 48000.0; + + std::vector weights = create_lstm_weights(num_layers, input_size, hidden_size, out_channels); + + nam::lstm::LSTM lstm(in_channels, out_channels, num_layers, input_size, hidden_size, weights, expected_sample_rate); + + const int numFrames = 2; + lstm.Reset(expected_sample_rate, numFrames); + + // Multiple calls should work correctly with state persistence + for (int i = 0; i < 5; i++) + { + std::vector input(numFrames, 1.0f); + std::vector output(numFrames, 0.0f); + NAM_SAMPLE* inputPtrs[] = {input.data()}; + NAM_SAMPLE* outputPtrs[] = {output.data()}; + lstm.process(inputPtrs, outputPtrs, numFrames); + + // Output should be finite + for (int j = 0; j < numFrames; j++) + { + assert(std::isfinite(output[j])); + } + } +} + +// Test LSTM with multi-channel input/output +void test_lstm_multichannel() +{ + const int in_channels = 2; + const int out_channels = 2; + const int num_layers = 1; + const int input_size = 2; + const int hidden_size = 4; + const double expected_sample_rate = 48000.0; + + std::vector weights = create_lstm_weights(num_layers, input_size, hidden_size, out_channels); + + nam::lstm::LSTM lstm(in_channels, out_channels, num_layers, input_size, hidden_size, weights, expected_sample_rate); + + const int numFrames = 4; + lstm.Reset(expected_sample_rate, 64); + + std::vector input1(numFrames, 0.5f); + std::vector input2(numFrames, 0.3f); + std::vector output1(numFrames, 0.0f); + std::vector output2(numFrames, 0.0f); + NAM_SAMPLE* inputPtrs[] = {input1.data(), input2.data()}; + NAM_SAMPLE* outputPtrs[] = {output1.data(), output2.data()}; + + lstm.process(inputPtrs, outputPtrs, numFrames); + + // Verify both output channels are finite + for (int i = 0; i < numFrames; i++) + { + assert(std::isfinite(output1[i])); + assert(std::isfinite(output2[i])); + } +} + +// Test LSTM with larger hidden size +void test_lstm_large_hidden_size() +{ + const int in_channels = 1; + const int out_channels = 1; + const int num_layers = 1; + const int input_size = 1; + const int hidden_size = 16; + const double expected_sample_rate = 48000.0; + + std::vector weights = create_lstm_weights(num_layers, input_size, hidden_size, out_channels); + + nam::lstm::LSTM lstm(in_channels, out_channels, num_layers, input_size, hidden_size, weights, expected_sample_rate); + + const int numFrames = 4; + lstm.Reset(expected_sample_rate, 64); + + std::vector input(numFrames, 1.0f); + std::vector output(numFrames, 0.0f); + NAM_SAMPLE* inputPtrs[] = {input.data()}; + NAM_SAMPLE* outputPtrs[] = {output.data()}; + + lstm.process(inputPtrs, outputPtrs, numFrames); + + for (int i = 0; i < numFrames; i++) + { + assert(std::isfinite(output[i])); + } +} + +// Test LSTM with different input sizes +void test_lstm_different_input_size() +{ + const int in_channels = 3; + const int out_channels = 1; + const int num_layers = 1; + const int input_size = 3; + const int hidden_size = 4; + const double expected_sample_rate = 48000.0; + + std::vector weights = create_lstm_weights(num_layers, input_size, hidden_size, out_channels); + + nam::lstm::LSTM lstm(in_channels, out_channels, num_layers, input_size, hidden_size, weights, expected_sample_rate); + + const int numFrames = 4; + lstm.Reset(expected_sample_rate, 64); + + std::vector input1(numFrames, 0.1f); + std::vector input2(numFrames, 0.2f); + std::vector input3(numFrames, 0.3f); + std::vector output(numFrames, 0.0f); + NAM_SAMPLE* inputPtrs[] = {input1.data(), input2.data(), input3.data()}; + NAM_SAMPLE* outputPtrs[] = {output.data()}; + + lstm.process(inputPtrs, outputPtrs, numFrames); + + for (int i = 0; i < numFrames; i++) + { + assert(std::isfinite(output[i])); + } +} + +// Test LSTM state evolution over time +void test_lstm_state_evolution() +{ + const int in_channels = 1; + const int out_channels = 1; + const int num_layers = 1; + const int input_size = 1; + const int hidden_size = 4; + const double expected_sample_rate = 48000.0; + + std::vector weights = create_lstm_weights(num_layers, input_size, hidden_size, out_channels); + + nam::lstm::LSTM lstm(in_channels, out_channels, num_layers, input_size, hidden_size, weights, expected_sample_rate); + + const int numFrames = 10; + lstm.Reset(expected_sample_rate, 64); + + // Create a sine wave input + std::vector input(numFrames); + for (int i = 0; i < numFrames; i++) + { + input[i] = 0.5f * std::sin(2.0f * M_PI * i / numFrames); + } + + std::vector output(numFrames, 0.0f); + NAM_SAMPLE* inputPtrs[] = {input.data()}; + NAM_SAMPLE* outputPtrs[] = {output.data()}; + + lstm.process(inputPtrs, outputPtrs, numFrames); + + // Output should be finite and potentially show some variation due to state + for (int i = 0; i < numFrames; i++) + { + assert(std::isfinite(output[i])); + } +} + +// Test LSTM with no layers (edge case) +void test_lstm_no_layers() +{ + const int in_channels = 1; + const int out_channels = 1; + const int num_layers = 0; + const int input_size = 1; + const int hidden_size = 4; + const double expected_sample_rate = 48000.0; + + // With no layers, we still need head weights + std::vector weights; + // Head weight matrix (row-major: out_channels x hidden_size) + for (int out_ch = 0; out_ch < out_channels; out_ch++) + { + for (int h = 0; h < hidden_size; h++) + { + weights.push_back(0.0f); // Zero weights means pass-through + } + } + // Head bias + for (int out_ch = 0; out_ch < out_channels; out_ch++) + { + weights.push_back(0.0f); + } + + nam::lstm::LSTM lstm(in_channels, out_channels, num_layers, input_size, hidden_size, weights, expected_sample_rate); + + const int numFrames = 4; + lstm.Reset(expected_sample_rate, 64); + + std::vector input(numFrames, 1.0f); + std::vector output(numFrames, 0.0f); + NAM_SAMPLE* inputPtrs[] = {input.data()}; + NAM_SAMPLE* outputPtrs[] = {output.data()}; + + lstm.process(inputPtrs, outputPtrs, numFrames); + + // With zero head weights and bias, output should equal input for first channel + for (int i = 0; i < numFrames; i++) + { + assert(std::isfinite(output[i])); + } +} + +}; // namespace test_lstm diff --git a/tools/test/test_wavenet/test_full.cpp b/tools/test/test_wavenet/test_full.cpp index d75ae1c..122ea0b 100644 --- a/tools/test/test_wavenet/test_full.cpp +++ b/tools/test/test_wavenet/test_full.cpp @@ -47,7 +47,8 @@ void test_wavenet_model() weights.push_back(1.0f); // Head rechannel weights.push_back(head_scale); // Head scale - auto wavenet = std::make_unique(layer_array_params, head_scale, with_head, weights, 48000.0); + auto wavenet = + std::make_unique(input_size, layer_array_params, head_scale, with_head, weights, 48000.0); const int numFrames = 4; const int maxBufferSize = 64; @@ -55,8 +56,10 @@ void test_wavenet_model() std::vector input(numFrames, 1.0f); std::vector output(numFrames, 0.0f); + NAM_SAMPLE* inputPtrs[] = {input.data()}; + NAM_SAMPLE* outputPtrs[] = {output.data()}; - wavenet->process(input.data(), output.data(), numFrames); + wavenet->process(inputPtrs, outputPtrs, numFrames); // Verify output dimensions assert(output.size() == numFrames); @@ -89,13 +92,13 @@ void test_wavenet_multiple_arrays() const int bottleneck = channels; const int groups_1x1 = 1; layer_array_params.push_back(nam::wavenet::LayerArrayParams(input_size, condition_size, head_size, channels, - bottleneck, kernel_size, std::move(dilations1), activation, - gated, head_bias, groups, groups_1x1)); + bottleneck, kernel_size, std::move(dilations1), + activation, gated, head_bias, groups, groups_1x1)); // Second array (head_size of first must match channels of second) std::vector dilations2{1}; layer_array_params.push_back(nam::wavenet::LayerArrayParams(head_size, condition_size, head_size, channels, - bottleneck, kernel_size, std::move(dilations2), activation, - gated, head_bias, groups, groups_1x1)); + bottleneck, kernel_size, std::move(dilations2), + activation, gated, head_bias, groups, groups_1x1)); std::vector weights; // Array 0: rechannel, layer, head_rechannel @@ -104,7 +107,8 @@ void test_wavenet_multiple_arrays() weights.insert(weights.end(), {1.0f, 1.0f, 0.0f, 1.0f, 1.0f, 0.0f, 1.0f}); weights.push_back(head_scale); - auto wavenet = std::make_unique(layer_array_params, head_scale, with_head, weights, 48000.0); + auto wavenet = + std::make_unique(input_size, layer_array_params, head_scale, with_head, weights, 48000.0); const int numFrames = 4; const int maxBufferSize = 64; @@ -112,8 +116,10 @@ void test_wavenet_multiple_arrays() std::vector input(numFrames, 1.0f); std::vector output(numFrames, 0.0f); + NAM_SAMPLE* inputPtrs[] = {input.data()}; + NAM_SAMPLE* outputPtrs[] = {output.data()}; - wavenet->process(input.data(), output.data(), numFrames); + wavenet->process(inputPtrs, outputPtrs, numFrames); assert(output.size() == numFrames); for (int i = 0; i < numFrames; i++) @@ -147,15 +153,18 @@ void test_wavenet_zero_input() std::vector weights{1.0f, 1.0f, 0.0f, 1.0f, 1.0f, 0.0f, 1.0f, head_scale}; - auto wavenet = std::make_unique(layer_array_params, head_scale, with_head, weights, 48000.0); + auto wavenet = + std::make_unique(input_size, layer_array_params, head_scale, with_head, weights, 48000.0); const int numFrames = 4; wavenet->Reset(48000.0, numFrames); std::vector input(numFrames, 0.0f); std::vector output(numFrames, 0.0f); + NAM_SAMPLE* inputPtrs[] = {input.data()}; + NAM_SAMPLE* outputPtrs[] = {output.data()}; - wavenet->process(input.data(), output.data(), numFrames); + wavenet->process(inputPtrs, outputPtrs, numFrames); // With zero input, output should be finite (may be zero or non-zero depending on bias) for (int i = 0; i < numFrames; i++) @@ -189,18 +198,23 @@ void test_wavenet_different_buffer_sizes() std::vector weights{1.0f, 1.0f, 0.0f, 1.0f, 1.0f, 0.0f, 1.0f, head_scale}; - auto wavenet = std::make_unique(layer_array_params, head_scale, with_head, weights, 48000.0); + auto wavenet = + std::make_unique(input_size, layer_array_params, head_scale, with_head, weights, 48000.0); // Test with different buffer sizes wavenet->Reset(48000.0, 64); std::vector input1(32, 1.0f); std::vector output1(32, 0.0f); - wavenet->process(input1.data(), output1.data(), 32); + NAM_SAMPLE* inputPtrs1[] = {input1.data()}; + NAM_SAMPLE* outputPtrs1[] = {output1.data()}; + wavenet->process(inputPtrs1, outputPtrs1, 32); wavenet->Reset(48000.0, 128); std::vector input2(64, 1.0f); std::vector output2(64, 0.0f); - wavenet->process(input2.data(), output2.data(), 64); + NAM_SAMPLE* inputPtrs2[] = {input2.data()}; + NAM_SAMPLE* outputPtrs2[] = {output2.data()}; + wavenet->process(inputPtrs2, outputPtrs2, 64); // Both should work without errors assert(output1.size() == 32); @@ -251,7 +265,8 @@ void test_wavenet_prewarm() weights.push_back(1.0f); weights.push_back(head_scale); - auto wavenet = std::make_unique(layer_array_params, head_scale, with_head, weights, 48000.0); + auto wavenet = + std::make_unique(input_size, layer_array_params, head_scale, with_head, weights, 48000.0); // Test that prewarm can be called without errors wavenet->Reset(48000.0, 64); @@ -261,7 +276,9 @@ void test_wavenet_prewarm() const int numFrames = 4; std::vector input(numFrames, 1.0f); std::vector output(numFrames, 0.0f); - wavenet->process(input.data(), output.data(), numFrames); + NAM_SAMPLE* inputPtrs[] = {input.data()}; + NAM_SAMPLE* outputPtrs[] = {output.data()}; + wavenet->process(inputPtrs, outputPtrs, numFrames); // Output should be finite for (int i = 0; i < numFrames; i++) diff --git a/tools/test/test_wavenet/test_real_time_safe.cpp b/tools/test/test_wavenet/test_real_time_safe.cpp index 91d8628..0a57539 100644 --- a/tools/test/test_wavenet/test_real_time_safe.cpp +++ b/tools/test/test_wavenet/test_real_time_safe.cpp @@ -437,8 +437,8 @@ void test_layer_process_realtime_safe() const int groups_input = 1; const int groups_1x1 = 1; - auto layer = nam::wavenet::_Layer(condition_size, channels, bottleneck, kernel_size, dilation, activation, gated, - groups_input, groups_1x1); + auto layer = nam::wavenet::_Layer( + condition_size, channels, bottleneck, kernel_size, dilation, activation, gated, groups_input, groups_1x1); // Set weights std::vector weights{1.0f, 0.0f, // Conv (weight, bias) @@ -492,8 +492,8 @@ void test_layer_bottleneck_process_realtime_safe() const int groups_input = 1; const int groups_1x1 = 1; - auto layer = nam::wavenet::_Layer(condition_size, channels, bottleneck, kernel_size, dilation, activation, gated, - groups_input, groups_1x1); + auto layer = nam::wavenet::_Layer( + condition_size, channels, bottleneck, kernel_size, dilation, activation, gated, groups_input, groups_1x1); // Set weights for bottleneck != channels // Conv: (channels, bottleneck, kernelSize=1) = (4, 2, 1) + bias @@ -544,8 +544,8 @@ void test_layer_bottleneck_process_realtime_safe() input.setConstant(0.5f); condition.setConstant(0.5f); - std::string test_name = "Layer Process (bottleneck=" + std::to_string(bottleneck) + ", channels=" + - std::to_string(channels) + ") - Buffer size " + std::to_string(buffer_size); + std::string test_name = "Layer Process (bottleneck=" + std::to_string(bottleneck) + ", channels=" + + std::to_string(channels) + ") - Buffer size " + std::to_string(buffer_size); run_allocation_test_no_allocations( nullptr, // No setup needed [&]() { @@ -577,8 +577,8 @@ void test_layer_grouped_process_realtime_safe() const int groups_input = 2; // groups_input > 1 const int groups_1x1 = 2; // 1x1 is also grouped - auto layer = nam::wavenet::_Layer(condition_size, channels, bottleneck, kernel_size, dilation, activation, gated, - groups_input, groups_1x1); + auto layer = nam::wavenet::_Layer( + condition_size, channels, bottleneck, kernel_size, dilation, activation, gated, groups_input, groups_1x1); // Set weights for grouped convolution // With groups_input=2, channels=4: each group has 2 in_channels and 2 out_channels @@ -757,13 +757,13 @@ void test_process_realtime_safe() const int bottleneck = channels; const int groups_1x1 = 1; layer_array_params.push_back(nam::wavenet::LayerArrayParams(input_size, condition_size, head_size, channels, - bottleneck, kernel_size, std::move(dilations1), activation, - gated, head_bias, groups, groups_1x1)); + bottleneck, kernel_size, std::move(dilations1), + activation, gated, head_bias, groups, groups_1x1)); // Second layer array (head_size of first must match channels of second) std::vector dilations2{1}; layer_array_params.push_back(nam::wavenet::LayerArrayParams(head_size, condition_size, head_size, channels, - bottleneck, kernel_size, std::move(dilations2), activation, - gated, head_bias, groups, groups_1x1)); + bottleneck, kernel_size, std::move(dilations2), + activation, gated, head_bias, groups, groups_1x1)); // Weights: Array 0: rechannel(1), layer(conv:1+1, input_mixin:1, 1x1:1+1), head_rechannel(1) // Array 1: same structure @@ -775,7 +775,8 @@ void test_process_realtime_safe() weights.insert(weights.end(), {1.0f, 1.0f, 0.0f, 1.0f, 1.0f, 0.0f, 1.0f}); weights.push_back(head_scale); - auto wavenet = std::make_unique(layer_array_params, head_scale, with_head, weights, 48000.0); + auto wavenet = + std::make_unique(input_size, layer_array_params, head_scale, with_head, weights, 48000.0); const int maxBufferSize = 256; wavenet->Reset(48000.0, maxBufferSize); @@ -794,7 +795,9 @@ void test_process_realtime_safe() nullptr, // No setup needed [&]() { // Call process() - this should not allocate or free - wavenet->process(input.data(), output.data(), buffer_size); + NAM_SAMPLE* inputPtrs[] = {input.data()}; + NAM_SAMPLE* outputPtrs[] = {output.data()}; + wavenet->process(inputPtrs, outputPtrs, buffer_size); }, nullptr, // No teardown needed test_name.c_str()); @@ -806,4 +809,127 @@ void test_process_realtime_safe() } } } + +// Test that WaveNet::process() method with 3 input channels and 2 output channels does not allocate or free memory +void test_process_3in_2out_realtime_safe() +{ + // Setup: Create WaveNet with 3 input channels and 2 output channels + const int input_size = 3; // 3 input channels + const int condition_size = 3; // condition matches input channels + const int head_size = 2; // 2 output channels + const int channels = 4; // internal channels + const int bottleneck = 2; // bottleneck (will be used for head) + const int kernel_size = 1; + const std::string activation = "ReLU"; + const bool gated = false; + const bool head_bias = false; + const float head_scale = 1.0f; + const bool with_head = false; + const int groups = 1; + const int groups_1x1 = 1; + + std::vector layer_array_params; + std::vector dilations1{1}; + layer_array_params.push_back(nam::wavenet::LayerArrayParams(input_size, condition_size, head_size, channels, + bottleneck, kernel_size, std::move(dilations1), + activation, gated, head_bias, groups, groups_1x1)); + + // Calculate weights: + // _rechannel: Conv1x1(3, 4, bias=false) = 3*4 = 12 weights + // Layer: + // _conv: Conv1D(4, 2, kernel_size=1, bias=true) = 1*(2*4) + 2 = 10 weights + // _input_mixin: Conv1x1(3, 2, bias=false) = 3*2 = 6 weights + // _1x1: Conv1x1(2, 4, bias=true) = 2*4 + 4 = 12 weights + // _head_rechannel: Conv1x1(2, 2, bias=false) = 2*2 = 4 weights + // Total: 12 + 10 + 6 + 12 + 4 = 44 weights + std::vector weights; + // _rechannel weights (3->4): identity-like pattern + for (int out_ch = 0; out_ch < 4; out_ch++) + { + for (int in_ch = 0; in_ch < 3; in_ch++) + { + weights.push_back((out_ch < 3 && out_ch == in_ch) ? 1.0f : 0.0f); + } + } + // Layer: _conv weights (4->2, kernel_size=1, with bias) + // Weight layout: for each kernel position k, for each out_channel, for each in_channel + for (int out_ch = 0; out_ch < 2; out_ch++) + { + for (int in_ch = 0; in_ch < 4; in_ch++) + { + weights.push_back((out_ch == in_ch) ? 1.0f : 0.0f); + } + } + // _conv bias (2 values) + weights.insert(weights.end(), {0.0f, 0.0f}); + // _input_mixin weights (3->2) + for (int out_ch = 0; out_ch < 2; out_ch++) + { + for (int in_ch = 0; in_ch < 3; in_ch++) + { + weights.push_back((out_ch == in_ch) ? 1.0f : 0.0f); + } + } + // _1x1 weights (2->4, with bias) + for (int out_ch = 0; out_ch < 4; out_ch++) + { + for (int in_ch = 0; in_ch < 2; in_ch++) + { + weights.push_back((out_ch < 2 && out_ch == in_ch) ? 1.0f : 0.0f); + } + } + // _1x1 bias (4 values) + weights.insert(weights.end(), {0.0f, 0.0f, 0.0f, 0.0f}); + // _head_rechannel weights (2->2) + for (int out_ch = 0; out_ch < 2; out_ch++) + { + for (int in_ch = 0; in_ch < 2; in_ch++) + { + weights.push_back((out_ch == in_ch) ? 1.0f : 0.0f); + } + } + weights.push_back(head_scale); + + const int in_channels = 3; + auto wavenet = + std::make_unique(in_channels, layer_array_params, head_scale, with_head, weights, 48000.0); + + const int maxBufferSize = 256; + wavenet->Reset(48000.0, maxBufferSize); + + // Test with several different buffer sizes + std::vector buffer_sizes{1, 8, 16, 32, 64, 128, 256}; + + for (int buffer_size : buffer_sizes) + { + // Prepare input/output buffers for 3 input channels and 2 output channels (allocate before tracking) + std::vector> input(3, std::vector(buffer_size, 0.5f)); + std::vector> output(2, std::vector(buffer_size, 0.0f)); + std::vector inputPtrs(3); + std::vector outputPtrs(2); + for (int ch = 0; ch < 3; ch++) + inputPtrs[ch] = input[ch].data(); + for (int ch = 0; ch < 2; ch++) + outputPtrs[ch] = output[ch].data(); + + std::string test_name = "WaveNet process (3in, 2out) - Buffer size " + std::to_string(buffer_size); + run_allocation_test_no_allocations( + nullptr, // No setup needed + [&]() { + // Call process() - this should not allocate or free + wavenet->process(inputPtrs.data(), outputPtrs.data(), buffer_size); + }, + nullptr, // No teardown needed + test_name.c_str()); + + // Verify output is valid + for (int ch = 0; ch < 2; ch++) + { + for (int i = 0; i < buffer_size; i++) + { + assert(std::isfinite(output[ch][i])); + } + } + } +} } // namespace test_wavenet