diff --git a/speech/api/parse_arguments.cc b/speech/api/parse_arguments.cc index 8f8cb26c..8da33c19 100644 --- a/speech/api/parse_arguments.cc +++ b/speech/api/parse_arguments.cc @@ -31,6 +31,9 @@ ParseResult ParseArguments(int argc, char* argv[]) { ("bitrate", po::value()->default_value(16000), "the sample rate in Hz") // + ("ptime", po::value()->default_value(0), + "(optional) packetization time in milliseconds") + // ("language-code", po::value()->default_value("en"), "the language code for the audio") // @@ -47,6 +50,7 @@ ParseResult ParseArguments(int argc, char* argv[]) { auto const path = vm["path"].as(); auto const bitrate = vm["bitrate"].as(); + auto const ptime = vm["ptime"].as(); auto const language_code = vm["language-code"].as(); // Validate the command-line options. if (path.empty()) { @@ -57,11 +61,19 @@ ParseResult ParseArguments(int argc, char* argv[]) { "--bitrate option must be a positive number, value=" + std::to_string(bitrate)); } + if (ptime < 0) { + throw std::runtime_error( + "--ptime option must be a positive number, value=" + + std::to_string(ptime)); + } ParseResult result; result.path = path; result.config.set_language_code(language_code); result.config.set_sample_rate_hertz(bitrate); + result.bitrate = 0; + result.ptime = 0; + result.sample_size = 0; // Use the audio file extension to configure the encoding. auto const ext = [&] { @@ -75,8 +87,14 @@ ParseResult ParseArguments(int argc, char* argv[]) { if (ext.empty() || ext == ".raw") { result.config.set_encoding(RecognitionConfig::LINEAR16); + result.bitrate = bitrate; + result.ptime = ptime; + result.sample_size = 2; } else if (ext == ".ulaw") { result.config.set_encoding(RecognitionConfig::MULAW); + result.bitrate = bitrate; + result.ptime = ptime; + result.sample_size = 1; } else if (ext == ".flac") { result.config.set_encoding(RecognitionConfig::FLAC); } else if (ext == ".amr") { diff --git a/speech/api/parse_arguments.h b/speech/api/parse_arguments.h index 2486ee05..91728916 100644 --- a/speech/api/parse_arguments.h +++ b/speech/api/parse_arguments.h @@ -23,6 +23,9 @@ struct ParseResult { google::cloud::speech::v1::RecognitionConfig config; std::string path; + int bitrate; + int ptime; + int sample_size; }; ParseResult ParseArguments(int argc, char* argv[]); diff --git a/speech/api/streaming_transcribe.cc b/speech/api/streaming_transcribe.cc index ddb035a6..bc84384a 100644 --- a/speech/api/streaming_transcribe.cc +++ b/speech/api/streaming_transcribe.cc @@ -27,17 +27,26 @@ using RecognizeStream = ::google::cloud::AsyncStreamingReadWriteRpc< speech::v1::StreamingRecognizeResponse>; auto constexpr kUsage = R"""(Usage: - streaming_transcribe [--bitrate N] audio.(raw|ulaw|flac|amr|awb) + streaming_transcribe [--bitrate N] [--ptime N] audio.(raw|ulaw|flac|amr|awb) )"""; -// Write the audio in 64k chunks at a time, simulating audio content arriving -// from a microphone. -void MicrophoneThreadMain(RecognizeStream& stream, - std::string const& file_path) { +// Write the audio packet every ptime ms, simulating audio content arriving +// from a microphone in ptime ms intervals +void MicrophoneThreadMain(RecognizeStream& stream, std::string const& file_path, + const int bitrate, const int ptime, + const int sample_size) { speech::v1::StreamingRecognizeRequest request; std::ifstream file_stream(file_path, std::ios::binary); - auto constexpr kChunkSize = 64 * 1024; - std::vector chunk(kChunkSize); + // By default, read 64k bytes every 1 second + auto bytes_n = 64 * 1024; + auto wait_ms = 1000; + // If ptime is configured read packet every ptime ms (may only be set for + // "raw" and "ulaw") + if (ptime) { + wait_ms = ptime; + bytes_n = (bitrate / 1000) * sample_size * ptime; + } + std::vector chunk(bytes_n); while (true) { // Read another chunk from the file. file_stream.read(chunk.data(), chunk.size()); @@ -45,7 +54,7 @@ void MicrophoneThreadMain(RecognizeStream& stream, // And write the chunk to the stream. if (bytes_read > 0) { request.set_audio_content(chunk.data(), bytes_read); - std::cout << "Sending " << bytes_read / 1024 << "k bytes." << std::endl; + std::cout << "Sending " << bytes_read << " bytes." << std::endl; if (!stream.Write(request, grpc::WriteOptions()).get()) break; } if (!file_stream) { @@ -54,7 +63,7 @@ void MicrophoneThreadMain(RecognizeStream& stream, break; } // Wait a second before writing the next chunk. - std::this_thread::sleep_for(std::chrono::seconds(1)); + std::this_thread::sleep_for(std::chrono::milliseconds(wait_ms)); } } @@ -65,6 +74,9 @@ int main(int argc, char** argv) try { // Parse command line arguments. auto args = ParseArguments(argc, argv); auto const file_path = args.path; + auto const bitrate = args.bitrate; + auto const ptime = args.ptime; + auto const sample_size = args.sample_size; speech::v1::StreamingRecognizeRequest request; auto& streaming_config = *request.mutable_streaming_config(); @@ -81,8 +93,8 @@ int main(int argc, char** argv) try { } // Simulate a microphone thread using the file as input. - auto microphone = - std::thread(MicrophoneThreadMain, std::ref(*stream), file_path); + auto microphone = std::thread(MicrophoneThreadMain, std::ref(*stream), + file_path, bitrate, ptime, sample_size); // Read responses. auto read = [&stream] { return stream->Read().get(); }; for (auto response = read(); response.has_value(); response = read()) {