Skip to content

Commit c1da61c

Browse files
committed
feat: Add inputAudioTranscription support to Java ADK
1 parent 54bee7b commit c1da61c

File tree

5 files changed

+63
-3
lines changed

5 files changed

+63
-3
lines changed

core/src/main/java/com/google/adk/agents/RunConfig.java

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,8 +48,12 @@ public enum StreamingMode {
4848

4949
public abstract @Nullable AudioTranscriptionConfig outputAudioTranscription();
5050

51+
public abstract @Nullable AudioTranscriptionConfig inputAudioTranscription();
52+
5153
public abstract int maxLlmCalls();
5254

55+
public abstract Builder toBuilder();
56+
5357
public static Builder builder() {
5458
return new AutoValue_RunConfig.Builder()
5559
.setSaveInputBlobsAsArtifacts(false)
@@ -65,7 +69,8 @@ public static Builder builder(RunConfig runConfig) {
6569
.setMaxLlmCalls(runConfig.maxLlmCalls())
6670
.setResponseModalities(runConfig.responseModalities())
6771
.setSpeechConfig(runConfig.speechConfig())
68-
.setOutputAudioTranscription(runConfig.outputAudioTranscription());
72+
.setOutputAudioTranscription(runConfig.outputAudioTranscription())
73+
.setInputAudioTranscription(runConfig.inputAudioTranscription());
6974
}
7075

7176
/** Builder for {@link RunConfig}. */
@@ -88,6 +93,10 @@ public abstract static class Builder {
8893
public abstract Builder setOutputAudioTranscription(
8994
AudioTranscriptionConfig outputAudioTranscription);
9095

96+
@CanIgnoreReturnValue
97+
public abstract Builder setInputAudioTranscription(
98+
AudioTranscriptionConfig inputAudioTranscription);
99+
91100
@CanIgnoreReturnValue
92101
public abstract Builder setMaxLlmCalls(int maxLlmCalls);
93102

core/src/main/java/com/google/adk/flows/llmflows/Basic.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,8 @@ public Single<RequestProcessor.RequestProcessingResult> processRequest(
4848
.ifPresent(liveConnectConfigBuilder::speechConfig);
4949
Optional.ofNullable(context.runConfig().outputAudioTranscription())
5050
.ifPresent(liveConnectConfigBuilder::outputAudioTranscription);
51+
Optional.ofNullable(context.runConfig().inputAudioTranscription())
52+
.ifPresent(liveConnectConfigBuilder::inputAudioTranscription);
5153

5254
LlmRequest.Builder builder =
5355
request.toBuilder()

core/src/main/java/com/google/adk/runner/Runner.java

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -366,8 +366,9 @@ private Single<Session> emitStateDeltaEvent(
366366
private InvocationContext newInvocationContextForLive(
367367
Session session, Optional<LiveRequestQueue> liveRequestQueue, RunConfig runConfig) {
368368
RunConfig.Builder runConfigBuilder = RunConfig.builder(runConfig);
369-
if (!CollectionUtils.isNullOrEmpty(runConfig.responseModalities())
370-
&& liveRequestQueue.isPresent()) {
369+
if (liveRequestQueue.isPresent() && !this.agent.subAgents().isEmpty()) {
370+
// Parity with Python: apply modality defaults and transcription settings
371+
// only for multi-agent live scenarios.
371372
// Default to AUDIO modality if not specified.
372373
if (CollectionUtils.isNullOrEmpty(runConfig.responseModalities())) {
373374
runConfigBuilder.setResponseModalities(
@@ -380,6 +381,10 @@ private InvocationContext newInvocationContextForLive(
380381
runConfigBuilder.setOutputAudioTranscription(AudioTranscriptionConfig.builder().build());
381382
}
382383
}
384+
// Need input transcription for agent transferring in live mode.
385+
if (runConfig.inputAudioTranscription() == null) {
386+
runConfigBuilder.setInputAudioTranscription(AudioTranscriptionConfig.builder().build());
387+
}
383388
}
384389
return newInvocationContext(
385390
session, /* newMessage= */ Optional.empty(), liveRequestQueue, runConfigBuilder.build());

core/src/test/java/com/google/adk/agents/RunConfigTest.java

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ public void testBuilderWithVariousValues() {
2525
.setSaveInputBlobsAsArtifacts(true)
2626
.setStreamingMode(RunConfig.StreamingMode.SSE)
2727
.setOutputAudioTranscription(audioTranscriptionConfig)
28+
.setInputAudioTranscription(audioTranscriptionConfig)
2829
.setMaxLlmCalls(10)
2930
.build();
3031

@@ -33,6 +34,7 @@ public void testBuilderWithVariousValues() {
3334
assertThat(runConfig.saveInputBlobsAsArtifacts()).isTrue();
3435
assertThat(runConfig.streamingMode()).isEqualTo(RunConfig.StreamingMode.SSE);
3536
assertThat(runConfig.outputAudioTranscription()).isEqualTo(audioTranscriptionConfig);
37+
assertThat(runConfig.inputAudioTranscription()).isEqualTo(audioTranscriptionConfig);
3638
assertThat(runConfig.maxLlmCalls()).isEqualTo(10);
3739
}
3840

@@ -45,6 +47,7 @@ public void testBuilderDefaults() {
4547
assertThat(runConfig.saveInputBlobsAsArtifacts()).isFalse();
4648
assertThat(runConfig.streamingMode()).isEqualTo(RunConfig.StreamingMode.NONE);
4749
assertThat(runConfig.outputAudioTranscription()).isNull();
50+
assertThat(runConfig.inputAudioTranscription()).isNull();
4851
assertThat(runConfig.maxLlmCalls()).isEqualTo(500);
4952
}
5053

@@ -66,6 +69,7 @@ public void testBuilderWithDifferentValues() {
6669
.setSaveInputBlobsAsArtifacts(true)
6770
.setStreamingMode(RunConfig.StreamingMode.BIDI)
6871
.setOutputAudioTranscription(audioTranscriptionConfig)
72+
.setInputAudioTranscription(audioTranscriptionConfig)
6973
.setMaxLlmCalls(20)
7074
.build();
7175

@@ -74,6 +78,24 @@ public void testBuilderWithDifferentValues() {
7478
assertThat(runConfig.saveInputBlobsAsArtifacts()).isTrue();
7579
assertThat(runConfig.streamingMode()).isEqualTo(RunConfig.StreamingMode.BIDI);
7680
assertThat(runConfig.outputAudioTranscription()).isEqualTo(audioTranscriptionConfig);
81+
assertThat(runConfig.inputAudioTranscription()).isEqualTo(audioTranscriptionConfig);
7782
assertThat(runConfig.maxLlmCalls()).isEqualTo(20);
7883
}
84+
85+
@Test
86+
public void testInputAudioTranscriptionOnly() {
87+
AudioTranscriptionConfig inputTranscriptionConfig = AudioTranscriptionConfig.builder().build();
88+
89+
RunConfig runConfig =
90+
RunConfig.builder()
91+
.setStreamingMode(RunConfig.StreamingMode.BIDI)
92+
.setResponseModalities(ImmutableList.of(new Modality(Modality.Known.AUDIO)))
93+
.setInputAudioTranscription(inputTranscriptionConfig)
94+
.build();
95+
96+
assertThat(runConfig.inputAudioTranscription()).isEqualTo(inputTranscriptionConfig);
97+
assertThat(runConfig.outputAudioTranscription()).isNull();
98+
assertThat(runConfig.streamingMode()).isEqualTo(RunConfig.StreamingMode.BIDI);
99+
assertThat(runConfig.responseModalities()).containsExactly(new Modality(Modality.Known.AUDIO));
100+
}
79101
}

core/src/test/java/com/google/adk/flows/llmflows/BasicTest.java

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -220,13 +220,33 @@ public void processRequest_buildsLiveConnectConfigFromRunConfig_outputAudioTrans
220220
assertThat(result.events()).isEmpty();
221221
}
222222

223+
@Test
224+
public void processRequest_buildsLiveConnectConfigFromRunConfig_inputAudioTranscription() {
225+
RunConfig runConfig =
226+
RunConfig.builder().setInputAudioTranscription(TEST_AUDIO_TRANSCRIPTION_CONFIG).build();
227+
LlmAgent agentWithConfig = LlmAgent.builder().name("agentWithConfig").model(testLlm).build();
228+
InvocationContext contextWithRunConfig = createInvocationContext(agentWithConfig, runConfig);
229+
230+
RequestProcessingResult result =
231+
basicProcessor.processRequest(contextWithRunConfig, initialRequest).blockingGet();
232+
233+
LlmRequest updatedRequest = result.updatedRequest();
234+
assertThat(updatedRequest.liveConnectConfig()).isNotNull();
235+
assertThat(updatedRequest.liveConnectConfig().responseModalities().get()).isEmpty();
236+
assertThat(updatedRequest.liveConnectConfig().speechConfig()).isEmpty();
237+
assertThat(updatedRequest.liveConnectConfig().inputAudioTranscription())
238+
.hasValue(TEST_AUDIO_TRANSCRIPTION_CONFIG);
239+
assertThat(result.events()).isEmpty();
240+
}
241+
223242
@Test
224243
public void processRequest_buildsLiveConnectConfigFromRunConfig_allFields() {
225244
RunConfig runConfig =
226245
RunConfig.builder()
227246
.setResponseModalities(ImmutableList.of(new Modality(Modality.Known.AUDIO)))
228247
.setSpeechConfig(TEST_SPEECH_CONFIG)
229248
.setOutputAudioTranscription(TEST_AUDIO_TRANSCRIPTION_CONFIG)
249+
.setInputAudioTranscription(TEST_AUDIO_TRANSCRIPTION_CONFIG)
230250
.build();
231251
LlmAgent agentWithConfig = LlmAgent.builder().name("agentWithConfig").model(testLlm).build();
232252
InvocationContext contextWithRunConfig = createInvocationContext(agentWithConfig, runConfig);
@@ -241,6 +261,8 @@ public void processRequest_buildsLiveConnectConfigFromRunConfig_allFields() {
241261
assertThat(updatedRequest.liveConnectConfig().speechConfig()).hasValue(TEST_SPEECH_CONFIG);
242262
assertThat(updatedRequest.liveConnectConfig().outputAudioTranscription())
243263
.hasValue(TEST_AUDIO_TRANSCRIPTION_CONFIG);
264+
assertThat(updatedRequest.liveConnectConfig().inputAudioTranscription())
265+
.hasValue(TEST_AUDIO_TRANSCRIPTION_CONFIG);
244266
assertThat(result.events()).isEmpty();
245267
}
246268
}

0 commit comments

Comments
 (0)