From 2abbb436692268993eee088cfeaab562a2cd9df4 Mon Sep 17 00:00:00 2001 From: "serhii.ku" Date: Wed, 23 Apr 2025 13:45:10 +0300 Subject: [PATCH 01/46] Added basic vad functionality --- demo/index.ts | 4 +- package.json | 1 + src/modules/audio.ts | 258 ++++++++++++++++++++++++++++++ vite.config.js => vite.config.mjs | 0 yarn.lock | 140 ++++++++++++++++ 5 files changed, 401 insertions(+), 2 deletions(-) rename vite.config.js => vite.config.mjs (100%) diff --git a/demo/index.ts b/demo/index.ts index bd2f3be..4d4b4bd 100644 --- a/demo/index.ts +++ b/demo/index.ts @@ -612,10 +612,10 @@ loginToAppFormEl?.addEventListener('submit', (event) => { imageSrc: base64Image }) - openSIPSJS.use(screenSharePlugin) + /*openSIPSJS.use(screenSharePlugin) //openSIPSJS.use(streamMaskPlugin) openSIPSJS.use(whiteBoardPlugin) - openSIPSJS.use(screenShareWhiteBoardPlugin) + openSIPSJS.use(screenShareWhiteBoardPlugin)*/ /* openSIPSJS Listeners */ openSIPSJS diff --git a/package.json b/package.json index 035c6ca..259d4c4 100644 --- a/package.json +++ b/package.json @@ -60,6 +60,7 @@ "vue": "3.2.25" }, "dependencies": { + "@ricky0123/vad-web": "^0.0.22", "@types/mime": "^3.0.1", "generate-unique-id": "^2.0.1", "jssip": "3.10.0", diff --git a/src/modules/audio.ts b/src/modules/audio.ts index c6945fa..dfe676c 100644 --- a/src/modules/audio.ts +++ b/src/modules/audio.ts @@ -19,6 +19,7 @@ import { import { isMobile, processAudioVolume, simplifyCallObject, syncStream } from '@/helpers/audio.helper' import { RTCSessionEvent } from 'jssip/lib/UA' import { forEach } from 'p-iteration' +import { MicVAD, utils } from '@ricky0123/vad-web' import audioContext from '@/helpers/audioContext' import { CALL_EVENT_LISTENER_TYPE } from '@/enum/call.event.listener.type' import { IncomingAckEvent, IncomingEvent, OutgoingAckEvent, OutgoingEvent } from 'jssip/lib/RTCSession' @@ -34,6 +35,23 @@ const STORAGE_KEYS = { } const CALL_STATUS_UNANSWERED = 0 +export function debounce (callback, wait) { + let timerId + + const debounced = (...args) => { + clearTimeout(timerId) + timerId = setTimeout(() => { + callback(...args) + }, wait) + } + + debounced.cancel = () => { + clearTimeout(timerId) + } + + return debounced +} + export class AudioModule { private context: OpenSIPSJS private currentActiveRoomIdValue: number | undefined @@ -66,6 +84,8 @@ export class AudioModule { private activeStreamValue: MediaStream | null = null private initialStreamValue: MediaStream | null = null + private vad: MicVAD | null = null + private vadSessions: object = {} private VUMeter: VUMeter @@ -693,6 +713,7 @@ export class AudioModule { } private async roomReconfigure (roomId: number | undefined) { + console.log('roomReconfigure start') if (roomId === undefined) { return } @@ -737,9 +758,12 @@ export class AudioModule { } else if (callsInRoom.length > 1) { await this.doConference(callsInRoom) } + + console.log('roomReconfigure end') } private async doConference (sessions: Array) { + console.log('doConference start') /*await forEach(sessions, async (session: ICall) => { if (session._localHold) { await this.unholdCall(session._id) @@ -788,12 +812,77 @@ export class AudioModule { sourceStream.connect(mixedOutput) } + console.log('doConference') + this.vad?.pause() + this.vad = null + + if (this.vadSessions[session._id]) { + this.vadSessions[session._id].vad.pause() + this.vadSessions[session._id].vad = null + console.log('vad session pause', session._id) + } else { + console.log('vad session else', session._id) + } + + console.log('typeof mixedOutput', typeof mixedOutput) + const mixedStreamCopy = mixedOutput.stream.clone() + const vadSession = await MicVAD.new({ + mixedStreamCopy, + model: 'v5', + //baseAssetPath: '/', + //onnxWASMBasePath: '/', + positiveSpeechThreshold: 0.4, + negativeSpeechThreshold: 0.4, + minSpeechFrames: 15, + preSpeechPadFrames: 30, + onFrameProcessed: async (probs, frame) => { + console.log('VAD probs.isSpeech conference', session._id, probs.isSpeech) + if (probs.isSpeech > 0.001) { + if (!this.vadSessions[session._id].isSpeakingState && mixedOutput) { + console.log('SET SPEAKING - YES') + this.vadSessions[session._id].isSpeakingState = true + clearTimeout(this.vadSessions[session._id].vadInterval) + this.vadSessions[session._id].vadInterval = null + + mixedOutput.stream.getTracks().forEach(track => track.enabled = true) + if (session.connection?.getSenders()[0]) { + await session.connection.getSenders()[0].replaceTrack(mixedOutput.stream.getTracks()[0]) + } + } + } else { + if (this.vadSessions[session._id].isSpeakingState && !this.vadSessions[session._id].vadInterval && mixedOutput) { + this.vadSessions[session._id].vadInterval = setTimeout(async () => { + console.log('SET SPEAKING - NO') + this.vadSessions[session._id].isSpeakingState = false + + mixedOutput.stream.getTracks().forEach(track => track.enabled = false) + if (session.connection?.getSenders()[0]) { + await session.connection.getSenders()[0].replaceTrack(mixedOutput.stream.getTracks()[0]) + } + }, 1500) + } + } + }, + onSpeechEnd: (arr) => { + console.log('VAD onSpeechEnd') + }, + }) + + this.vadSessions[session._id] = { + isSpeakingState: true, + vadInterval: null, + vad: vadSession + } + if (session.connection?.getSenders()[0]) { //mixedOutput.stream.getTracks().forEach(track => track.enabled = !getters.isMuted) // Uncomment to mute all callers on mute await session.connection.getSenders()[0].replaceTrack(mixedOutput.stream.getTracks()[0]) this.muteReconfigure(session) } + + vadSession.start() }) + console.log('doConference end') } private processCallerMute (callId: string, value: boolean) { @@ -1109,6 +1198,8 @@ export class AudioModule { this.setIsMuted(false) this.initialStreamValue?.getTracks().forEach((track) => track.stop()) this.initialStreamValue = null + this.vad?.pause() + this.vad = null } }) session.on('progress', (event: IncomingEvent | OutgoingEvent) => { @@ -1146,6 +1237,8 @@ export class AudioModule { this.setIsMuted(false) this.initialStreamValue?.getTracks().forEach((track) => track.stop()) this.initialStreamValue = null + this.vad?.pause() + this.vad = null } }) session.on('confirmed', (event: IncomingAckEvent | OutgoingAckEvent) => { @@ -1285,8 +1378,173 @@ export class AudioModule { if (this.initialStreamValue) { this.initialStreamValue.getTracks().forEach((track) => track.stop()) this.initialStreamValue = null + this.vad?.pause() + this.vad = null } this.initialStreamValue = stream + const vadStream = stream.clone() + + let isSpeakingState = false + let vadInterval = null + this.vad = await MicVAD.new({ + vadStream, + model: 'v5', + //baseAssetPath: '/', + //onnxWASMBasePath: '/', + positiveSpeechThreshold: 0.4, + negativeSpeechThreshold: 0.4, + minSpeechFrames: 15, + preSpeechPadFrames: 30, + onFrameProcessed: async (probs, frame) => { + console.log('VAD probs.isSpeech', probs.isSpeech) + if (probs.isSpeech > 0.001) { + if (!isSpeakingState && this.initialStreamValue) { + console.log('SET SPEAKING - YES') + isSpeakingState = true + clearTimeout(vadInterval) + vadInterval = null + + const callsInRoom = Object.values(this.extendedCalls) + .filter(call => call.roomId === this.currentActiveRoomId) + + if ( + callsInRoom.length === 1 && + callsInRoom[0].connection && + callsInRoom[0].connection?.getSenders()[0] + ) { + //const processedStream = this.getActiveStream() + /*await */ + this.initialStreamValue.getTracks().forEach(track => track.enabled = true) + await callsInRoom[0].connection.getSenders()[0].replaceTrack(this.initialStreamValue.getTracks()[0]) + //this.muteReconfigure(callsInRoom[0]) + } /*else if (callsInRoom.length > 1) { + const receivedTracks: Array = [] + + callsInRoom.forEach(session => { + if (session !== null && session !== undefined) { + session.connection.getReceivers().forEach((receiver: RTCRtpReceiver) => { + receivedTracks.push(receiver.track) + }) + } + }) + + await forEach(callsInRoom, async (session: ICall) => { + if (session === null || session === undefined) { + return + } + + const allReceivedMediaStreams = new MediaStream() + const mixedOutput = audioContext.createMediaStreamDestination() + + session.connection.getReceivers().forEach((receiver: RTCRtpReceiver) => { + receivedTracks.forEach(track => { + allReceivedMediaStreams.addTrack(receiver.track) + + if (receiver.track.id !== track.id) { + const sourceStream = audioContext.createMediaStreamSource(new MediaStream([ track ])) + sourceStream.connect(mixedOutput) + } + }) + }) + + const sourceStream = audioContext.createMediaStreamSource(this.initialStreamValue) + sourceStream.connect(mixedOutput) + + if (session.connection?.getSenders()[0]) { + await session.connection.getSenders()[0].replaceTrack(mixedOutput.stream.getTracks()[0]) + } + }) + //await this.doConference(callsInRoom) + }*/ + //this.roomReconfigure(this.currentActiveRoomId) + } + } else { + if (isSpeakingState && !vadInterval && this.initialStreamValue) { + vadInterval = setTimeout(async () => { + console.log('SET SPEAKING - NO') + isSpeakingState = false + + /*const callsInRoom = Object.values(this.extendedCalls) + .filter(call => call.roomId === this.currentActiveRoomId) + + if (callsInRoom[0].connection && callsInRoom[0].connection?.getSenders()[0]) { + //const processedStream = this.getActiveStream() + /!*await *!/ + callsInRoom[0].connection.getSenders()[0].replaceTrack(this.initialStreamValue.getTracks()[0]) + //this.muteReconfigure(callsInRoom[0]) + }*/ + + const callsInRoom = Object.values(this.extendedCalls) + .filter(call => call.roomId === this.currentActiveRoomId) + + if ( + callsInRoom.length === 1 && + callsInRoom[0].connection && + callsInRoom[0].connection?.getSenders()[0] + ) { + //const processedStream = this.getActiveStream() + /*await */ + this.initialStreamValue.getTracks().forEach(track => track.enabled = false) + await callsInRoom[0].connection.getSenders()[0].replaceTrack(this.initialStreamValue.getTracks()[0]) + //this.muteReconfigure(callsInRoom[0]) + } /*else if (callsInRoom.length > 1) { + const receivedTracks: Array = [] + + callsInRoom.forEach(session => { + if (session !== null && session !== undefined) { + session.connection.getReceivers().forEach((receiver: RTCRtpReceiver) => { + receivedTracks.push(receiver.track) + }) + } + }) + + await forEach(callsInRoom, async (session: ICall) => { + if (session === null || session === undefined) { + return + } + + const allReceivedMediaStreams = new MediaStream() + const mixedOutput = audioContext.createMediaStreamDestination() + + session.connection.getReceivers().forEach((receiver: RTCRtpReceiver) => { + receivedTracks.forEach(track => { + allReceivedMediaStreams.addTrack(receiver.track) + + if (receiver.track.id !== track.id) { + const sourceStream = audioContext.createMediaStreamSource(new MediaStream([ track ])) + sourceStream.connect(mixedOutput) + } + }) + }) + + const sourceStream = audioContext.createMediaStreamSource(this.initialStreamValue) + sourceStream.connect(mixedOutput) + + if (session.connection?.getSenders()[0]) { + await session.connection.getSenders()[0].replaceTrack(mixedOutput.stream.getTracks()[0]) + } + }) + + }*/ + //this.roomReconfigure(this.currentActiveRoomId) + }, 1500) + } + } + //const indicatorColor = interpolateInferno(probs.isSpeech / 2) + //document.body.style.setProperty("--indicator-color", indicatorColor) + }, + onSpeechEnd: (arr) => { + console.log('VAD onSpeechEnd') + /*const wavBuffer = utils.encodeWAV(arr) + const base64 = utils.arrayBufferToBase64(wavBuffer) + const url = `data:audio/wav;base64,${base64}` + const el = addAudio(url) + const speechList = document.getElementById("playlist") + speechList.prepend(el)*/ + }, + }) + + this.vad.start() } private async triggerAddStream (event: RTCTrackEvent, call: ICall) { diff --git a/vite.config.js b/vite.config.mjs similarity index 100% rename from vite.config.js rename to vite.config.mjs diff --git a/yarn.lock b/yarn.lock index e134d75..cbdbf60 100644 --- a/yarn.lock +++ b/yarn.lock @@ -870,6 +870,66 @@ "@nodelib/fs.scandir" "2.1.5" fastq "^1.6.0" +"@protobufjs/aspromise@^1.1.1", "@protobufjs/aspromise@^1.1.2": + version "1.1.2" + resolved "https://registry.yarnpkg.com/@protobufjs/aspromise/-/aspromise-1.1.2.tgz#9b8b0cc663d669a7d8f6f5d0893a14d348f30fbf" + integrity sha512-j+gKExEuLmKwvz3OgROXtrJ2UG2x8Ch2YZUxahh+s1F2HZ+wAceUNLkvy6zKCPVRkU++ZWQrdxsUeQXmcg4uoQ== + +"@protobufjs/base64@^1.1.2": + version "1.1.2" + resolved "https://registry.yarnpkg.com/@protobufjs/base64/-/base64-1.1.2.tgz#4c85730e59b9a1f1f349047dbf24296034bb2735" + integrity sha512-AZkcAA5vnN/v4PDqKyMR5lx7hZttPDgClv83E//FMNhR2TMcLUhfRUBHCmSl0oi9zMgDDqRUJkSxO3wm85+XLg== + +"@protobufjs/codegen@^2.0.4": + version "2.0.4" + resolved "https://registry.yarnpkg.com/@protobufjs/codegen/-/codegen-2.0.4.tgz#7ef37f0d010fb028ad1ad59722e506d9262815cb" + integrity sha512-YyFaikqM5sH0ziFZCN3xDC7zeGaB/d0IUb9CATugHWbd1FRFwWwt4ld4OYMPWu5a3Xe01mGAULCdqhMlPl29Jg== + +"@protobufjs/eventemitter@^1.1.0": + version "1.1.0" + resolved "https://registry.yarnpkg.com/@protobufjs/eventemitter/-/eventemitter-1.1.0.tgz#355cbc98bafad5978f9ed095f397621f1d066b70" + integrity sha512-j9ednRT81vYJ9OfVuXG6ERSTdEL1xVsNgqpkxMsbIabzSo3goCjDIveeGv5d03om39ML71RdmrGNjG5SReBP/Q== + +"@protobufjs/fetch@^1.1.0": + version "1.1.0" + resolved "https://registry.yarnpkg.com/@protobufjs/fetch/-/fetch-1.1.0.tgz#ba99fb598614af65700c1619ff06d454b0d84c45" + integrity sha512-lljVXpqXebpsijW71PZaCYeIcE5on1w5DlQy5WH6GLbFryLUrBD4932W/E2BSpfRJWseIL4v/KPgBFxDOIdKpQ== + dependencies: + "@protobufjs/aspromise" "^1.1.1" + "@protobufjs/inquire" "^1.1.0" + +"@protobufjs/float@^1.0.2": + version "1.0.2" + resolved "https://registry.yarnpkg.com/@protobufjs/float/-/float-1.0.2.tgz#5e9e1abdcb73fc0a7cb8b291df78c8cbd97b87d1" + integrity sha512-Ddb+kVXlXst9d+R9PfTIxh1EdNkgoRe5tOX6t01f1lYWOvJnSPDBlG241QLzcyPdoNTsblLUdujGSE4RzrTZGQ== + +"@protobufjs/inquire@^1.1.0": + version "1.1.0" + resolved "https://registry.yarnpkg.com/@protobufjs/inquire/-/inquire-1.1.0.tgz#ff200e3e7cf2429e2dcafc1140828e8cc638f089" + integrity sha512-kdSefcPdruJiFMVSbn801t4vFK7KB/5gd2fYvrxhuJYg8ILrmn9SKSX2tZdV6V+ksulWqS7aXjBcRXl3wHoD9Q== + +"@protobufjs/path@^1.1.2": + version "1.1.2" + resolved "https://registry.yarnpkg.com/@protobufjs/path/-/path-1.1.2.tgz#6cc2b20c5c9ad6ad0dccfd21ca7673d8d7fbf68d" + integrity sha512-6JOcJ5Tm08dOHAbdR3GrvP+yUUfkjG5ePsHYczMFLq3ZmMkAD98cDgcT2iA1lJ9NVwFd4tH/iSSoe44YWkltEA== + +"@protobufjs/pool@^1.1.0": + version "1.1.0" + resolved "https://registry.yarnpkg.com/@protobufjs/pool/-/pool-1.1.0.tgz#09fd15f2d6d3abfa9b65bc366506d6ad7846ff54" + integrity sha512-0kELaGSIDBKvcgS4zkjz1PeddatrjYcmMWOlAuAPwAeccUrPHdUqo/J6LiymHHEiJT5NrF1UVwxY14f+fy4WQw== + +"@protobufjs/utf8@^1.1.0": + version "1.1.0" + resolved "https://registry.yarnpkg.com/@protobufjs/utf8/-/utf8-1.1.0.tgz#a777360b5b39a1a2e5106f8e858f2fd2d060c570" + integrity sha512-Vvn3zZrhQZkkBE8LSuW3em98c0FwgO4nxzv6OdSxPKJIEKY2bGbHn+mhGIPerzI4twdxaP8/0+06HBpwf345Lw== + +"@ricky0123/vad-web@^0.0.22": + version "0.0.22" + resolved "https://registry.yarnpkg.com/@ricky0123/vad-web/-/vad-web-0.0.22.tgz#35fd050b85a14d2f16e97f2970fd398d3d08b70a" + integrity sha512-679R6sfwXx4jkquK+FJ9RC2W29oulWC+9ZINK6LVpuy90IBV7UaTGNN79oQXufpJTJs5z4X/22nw1DQ4+Rh8CA== + dependencies: + onnxruntime-web "1.14.0" + "@rollup/pluginutils@^5.0.2": version "5.1.0" resolved "https://registry.npmjs.org/@rollup/pluginutils/-/pluginutils-5.1.0.tgz" @@ -1427,6 +1487,11 @@ resolved "https://registry.npmjs.org/@types/json-schema/-/json-schema-7.0.11.tgz" integrity sha512-wOuvG1SN4Us4rez+tylwwwCV1psiNVOkJeM3AUWUNWg/jDQY2+HE/444y5gc+jBmRqASOm2Oeh5c1axHobwRKQ== +"@types/long@^4.0.1": + version "4.0.2" + resolved "https://registry.yarnpkg.com/@types/long/-/long-4.0.2.tgz#b74129719fc8d11c01868010082d483b7545591a" + integrity sha512-MqTGEo5bj5t157U6fA/BiDynNkn0YknVdh48CMPkTSpFTVmvao5UQmm7uEF6xBEo7qIMAlY/JSleYaE6VOdpaA== + "@types/mime@^3.0.1": version "3.0.1" resolved "https://registry.npmjs.org/@types/mime/-/mime-3.0.1.tgz" @@ -1437,6 +1502,13 @@ resolved "https://registry.npmjs.org/@types/ms/-/ms-0.7.31.tgz" integrity sha512-iiUgKzV9AuaEkZqkOLDIvlQiL6ltuZd9tGcW3gwpnX8JbuiuhFlEGmmFXEXkN50Cvq7Os88IY2v0dkDqXYWVgA== +"@types/node@>=13.7.0": + version "22.13.10" + resolved "https://registry.yarnpkg.com/@types/node/-/node-22.13.10.tgz#df9ea358c5ed991266becc3109dc2dc9125d77e4" + integrity sha512-I6LPUvlRH+O6VRUqYOcMudhaIdUVWfsjnZavnsraHvpBwaEyMN29ry+0UVJhImYL16xsscu0aske3yA+uPOWfw== + dependencies: + undici-types "~6.20.0" + "@types/node@^18.14.2": version "18.16.14" resolved "https://registry.npmjs.org/@types/node/-/node-18.16.14.tgz" @@ -2179,6 +2251,11 @@ flat-cache@^3.0.4: flatted "^3.1.0" rimraf "^3.0.2" +flatbuffers@^1.12.0: + version "1.12.0" + resolved "https://registry.yarnpkg.com/flatbuffers/-/flatbuffers-1.12.0.tgz#72e87d1726cb1b216e839ef02658aa87dcef68aa" + integrity sha512-c7CZADjRcl6j0PlvFy0ZqXQ67qSEZfrVPynmnL+2zPc+NtMvrF8Y0QceMo7QqnSPc7+uWjUIAbvCQ5WIKlMVdQ== + flatted@^3.1.0: version "3.2.7" resolved "https://registry.npmjs.org/flatted/-/flatted-3.2.7.tgz" @@ -2273,6 +2350,11 @@ graphemer@^1.4.0: resolved "https://registry.npmjs.org/graphemer/-/graphemer-1.4.0.tgz" integrity sha512-EtKwoO6kxCL9WO5xipiHTZlSzBm7WLT627TqC/uVRd0HKmq8NXyebnNYxDoBi7wt8eTWrUrKXCOVaFq9x1kgag== +guid-typescript@^1.0.9: + version "1.0.9" + resolved "https://registry.yarnpkg.com/guid-typescript/-/guid-typescript-1.0.9.tgz#e35f77003535b0297ea08548f5ace6adb1480ddc" + integrity sha512-Y8T4vYhEfwJOTbouREvG+3XDsjr8E3kIr7uf+JZ0BYloFsttiHU0WfvANVsR7TxNUJa/WpCnw/Ino/p+DeBhBQ== + has-flag@^4.0.0: version "4.0.0" resolved "https://registry.npmjs.org/has-flag/-/has-flag-4.0.0.tgz" @@ -2462,6 +2544,11 @@ loglevel@^1.8.1: resolved "https://registry.npmjs.org/loglevel/-/loglevel-1.8.1.tgz" integrity sha512-tCRIJM51SHjAayKwC+QAg8hT8vg6z7GSgLJKGvzuPb1Wc+hLzqtuVLxp6/HzSPOozuK+8ErAhy7U/sVzw8Dgfg== +long@^4.0.0: + version "4.0.0" + resolved "https://registry.yarnpkg.com/long/-/long-4.0.0.tgz#9a7b71cfb7d361a194ea555241c92f7468d5bf28" + integrity sha512-XsP+KhQif4bjX1kbuSiySJFNAehNxgLb6hPRGJ9QsUr8ajHkuXGdrHmFUTUUXhDwVX2R5bY4JNZEwbUiMhV+MA== + lru-cache@^4.0.1: version "4.1.5" resolved "https://registry.npmjs.org/lru-cache/-/lru-cache-4.1.5.tgz" @@ -2570,6 +2657,30 @@ once@^1.3.0: dependencies: wrappy "1" +onnx-proto@^4.0.4: + version "4.0.4" + resolved "https://registry.yarnpkg.com/onnx-proto/-/onnx-proto-4.0.4.tgz#2431a25bee25148e915906dda0687aafe3b9e044" + integrity sha512-aldMOB3HRoo6q/phyB6QRQxSt895HNNw82BNyZ2CMh4bjeKv7g/c+VpAFtJuEMVfYLMbRx61hbuqnKceLeDcDA== + dependencies: + protobufjs "^6.8.8" + +onnxruntime-common@~1.14.0: + version "1.14.0" + resolved "https://registry.yarnpkg.com/onnxruntime-common/-/onnxruntime-common-1.14.0.tgz#2bb5dac5261269779aa5fb6536ca379657de8bf6" + integrity sha512-3LJpegM2iMNRX2wUmtYfeX/ytfOzNwAWKSq1HbRrKc9+uqG/FsEA0bbKZl1btQeZaXhC26l44NWpNUeXPII7Ew== + +onnxruntime-web@1.14.0: + version "1.14.0" + resolved "https://registry.yarnpkg.com/onnxruntime-web/-/onnxruntime-web-1.14.0.tgz#c8cee538781b1d4c1c6b043934f4a3e6ddf1466e" + integrity sha512-Kcqf43UMfW8mCydVGcX9OMXI2VN17c0p6XvR7IPSZzBf/6lteBzXHvcEVWDPmCKuGombl997HgLqj91F11DzXw== + dependencies: + flatbuffers "^1.12.0" + guid-typescript "^1.0.9" + long "^4.0.0" + onnx-proto "^4.0.4" + onnxruntime-common "~1.14.0" + platform "^1.3.6" + optionator@^0.9.1: version "0.9.1" resolved "https://registry.npmjs.org/optionator/-/optionator-0.9.1.tgz" @@ -2653,6 +2764,11 @@ picomatch@^2.0.4, picomatch@^2.2.1, picomatch@^2.3.1: resolved "https://registry.npmjs.org/picomatch/-/picomatch-2.3.1.tgz" integrity sha512-JU3teHTNjmE2VCGFzuY8EXzCDVwEqB2a8fsIvwaStHhAWJEeVd1o1QD80CU6+ZdEXXSLbSsuLwJjkCBWqRQUVA== +platform@^1.3.6: + version "1.3.6" + resolved "https://registry.yarnpkg.com/platform/-/platform-1.3.6.tgz#48b4ce983164b209c2d45a107adb31f473a6e7a7" + integrity sha512-fnWVljUchTro6RiCFvCXBbNhJc2NijN7oIQxbwsyL0buWJPG85v81ehlHI9fXrJsMNgTofEoWIQeClKpgxFLrg== + plimit-lit@^1.2.6: version "1.5.0" resolved "https://registry.npmjs.org/plimit-lit/-/plimit-lit-1.5.0.tgz" @@ -2693,6 +2809,25 @@ process@^0.11.10: resolved "https://registry.npmjs.org/process/-/process-0.11.10.tgz" integrity sha512-cdGef/drWFoydD1JsMzuFf8100nZl+GT+yacc2bEced5f9Rjk4z+WtFUTBu9PhOi9j/jfmBPu0mMEY4wIdAF8A== +protobufjs@^6.8.8: + version "6.11.4" + resolved "https://registry.yarnpkg.com/protobufjs/-/protobufjs-6.11.4.tgz#29a412c38bf70d89e537b6d02d904a6f448173aa" + integrity sha512-5kQWPaJHi1WoCpjTGszzQ32PG2F4+wRY6BmAT4Vfw56Q2FZ4YZzK20xUYQH4YkfehY1e6QSICrJquM6xXZNcrw== + dependencies: + "@protobufjs/aspromise" "^1.1.2" + "@protobufjs/base64" "^1.1.2" + "@protobufjs/codegen" "^2.0.4" + "@protobufjs/eventemitter" "^1.1.0" + "@protobufjs/fetch" "^1.1.0" + "@protobufjs/float" "^1.0.2" + "@protobufjs/inquire" "^1.1.0" + "@protobufjs/path" "^1.1.2" + "@protobufjs/pool" "^1.1.0" + "@protobufjs/utf8" "^1.1.0" + "@types/long" "^4.0.1" + "@types/node" ">=13.7.0" + long "^4.0.0" + pseudomap@^1.0.2: version "1.0.2" resolved "https://registry.npmjs.org/pseudomap/-/pseudomap-1.0.2.tgz" @@ -2969,6 +3104,11 @@ typescript@~5.0.4: resolved "https://registry.npmjs.org/typescript/-/typescript-5.0.4.tgz" integrity sha512-cW9T5W9xY37cc+jfEnaUvX91foxtHkza3Nw3wkoF4sSlKn0MONdkdEndig/qPBWXNkmplh3NzayQzCiHM4/hqw== +undici-types@~6.20.0: + version "6.20.0" + resolved "https://registry.yarnpkg.com/undici-types/-/undici-types-6.20.0.tgz#8171bf22c1f588d1554d55bf204bc624af388433" + integrity sha512-Ny6QZ2Nju20vw1SRHe3d9jVu6gJ+4e3+MMpqu7pqE5HT6WsTSlce++GQmK5UXS8mzV8DSYHrQH+Xrf2jVcuKNg== + universalify@^0.1.0: version "0.1.2" resolved "https://registry.npmjs.org/universalify/-/universalify-0.1.2.tgz" From 6a553fcd862c2006b1e27b78be45bfaaa6b7ab38 Mon Sep 17 00:00:00 2001 From: "serhii.ku" Date: Fri, 2 May 2025 12:29:39 +0300 Subject: [PATCH 02/46] Improved vad usage --- src/modules/audio.ts | 144 +++++++++++++++++++++++++++++++++++-------- 1 file changed, 119 insertions(+), 25 deletions(-) diff --git a/src/modules/audio.ts b/src/modules/audio.ts index dfe676c..4120a8b 100644 --- a/src/modules/audio.ts +++ b/src/modules/audio.ts @@ -84,7 +84,7 @@ export class AudioModule { private activeStreamValue: MediaStream | null = null private initialStreamValue: MediaStream | null = null - private vad: MicVAD | null = null + //private vad: MicVAD | null = null private vadSessions: object = {} private VUMeter: VUMeter @@ -712,6 +712,90 @@ export class AudioModule { } } + private async processSessionVad (session, newStream) { + if (this.vadSessions[session._id]) { + this.vadSessions[session._id].pause() + this.vadSessions[session._id] = null + console.log('vad session pause', session._id) + } else { + console.log('vad session else', session._id) + } + + console.log('typeof mixedOutput', typeof newStream) + const streamCopy = newStream.clone() + const vadSession = await MicVAD.new({ + streamCopy, + model: 'v5', + //baseAssetPath: '/', + //onnxWASMBasePath: '/', + positiveSpeechThreshold: 0.4, + negativeSpeechThreshold: 0.4, + minSpeechFrames: 15, + preSpeechPadFrames: 30, + /*onFrameProcessed: async (probs, frame) => { + console.log('VAD probs.isSpeech conference', session._id, probs.isSpeech) + if (probs.isSpeech > 0.001) { + if (!this.vadSessions[session._id].isSpeakingState && newStream) { + console.log('SET SPEAKING - YES') + this.vadSessions[session._id].isSpeakingState = true + clearTimeout(this.vadSessions[session._id].vadInterval) + this.vadSessions[session._id].vadInterval = null + + newStream.getTracks().forEach(track => track.enabled = true) + if (session.connection?.getSenders()[0]) { + await session.connection.getSenders()[0].replaceTrack(newStream.getTracks()[0]) + } + } + } else { + if (this.vadSessions[session._id].isSpeakingState && !this.vadSessions[session._id].vadInterval && newStream) { + this.vadSessions[session._id].vadInterval = setTimeout(async () => { + console.log('SET SPEAKING - NO') + this.vadSessions[session._id].isSpeakingState = false + + newStream.getTracks().forEach(track => track.enabled = false) + if (session.connection?.getSenders()[0]) { + await session.connection.getSenders()[0].replaceTrack(newStream.getTracks()[0]) + } + }, 1500) + } + } + }, + onSpeechEnd: (arr) => { + console.log('VAD onSpeechEnd') + },*/ + onSpeechStart: async () => { + console.log('onSpeechStart') + //this.vadSessions[session._id].isSpeakingState = true + //clearTimeout(this.vadSessions[session._id].vadInterval) + //this.vadSessions[session._id].vadInterval = null + + newStream.getTracks().forEach(track => track.enabled = true) + if (session.connection?.getSenders()[0]) { + await session.connection.getSenders()[0].replaceTrack(newStream.getTracks()[0]) + } + }, + onVADMisfire: async () => { + console.log('onVADMisfire') + //this.vadSessions[session._id].isSpeakingState = false + + newStream.getTracks().forEach(track => track.enabled = false) + if (session.connection?.getSenders()[0]) { + await session.connection.getSenders()[0].replaceTrack(newStream.getTracks()[0]) + } + } + }) + + this.vadSessions[session._id] = vadSession + vadSession.start() + } + + private stopSessionVad (session) { + if (this.vadSessions[session._id]) { + this.vadSessions[session._id].pause() + this.vadSessions[session._id] = null + } + } + private async roomReconfigure (roomId: number | undefined) { console.log('roomReconfigure start') if (roomId === undefined) { @@ -752,6 +836,9 @@ export class AudioModule { if (callsInRoom[0].connection && callsInRoom[0].connection?.getSenders()[0]) { const processedStream = this.getActiveStream() + + this.processSessionVad(callsInRoom[0], processedStream) + await callsInRoom[0].connection.getSenders()[0].replaceTrack(processedStream.getTracks()[0]) this.muteReconfigure(callsInRoom[0]) } @@ -813,10 +900,13 @@ export class AudioModule { } console.log('doConference') - this.vad?.pause() - this.vad = null - if (this.vadSessions[session._id]) { + this.processSessionVad(session, mixedOutput.stream) + + //this.vad?.pause() + //this.vad = null + + /*if (this.vadSessions[session._id]) { this.vadSessions[session._id].vad.pause() this.vadSessions[session._id].vad = null console.log('vad session pause', session._id) @@ -872,15 +962,13 @@ export class AudioModule { isSpeakingState: true, vadInterval: null, vad: vadSession - } + }*/ if (session.connection?.getSenders()[0]) { //mixedOutput.stream.getTracks().forEach(track => track.enabled = !getters.isMuted) // Uncomment to mute all callers on mute await session.connection.getSenders()[0].replaceTrack(mixedOutput.stream.getTracks()[0]) this.muteReconfigure(session) } - - vadSession.start() }) console.log('doConference end') } @@ -1184,6 +1272,8 @@ export class AudioModule { session, event }) + + this.stopSessionVad(session) const s = this.getActiveCalls[session.id] if (s) { @@ -1198,8 +1288,8 @@ export class AudioModule { this.setIsMuted(false) this.initialStreamValue?.getTracks().forEach((track) => track.stop()) this.initialStreamValue = null - this.vad?.pause() - this.vad = null + //this.vad?.pause() + //this.vad = null } }) session.on('progress', (event: IncomingEvent | OutgoingEvent) => { @@ -1219,6 +1309,8 @@ export class AudioModule { event }) + this.stopSessionVad(session) + if (session.id === this.callAddingInProgress) { this.callAddingInProgress = undefined } @@ -1237,8 +1329,8 @@ export class AudioModule { this.setIsMuted(false) this.initialStreamValue?.getTracks().forEach((track) => track.stop()) this.initialStreamValue = null - this.vad?.pause() - this.vad = null + //this.vad?.pause() + //this.vad = null } }) session.on('confirmed', (event: IncomingAckEvent | OutgoingAckEvent) => { @@ -1378,11 +1470,11 @@ export class AudioModule { if (this.initialStreamValue) { this.initialStreamValue.getTracks().forEach((track) => track.stop()) this.initialStreamValue = null - this.vad?.pause() - this.vad = null + //this.vad?.pause() + //this.vad = null } this.initialStreamValue = stream - const vadStream = stream.clone() + /*const vadStream = stream.clone() let isSpeakingState = false let vadInterval = null @@ -1413,11 +1505,11 @@ export class AudioModule { callsInRoom[0].connection?.getSenders()[0] ) { //const processedStream = this.getActiveStream() - /*await */ + /!*await *!/ this.initialStreamValue.getTracks().forEach(track => track.enabled = true) await callsInRoom[0].connection.getSenders()[0].replaceTrack(this.initialStreamValue.getTracks()[0]) //this.muteReconfigure(callsInRoom[0]) - } /*else if (callsInRoom.length > 1) { + } /!*else if (callsInRoom.length > 1) { const receivedTracks: Array = [] callsInRoom.forEach(session => { @@ -1455,7 +1547,7 @@ export class AudioModule { } }) //await this.doConference(callsInRoom) - }*/ + }*!/ //this.roomReconfigure(this.currentActiveRoomId) } } else { @@ -1464,7 +1556,7 @@ export class AudioModule { console.log('SET SPEAKING - NO') isSpeakingState = false - /*const callsInRoom = Object.values(this.extendedCalls) + /!*const callsInRoom = Object.values(this.extendedCalls) .filter(call => call.roomId === this.currentActiveRoomId) if (callsInRoom[0].connection && callsInRoom[0].connection?.getSenders()[0]) { @@ -1472,7 +1564,7 @@ export class AudioModule { /!*await *!/ callsInRoom[0].connection.getSenders()[0].replaceTrack(this.initialStreamValue.getTracks()[0]) //this.muteReconfigure(callsInRoom[0]) - }*/ + }*!/ const callsInRoom = Object.values(this.extendedCalls) .filter(call => call.roomId === this.currentActiveRoomId) @@ -1483,11 +1575,11 @@ export class AudioModule { callsInRoom[0].connection?.getSenders()[0] ) { //const processedStream = this.getActiveStream() - /*await */ + /!*await *!/ this.initialStreamValue.getTracks().forEach(track => track.enabled = false) await callsInRoom[0].connection.getSenders()[0].replaceTrack(this.initialStreamValue.getTracks()[0]) //this.muteReconfigure(callsInRoom[0]) - } /*else if (callsInRoom.length > 1) { + } /!*else if (callsInRoom.length > 1) { const receivedTracks: Array = [] callsInRoom.forEach(session => { @@ -1525,7 +1617,7 @@ export class AudioModule { } }) - }*/ + }*!/ //this.roomReconfigure(this.currentActiveRoomId) }, 1500) } @@ -1535,16 +1627,16 @@ export class AudioModule { }, onSpeechEnd: (arr) => { console.log('VAD onSpeechEnd') - /*const wavBuffer = utils.encodeWAV(arr) + /!*const wavBuffer = utils.encodeWAV(arr) const base64 = utils.arrayBufferToBase64(wavBuffer) const url = `data:audio/wav;base64,${base64}` const el = addAudio(url) const speechList = document.getElementById("playlist") - speechList.prepend(el)*/ + speechList.prepend(el)*!/ }, }) - this.vad.start() + this.vad.start()*/ } private async triggerAddStream (event: RTCTrackEvent, call: ICall) { @@ -1567,6 +1659,8 @@ export class AudioModule { this.setupVUMeter(stream, call._id) this.getCallQuality(call) this.updateCall(call) + + this.processSessionVad(call, stream) } //@requireInitialization() From 4a8ad54eb77557da9205078b1a6f9baf4cc95b4e Mon Sep 17 00:00:00 2001 From: "serhii.ku" Date: Tue, 13 May 2025 15:00:59 +0300 Subject: [PATCH 03/46] Added form for vad configuration --- demo/index.html | 132 +++++++++++++ demo/index.ts | 49 +++++ package.json | 2 +- src/modules/audio.ts | 456 ++++++++++++++++--------------------------- src/types/rtc.d.ts | 11 ++ yarn.lock | 8 +- 6 files changed, 363 insertions(+), 295 deletions(-) diff --git a/demo/index.html b/demo/index.html index 6c9e399..ecc9239 100644 --- a/demo/index.html +++ b/demo/index.html @@ -10,6 +10,8 @@ Document + +
OpenSIPS
@@ -63,6 +65,9 @@

Audio Calls

+ Playlist +
+
@@ -156,6 +161,133 @@

Audio Calls

+
+

VAD Configuration

+ + + + + + + + + + + + + +
+ +
+ +
+

RoomList


diff --git a/demo/index.ts b/demo/index.ts index 4d4b4bd..04a6ef5 100644 --- a/demo/index.ts +++ b/demo/index.ts @@ -62,6 +62,21 @@ const terminateJanusSessionButtonEl = document.getElementById('terminateJanusSes const agentVoiceLevelContainerEl = document.getElementById('agentVoiceLevelContainer') +const saveVADConfigurationButtonEl = document.getElementById('saveVADConfigurationButton') +const positiveSpeechThresholdInputEl = document.getElementById('positiveSpeechThreshold') as HTMLInputElement +const negativeSpeechThresholdInputEl = document.getElementById('negativeSpeechThreshold') as HTMLInputElement +const preSpeechPadFramesInputEl = document.getElementById('preSpeechPadFrames') as HTMLInputElement +const redemptionFramesInputEl = document.getElementById('redemptionFrames') as HTMLInputElement +const frameSamplesInputEl = document.getElementById('frameSamples') as HTMLInputElement +const minSpeechFramesInputEl = document.getElementById('minSpeechFrames') as HTMLInputElement +const submitUserSpeechOnPauseCheckboxEl = document.getElementById('submitUserSpeechOnPause') as HTMLInputElement +const baseAssetPathInputEl = document.getElementById('baseAssetPath') as HTMLInputElement +const onnxWASMBasePathInputEl = document.getElementById('onnxWASMBasePath') as HTMLInputElement +const modelInputEl = document.getElementById('model') as HTMLInputElement +const startOnLoadCheckboxEl = document.getElementById('startOnLoad') as HTMLInputElement +const userSpeakingThresholdInputEl = document.getElementById('userSpeakingThreshold') as HTMLInputElement + + const activeCallsCounterEl = document.getElementById('activeCallsCounter') const roomSelectEl = document.getElementById('roomSelect') as HTMLSelectElement @@ -1485,6 +1500,40 @@ dtmfForm?.addEventListener( openSIPSJS.audio.sendDTMF(callsInActiveRoom[0]._id, dtmfTarget) }) +saveVADConfigurationButtonEl?.addEventListener( + 'click', + async (event) => { + event.preventDefault() + + const positiveSpeechThreshold = Number(positiveSpeechThresholdInputEl.value) + const negativeSpeechThreshold = Number(negativeSpeechThresholdInputEl.value) + const preSpeechPadFrames = Number(preSpeechPadFramesInputEl.value) + const redemptionFrames = Number(redemptionFramesInputEl.value) + const frameSamples = Number(frameSamplesInputEl.value) + const minSpeechFrames = Number(minSpeechFramesInputEl.value) + const submitUserSpeechOnPause = submitUserSpeechOnPauseCheckboxEl.checked + const baseAssetPath = baseAssetPathInputEl.value + const onnxWASMBasePath = onnxWASMBasePathInputEl.value + const model = modelInputEl.value + const startOnLoad = startOnLoadCheckboxEl.checked + const userSpeakingThreshold = Number(userSpeakingThresholdInputEl.value) + + openSIPSJS.audio.setVADConfiguration({ + positiveSpeechThreshold, + negativeSpeechThreshold, + preSpeechPadFrames, + redemptionFrames, + frameSamples, + minSpeechFrames, + submitUserSpeechOnPause, + baseAssetPath, + onnxWASMBasePath, + model, + startOnLoad, + userSpeakingThreshold + }) + }) + roomSelectEl?.addEventListener( 'change', async (event) => { diff --git a/package.json b/package.json index 259d4c4..b326c33 100644 --- a/package.json +++ b/package.json @@ -60,7 +60,7 @@ "vue": "3.2.25" }, "dependencies": { - "@ricky0123/vad-web": "^0.0.22", + "@ricky0123/vad-web": "^0.0.24", "@types/mime": "^3.0.1", "generate-unique-id": "^2.0.1", "jssip": "3.10.0", diff --git a/src/modules/audio.ts b/src/modules/audio.ts index 4120a8b..072d85b 100644 --- a/src/modules/audio.ts +++ b/src/modules/audio.ts @@ -5,7 +5,8 @@ import { IntervalType, IRoom, IRoomUpdate, - RTCSessionExtended + RTCSessionExtended, + VADOptions } from '@/types/rtc' import { CallTime, ITimeData, TempTimeData } from '@/types/timer' import { setupTime } from '@/helpers/time.helper' @@ -19,7 +20,7 @@ import { import { isMobile, processAudioVolume, simplifyCallObject, syncStream } from '@/helpers/audio.helper' import { RTCSessionEvent } from 'jssip/lib/UA' import { forEach } from 'p-iteration' -import { MicVAD, utils } from '@ricky0123/vad-web' +import { MicVAD, utils, getDefaultRealTimeVADOptions } from '@ricky0123/vad-web' import audioContext from '@/helpers/audioContext' import { CALL_EVENT_LISTENER_TYPE } from '@/enum/call.event.listener.type' import { IncomingAckEvent, IncomingEvent, OutgoingAckEvent, OutgoingEvent } from 'jssip/lib/RTCSession' @@ -29,6 +30,9 @@ import { METRIC_KEYS_TO_INCLUDE } from '@/enum/metric.keys.to.include' import VUMeter from '@/helpers/VUMeter' import OpenSIPSJS from '@/index' +import * as ort from 'onnxruntime-web' +ort.env.wasm.wasmPaths = '/' + const STORAGE_KEYS = { SELECTED_INPUT_DEVICE: 'OpensipsJSInputDevice', SELECTED_OUTPUT_DEVICE: 'OpensipsJSOutputDevice' @@ -52,6 +56,16 @@ export function debounce (callback, wait) { return debounced } +function addAudio (audioUrl) { + const entry = document.createElement('li') + const audio = document.createElement('audio') + audio.controls = true + audio.src = audioUrl + entry.classList.add('newItem') + entry.appendChild(audio) + return entry +} + export class AudioModule { private context: OpenSIPSJS private currentActiveRoomIdValue: number | undefined @@ -84,8 +98,10 @@ export class AudioModule { private activeStreamValue: MediaStream | null = null private initialStreamValue: MediaStream | null = null - //private vad: MicVAD | null = null + + private useVAD = false private vadSessions: object = {} + private vadConfiguration: Partial = {} private VUMeter: VUMeter @@ -101,9 +117,29 @@ export class AudioModule { onChangeFunction: this.emitVolumeChange.bind(this) }) + this.processVADConfiguration() this.initializeMediaDevices() } + public setVADConfiguration (options) { + console.log('setVADConfiguration', options) + this.vadConfiguration = { + ...options + } + } + + private processVADConfiguration () { + this.useVAD = this.context.options.configuration?.useVAD ?? true + + /*this.vadConfiguration = { + model: this.context.options.configuration?.VADOptions?.model ?? 'legacy', + positiveSpeechThreshold: this.context.options.configuration?.VADOptions?.positiveSpeechThreshold ?? 0.4, + negativeSpeechThreshold: this.context.options.configuration?.VADOptions?.negativeSpeechThreshold ?? 0.4, + minSpeechFrames: this.context.options.configuration?.VADOptions?.minSpeechFrames ?? 15, + preSpeechPadFrames: this.context.options.configuration?.VADOptions?.preSpeechPadFrames ?? 30 + }*/ + } + public get sipOptions () { const options = { ...this.context.options.sipOptions, @@ -712,81 +748,143 @@ export class AudioModule { } } - private async processSessionVad (session, newStream) { - if (this.vadSessions[session._id]) { - this.vadSessions[session._id].pause() - this.vadSessions[session._id] = null - console.log('vad session pause', session._id) - } else { - console.log('vad session else', session._id) - } + private async processVAD (session, stream) { + const clonedStream = stream.clone() + + clonedStream.getTracks().forEach((track) => track.enabled = true) + + const newStream = await navigator.mediaDevices.getUserMedia({ + audio: { + sampleRate: 16000, // <-- ensure this is at least 16000 + channelCount: 1, + echoCancellation: false, + noiseSuppression: false, + autoGainControl: false + }, + video: false + }) + + const options = getDefaultRealTimeVADOptions('legacy') + console.log('VAD Options', { + ...options, + ...this.vadConfiguration, + stream: newStream, + }) - console.log('typeof mixedOutput', typeof newStream) - const streamCopy = newStream.clone() const vadSession = await MicVAD.new({ - streamCopy, - model: 'v5', + ...options, + /*positiveSpeechThreshold: 0.35, + negativeSpeechThreshold: 0.35, + preSpeechPadFrames: 4, + redemptionFrames: 8, + frameSamples: 1536, + minSpeechFrames: 2, + submitUserSpeechOnPause: false,*/ + ...this.vadConfiguration, + stream: newStream, + /*model: 'v5', //baseAssetPath: '/', //onnxWASMBasePath: '/', positiveSpeechThreshold: 0.4, negativeSpeechThreshold: 0.4, minSpeechFrames: 15, - preSpeechPadFrames: 30, - /*onFrameProcessed: async (probs, frame) => { - console.log('VAD probs.isSpeech conference', session._id, probs.isSpeech) - if (probs.isSpeech > 0.001) { - if (!this.vadSessions[session._id].isSpeakingState && newStream) { - console.log('SET SPEAKING - YES') - this.vadSessions[session._id].isSpeakingState = true - clearTimeout(this.vadSessions[session._id].vadInterval) - this.vadSessions[session._id].vadInterval = null - - newStream.getTracks().forEach(track => track.enabled = true) - if (session.connection?.getSenders()[0]) { - await session.connection.getSenders()[0].replaceTrack(newStream.getTracks()[0]) - } - } - } else { - if (this.vadSessions[session._id].isSpeakingState && !this.vadSessions[session._id].vadInterval && newStream) { - this.vadSessions[session._id].vadInterval = setTimeout(async () => { - console.log('SET SPEAKING - NO') - this.vadSessions[session._id].isSpeakingState = false - - newStream.getTracks().forEach(track => track.enabled = false) - if (session.connection?.getSenders()[0]) { - await session.connection.getSenders()[0].replaceTrack(newStream.getTracks()[0]) - } - }, 1500) - } + preSpeechPadFrames: 30,*/ + /*model: 'legacy', + positiveSpeechThreshold: 0.4, + negativeSpeechThreshold: 0.4, + minSpeechFrames: 15, + preSpeechPadFrames: 30,*/ + /*model: 'v5', + positiveSpeechThreshold: 0.5, + negativeSpeechThreshold: 0.35, + minSpeechFrames: 9, + preSpeechPadFrames: 3, + redemptionFrames: 24, + frameSamples: 512, + submitUserSpeechOnPause: false, + baseAssetPath: 'https://cdn.jsdelivr.net/npm/@ricky0123/vad-web@0.0.20/dist/', + onnxWASMBasePath: 'https://cdn.jsdelivr.net/npm/onnxruntime-web@1.14.0/dist/',*/ + onVADMisfire: () => { + console.log('Vad misfire') + /*if (session.connection?.getSenders()[0]) { + session.connection.getSenders()[0].track.enabled = false + console.log('After disable', session._id) + }*/ + }, + onFrameProcessed: (probabilities, frame) => { + console.log('onFrameProcessed') + }, + onSpeechStart: () => { + console.log('Speech start') + if (session.connection?.getSenders()[0]) { + session.connection.getSenders()[0].track.enabled = true + console.log('After enable', session._id) } }, + onSpeechRealStart: () => { + console.log('Speech real start') + }, onSpeechEnd: (arr) => { - console.log('VAD onSpeechEnd') - },*/ - onSpeechStart: async () => { - console.log('onSpeechStart') - //this.vadSessions[session._id].isSpeakingState = true - //clearTimeout(this.vadSessions[session._id].vadInterval) - //this.vadSessions[session._id].vadInterval = null + console.log('Speech end') + /*const wavBuffer = utils.encodeWAV(audio) + const base64 = utils.arrayBufferToBase64(wavBuffer) + const url = `data:audio/wav;base64,${base64}`*/ + if (session.connection?.getSenders()[0]) { + session.connection.getSenders()[0].track.enabled = false + console.log('After disable', session._id) + } - newStream.getTracks().forEach(track => track.enabled = true) + + const wavBuffer = utils.encodeWAV(arr) + const base64 = utils.arrayBufferToBase64(wavBuffer) + const url = `data:audio/wav;base64,${base64}` + const el = addAudio(url) + const speechList = document.getElementById('playlist') + console.log('prepend') + speechList.prepend(el) + //setAudioList((old) => [ url, ...old ]) + }, + /*onFrameProcessed: (probabilities, frame) => { + console.log('onFrameProcessed') + }, + onSpeechRealStart: () => { + console.log('Speech real start') + }, + onSpeechStart: async () => { + console.log('onSpeechStart', session._id) if (session.connection?.getSenders()[0]) { - await session.connection.getSenders()[0].replaceTrack(newStream.getTracks()[0]) + session.connection.getSenders()[0].track.enabled = true + console.log('After enable', session._id) } }, onVADMisfire: async () => { - console.log('onVADMisfire') - //this.vadSessions[session._id].isSpeakingState = false - - newStream.getTracks().forEach(track => track.enabled = false) + console.log('onVADMisfire', session._id) if (session.connection?.getSenders()[0]) { - await session.connection.getSenders()[0].replaceTrack(newStream.getTracks()[0]) + session.connection.getSenders()[0].track.enabled = false + console.log('After disable', session._id) } - } + }, + onSpeechEnd: (arr) => { + console.log('onSpeechEnd') + const wavBuffer = utils.encodeWAV(arr) + const base64 = utils.arrayBufferToBase64(wavBuffer) + const url = `data:audio/wav;base64,${base64}` + const el = addAudio(url) + const speechList = document.getElementById('playlist') + console.log('prepend') + speechList.prepend(el) + },*/ }) + if (this.vadSessions[session._id]) { + this.vadSessions[session._id].pause() + delete this.vadSessions[session._id] + } + this.vadSessions[session._id] = vadSession vadSession.start() + + console.log('this.vadSessions', this.vadSessions) } private stopSessionVad (session) { @@ -797,7 +895,6 @@ export class AudioModule { } private async roomReconfigure (roomId: number | undefined) { - console.log('roomReconfigure start') if (roomId === undefined) { return } @@ -837,7 +934,9 @@ export class AudioModule { if (callsInRoom[0].connection && callsInRoom[0].connection?.getSenders()[0]) { const processedStream = this.getActiveStream() - this.processSessionVad(callsInRoom[0], processedStream) + if (this.useVAD) { + this.processVAD(callsInRoom[0], processedStream) + } await callsInRoom[0].connection.getSenders()[0].replaceTrack(processedStream.getTracks()[0]) this.muteReconfigure(callsInRoom[0]) @@ -845,12 +944,9 @@ export class AudioModule { } else if (callsInRoom.length > 1) { await this.doConference(callsInRoom) } - - console.log('roomReconfigure end') } private async doConference (sessions: Array) { - console.log('doConference start') /*await forEach(sessions, async (session: ICall) => { if (session._localHold) { await this.unholdCall(session._id) @@ -899,78 +995,16 @@ export class AudioModule { sourceStream.connect(mixedOutput) } - console.log('doConference') - - this.processSessionVad(session, mixedOutput.stream) - - //this.vad?.pause() - //this.vad = null - - /*if (this.vadSessions[session._id]) { - this.vadSessions[session._id].vad.pause() - this.vadSessions[session._id].vad = null - console.log('vad session pause', session._id) - } else { - console.log('vad session else', session._id) + if (this.useVAD) { + this.processVAD(session, mixedOutput.stream) } - console.log('typeof mixedOutput', typeof mixedOutput) - const mixedStreamCopy = mixedOutput.stream.clone() - const vadSession = await MicVAD.new({ - mixedStreamCopy, - model: 'v5', - //baseAssetPath: '/', - //onnxWASMBasePath: '/', - positiveSpeechThreshold: 0.4, - negativeSpeechThreshold: 0.4, - minSpeechFrames: 15, - preSpeechPadFrames: 30, - onFrameProcessed: async (probs, frame) => { - console.log('VAD probs.isSpeech conference', session._id, probs.isSpeech) - if (probs.isSpeech > 0.001) { - if (!this.vadSessions[session._id].isSpeakingState && mixedOutput) { - console.log('SET SPEAKING - YES') - this.vadSessions[session._id].isSpeakingState = true - clearTimeout(this.vadSessions[session._id].vadInterval) - this.vadSessions[session._id].vadInterval = null - - mixedOutput.stream.getTracks().forEach(track => track.enabled = true) - if (session.connection?.getSenders()[0]) { - await session.connection.getSenders()[0].replaceTrack(mixedOutput.stream.getTracks()[0]) - } - } - } else { - if (this.vadSessions[session._id].isSpeakingState && !this.vadSessions[session._id].vadInterval && mixedOutput) { - this.vadSessions[session._id].vadInterval = setTimeout(async () => { - console.log('SET SPEAKING - NO') - this.vadSessions[session._id].isSpeakingState = false - - mixedOutput.stream.getTracks().forEach(track => track.enabled = false) - if (session.connection?.getSenders()[0]) { - await session.connection.getSenders()[0].replaceTrack(mixedOutput.stream.getTracks()[0]) - } - }, 1500) - } - } - }, - onSpeechEnd: (arr) => { - console.log('VAD onSpeechEnd') - }, - }) - - this.vadSessions[session._id] = { - isSpeakingState: true, - vadInterval: null, - vad: vadSession - }*/ - if (session.connection?.getSenders()[0]) { //mixedOutput.stream.getTracks().forEach(track => track.enabled = !getters.isMuted) // Uncomment to mute all callers on mute await session.connection.getSenders()[0].replaceTrack(mixedOutput.stream.getTracks()[0]) this.muteReconfigure(session) } }) - console.log('doConference end') } private processCallerMute (callId: string, value: boolean) { @@ -1465,7 +1499,7 @@ export class AudioModule { } async setupStream () { - const stream = await navigator.mediaDevices.getUserMedia(this.getUserMediaConstraints) + //const stream = await navigator.mediaDevices.getUserMedia(this.getUserMediaConstraints) if (this.initialStreamValue) { this.initialStreamValue.getTracks().forEach((track) => track.stop()) @@ -1473,170 +1507,10 @@ export class AudioModule { //this.vad?.pause() //this.vad = null } - this.initialStreamValue = stream - /*const vadStream = stream.clone() - let isSpeakingState = false - let vadInterval = null - this.vad = await MicVAD.new({ - vadStream, - model: 'v5', - //baseAssetPath: '/', - //onnxWASMBasePath: '/', - positiveSpeechThreshold: 0.4, - negativeSpeechThreshold: 0.4, - minSpeechFrames: 15, - preSpeechPadFrames: 30, - onFrameProcessed: async (probs, frame) => { - console.log('VAD probs.isSpeech', probs.isSpeech) - if (probs.isSpeech > 0.001) { - if (!isSpeakingState && this.initialStreamValue) { - console.log('SET SPEAKING - YES') - isSpeakingState = true - clearTimeout(vadInterval) - vadInterval = null - - const callsInRoom = Object.values(this.extendedCalls) - .filter(call => call.roomId === this.currentActiveRoomId) - - if ( - callsInRoom.length === 1 && - callsInRoom[0].connection && - callsInRoom[0].connection?.getSenders()[0] - ) { - //const processedStream = this.getActiveStream() - /!*await *!/ - this.initialStreamValue.getTracks().forEach(track => track.enabled = true) - await callsInRoom[0].connection.getSenders()[0].replaceTrack(this.initialStreamValue.getTracks()[0]) - //this.muteReconfigure(callsInRoom[0]) - } /!*else if (callsInRoom.length > 1) { - const receivedTracks: Array = [] - - callsInRoom.forEach(session => { - if (session !== null && session !== undefined) { - session.connection.getReceivers().forEach((receiver: RTCRtpReceiver) => { - receivedTracks.push(receiver.track) - }) - } - }) - - await forEach(callsInRoom, async (session: ICall) => { - if (session === null || session === undefined) { - return - } - - const allReceivedMediaStreams = new MediaStream() - const mixedOutput = audioContext.createMediaStreamDestination() - - session.connection.getReceivers().forEach((receiver: RTCRtpReceiver) => { - receivedTracks.forEach(track => { - allReceivedMediaStreams.addTrack(receiver.track) - - if (receiver.track.id !== track.id) { - const sourceStream = audioContext.createMediaStreamSource(new MediaStream([ track ])) - sourceStream.connect(mixedOutput) - } - }) - }) - - const sourceStream = audioContext.createMediaStreamSource(this.initialStreamValue) - sourceStream.connect(mixedOutput) - - if (session.connection?.getSenders()[0]) { - await session.connection.getSenders()[0].replaceTrack(mixedOutput.stream.getTracks()[0]) - } - }) - //await this.doConference(callsInRoom) - }*!/ - //this.roomReconfigure(this.currentActiveRoomId) - } - } else { - if (isSpeakingState && !vadInterval && this.initialStreamValue) { - vadInterval = setTimeout(async () => { - console.log('SET SPEAKING - NO') - isSpeakingState = false - - /!*const callsInRoom = Object.values(this.extendedCalls) - .filter(call => call.roomId === this.currentActiveRoomId) - - if (callsInRoom[0].connection && callsInRoom[0].connection?.getSenders()[0]) { - //const processedStream = this.getActiveStream() - /!*await *!/ - callsInRoom[0].connection.getSenders()[0].replaceTrack(this.initialStreamValue.getTracks()[0]) - //this.muteReconfigure(callsInRoom[0]) - }*!/ - - const callsInRoom = Object.values(this.extendedCalls) - .filter(call => call.roomId === this.currentActiveRoomId) - - if ( - callsInRoom.length === 1 && - callsInRoom[0].connection && - callsInRoom[0].connection?.getSenders()[0] - ) { - //const processedStream = this.getActiveStream() - /!*await *!/ - this.initialStreamValue.getTracks().forEach(track => track.enabled = false) - await callsInRoom[0].connection.getSenders()[0].replaceTrack(this.initialStreamValue.getTracks()[0]) - //this.muteReconfigure(callsInRoom[0]) - } /!*else if (callsInRoom.length > 1) { - const receivedTracks: Array = [] - - callsInRoom.forEach(session => { - if (session !== null && session !== undefined) { - session.connection.getReceivers().forEach((receiver: RTCRtpReceiver) => { - receivedTracks.push(receiver.track) - }) - } - }) - - await forEach(callsInRoom, async (session: ICall) => { - if (session === null || session === undefined) { - return - } - - const allReceivedMediaStreams = new MediaStream() - const mixedOutput = audioContext.createMediaStreamDestination() - - session.connection.getReceivers().forEach((receiver: RTCRtpReceiver) => { - receivedTracks.forEach(track => { - allReceivedMediaStreams.addTrack(receiver.track) - - if (receiver.track.id !== track.id) { - const sourceStream = audioContext.createMediaStreamSource(new MediaStream([ track ])) - sourceStream.connect(mixedOutput) - } - }) - }) - - const sourceStream = audioContext.createMediaStreamSource(this.initialStreamValue) - sourceStream.connect(mixedOutput) - - if (session.connection?.getSenders()[0]) { - await session.connection.getSenders()[0].replaceTrack(mixedOutput.stream.getTracks()[0]) - } - }) - - }*!/ - //this.roomReconfigure(this.currentActiveRoomId) - }, 1500) - } - } - //const indicatorColor = interpolateInferno(probs.isSpeech / 2) - //document.body.style.setProperty("--indicator-color", indicatorColor) - }, - onSpeechEnd: (arr) => { - console.log('VAD onSpeechEnd') - /!*const wavBuffer = utils.encodeWAV(arr) - const base64 = utils.arrayBufferToBase64(wavBuffer) - const url = `data:audio/wav;base64,${base64}` - const el = addAudio(url) - const speechList = document.getElementById("playlist") - speechList.prepend(el)*!/ - }, - }) + const stream = await navigator.mediaDevices.getUserMedia(this.getUserMediaConstraints) - this.vad.start()*/ + this.initialStreamValue = stream } private async triggerAddStream (event: RTCTrackEvent, call: ICall) { @@ -1660,7 +1534,9 @@ export class AudioModule { this.getCallQuality(call) this.updateCall(call) - this.processSessionVad(call, stream) + if (this.useVAD) { + this.processVAD(call, processedStream) + } } //@requireInitialization() diff --git a/src/types/rtc.d.ts b/src/types/rtc.d.ts index 23826d4..f72f85e 100644 --- a/src/types/rtc.d.ts +++ b/src/types/rtc.d.ts @@ -13,6 +13,7 @@ import { IncomingRequest } from 'jssip/lib/SIPMessage' import { UAConfiguration } from 'jssip/lib/UA' +import { RealTimeVADOptions } from '@ricky0123/vad-web' import { MODULES } from '@/enum/modules' @@ -132,8 +133,18 @@ export type MSRPModuleName = typeof MODULES.MSRP export type Modules = AudioModuleName | VideoModuleName | MSRPModuleName +export interface VADOptions { + model: 'v5' | 'legacy' + positiveSpeechThreshold: number + negativeSpeechThreshold: number + minSpeechFrames: number + preSpeechPadFrames: number +} + type UAConfigurationExtended = UAConfiguration & { overrideUserAgent?: (userAgent: string) => string + useVAD?: boolean + VADOptions?: Partial } export type IOpenSIPSConfiguration = Omit diff --git a/yarn.lock b/yarn.lock index cbdbf60..ecd23c5 100644 --- a/yarn.lock +++ b/yarn.lock @@ -923,10 +923,10 @@ resolved "https://registry.yarnpkg.com/@protobufjs/utf8/-/utf8-1.1.0.tgz#a777360b5b39a1a2e5106f8e858f2fd2d060c570" integrity sha512-Vvn3zZrhQZkkBE8LSuW3em98c0FwgO4nxzv6OdSxPKJIEKY2bGbHn+mhGIPerzI4twdxaP8/0+06HBpwf345Lw== -"@ricky0123/vad-web@^0.0.22": - version "0.0.22" - resolved "https://registry.yarnpkg.com/@ricky0123/vad-web/-/vad-web-0.0.22.tgz#35fd050b85a14d2f16e97f2970fd398d3d08b70a" - integrity sha512-679R6sfwXx4jkquK+FJ9RC2W29oulWC+9ZINK6LVpuy90IBV7UaTGNN79oQXufpJTJs5z4X/22nw1DQ4+Rh8CA== +"@ricky0123/vad-web@^0.0.24": + version "0.0.24" + resolved "https://registry.yarnpkg.com/@ricky0123/vad-web/-/vad-web-0.0.24.tgz#e65053bce876b9fa778a5176216809420a0fedde" + integrity sha512-uv6GWW/kq8BkVErMQzXp3uwSyYMT3w/3QJiUerVaaKp7EwhOTIRY+96EoyFdG2WOFU5RkLk/2CVGbI7nDlxhEg== dependencies: onnxruntime-web "1.14.0" From cece4b821273b9b32fb6fb24ab91964eed143e84 Mon Sep 17 00:00:00 2001 From: "serhii.ku" Date: Wed, 14 May 2025 11:23:59 +0300 Subject: [PATCH 04/46] Updated docs due to latest vad demo changes --- docs/components/ExampleContent.vue | 173 ++++++++++++++++++++++++++--- 1 file changed, 157 insertions(+), 16 deletions(-) diff --git a/docs/components/ExampleContent.vue b/docs/components/ExampleContent.vue index 18b4e10..6a163ef 100644 --- a/docs/components/ExampleContent.vue +++ b/docs/components/ExampleContent.vue @@ -25,12 +25,12 @@
- +
- +
@@ -41,23 +41,23 @@