Build your own Voice AI pipeline with different transcription, LLM, and speech synthesis options while maintaining realistic agent interactivity.
Why this is needed
- While Voximplant offers many speech options, general providers often focus on popular languages and do not cover every language and dialect; specialized speech options are often needed - this requires building custom Voice AI pipelines where the developer controls Speech-to-Text (STT), the Large Language Model (LLM), and the Text-to-Speech (TTS) components
- Voice Activity Detection (VAD) and turn detection (i.e. who’s turn is it to talk) are needed to enable quick and natural agent interactivity when you build an end-to-end Voice AI Agent pipeline
- Previous Voice AI connectors all supported direct speech-input that already includes VAD and turn detection
- OpenAI has several popular API options that can be used for the LLM component of the pipeline
- Developers want flexibility in their LLM options - several third-party LLM vendors and AI suppliers mimic OpenAI’s API in their products; Voximplant now supports a direct integration with these in addition to OpenAI’s
- Developers using OpenAI for text chat who wanted to support voice had to rebuild their LLM infrastructure to support a realtime API - now they can keep their LLM configuration and expand on it to support voice interaction
New features:
1. Speech detention - detect when a user starts speaking to start / stop agent interaction
a. VAD detection
- Based on Silero VAD
- What it does: detects voice activity
- Benefits: Allows a scenario detect speech, allowing the scenario to know when to do things like start sending speech to an agent or even when to stop recording (like when leaving a message)
b. Turn detection
- Based on Pipecat’s SmartTURN
- What it does: detects when a user has stopped speaking and is ready for a response.
- Benefits: enables realistic conversational AI agents that can navigate variable pauses and speech disfluencies (i.e. “ahh”, “ummm”, etc.) for rapid interaction among speakers without lengthy delays or talking over one another.
2. OpenAI compatible connectors
a. Features - direct interface with these APIs that provide access to OpenAI LLM variants
- OpenAI Chat Completions API Client - simpler, lower-level, older API
- OpenAI Responses API Client - newer, agentic API with more features like multi-turn state handling, built-in tools, and
b. Benefits
- Use these API’s as your LLM - more choice; consistency with text-based bot implementations
- Responses API Client uses WebSockets which is ideal for long-running, tool-call-heavy workflows
- Allows use of 3rd-party OpenAI compatible LLMs
Pricing
- End-of-turn-detection: 0.002 per minute per stream
- VAD: free
Links
OpenAI
OpenAI product page — https://voximplant.com/products/openai-client
Chat Completions API Client Guide: https://voximplant.com/docs/voice-ai/openai/chat-completions-client
Responses API Client Guide: https://voximplant.com/docs/voice-ai/openai/responses-client
OpenAI module API reference: https://voximplant.com/docs/references/voxengine/openai
VAD and Turn Detection
VAD and Turn Detection Product page https://voximplant.com/products/turn-detection
VAD / Turn Detection Guide: https://voximplant.com/docs/guides/speech/vad-turn-detection#vad_/_turn_detection_0
Silero Module (VAD) API reference: https://voximplant.com/docs/references/voxengine/silero
Pipecat Module (Turn detection) API reference: https://voximplant.com/docs/references/voxengine/pipecat
OpenAI Chat Completions API reference — https://platform.openai.com/docs/api-reference/chat
OpenAI Responses API reference — https://platform.openai.com/docs/api-reference/responses
Voximplant Voice AI platform — https://voximplant.ai
Examples (for reference only)
Chat Completions
require(Modules.OpenAI);
VoxEngine.addEventListener(AppEvents.CallAlerting, async ({ call }) => {
let chatCompletionsClient = undefined;
call.answer();
const callBaseHandler = () => {
if (chatCompletionsClient) chatCompletionsClient.close();
VoxEngine.terminate();
};
call.addEventListener(CallEvents.Disconnected, callBaseHandler);
call.addEventListener(CallEvents.Failed, callBaseHandler);
const OPENAI_API_KEY = 'YOUR_OPENAI_API_KEY'; // insert your API key here
const onWebSocketClose = (event) => {
Logger.write('===ON_WEB_SOCKET_CLOSE==');
Logger.write(JSON.stringify(event));
VoxEngine.terminate();
};
const chatCompletionsClientParameters = {
apiKey: OPENAI_API_KEY,
storeContext: true, // this enables memory in your conversation
onWebSocketClose,
};
try {
chatCompletionsClient = await OpenAI.createChatCompletionsClient(chatCompletionsClientParameters);
chatCompletionsClient.addEventListener(OpenAI.ChatCompletionsEvents.ChatCompletionsError, (event) => {
Logger.write('===OpenAI.ChatCompletionsEvents.ChatCompletionsError===');
Logger.write(JSON.stringify(event));
});
var createChatCompletionParameters = {
"model": "gpt-4o",
"messages": [
{
"role": "developer",
"content": "You are a helpful assistant."
},
{ "role": "user", "content": "my name is John" }
]
}
chatCompletionsClient.createChatCompletions(createChatCompletionParameters);
chatCompletionsClient.addEventListener(OpenAI.ChatCompletionsEvents.Chunk, (event) => {
Logger.write('===OpenAI.ChatCompletionsEvents.Chunk===');
Logger.write(JSON.stringify(event));
});
chatCompletionsClient.addEventListener(OpenAI.ChatCompletionsEvents.Content, (event) => {
Logger.write('===OpenAI.ChatCompletionsEvents.Content===');
Logger.write(JSON.stringify(event));
});
chatCompletionsClient.addEventListener(OpenAI.ChatCompletionsEvents.ContentDelta, (event) => {
Logger.write('===OpenAI.ChatCompletionsEvents.ContentDelta===');
Logger.write(JSON.stringify(event));
});
var wasChatCompletionSend = true
chatCompletionsClient.addEventListener(OpenAI.ChatCompletionsEvents.ContentDone, (event) => {
Logger.write('===OpenAI.ChatCompletionsEvents.ContentDone===');
Logger.write(JSON.stringify(event));
if(wasChatCompletionSend){
const createChatCompletionParameters = {
"model": "gpt-4o",
"messages": [
{ "role": "user", "content": "What is my name?" }
]
}
chatCompletionsClient.createChatCompletions(createChatCompletionParameters);
wasChatCompletionSend = false;
}
});
chatCompletionsClient.addEventListener(OpenAI.ChatCompletionsEvents.RefusalDelta, (event) => {
Logger.write('===OpenAI.ChatCompletionsEvents.RefusalDelta===');
Logger.write(JSON.stringify(event));
});
chatCompletionsClient.addEventListener(OpenAI.ChatCompletionsEvents.RefusalDone, (event) => {
Logger.write('===OpenAI.ChatCompletionsEvents.RefusalDone===');
Logger.write(JSON.stringify(event));
});
chatCompletionsClient.addEventListener(OpenAI.ChatCompletionsEvents.FunctionToolCallArgumentsDelta, (event) => {
Logger.write('===OpenAI.ChatCompletionsEvents.FunctionToolCallArgumentsDelta===');
Logger.write(JSON.stringify(event));
});
chatCompletionsClient.addEventListener(OpenAI.ChatCompletionsEvents.FunctionToolCallArgumentsDone, (event) => {
Logger.write('===OpenAI.ChatCompletionsEvents.FunctionToolCallArgumentsDone===');
Logger.write(JSON.stringify(event));
});
chatCompletionsClient.addEventListener(OpenAI.ChatCompletionsEvents.LogProbsContentDelta, (event) => {
Logger.write('===OpenAI.ChatCompletionsEvents.LogProbsContentDelta===');
Logger.write(JSON.stringify(event));
});
chatCompletionsClient.addEventListener(OpenAI.ChatCompletionsEvents.LogProbsContentDone, (event) => {
Logger.write('===OpenAI.ChatCompletionsEvents.LogProbsContentDone===');
Logger.write(JSON.stringify(event));
});
chatCompletionsClient.addEventListener(OpenAI.ChatCompletionsEvents.LogProbsRefusalDelta, (event) => {
Logger.write('===OpenAI.ChatCompletionsEvents.LogProbsRefusalDelta===');
Logger.write(JSON.stringify(event));
});
chatCompletionsClient.addEventListener(OpenAI.ChatCompletionsEvents.LogProbsRefusalDone, (event) => {
Logger.write('===OpenAI.ChatCompletionsEvents.LogProbsRefusalDone===');
Logger.write(JSON.stringify(event));
});
} catch (error) {
Logger.write('===SOMETHING_WENT_WRONG===');
Logger.write(error);
VoxEngine.terminate();
}
});
Responses API
require(Modules.OpenAI);
VoxEngine.addEventListener(AppEvents.CallAlerting, async ({ call }) => {
let responsesClient = undefined;
call.answer();
const callBaseHandler = () => {
if (responsesClient) responsesClient.close();
VoxEngine.terminate();
};
call.addEventListener(CallEvents.Disconnected, callBaseHandler);
call.addEventListener(CallEvents.Failed, callBaseHandler);
const OPENAI_API_KEY = 'YOUR_OPENAI_API_KEY'; // insert your API key here
const onWebSocketClose = (event) => {
Logger.write('===ON_WEB_SOCKET_CLOSE==');
Logger.write(JSON.stringify(event));
VoxEngine.terminate();
};
const responsesClientParameters = {
apiKey: OPENAI_API_KEY,
storeContext: true, // this enables memory in your conversation
onWebSocketClose,
};
try {
responsesClient = await OpenAI.createResponsesClient(responsesClientParameters);
responsesClient.addEventListener(OpenAI.ResponsesEvents.ResponsesError, (event) => {
Logger.write('===OpenAI.ResponsesEvents.ResponsesError===');
Logger.write(JSON.stringify(event));
});
var createResponsesParameters = {
"model": "gpt-4o",
"input": "my name is John"
}
responsesClient.createResponses(createResponsesParameters);
const responsesEventsToListen = [
["ResponseAudioDone", OpenAI.ResponsesEvents.ResponseAudioDone],
["ResponseAudioTranscriptDelta", OpenAI.ResponsesEvents.ResponseAudioTranscriptDelta],
["ResponseAudioTranscriptDone", OpenAI.ResponsesEvents.ResponseAudioTranscriptDone],
["ResponseCodeInterpreterCallCodeDelta", OpenAI.ResponsesEvents.ResponseCodeInterpreterCallCodeDelta],
["ResponseCodeInterpreterCallCodeDone", OpenAI.ResponsesEvents.ResponseCodeInterpreterCallCodeDone],
["ResponseCodeInterpreterCallCompleted", OpenAI.ResponsesEvents.ResponseCodeInterpreterCallCompleted],
["ResponseCodeInterpreterCallInProgress", OpenAI.ResponsesEvents.ResponseCodeInterpreterCallInProgress],
["ResponseCodeInterpreterCallInterpreting", OpenAI.ResponsesEvents.ResponseCodeInterpreterCallInterpreting],
["ResponseCompleted", OpenAI.ResponsesEvents.ResponseCompleted],
["ResponseContentPartAdded", OpenAI.ResponsesEvents.ResponseContentPartAdded],
["ResponseContentPartDone", OpenAI.ResponsesEvents.ResponseContentPartDone],
["ResponseCreated", OpenAI.ResponsesEvents.ResponseCreated],
["ResponseError", OpenAI.ResponsesEvents.ResponseError],
["ResponseFileSearchCallCompleted", OpenAI.ResponsesEvents.ResponseFileSearchCallCompleted],
["ResponseFileSearchCallInProgress", OpenAI.ResponsesEvents.ResponseFileSearchCallInProgress],
["ResponseFileSearchCallSearching", OpenAI.ResponsesEvents.ResponseFileSearchCallSearching],
["ResponseFunctionCallArgumentsDelta", OpenAI.ResponsesEvents.ResponseFunctionCallArgumentsDelta],
["ResponseFunctionCallArgumentsDone", OpenAI.ResponsesEvents.ResponseFunctionCallArgumentsDone],
["ResponseInProgress", OpenAI.ResponsesEvents.ResponseInProgress],
["ResponseFailed", OpenAI.ResponsesEvents.ResponseFailed],
["ResponseIncomplete", OpenAI.ResponsesEvents.ResponseIncomplete],
["ResponseOutputItemAdded", OpenAI.ResponsesEvents.ResponseOutputItemAdded],
["ResponseOutputItemDone", OpenAI.ResponsesEvents.ResponseOutputItemDone],
["ResponseReasoningSummaryPartAdded", OpenAI.ResponsesEvents.ResponseReasoningSummaryPartAdded],
["ResponseReasoningSummaryPartDone", OpenAI.ResponsesEvents.ResponseReasoningSummaryPartDone],
["ResponseReasoningSummaryTextDelta", OpenAI.ResponsesEvents.ResponseReasoningSummaryTextDelta],
["ResponseReasoningSummaryTextDone", OpenAI.ResponsesEvents.ResponseReasoningSummaryTextDone],
["ResponseReasoningTextDelta", OpenAI.ResponsesEvents.ResponseReasoningTextDelta],
["ResponseReasoningTextDone", OpenAI.ResponsesEvents.ResponseReasoningTextDone],
["ResponseRefusalDelta", OpenAI.ResponsesEvents.ResponseRefusalDelta],
["ResponseRefusalDone", OpenAI.ResponsesEvents.ResponseRefusalDone],
["ResponseTextDelta", OpenAI.ResponsesEvents.ResponseTextDelta],
["ResponseTextDone", OpenAI.ResponsesEvents.ResponseTextDone],
["ResponseWebSearchCallCompleted", OpenAI.ResponsesEvents.ResponseWebSearchCallCompleted],
["ResponseWebSearchCallInProgress", OpenAI.ResponsesEvents.ResponseWebSearchCallInProgress],
["ResponseWebSearchCallSearching", OpenAI.ResponsesEvents.ResponseWebSearchCallSearching],
["ResponseImageGenCallCompleted", OpenAI.ResponsesEvents.ResponseImageGenCallCompleted],
["ResponseImageGenCallGenerating", OpenAI.ResponsesEvents.ResponseImageGenCallGenerating],
["ResponseImageGenCallInProgress", OpenAI.ResponsesEvents.ResponseImageGenCallInProgress],
["ResponseImageGenCallPartialImage", OpenAI.ResponsesEvents.ResponseImageGenCallPartialImage],
["ResponseMCPCallArgumentsDelta", OpenAI.ResponsesEvents.ResponseMCPCallArgumentsDelta],
["ResponseMCPCallArgumentsDone", OpenAI.ResponsesEvents.ResponseMCPCallArgumentsDone],
["ResponseMCPCallCompleted", OpenAI.ResponsesEvents.ResponseMCPCallCompleted],
["ResponseMCPCallFailed", OpenAI.ResponsesEvents.ResponseMCPCallFailed],
["ResponseMCPCallInProgress", OpenAI.ResponsesEvents.ResponseMCPCallInProgress],
["ResponseMCPListToolsCompleted", OpenAI.ResponsesEvents.ResponseMCPListToolsCompleted],
["ResponseMCPListToolsFailed", OpenAI.ResponsesEvents.ResponseMCPListToolsFailed],
["ResponseMCPListToolsInProgress", OpenAI.ResponsesEvents.ResponseMCPListToolsInProgress],
["ResponseOutputTextAnnotationAdded", OpenAI.ResponsesEvents.ResponseOutputTextAnnotationAdded],
["ResponseQueued", OpenAI.ResponsesEvents.ResponseQueued],
["ResponseCustomToolCallInputDelta", OpenAI.ResponsesEvents.ResponseCustomToolCallInputDelta],
["ResponseCustomToolCallInputDone", OpenAI.ResponsesEvents.ResponseCustomToolCallInputDone]
];
responsesEventsToListen.forEach(([name, ev]) => responsesClient.addEventListener(ev, (event) => {
Logger.write(`===OpenAI.ResponsesEvents.${name}===`);
Logger.write(JSON.stringify(event));
})
);
var wasResponsesSend = true
responsesClient.addEventListener(OpenAI.ResponsesEvents.ResponseCompleted, (event) => {
Logger.write('===OpenAI.ResponsesEvents.ResponseCompleted===');
Logger.write(JSON.stringify(event));
if(wasResponsesSend){
var createResponsesParameters = {
"model": "gpt-4o",
"input": "What is my name?"
}
responsesClient.createResponses(createResponsesParameters);
wasResponsesSend = false;
}
});
} catch (error) {
Logger.write('===SOMETHING_WENT_WRONG===');
Logger.write(error);
VoxEngine.terminate();
}
});
VAD and Turn Detection
require(Modules.Silero);
require(Modules.Pipecat);
const SILERO_VAD_THRESHOLD = 0.5;
const SILERO_VAD_MIN_SILENCE_DURATION_MS = 300;
const SILERO_VAD_SPEECH_PAD_MS = 10;
// NOTE: this is a "secret" parameters for developers' needs, NOT FOR PUBLIC SHARING
// const SILERO_VAD_MODEL = 'YOUR_SILERO_VAD_MODEL'; // e.g., "silero_vad_v3"
const vadParameters = {
threshold: SILERO_VAD_THRESHOLD,
minSilenceDurationMs: SILERO_VAD_MIN_SILENCE_DURATION_MS,
speechPadMs: SILERO_VAD_SPEECH_PAD_MS,
// model: SILERO_VAD_MODEL,
};
const PIPECAT_TURN_THRESHOLD = 0.5; // The end-of-turn probability cutoff in (0, 1). The default value is **0.5**.
// NOTE: this is a "secret" parameters for developers' needs, NOT FOR PUBLIC SHARING
// const PIPECAT_TURN_MODEL = 'YOUR_PIPECAT_MODEL'; // e.g., "pipecat_smart-turn-v3.1-cpu"
// const PIPECAT_TURN_CPU_COUNT = 1; // The number of intra-op threads for ONNX Runtime.
// const PIPECAT_TURN_SNAPSHOT_SECONDS = 8; // The number of seconds to capture audio for each snapshot.
const turnParameters = {
threshold: PIPECAT_TURN_THRESHOLD,
// model: PIPECAT_TURN_MODEL,
// cpuCount: PIPECAT_TURN_CPU_COUNT,
// snapshotSeconds: PIPECAT_TURN_SNAPSHOT_SECONDS,
};
VoxEngine.addEventListener(AppEvents.CallAlerting, async ({ call }) => {
try {
const callBaseHandler = () => {
vad?.close();
turnDetector?.close();
VoxEngine.terminate();
};
call.addEventListener(CallEvents.Disconnected, callBaseHandler);
call.addEventListener(CallEvents.Failed, callBaseHandler);
const vad = await Silero.createVAD(vadParameters);
const turnDetector = await Pipecat.createTurnDetector(turnParameters);
vad.addEventListener(Silero.VADEvents.Result, (event) => {
Logger.write('===Silero.VADEvents.Result===');
Logger.write(JSON.stringify(event));
if (event.speechEndAt) {
turnDetector.predict();
}
trace(JSON.stringify(event));
});
vad.addEventListener(Silero.VADEvents.Error, (event) => {
Logger.write('===Silero.VADEvents.Error===');
Logger.write(JSON.stringify(event));
});
vad.addEventListener(Silero.VADEvents.ConnectorInformation, (event) => {
Logger.write('===Silero.VADEvents.ConnectorInformation===');
Logger.write(JSON.stringify(event));
});
turnDetector.addEventListener(Pipecat.TurnEvents.Result, (event) => {
Logger.write('===Pipecat.TurnEvents.Result===');
Logger.write(JSON.stringify(event));
trace(JSON.stringify(event));
});
turnDetector.addEventListener(Pipecat.TurnEvents.Error, (event) => {
Logger.write('===Pipecat.TurnEvents.Error===');
Logger.write(JSON.stringify(event));
});
turnDetector.addEventListener(Pipecat.TurnEvents.ConnectorInformation, (event) => {
Logger.write('===Pipecat.TurnEvents.ConnectorInformation===');
Logger.write(JSON.stringify(event));
});
call.answer();
call.sendMediaTo(vad);
call.sendMediaTo(turnDetector);
} catch (error) {
Logger.write('===SOMETHING_WENT_WRONG===');
Logger.write(error);
VoxEngine.terminate();
}
});




