Skip to content
Docs

Voice AI Integrations

Voice AI enables natural, hands-free interactions through spoken language. Whether you are building a customer support phone agent, a voice-controlled assistant, or a real-time conversation system, the voice pipeline handles the full audio lifecycle: capturing speech, converting it to text for your agent, and synthesizing spoken responses.

Beluga AI provides a frame-based voice pipeline with separate registries for speech-to-text (STT), text-to-speech (TTS), speech-to-speech (S2S), voice activity detection (VAD), and real-time transport. Each category follows the standard registry pattern, so you can swap providers without changing your application logic.

Audio In → Transport → VAD → STT → Agent → TTS → Transport → Audio Out
└─── S2S (bypasses STT/TTS) ──┘

The voice pipeline processes audio as discrete Frame values through a chain of FrameProcessor implementations. STT, TTS, and S2S engines can each operate as frame processors within this pipeline.

Speech-to-text providers convert audio to text, supporting both batch and streaming transcription.

ProviderRegistry NameStreamingLanguagesImport Path
DeepgramdeepgramYes36+voice/stt/providers/deepgram
AssemblyAIassemblyaiYes20+voice/stt/providers/assemblyai
WhisperwhisperNo99voice/stt/providers/whisper
ElevenLabselevenlabsYes29voice/stt/providers/elevenlabs
GroqgroqNo50+voice/stt/providers/groq
GladiagladiaYes100+voice/stt/providers/gladia
type STT interface {
Transcribe(ctx context.Context, audio []byte) (string, error)
TranscribeStream(ctx context.Context, audio iter.Seq2[[]byte, error]) iter.Seq2[TranscriptEvent, error]
}
Terminal window
export DEEPGRAM_API_KEY="..."
import _ "github.com/lookatitude/beluga-ai/voice/stt/providers/deepgram"
engine, err := stt.New("deepgram", stt.Config{
Language: "en",
SampleRate: 16000,
Extra: map[string]any{
"api_key": os.Getenv("DEEPGRAM_API_KEY"),
"model": "nova-2",
},
})
if err != nil {
log.Fatal(err)
}
// Batch transcription
text, err := engine.Transcribe(ctx, audioBytes)
// Streaming transcription
for event, err := range engine.TranscribeStream(ctx, audioStream) {
if err != nil {
break
}
if event.IsFinal {
fmt.Printf("[%v] %s\n", event.Timestamp, event.Text)
}
}
import _ "github.com/lookatitude/beluga-ai/voice/stt/providers/groq"
engine, err := stt.New("groq", stt.Config{
Language: "en",
Extra: map[string]any{
"api_key": os.Getenv("GROQ_API_KEY"),
"model": "whisper-large-v3",
},
})
processor := stt.AsFrameProcessor(engine)
// processor implements voice.FrameProcessor and can be inserted into the pipeline

Text-to-speech providers convert text to audio, supporting both batch and streaming synthesis.

ProviderRegistry NameStreamingVoicesImport Path
ElevenLabselevenlabsYes1000+voice/tts/providers/elevenlabs
CartesiacartesiaYesCustomvoice/tts/providers/cartesia
PlayHTplayhtYes600+voice/tts/providers/playht
LMNTlmntYesCustomvoice/tts/providers/lmnt
Fish AudiofishYesCustomvoice/tts/providers/fish
GroqgroqNoStandardvoice/tts/providers/groq
Smallest AIsmallestYesCustomvoice/tts/providers/smallest
type TTS interface {
Synthesize(ctx context.Context, text string) ([]byte, error)
SynthesizeStream(ctx context.Context, text iter.Seq2[string, error]) iter.Seq2[[]byte, error]
OutputFormat() AudioFormat
SampleRate() int
}
Terminal window
export ELEVENLABS_API_KEY="..."
import _ "github.com/lookatitude/beluga-ai/voice/tts/providers/elevenlabs"
engine, err := tts.New("elevenlabs", tts.Config{
Voice: "rachel",
SampleRate: 24000,
Extra: map[string]any{
"api_key": os.Getenv("ELEVENLABS_API_KEY"),
"model_id": "eleven_multilingual_v2",
"output_format": "pcm_24000",
},
})
if err != nil {
log.Fatal(err)
}
// Batch synthesis
audio, err := engine.Synthesize(ctx, "Hello, how can I help you?")
// Streaming synthesis
for chunk, err := range engine.SynthesizeStream(ctx, textStream) {
if err != nil {
break
}
transport.Send(chunk)
}
import _ "github.com/lookatitude/beluga-ai/voice/tts/providers/cartesia"
engine, err := tts.New("cartesia", tts.Config{
Voice: "sonic-english",
SampleRate: 24000,
Extra: map[string]any{
"api_key": os.Getenv("CARTESIA_API_KEY"),
},
})
processor := tts.AsFrameProcessor(engine, 24000)

Speech-to-speech providers handle bidirectional audio conversations, bypassing separate STT and TTS stages for lower latency.

ProviderRegistry NameModelImport Path
OpenAI Realtimeopenai_realtimeGPT-4o Realtimevoice/s2s/providers/openai
Amazon NovanovaNova S2Svoice/s2s/providers/nova
Google GeminigeminiGemini Livevoice/s2s/providers/gemini
SilerosileroSilero S2Svoice/s2s/providers/silero
type S2S interface {
Start(ctx context.Context) (Session, error)
}
type Session interface {
SendAudio(ctx context.Context, audio []byte) error
SendText(ctx context.Context, text string) error
Receive(ctx context.Context) iter.Seq2[Event, error]
Close() error
}
import _ "github.com/lookatitude/beluga-ai/voice/s2s/providers/openai"
engine, err := s2s.New("openai_realtime", s2s.Config{
Voice: "alloy",
Model: "gpt-4o-realtime-preview",
Extra: map[string]any{
"api_key": os.Getenv("OPENAI_API_KEY"),
},
})
if err != nil {
log.Fatal(err)
}
session, err := engine.Start(ctx)
if err != nil {
log.Fatal(err)
}
defer session.Close()
// Send audio and receive responses
if err := session.SendAudio(ctx, audioChunk); err != nil {
log.Fatal(err)
}
for event, err := range session.Receive(ctx) {
if err != nil {
break
}
switch event.Type {
case s2s.EventAudio:
transport.Send(event.Audio)
case s2s.EventTranscript:
fmt.Println("Agent:", event.Text)
}
}

Voice activity detection determines when speech starts and stops in an audio stream.

ProviderRegistry NameTypeImport Path
SilerosileroNeural networkvoice/vad/providers/silero
WebRTCwebrtcTraditional DSPvoice/vad/providers/webrtc

Silero VAD uses a neural network model for high-accuracy speech detection.

import _ "github.com/lookatitude/beluga-ai/voice/vad/providers/silero"
detector, err := vad.New("silero", vad.Config{
Threshold: 0.5,
SampleRate: 16000,
})

WebRTC VAD is lighter weight and suitable for environments where neural network inference is too expensive.

import _ "github.com/lookatitude/beluga-ai/voice/vad/providers/webrtc"
detector, err := vad.New("webrtc", vad.Config{
Threshold: 0.5,
SampleRate: 16000,
})

Transport providers handle real-time audio streaming between clients and the voice pipeline.

ProviderRegistry NameProtocolImport Path
WebSocketwebsocketWebSocketvoice/transport
LiveKitlivekitWebRTC (LiveKit)voice/transport/providers/livekit
DailydailyWebRTC (Daily)voice/transport/providers/daily
PipecatpipecatPipecat protocolvoice/transport/providers/pipecat

WebSocket is the built-in transport for browser-based audio streaming.

transport, err := transport.New("websocket", transport.Config{
SampleRate: 16000,
Channels: 1,
})

LiveKit provides WebRTC-based transport with room management for multi-party voice.

import _ "github.com/lookatitude/beluga-ai/voice/transport/providers/livekit"
transport, err := transport.New("livekit", transport.Config{
Extra: map[string]any{
"url": os.Getenv("LIVEKIT_URL"),
"api_key": os.Getenv("LIVEKIT_API_KEY"),
"api_secret": os.Getenv("LIVEKIT_API_SECRET"),
"room_name": "voice-agent-room",
},
})

Assemble a full voice pipeline by combining STT, TTS, VAD, and transport:

package main
import (
"context"
"log"
"github.com/lookatitude/beluga-ai/voice"
"github.com/lookatitude/beluga-ai/voice/stt"
"github.com/lookatitude/beluga-ai/voice/tts"
"github.com/lookatitude/beluga-ai/voice/vad"
_ "github.com/lookatitude/beluga-ai/voice/stt/providers/deepgram"
_ "github.com/lookatitude/beluga-ai/voice/tts/providers/elevenlabs"
_ "github.com/lookatitude/beluga-ai/voice/vad/providers/silero"
)
func main() {
ctx := context.Background()
sttEngine, err := stt.New("deepgram", stt.Config{Language: "en"})
if err != nil {
log.Fatal(err)
}
ttsEngine, err := tts.New("elevenlabs", tts.Config{Voice: "rachel"})
if err != nil {
log.Fatal(err)
}
vadEngine, err := vad.New("silero", vad.Config{Threshold: 0.5})
if err != nil {
log.Fatal(err)
}
pipeline := voice.NewPipeline(
vad.AsFrameProcessor(vadEngine),
stt.AsFrameProcessor(sttEngine),
// Your agent processor here
tts.AsFrameProcessor(ttsEngine, 24000),
)
if err := pipeline.Run(ctx); err != nil {
log.Fatal(err)
}
}
ComponentLow LatencyHigh AccuracyCost Effective
STTDeepgram Nova-2AssemblyAIGroq Whisper
TTSCartesiaElevenLabsGroq
S2SOpenAI RealtimeOpenAI RealtimeGemini Live
VADWebRTCSileroWebRTC
TransportWebSocketLiveKitWebSocket