I’m building a voice chat system using the Gemini 2.0 Multimodal WebSocket in Node.js for the backend and React for the frontend. Google provides an example repository here: https://github.com/google-gemini/multimodal-live-api-web-console.
However, the issue is that the API key is exposed in the WebSocket connection on the frontend, which is a security risk. To mitigate this, I am trying to implement a proxy server in Node.js so the frontend communicates with my backend, and the backend securely connects to Gemini’s WebSocket API.
Here’s the workflow I am attempting to build:
Frontend: Sends voice data (via WebSocket) to my Node.js proxy server.
Backend (Node.js): Connects to Gemini’s WebSocket API with the API key and relays data/responses to/from the frontend.
The problem is:
The WebSocket connection between my backend and Gemini’s API establishes correctly the first time.
When I send the voice data, I don’t receive any response from the Gemini WebSocket server.
Here’s a simplified version of my backend proxy WebSocket implementation:
/// backend code
`const express = require('express');
const http = require('http');
const { Server } = require('socket.io');
const cors = require('cors');
const WebSocket = require('ws');
const app = express();
app.use(cors());
const server = http.createServer(app);
const io = new Server(server, {
cors: {
origin: "*",
methods: ["GET", "POST"],
},
});
const GeminiKey= 'Your api key'
const model = `models/gemini-2.0-flash-exp`;
const GEMINI_API_URL = 'wss://generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1alpha.GenerativeService.BidiGenerateContent';
const clientGeminiSockets = new Map();
const clientRetryCounts = new Map();
const createGeminiConnection = (socketId) => {
const geminiSocket = new WebSocket(`${GEMINI_API_URL}?key=${GeminiKey}`);
geminiSocket.onopen = () => {
console.log(`Connected to Gemini API for client: ${socketId}`);
const setupMessage = {
setup: {
generation_config: { response_modalities: ["AUDIO"] },
model: model
},
};
geminiSocket.send(JSON.stringify(setupMessage));
// Reset retry count on successful connection
clientRetryCounts.set(socketId, 0);
};
geminiSocket.onmessage = (event) => {
try {
console.log(`Received message from Gemini API for client ${socketId}:`, event.data);
// Handle binary data
if (event.data instanceof Buffer) {
console.log(`Sending Gemini Audio data to client ${socketId}: binary data`);
//Send binary data
io.to(socketId).emit('geminiAudio', event.data.toString('base64'));
}
else{
const message = JSON.parse(event.data)
io.to(socketId).emit('message', message);
}
} catch (err) {
console.error("Error handling gemini message:", err);
io.to(socketId).emit('message', JSON.stringify({ error: "Error handling Gemini API message" }));
}
};
geminiSocket.onerror = (error) => {
console.error(`Gemini API WebSocket Error for client ${socketId}:`, error);
io.to(socketId).emit('message', JSON.stringify({ error: `Gemini API WebSocket Error: ${error.message}` }));
handleReconnection(socketId, geminiSocket);
};
geminiSocket.onclose = (event) => {
console.log(`Gemini API WebSocket closed for client ${socketId}:`, event.code, event.reason);
io.to(socketId).emit('message', JSON.stringify({ error: `Gemini API WebSocket Closed ${event.code}, ${event.reason}` }));
clientGeminiSockets.delete(socketId);
clientRetryCounts.delete(socketId)
};
clientGeminiSockets.set(socketId, geminiSocket);
return geminiSocket
};
const handleReconnection = (socketId, geminiSocket) => {
const retryCount = clientRetryCounts.get(socketId) || 0;
const delay = Math.min(1000 * Math.pow(2, retryCount), 30000); // Maximum 30 sec
console.log(`Attempting to reconnect to Gemini for client: ${socketId} in ${delay}ms`);
setTimeout(() => {
if(!geminiSocket || geminiSocket.readyState !== WebSocket.OPEN){
console.log(`Reconnecting Gemini API for client ${socketId}`);
createGeminiConnection(socketId);
}
}, delay);
clientRetryCounts.set(socketId, retryCount + 1);
}
io.on('connection', (socket) => {
console.log(`Client connected: ${socket.id}`);
let geminiSocket = createGeminiConnection(socket.id);
socket.on('message', (formattedMessage) => {
try {
console.log(`Received message from client ${socket.id}:`,);
if (geminiSocket && geminiSocket.readyState === WebSocket.OPEN) {
const messageToSend = {
realtimeInput: {
mediaChunks: [
{
mimeType: "audio/pcm;rate=16000",
data: formattedMessage.mediaChunks[0].data,
}
]
}
};
console.log(`Sending data to gemini API for client ${socket.id}:`, );
geminiSocket.send(JSON.stringify(messageToSend));
} else {
console.error(`Gemini API Websocket is not open for client ${socket.id}`);
socket.emit('message', JSON.stringify({ error: "Gemini API Websocket is not open" }));
}
} catch (error) {
console.error(`Error forwarding message to Gemini API for client ${socket.id}:`, error);
socket.emit('message', JSON.stringify({ error: `Error forwarding message ${error.message}` }));
}
});
socket.on('disconnect', () => {
console.log(`Client disconnected: ${socket.id}`);
const geminiSocket = clientGeminiSockets.get(socket.id);
if (geminiSocket) {
geminiSocket.close();
clientGeminiSockets.delete(socket.id);
clientRetryCounts.delete(socket.id)
console.log(`Gemini connection closed for client: ${socket.id}`);
} else {
console.log(`No gemini connection found for client: ${socket.id}`);
}
});
});
const PORT = 5000;
server.listen(PORT, () => {
console.log(`Proxy server running on port ${PORT}`);
});
`
/// frontend code
`
import { useEffect, useRef, useState } from 'react';
import io from 'socket.io-client';
export default function Home() {
const [isRecording, setIsRecording] = useState(false);
const [socket, setSocket] = useState(null);
const audioRef = useRef(null);
const [geminiAudio, setGeminiAudio] = useState(null)
const audioContextRef = useRef(null);
const scriptProcessorRef = useRef(null);
const [connectionError, setConnectionError] = useState(null)
const startRecording = async () => {
if (!socket) {
console.error("Socket is not connected.");
return;
}
if(connectionError){
console.log("There is a connection error, cannot start recording")
return
}
try {
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
audioContextRef.current = new AudioContext({ sampleRate: 16000 }); // Create audio context with 16kHz
const source = audioContextRef.current.createMediaStreamSource(stream);
scriptProcessorRef.current = audioContextRef.current.createScriptProcessor(4096, 1, 1); // Create script processor
source.connect(scriptProcessorRef.current);
scriptProcessorRef.current.connect(audioContextRef.current.destination);
scriptProcessorRef.current.onaudioprocess = (event) => {
if (socket && socket.connected) {
const pcmData = event.inputBuffer.getChannelData(0); // Get PCM data
console.log("PCM data", pcmData)
const base64Audio = arrayBufferToBase64(pcmData.buffer);
console.log("base64Audio data", base64Audio);
const formattedMessage = {
mediaChunks: [
{
mimeType: "audio/pcm;rate=16000",
data: base64Audio,
},
],
};
console.log("Formatted message sent to backend:", formattedMessage);
socket.emit('message', formattedMessage);
} else {
console.error("Socket is not connected");
}
};
setIsRecording(true);
} catch (err) {
console.error("Error recoring audio", err)
}
};
const stopRecording = () => {
if (scriptProcessorRef.current) {
scriptProcessorRef.current.disconnect();
scriptProcessorRef.current.onaudioprocess = null;
audioContextRef.current.close();
setIsRecording(false);
}
};
function arrayBufferToBase64(buffer) {
let binary = '';
const bytes = new Uint8Array(buffer);
const len = bytes.byteLength;
for (let i = 0; i < len; i++) {
binary += String.fromCharCode(bytes[i]);
}
return btoa(binary);
}
useEffect(() => {
const newSocket = io('http://localhost:5000'); // Replace with your backend URL
newSocket.on('connect', () => {
console.log("connected to websocket server")
setConnectionError(null)
});
newSocket.on('message', (message) => {
console.log('Received message:', message);
try {
const parsedMessage = JSON.parse(message)
if(parsedMessage.error){
setConnectionError(parsedMessage.error)
console.log("Setting connection error", parsedMessage.error)
}
}`your text`
catch (error){
console.log("Message is not in JSON format")
}
});
newSocket.on('geminiAudio', (audioData) => {
console.log('Received geminiAudio data from backend:', audioData);
setGeminiAudio(audioData)
})
setSocket(newSocket);
return () => {
newSocket.disconnect();
};
}, []);
useEffect(() => {
if (geminiAudio) {
const audio = audioRef.current;
const audioBlob = new Blob([Uint8Array.from(atob(geminiAudio), c => c.charCodeAt(0))], { type: 'audio/webm' });
console.log("Audio blob created", audioBlob);
const audioUrl = URL.createObjectURL(audioBlob);
console.log("Audio URL created", audioUrl);
audio.src = audioUrl;
audio.play();
}
}, [geminiAudio])
return (
<div className="flex flex-col items-center justify-center min-h-screen bg-gray-100">
<h1 className="text-3xl font-bold mb-6">Live Voice Conversation</h1>
{connectionError && <div className="text-red-500 mb-4">{connectionError}</div>}
<div className="space-x-4">
<button
className={`px-4 py-2 rounded ${
isRecording
? 'bg-red-500 text-white'
: 'bg-green-500 text-white hover:bg-green-600'
}`}
onClick={isRecording ? stopRecording : startRecording}
>
{isRecording ? 'Stop Recording' : 'Start Recording'}
</button>
<audio ref={audioRef} controls>
Your browser does not support the audio element.
</audio>
</div>
</div>
);
}`
// What I Tried :
Checked WebSocket Connection – The WebSocket successfully connects to both the frontend and the Gemini API.
Logged Sent Messages – Verified that the frontend is sending voice data correctly, and the backend is forwarding it to Gemini.
Checked for Responses – Added console.log to print any incoming messages from Gemini’s WebSocket.
Handled Errors – No error messages are appearing in the WebSocket error event.
// What I Expected :
When the frontend sends voice data, the backend should relay it to Gemini’s WebSocket API.
Gemini should process the voice input and return a response, which my backend would send back to the frontend.
The frontend should receive the response and display/play it.