I’m working on a chatbot application where I need to convert text to speech (TTS) on the backend and send both the generated audio and the original text to the frontend through a WebSocket connection. The text is being used for displaying in the chat interface, and the audio is played for users as part of the chatbot’s response.
What would be the best approach to send both the text and audio data via WebSocket? Should I encode the audio file into a specific format (e.g., base64) and send it alongside the text? Or is there a more efficient method? Additionally, how should I handle receiving and separating the two types of data on the frontend?
Any suggestions or code examples would be greatly appreciated!
Thanks in advance!
Backend code:
@router.websocket("/ws/voice-chat/") async def websocket_audio_endpoint(websocket: WebSocket): await websocket.accept()
try:
while True:
data = await websocket.receive_text()
chabotbotmodel = ChatbotModel(**json.loads(data))
usr_id = chabotbotmodel.usr_id or chabotbotmodel.guest_id
stream_data = stream_ai_chatbot(chabotbotmodel.usr_query, usr_id)
# Check for propertyData in stream_data
if stream_data.get("propertyData"):
write_conversation_data_into_database(
chatbot_model.get("usr_id"),
chatbot_model.get("guest_id"),
usr_query,
response_data={"propertyData": stream_data.get("propertyData")},
)
await websocket.send_json({
"type": "text",
"content": stream_data.get("propertyData")
})
# Check for real-time data in stream_data
elif stream_data.get("realtime_data"):
realtime_response_data = ""
async for item in real_time_data(
usr_query, stream_data.get("last_user_message")
):
# Generate audio using gTTS and encode to base64
tts = gTTS(text=item, lang='en')
audio_fp = BytesIO()
tts.write_to_fp(audio_fp)
audio_fp.seek(0)
audio_base64 = base64.b64encode(audio_fp.read()).decode('utf-8')
# Send both the base64-encoded audio and text response in one message
await websocket.send_json({
"type": "voice",
"audio": audio_base64, # Audio in base64 format
"text": item # Corresponding text
})
realtime_response_data = item
# Store conversation data in database
write_conversation_data_into_database(
chatbot_model.get("usr_id"),
chatbot_model.get("guest_id"),
usr_query,
response_data={"realtimeData": realtime_response_data},
)
print("nRealtime response data:", realtime_response_data)
# Handle generic data response
else:
generic_response_data = ""
response_text = ""
async for item in generate_generic_response(
{
"role": "user",
"content": chabotbotmodel.usr_query
},
stream_data.get("last_user_message")):
generic_response_data = item
response_text += item + " "
await websocket.send_text(item)
print("response_text-----",response_text)
# Generate audio using gTTS and encode to base64
tts = gTTS(text=response_text, lang='en')
audio_fp = BytesIO()
tts.write_to_fp(audio_fp)
audio_fp.seek(0)
audio_base64 = base64.b64encode(audio_fp.read()).decode('utf-8')
# Send both the base64-encoded audio and text response in one message
await websocket.send_json({
"type": "voice",
"audio": audio_base64
})
write_conversation_data_into_database(
chabotbotmodel.usr_id,
chabotbotmodel.guest_id,
chabotbotmodel.usr_query,
response_data={"genericData": generic_response_data},
)
except WebSocketDisconnect:
print("Client disconnected")
Frontend code:
function initializeVoiceWebSocket() { const wsEndpoint = "ws://0.0.0.0:5001/ws/voice-chat/"; voiceSocket = new WebSocket(wsEndpoint);
voiceSocket.onopen = function () {
console.log("Voice WebSocket connection established");
};
voiceSocket.onmessage = function (event) {
const response = JSON.parse(event.data);
// Check if the response contains text and display it
if (response.text) {
chatMessages.append(genericBotMessage(response.text)); // Display the text response
chatMessages.scrollTop(chatMessages.prop("scrollHeight"));
}
// Check if the response contains audio and play it
if (response.audio) {
playBase64Audio(response.audio); // Play the audio response if available
}
};
voiceSocket.onerror = function (error) {
console.error('Voice WebSocket Error:', error);
};
voiceSocket.onclose = function () {
console.log("Voice WebSocket connection closed");
};
}
// Function to send a text message
function sendTextMessage(prompt) {
if (textSocket.readyState === WebSocket.OPEN) {
const message = {
usr_query: prompt,
usr_id: "066463f4-4762-42f2-8c72-69e79b0c99c7", // Static or dynamic user ID
is_voice: isVoiceQuery // Flag indicating whether it's a voice message
};
textSocket.send(JSON.stringify(message));
} else {
console.error('Text WebSocket is not open.');
}
}
// Function to send a voice message
function sendVoiceMessage(prompt) {
if (voiceSocket.readyState === WebSocket.OPEN) {
const message = {
usr_query: prompt,
usr_id: "066463f4-4762-42f2-8c72-69e79b0c99c7", // Static or dynamic user ID
is_voice: true // This is a voice-based message
};
voiceSocket.send(JSON.stringify(message));
} else {
console.error('Voice WebSocket is not open.');
}
}
// Handle Text WebSocket Message
function handleTextMessage(data) {
chatMessages.append(genericBotMessage(data)); // Display the bot response
}
// Play Base64 Audio
function playBase64Audio(base64Audio) {
const audioBlob = base64ToBlob(base64Audio, 'audio/mpeg');
const audioUrl = URL.createObjectURL(audioBlob);
const audio = new Audio(audioUrl);
audio.play();
}
// Convert Base64 to Blob
function base64ToBlob(base64, mimeType) {
const byteCharacters = atob(base64);
const byteArrays = [];
for (let offset = 0; offset < byteCharacters.length; offset += 512) {
const slice = byteCharacters.slice(offset, offset + 512);
const byteNumbers = new Array(slice.length);
for (let i = 0; i < slice.length; i++) {
byteNumbers[i] = slice.charCodeAt(i);
}
const byteArray = new Uint8Array(byteNumbers);
byteArrays.push(byteArray);
}
return new Blob(byteArrays, { type: mimeType });
}`