How can I send both audio and text data simultaneously through a WebSocket connection in Python?

I’m working on a chatbot application where I need to convert text to speech (TTS) on the backend and send both the generated audio and the original text to the frontend through a WebSocket connection. The text is being used for displaying in the chat interface, and the audio is played for users as part of the chatbot’s response.

What would be the best approach to send both the text and audio data via WebSocket? Should I encode the audio file into a specific format (e.g., base64) and send it alongside the text? Or is there a more efficient method? Additionally, how should I handle receiving and separating the two types of data on the frontend?

Any suggestions or code examples would be greatly appreciated!

Thanks in advance!

Backend code:

@router.websocket("/ws/voice-chat/") async def websocket_audio_endpoint(websocket: WebSocket): await websocket.accept()
    try:
        while True:
            data = await websocket.receive_text()
            chabotbotmodel = ChatbotModel(**json.loads(data))
            usr_id = chabotbotmodel.usr_id or chabotbotmodel.guest_id
    
            stream_data = stream_ai_chatbot(chabotbotmodel.usr_query, usr_id)
    
            # Check for propertyData in stream_data
            if stream_data.get("propertyData"):
                write_conversation_data_into_database(
                    chatbot_model.get("usr_id"),
                    chatbot_model.get("guest_id"),
                    usr_query,
                    response_data={"propertyData": stream_data.get("propertyData")},
                )
                await websocket.send_json({
                    "type": "text",
                    "content": stream_data.get("propertyData")
                })
    
            # Check for real-time data in stream_data
            elif stream_data.get("realtime_data"):
                realtime_response_data = ""
    
                async for item in real_time_data(
                        usr_query, stream_data.get("last_user_message")
                ):
                    # Generate audio using gTTS and encode to base64
                    tts = gTTS(text=item, lang='en')
                    audio_fp = BytesIO()
                    tts.write_to_fp(audio_fp)
                    audio_fp.seek(0)
    
                    audio_base64 = base64.b64encode(audio_fp.read()).decode('utf-8')
    
                    # Send both the base64-encoded audio and text response in one message
                    await websocket.send_json({
                        "type": "voice",
                        "audio": audio_base64,  # Audio in base64 format
                        "text": item  # Corresponding text
                    })
                    realtime_response_data = item
    
                # Store conversation data in database
                write_conversation_data_into_database(
                    chatbot_model.get("usr_id"),
                    chatbot_model.get("guest_id"),
                    usr_query,
                    response_data={"realtimeData": realtime_response_data},
                )
                print("nRealtime response data:", realtime_response_data)
    
            # Handle generic data response
            else:
                generic_response_data = ""
                response_text = ""
                async for item in generate_generic_response(
                        {
                            "role": "user",
                            "content": chabotbotmodel.usr_query
                        },
                        stream_data.get("last_user_message")):
                    generic_response_data = item
                    response_text += item + " "
                    await websocket.send_text(item)
                print("response_text-----",response_text)
                # Generate audio using gTTS and encode to base64
                tts = gTTS(text=response_text, lang='en')
                audio_fp = BytesIO()
                tts.write_to_fp(audio_fp)
                audio_fp.seek(0)
    
                audio_base64 = base64.b64encode(audio_fp.read()).decode('utf-8')
    
                # Send both the base64-encoded audio and text response in one message
                await websocket.send_json({
                    "type": "voice",
                    "audio": audio_base64
                })
                write_conversation_data_into_database(
                    chabotbotmodel.usr_id,
                    chabotbotmodel.guest_id,
                    chabotbotmodel.usr_query,
                    response_data={"genericData": generic_response_data},
                )

except WebSocketDisconnect:
    print("Client disconnected")

Frontend code:

function initializeVoiceWebSocket() { const wsEndpoint = "ws://0.0.0.0:5001/ws/voice-chat/"; voiceSocket = new WebSocket(wsEndpoint);            
    voiceSocket.onopen = function () {
                console.log("Voice WebSocket connection established");
            };

            voiceSocket.onmessage = function (event) {
                const response = JSON.parse(event.data);
                // Check if the response contains text and display it
                if (response.text) {
                    chatMessages.append(genericBotMessage(response.text));  // Display the text response
                    chatMessages.scrollTop(chatMessages.prop("scrollHeight"));
                }

                // Check if the response contains audio and play it
                if (response.audio) {
                    playBase64Audio(response.audio);  // Play the audio response if available
                }
            };

            voiceSocket.onerror = function (error) {
                console.error('Voice WebSocket Error:', error);
            };

            voiceSocket.onclose = function () {
                console.log("Voice WebSocket connection closed");
            };
        }

        // Function to send a text message
        function sendTextMessage(prompt) {
            if (textSocket.readyState === WebSocket.OPEN) {
                const message = {
                    usr_query: prompt,
                    usr_id: "066463f4-4762-42f2-8c72-69e79b0c99c7",  // Static or dynamic user ID
                    is_voice: isVoiceQuery  // Flag indicating whether it's a voice message
                };
                textSocket.send(JSON.stringify(message));
            } else {
                console.error('Text WebSocket is not open.');
            }
        }

        // Function to send a voice message
        function sendVoiceMessage(prompt) {
            if (voiceSocket.readyState === WebSocket.OPEN) {
                const message = {
                    usr_query: prompt,
                    usr_id: "066463f4-4762-42f2-8c72-69e79b0c99c7",  // Static or dynamic user ID
                    is_voice: true  // This is a voice-based message
                };
                voiceSocket.send(JSON.stringify(message));
            } else {
                console.error('Voice WebSocket is not open.');
            }
        }

        // Handle Text WebSocket Message
        function handleTextMessage(data) {
            chatMessages.append(genericBotMessage(data));  // Display the bot response
        }

        // Play Base64 Audio
        function playBase64Audio(base64Audio) {
            const audioBlob = base64ToBlob(base64Audio, 'audio/mpeg');
            const audioUrl = URL.createObjectURL(audioBlob);
            const audio = new Audio(audioUrl);
            audio.play();
        }

        // Convert Base64 to Blob
        function base64ToBlob(base64, mimeType) {
            const byteCharacters = atob(base64);
            const byteArrays = [];
            for (let offset = 0; offset < byteCharacters.length; offset += 512) {
                const slice = byteCharacters.slice(offset, offset + 512);
                const byteNumbers = new Array(slice.length);
                for (let i = 0; i < slice.length; i++) {
                    byteNumbers[i] = slice.charCodeAt(i);
                }
                const byteArray = new Uint8Array(byteNumbers);
                byteArrays.push(byteArray);
            }
            return new Blob(byteArrays, { type: mimeType });
        }`