How to stream microphone audio through websocket from JS frontend to python backend?

I am working on a project that requires me to preform real-time VAD and TTS on clients incoming audio on the server. Also preferably the data should be turned into a pydub’s AudioSegment for ease of use.

Here is what I came up with:

Client:

const socket = new WebSocket("ws://localhost:8000/audio")
const btn = document.getElementById("mic")
let stream, recorder, blobbed = null
const constraints = { audio: {
    sampleRate: 16000,
    channelCount: 1,
    volume: 1.0,
    echoCancellation: true,
    noiseSuppression: true,
    autoGainControl: true
}}
const options = {
    mimeType: "audio/webm;codecs=opus",
}
const timeSliceMs = 3 * 1000 
let toggle = false

function start_recorder() {
    recorder.start(timeSliceMs)
}

function init_recorder() {
    navigator.mediaDevices.getUserMedia(constraints)
        .then(stream => {
            recorder = new MediaRecorder(stream, options)
            recorder.ondataavailable = e => {
                if (toggle) {
                    blobbed = new Blob([e.data], { "type": options.mimeType })
                    socket.send(blobbed)
                }
            }        
            recorder.onstart = e => {
                btn.innerText = "ON"
            }
            recorder.onstop = e => {
                btn.innerText = "OFF"
            }

            start_recorder()
        })
        .catch(() => { console.error("Couldn't get user media!") })
}

function btnSwitch(e) {
    if (recorder == null) { 
        toggle = true
        init_recorder()
    } else {
        toggle = !toggle
        if (toggle) {
            start_recorder()
        } else {
            recorder.stop()
        }
    }
}

btn.addEventListener("click", btnSwitch)

socket.addEventListener("message", e => {
    let data = e.data
    try {
        data = JSON.stringify()
    } catch (err) { 
        console.error("Server error!") 
        return
    }
    console.log(data)
})

function close() {
    toggle = false
    recorder.stop()
    recorder = null
    btn.removeEventListener("click", btnSwitch)
    console.log("WS connection closed.")
}

socket.addEventListener("close", close)
socket.addEventListener("error", close)

Server:

from fastapi import FastAPI, WebSocket
from fastapi.responses import HTMLResponse
from fastapi.staticfiles import StaticFiles
from pydub import AudioSegment
from webrtcvad import Vad
import speech_recognition as sr
import numpy as np
from io import BytesIO

app = FastAPI()
app.mount('/static', StaticFiles(directory='static', html=True), name='static')


@app.get("/audio")
def audio():
    with open("audio.html", "r") as f:
        html = f.read()
    return HTMLResponse(html)

@app.websocket("/audio")
async def audio(connection: WebSocket):
    await connection.accept()
    pad = await connection.receive_bytes()
    audio_segment = AudioSegment.from_file(BytesIO(pad), codec="opus", format="webm")
    pad_len = len(audio_segment)
    
    while True:
        data = await connection.receive_bytes()
        data = pad + data
        audio_segment = AudioSegment.from_file(BytesIO(data), codec="opus", format="webm")
        
        # Preforming VAD STT

-The obvious problem with my approach is that the JS MediaRecorder is intended to send the recorder audio in one big blob on the stop event and thus only provides the necessary metadata in the first batch of data so in the backend I need pad each subsequent segment with the first one and than slice it in half.
-This understandably isn’t ideal so I would like to find a different (probably more correct) way to do this.