Big file upload (#349)

2025-12-21 20:59:05 +00:00 · 2024-01-30 20:51:58 +05:30
parent 529984c198
commit 1522d60cbc
11 changed files with 64 additions and 45 deletions
--- a/server/reflector/pipelines/main_live_pipeline.py
+++ b/server/reflector/pipelines/main_live_pipeline.py
@@ -384,8 +384,8 @@ class PipelineMainDiarization(PipelineMainBase):
        # to let the start just do one job.
        pipeline.logger.bind(transcript_id=transcript.id)
        pipeline.logger.info("Diarization pipeline created")
-        self.push(audio_diarization_input)
-        self.flush()
+        await self.push(audio_diarization_input)
+        await self.flush()

        return pipeline

@@ -414,9 +414,9 @@ class PipelineMainFromTopics(PipelineMainBase):
        # push topics
        topics = self.get_transcript_topics(transcript)
        for topic in topics:
-            self.push(topic)
+            await self.push(topic)

-        self.flush()
+        await self.flush()

        return pipeline

@@ -653,10 +653,10 @@ async def pipeline_upload(transcript: Transcript, logger: Logger):
        try:
            logger.info("Start pushing audio into the pipeline")
            for frame in container.decode(audio=0):
-                pipeline.push(frame)
+                await pipeline.push(frame)
        finally:
            logger.info("Flushing the pipeline")
-            pipeline.flush()
+            await pipeline.flush()

        logger.info("Waiting for the pipeline to end")
        await pipeline.join()
--- a/server/reflector/pipelines/runner.py
+++ b/server/reflector/pipelines/runner.py
@@ -66,17 +66,17 @@ class PipelineRunner(BaseModel):
        coro = self.run()
        asyncio.run(coro)

-    def push(self, data):
+    async def push(self, data):
        """
        Push data to the pipeline
        """
-        self._add_cmd("PUSH", data)
+        await self._add_cmd("PUSH", data)

-    def flush(self):
+    async def flush(self):
        """
        Flush the pipeline
        """
-        self._add_cmd("FLUSH", None)
+        await self._add_cmd("FLUSH", None)

    async def on_status(self, status):
        """
@@ -90,12 +90,26 @@ class PipelineRunner(BaseModel):
        """
        pass

-    def _add_cmd(self, cmd: str, data):
+    async def _add_cmd(
+        self, cmd: str, data, max_retries: int = 3, retry_time_limit: int = 3
+    ):
        """
        Enqueue a command to be executed in the runner.
        Currently supported commands: PUSH, FLUSH
        """
-        self._q_cmd.put_nowait([cmd, data])
+        for _ in range(max_retries):
+            try:
+                self._q_cmd.put_nowait([cmd, data])
+                break  # Break if put succeeds
+            except asyncio.queues.QueueFull:
+                # Handle only the QueueFull exception, retry after a small delay
+                self._logger.debug(
+                    f"Encountered a full queue, while trying to add [{cmd, data}]. "
+                    f"Retrying in {retry_time_limit} seconds"
+                )
+                await asyncio.sleep(retry_time_limit)
+        else:
+            print(f"Failed to add [{cmd, data}] after {max_retries} attempts.")

    async def _set_status(self, status):
        self._logger.debug("Runner status updated", status=status)
--- a/server/reflector/utils/text_utils.py
+++ b/server/reflector/utils/text_utils.py
@@ -1,20 +1,20 @@
 """
 Utility file for all text processing related functionalities
 """
+
 import datetime
 from typing import List

 import nltk
 import torch
+from log_utils import LOGGER
 from nltk.corpus import stopwords
 from nltk.tokenize import word_tokenize
+from run_utils import CONFIG
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
 from transformers import BartForConditionalGeneration, BartTokenizer

-from log_utils import LOGGER
-from run_utils import CONFIG
-
 nltk.download("punkt", quiet=True)


--- a/server/reflector/views/rtc_offer.py
+++ b/server/reflector/views/rtc_offer.py
@@ -40,7 +40,7 @@ class AudioStreamTrack(MediaStreamTrack):
        ctx = self.ctx
        frame = await self.track.recv()
        try:
-            ctx.pipeline_runner.push(frame)
+            await ctx.pipeline_runner.push(frame)
        except Exception as e:
            ctx.logger.error("Pipeline error", error=e)
        return frame
@@ -76,7 +76,7 @@ async def rtc_offer_base(
        #    - when we receive the close event, we do nothing.
        # 2. or the client close the connection
        #    and there is nothing to do because it is already closed
-        ctx.pipeline_runner.flush()
+        await ctx.pipeline_runner.flush()
        if close:
            ctx.logger.debug("Closing peer connection")
            await pc.close()
--- a/server/reflector/views/transcripts_audio.py
+++ b/server/reflector/views/transcripts_audio.py
@@ -3,6 +3,7 @@ Transcripts audio related endpoints
 ===================================

 """
+
 from typing import Annotated, Optional

 import httpx
--- a/server/reflector/views/transcripts_participants.py
+++ b/server/reflector/views/transcripts_participants.py
@@ -3,6 +3,7 @@ Transcript participants API endpoints
 =====================================

 """
+
 from typing import Annotated, Optional

 import reflector.auth as auth
--- a/server/reflector/views/transcripts_speaker.py
+++ b/server/reflector/views/transcripts_speaker.py
@@ -3,6 +3,7 @@ Reassign speakers in a transcript
 =================================

 """
+
 from typing import Annotated, Optional

 import reflector.auth as auth
--- a/server/reflector/views/transcripts_websocket.py
+++ b/server/reflector/views/transcripts_websocket.py
@@ -3,6 +3,7 @@ Transcripts websocket API
 =========================

 """
+
 from fastapi import APIRouter, HTTPException, WebSocket, WebSocketDisconnect
 from reflector.db.transcripts import transcripts_controller
 from reflector.ws_manager import get_ws_manager