Merge pull request #373 from Monadical-SAS/upload-big-files

Upload big files
2026-02-04 18:06:48 +00:00 · 2024-07-19 11:59:54 +02:00
parent 74a1c69e1f 5007bd7875
commit 28523f3a67
7 changed files with 134 additions and 86 deletions
--- a/.github/workflows/test_server.yml
+++ b/.github/workflows/test_server.yml
@@ -3,10 +3,10 @@ name: Unittests
 on:
  pull_request:
    paths:
-      - 'server/**'
+      - "server/**"
  push:
    paths:
-      - 'server/**'
+      - "server/**"

 jobs:
  pytest:
@@ -23,9 +23,9 @@ jobs:
      - name: Set up Python 3.x
        uses: actions/setup-python@v4
        with:
-        python-version: '3.11'
-        cache: 'poetry'
-        cache-dependency-path: 'server/poetry.lock'
+          python-version: "3.11"
+          cache: "poetry"
+          cache-dependency-path: "server/poetry.lock"
      - name: Install requirements
        run: |
          cd server
@@ -61,7 +61,7 @@ jobs:
        run: |
          pip install ruff
          cd server
-        ruff reflector tests
+          ruff check reflector tests

  docker:
    runs-on: ubuntu-latest
--- a/server/reflector/views/transcripts_upload.py
+++ b/server/reflector/views/transcripts_upload.py
@@ -17,7 +17,9 @@ class UploadStatus(BaseModel):
@router.post("/transcripts/{transcript_id}/record/upload")
 async def transcript_record_upload(
    transcript_id: str,
-    file: UploadFile,
+    chunk_number: int,
+    total_chunks: int,
+    chunk: UploadFile,
    user: Annotated[Optional[auth.UserInfo], Depends(auth.current_user_optional)],
 ):
    user_id = user["sub"] if user else None
@@ -34,22 +36,37 @@ async def transcript_record_upload(
            status_code=400, detail="There is already an upload in progress"
        )

-    # save the file to the transcript folder
-    extension = file.filename.split(".")[-1]
-    upload_filename = transcript.data_path / f"upload.{extension}"
-    upload_filename.parent.mkdir(parents=True, exist_ok=True)
+    # save the chunk to the transcript folder
+    extension = chunk.filename.split(".")[-1]
+    chunk_filename = transcript.data_path / f"upload_{chunk_number}.{extension}"
+    chunk_filename.parent.mkdir(parents=True, exist_ok=True)

-    # ensure the file is back to the beginning
-    await file.seek(0)
+    # ensure the chunk is back to the beginning
+    await chunk.seek(0)

-    # save the file to the transcript folder
+    # save the chunk to the transcript folder
    try:
-        with open(upload_filename, "wb") as f:
-            while True:
-                chunk = await file.read(16384)
-                if not chunk:
-                    break
-                f.write(chunk)
+        with open(chunk_filename, "wb") as f:
+            f.write(await chunk.read())
+    except Exception:
+        chunk_filename.unlink()
+        raise
+
+    # return if it's not the last chunk
+    if chunk_number < total_chunks - 1:
+        return UploadStatus(status="ok")
+
+    # merge chunks to a single file
+    upload_filename = transcript.data_path / f"upload.{extension}"
+    try:
+        with open(upload_filename, "ab") as f:
+            for chunk_number in range(0, total_chunks):
+                chunk_filename = (
+                    transcript.data_path / f"upload_{chunk_number}.{extension}"
+                )
+                with open(chunk_filename, "rb") as chunk:
+                    f.write(chunk.read())
+                chunk_filename.unlink()
    except Exception:
        upload_filename.unlink()
        raise
--- a/server/tests/test_transcripts_upload.py
+++ b/server/tests/test_transcripts_upload.py
@@ -1,5 +1,6 @@
-import pytest
 import asyncio
+
+import pytest
 from httpx import AsyncClient


@@ -27,13 +28,13 @@ async def test_transcript_upload_file(

    # upload mp3
    response = await ac.post(
-        f"/transcripts/{tid}/record/upload",
+        f"/transcripts/{tid}/record/upload?chunk_number=0&total_chunks=1",
        files={
-            "file": (
+            "chunk": (
                "test_short.wav",
                open("tests/records/test_short.wav", "rb"),
                "audio/mpeg",
-            )
+            ),
        },
    )
    assert response.status_code == 200
--- a/www/app/[domain]/transcripts/fileUploadButton.tsx
+++ b/www/app/[domain]/transcripts/fileUploadButton.tsx
@@ -18,24 +18,46 @@ export default function FileUploadButton(props: FileUploadButton) {
    const file = event.target.files?.[0];

    if (file) {
-      console.log("Calling api.v1TranscriptRecordUpload()...");
-
-      // Create an object of the expected type
-      const uploadData = {
-        file: file,
-        // Add other properties if required by the type definition
-      };
+      const maxChunkSize = 50 * 1024 * 1024; // 50 MB
+      const totalChunks = Math.ceil(file.size / maxChunkSize);
+      let chunkNumber = 0;
+      let start = 0;
+      let uploadedSize = 0;

      api?.httpRequest.config.interceptors.request.use((request) => {
        request.onUploadProgress = (progressEvent) => {
-          setProgress((progressEvent.progress || 0) * 100);
+          const currentProgress = Math.floor(
+            ((uploadedSize + progressEvent.loaded) / file.size) * 100,
+          );
+          setProgress(currentProgress);
        };
        return request;
      });
-      api?.v1TranscriptRecordUpload({
+
+      const uploadNextChunk = async () => {
+        if (chunkNumber == totalChunks) return;
+
+        const chunkSize = Math.min(maxChunkSize, file.size - start);
+        const end = start + chunkSize;
+        const chunk = file.slice(start, end);
+
+        await api?.v1TranscriptRecordUpload({
          transcriptId: props.transcriptId,
-        formData: uploadData,
+          formData: {
+            chunk,
+          },
+          chunkNumber,
+          totalChunks,
        });
+
+        uploadedSize += chunkSize;
+        chunkNumber++;
+        start = end;
+
+        uploadNextChunk();
+      };
+
+      uploadNextChunk();
    }
  };

--- a/www/app/api/schemas.gen.ts
+++ b/www/app/api/schemas.gen.ts
@@ -18,14 +18,14 @@ export const $AudioWaveform = {
 export const $Body_transcript_record_upload_v1_transcripts__transcript_id__record_upload_post =
  {
    properties: {
-      file: {
+      chunk: {
        type: "string",
        format: "binary",
-        title: "File",
+        title: "Chunk",
      },
    },
    type: "object",
-    required: ["file"],
+    required: ["chunk"],
    title:
      "Body_transcript_record_upload_v1_transcripts__transcript_id__record_upload_post",
  } as const;
--- a/www/app/api/services.gen.ts
+++ b/www/app/api/services.gen.ts
@@ -497,6 +497,8 @@ export class DefaultService {
   * Transcript Record Upload
   * @param data The data for the request.
   * @param data.transcriptId
+   * @param data.chunkNumber
+   * @param data.totalChunks
   * @param data.formData
   * @returns unknown Successful Response
   * @throws ApiError
@@ -510,6 +512,10 @@ export class DefaultService {
      path: {
        transcript_id: data.transcriptId,
      },
+      query: {
+        chunk_number: data.chunkNumber,
+        total_chunks: data.totalChunks,
+      },
      formData: data.formData,
      mediaType: "multipart/form-data",
      errors: {
--- a/www/app/api/types.gen.ts
+++ b/www/app/api/types.gen.ts
@@ -6,7 +6,7 @@ export type AudioWaveform = {

 export type Body_transcript_record_upload_v1_transcripts__transcript_id__record_upload_post =
  {
-    file: Blob | File;
+    chunk: Blob | File;
  };

 export type CreateParticipant = {
@@ -296,7 +296,9 @@ export type V1TranscriptMergeSpeakerData = {
 export type V1TranscriptMergeSpeakerResponse = SpeakerAssignmentStatus;

 export type V1TranscriptRecordUploadData = {
+  chunkNumber: number;
  formData: Body_transcript_record_upload_v1_transcripts__transcript_id__record_upload_post;
+  totalChunks: number;
  transcriptId: string;
 };