From df5b735959d2cd6befcbc73f4d7f0d9adf14bcbe Mon Sep 17 00:00:00 2001 From: Sergey Mankovsky Date: Fri, 12 Jul 2024 22:57:54 +0200 Subject: [PATCH 1/2] Chunked filed upload --- server/reflector/views/transcripts_upload.py | 45 ++++++++++++------ server/tests/test_transcripts_upload.py | 9 ++-- .../[domain]/transcripts/fileUploadButton.tsx | 46 ++++++++++++++----- www/app/api/schemas.gen.ts | 6 +-- www/app/api/services.gen.ts | 6 +++ www/app/api/types.gen.ts | 4 +- 6 files changed, 82 insertions(+), 34 deletions(-) diff --git a/server/reflector/views/transcripts_upload.py b/server/reflector/views/transcripts_upload.py index 96b82d78..4fa45e3e 100644 --- a/server/reflector/views/transcripts_upload.py +++ b/server/reflector/views/transcripts_upload.py @@ -17,7 +17,9 @@ class UploadStatus(BaseModel): @router.post("/transcripts/{transcript_id}/record/upload") async def transcript_record_upload( transcript_id: str, - file: UploadFile, + chunk_number: int, + total_chunks: int, + chunk: UploadFile, user: Annotated[Optional[auth.UserInfo], Depends(auth.current_user_optional)], ): user_id = user["sub"] if user else None @@ -34,22 +36,37 @@ async def transcript_record_upload( status_code=400, detail="There is already an upload in progress" ) - # save the file to the transcript folder - extension = file.filename.split(".")[-1] - upload_filename = transcript.data_path / f"upload.{extension}" - upload_filename.parent.mkdir(parents=True, exist_ok=True) + # save the chunk to the transcript folder + extension = chunk.filename.split(".")[-1] + chunk_filename = transcript.data_path / f"upload_{chunk_number}.{extension}" + chunk_filename.parent.mkdir(parents=True, exist_ok=True) - # ensure the file is back to the beginning - await file.seek(0) + # ensure the chunk is back to the beginning + await chunk.seek(0) - # save the file to the transcript folder + # save the chunk to the transcript folder try: - with open(upload_filename, "wb") as f: - while True: - chunk = await file.read(16384) - if not chunk: - break - f.write(chunk) + with open(chunk_filename, "wb") as f: + f.write(await chunk.read()) + except Exception: + chunk_filename.unlink() + raise + + # return if it's not the last chunk + if chunk_number < total_chunks - 1: + return UploadStatus(status="ok") + + # merge chunks to a single file + upload_filename = transcript.data_path / f"upload.{extension}" + try: + with open(upload_filename, "ab") as f: + for chunk_number in range(0, total_chunks): + chunk_filename = ( + transcript.data_path / f"upload_{chunk_number}.{extension}" + ) + with open(chunk_filename, "rb") as chunk: + f.write(chunk.read()) + chunk_filename.unlink() except Exception: upload_filename.unlink() raise diff --git a/server/tests/test_transcripts_upload.py b/server/tests/test_transcripts_upload.py index 3cb482c1..fab21321 100644 --- a/server/tests/test_transcripts_upload.py +++ b/server/tests/test_transcripts_upload.py @@ -1,5 +1,6 @@ -import pytest import asyncio + +import pytest from httpx import AsyncClient @@ -27,13 +28,13 @@ async def test_transcript_upload_file( # upload mp3 response = await ac.post( - f"/transcripts/{tid}/record/upload", + f"/transcripts/{tid}/record/upload?chunk_number=0&total_chunks=1", files={ - "file": ( + "chunk": ( "test_short.wav", open("tests/records/test_short.wav", "rb"), "audio/mpeg", - ) + ), }, ) assert response.status_code == 200 diff --git a/www/app/[domain]/transcripts/fileUploadButton.tsx b/www/app/[domain]/transcripts/fileUploadButton.tsx index 3faba127..66246bf7 100644 --- a/www/app/[domain]/transcripts/fileUploadButton.tsx +++ b/www/app/[domain]/transcripts/fileUploadButton.tsx @@ -18,24 +18,46 @@ export default function FileUploadButton(props: FileUploadButton) { const file = event.target.files?.[0]; if (file) { - console.log("Calling api.v1TranscriptRecordUpload()..."); - - // Create an object of the expected type - const uploadData = { - file: file, - // Add other properties if required by the type definition - }; + const maxChunkSize = 50 * 1024 * 1024; // 50 MB + const totalChunks = Math.ceil(file.size / maxChunkSize); + let chunkNumber = 0; + let start = 0; + let uploadedSize = 0; api?.httpRequest.config.interceptors.request.use((request) => { request.onUploadProgress = (progressEvent) => { - setProgress((progressEvent.progress || 0) * 100); + const currentProgress = Math.floor( + ((uploadedSize + progressEvent.loaded) / file.size) * 100, + ); + setProgress(currentProgress); }; return request; }); - api?.v1TranscriptRecordUpload({ - transcriptId: props.transcriptId, - formData: uploadData, - }); + + const uploadNextChunk = async () => { + if (chunkNumber == totalChunks) return; + + const chunkSize = Math.min(maxChunkSize, file.size - start); + const end = start + chunkSize; + const chunk = file.slice(start, end); + + await api?.v1TranscriptRecordUpload({ + transcriptId: props.transcriptId, + formData: { + chunk, + }, + chunkNumber, + totalChunks, + }); + + uploadedSize += chunkSize; + chunkNumber++; + start = end; + + uploadNextChunk(); + }; + + uploadNextChunk(); } }; diff --git a/www/app/api/schemas.gen.ts b/www/app/api/schemas.gen.ts index 01e3f6ba..afa34829 100644 --- a/www/app/api/schemas.gen.ts +++ b/www/app/api/schemas.gen.ts @@ -18,14 +18,14 @@ export const $AudioWaveform = { export const $Body_transcript_record_upload_v1_transcripts__transcript_id__record_upload_post = { properties: { - file: { + chunk: { type: "string", format: "binary", - title: "File", + title: "Chunk", }, }, type: "object", - required: ["file"], + required: ["chunk"], title: "Body_transcript_record_upload_v1_transcripts__transcript_id__record_upload_post", } as const; diff --git a/www/app/api/services.gen.ts b/www/app/api/services.gen.ts index b546e7b4..e5a18eb8 100644 --- a/www/app/api/services.gen.ts +++ b/www/app/api/services.gen.ts @@ -497,6 +497,8 @@ export class DefaultService { * Transcript Record Upload * @param data The data for the request. * @param data.transcriptId + * @param data.chunkNumber + * @param data.totalChunks * @param data.formData * @returns unknown Successful Response * @throws ApiError @@ -510,6 +512,10 @@ export class DefaultService { path: { transcript_id: data.transcriptId, }, + query: { + chunk_number: data.chunkNumber, + total_chunks: data.totalChunks, + }, formData: data.formData, mediaType: "multipart/form-data", errors: { diff --git a/www/app/api/types.gen.ts b/www/app/api/types.gen.ts index db50d253..bc06d3c1 100644 --- a/www/app/api/types.gen.ts +++ b/www/app/api/types.gen.ts @@ -6,7 +6,7 @@ export type AudioWaveform = { export type Body_transcript_record_upload_v1_transcripts__transcript_id__record_upload_post = { - file: Blob | File; + chunk: Blob | File; }; export type CreateParticipant = { @@ -296,7 +296,9 @@ export type V1TranscriptMergeSpeakerData = { export type V1TranscriptMergeSpeakerResponse = SpeakerAssignmentStatus; export type V1TranscriptRecordUploadData = { + chunkNumber: number; formData: Body_transcript_record_upload_v1_transcripts__transcript_id__record_upload_post; + totalChunks: number; transcriptId: string; }; From 5007bd7875d653a260ca1e617317adbdb0fa886c Mon Sep 17 00:00:00 2001 From: Sergey Mankovsky Date: Mon, 15 Jul 2024 11:29:25 +0200 Subject: [PATCH 2/2] Fix formatting --- .github/workflows/test_server.yml | 104 +++++++++++++++--------------- 1 file changed, 52 insertions(+), 52 deletions(-) diff --git a/.github/workflows/test_server.yml b/.github/workflows/test_server.yml index 9f3b9a6a..c4a36167 100644 --- a/.github/workflows/test_server.yml +++ b/.github/workflows/test_server.yml @@ -3,10 +3,10 @@ name: Unittests on: pull_request: paths: - - 'server/**' + - "server/**" push: paths: - - 'server/**' + - "server/**" jobs: pytest: @@ -17,65 +17,65 @@ jobs: ports: - 6379:6379 steps: - - uses: actions/checkout@v3 - - name: Install poetry - run: pipx install poetry - - name: Set up Python 3.x - uses: actions/setup-python@v4 - with: - python-version: '3.11' - cache: 'poetry' - cache-dependency-path: 'server/poetry.lock' - - name: Install requirements - run: | - cd server - poetry install - - name: Tests - run: | - cd server - poetry run python -m pytest -v tests + - uses: actions/checkout@v3 + - name: Install poetry + run: pipx install poetry + - name: Set up Python 3.x + uses: actions/setup-python@v4 + with: + python-version: "3.11" + cache: "poetry" + cache-dependency-path: "server/poetry.lock" + - name: Install requirements + run: | + cd server + poetry install + - name: Tests + run: | + cd server + poetry run python -m pytest -v tests formatting: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 - - name: Set up Python 3.x - uses: actions/setup-python@v4 - with: - python-version: 3.11 - - name: Validate formatting - run: | - pip install black - cd server - black --check reflector tests + - uses: actions/checkout@v3 + - name: Set up Python 3.x + uses: actions/setup-python@v4 + with: + python-version: 3.11 + - name: Validate formatting + run: | + pip install black + cd server + black --check reflector tests linting: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 - - name: Set up Python 3.x - uses: actions/setup-python@v4 - with: - python-version: 3.11 - - name: Validate formatting - run: | - pip install ruff - cd server - ruff reflector tests + - uses: actions/checkout@v3 + - name: Set up Python 3.x + uses: actions/setup-python@v4 + with: + python-version: 3.11 + - name: Validate formatting + run: | + pip install ruff + cd server + ruff check reflector tests docker: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 - - name: Set up QEMU - uses: docker/setup-qemu-action@v2 - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v2 - - name: Build and push - id: docker_build - uses: docker/build-push-action@v4 - with: - context: server - platforms: linux/amd64,linux/arm64 - cache-from: type=gha - cache-to: type=gha,mode=max + - uses: actions/checkout@v3 + - name: Set up QEMU + uses: docker/setup-qemu-action@v2 + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v2 + - name: Build and push + id: docker_build + uses: docker/build-push-action@v4 + with: + context: server + platforms: linux/amd64,linux/arm64 + cache-from: type=gha + cache-to: type=gha,mode=max