mirror of
https://github.com/Monadical-SAS/reflector.git
synced 2026-02-06 10:46:46 +00:00
Compare commits
5 Commits
brady-bunc
...
release-pl
| Author | SHA1 | Date | |
|---|---|---|---|
| 46a10af349 | |||
| 15ab2e306e | |||
| 1ce1c7a910 | |||
|
|
984795357e | ||
| fa3cf5da0f |
@@ -4,3 +4,4 @@ docs/docs/installation/daily-setup.md:curl-auth-header:277
|
|||||||
gpu/self_hosted/DEV_SETUP.md:curl-auth-header:74
|
gpu/self_hosted/DEV_SETUP.md:curl-auth-header:74
|
||||||
gpu/self_hosted/DEV_SETUP.md:curl-auth-header:83
|
gpu/self_hosted/DEV_SETUP.md:curl-auth-header:83
|
||||||
server/reflector/worker/process.py:generic-api-key:465
|
server/reflector/worker/process.py:generic-api-key:465
|
||||||
|
server/reflector/worker/process.py:generic-api-key:594
|
||||||
|
|||||||
21
CHANGELOG.md
21
CHANGELOG.md
@@ -1,5 +1,26 @@
|
|||||||
# Changelog
|
# Changelog
|
||||||
|
|
||||||
|
## [0.33.0](https://github.com/Monadical-SAS/reflector/compare/v0.32.2...v0.33.0) (2026-02-05)
|
||||||
|
|
||||||
|
|
||||||
|
### Features
|
||||||
|
|
||||||
|
* Daily+hatchet default ([#846](https://github.com/Monadical-SAS/reflector/issues/846)) ([15ab2e3](https://github.com/Monadical-SAS/reflector/commit/15ab2e306eacf575494b4b5d2b2ad779d44a1c7f))
|
||||||
|
|
||||||
|
|
||||||
|
### Bug Fixes
|
||||||
|
|
||||||
|
* websocket tests ([#825](https://github.com/Monadical-SAS/reflector/issues/825)) ([1ce1c7a](https://github.com/Monadical-SAS/reflector/commit/1ce1c7a910b6c374115d2437b17f9d288ef094dc))
|
||||||
|
|
||||||
|
## [0.32.2](https://github.com/Monadical-SAS/reflector/compare/v0.32.1...v0.32.2) (2026-02-03)
|
||||||
|
|
||||||
|
|
||||||
|
### Bug Fixes
|
||||||
|
|
||||||
|
* increase TIMEOUT_MEDIUM from 2m to 5m for LLM tasks ([#843](https://github.com/Monadical-SAS/reflector/issues/843)) ([4acde4b](https://github.com/Monadical-SAS/reflector/commit/4acde4b7fdef88cc02ca12cf38c9020b05ed96ac))
|
||||||
|
* make caddy optional ([#841](https://github.com/Monadical-SAS/reflector/issues/841)) ([a2ed7d6](https://github.com/Monadical-SAS/reflector/commit/a2ed7d60d557b551a5b64e4dfd909b63a791d9fc))
|
||||||
|
* use Daily API recording.duration as master source for transcript duration ([#844](https://github.com/Monadical-SAS/reflector/issues/844)) ([8707c66](https://github.com/Monadical-SAS/reflector/commit/8707c6694a80c939b6214bbc13331741f192e082))
|
||||||
|
|
||||||
## [0.32.1](https://github.com/Monadical-SAS/reflector/compare/v0.32.0...v0.32.1) (2026-01-30)
|
## [0.32.1](https://github.com/Monadical-SAS/reflector/compare/v0.32.0...v0.32.1) (2026-01-30)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -4,27 +4,31 @@ ENV PYTHONUNBUFFERED=1 \
|
|||||||
UV_LINK_MODE=copy \
|
UV_LINK_MODE=copy \
|
||||||
UV_NO_CACHE=1
|
UV_NO_CACHE=1
|
||||||
|
|
||||||
|
# patch until nvidia updates the sha1 repo
|
||||||
|
ADD sequoia.config /etc/crypto-policies/back-ends/sequoia.config
|
||||||
|
|
||||||
WORKDIR /tmp
|
WORKDIR /tmp
|
||||||
RUN apt-get update \
|
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
|
||||||
|
--mount=type=cache,target=/var/lib/apt,sharing=locked \
|
||||||
|
apt-get update \
|
||||||
&& apt-get install -y \
|
&& apt-get install -y \
|
||||||
ffmpeg \
|
ffmpeg \
|
||||||
curl \
|
curl \
|
||||||
ca-certificates \
|
ca-certificates \
|
||||||
gnupg \
|
gnupg \
|
||||||
wget \
|
wget
|
||||||
&& apt-get clean
|
|
||||||
# Add NVIDIA CUDA repo for Debian 12 (bookworm) and install cuDNN 9 for CUDA 12
|
# Add NVIDIA CUDA repo for Debian 12 (bookworm) and install cuDNN 9 for CUDA 12
|
||||||
ADD https://developer.download.nvidia.com/compute/cuda/repos/debian12/x86_64/cuda-keyring_1.1-1_all.deb /cuda-keyring.deb
|
ADD https://developer.download.nvidia.com/compute/cuda/repos/debian12/x86_64/cuda-keyring_1.1-1_all.deb /cuda-keyring.deb
|
||||||
RUN dpkg -i /cuda-keyring.deb \
|
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
|
||||||
|
--mount=type=cache,target=/var/lib/apt,sharing=locked \
|
||||||
|
dpkg -i /cuda-keyring.deb \
|
||||||
&& rm /cuda-keyring.deb \
|
&& rm /cuda-keyring.deb \
|
||||||
&& apt-get update \
|
&& apt-get update \
|
||||||
&& apt-get install -y --no-install-recommends \
|
&& apt-get install -y --no-install-recommends \
|
||||||
cuda-cudart-12-6 \
|
cuda-cudart-12-6 \
|
||||||
libcublas-12-6 \
|
libcublas-12-6 \
|
||||||
libcudnn9-cuda-12 \
|
libcudnn9-cuda-12 \
|
||||||
libcudnn9-dev-cuda-12 \
|
libcudnn9-dev-cuda-12
|
||||||
&& apt-get clean \
|
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
|
||||||
ADD https://astral.sh/uv/install.sh /uv-installer.sh
|
ADD https://astral.sh/uv/install.sh /uv-installer.sh
|
||||||
RUN sh /uv-installer.sh && rm /uv-installer.sh
|
RUN sh /uv-installer.sh && rm /uv-installer.sh
|
||||||
ENV PATH="/root/.local/bin/:$PATH"
|
ENV PATH="/root/.local/bin/:$PATH"
|
||||||
@@ -39,6 +43,13 @@ COPY ./app /app/app
|
|||||||
COPY ./main.py /app/
|
COPY ./main.py /app/
|
||||||
COPY ./runserver.sh /app/
|
COPY ./runserver.sh /app/
|
||||||
|
|
||||||
|
# prevent uv failing with too many open files on big cpus
|
||||||
|
ENV UV_CONCURRENT_INSTALLS=16
|
||||||
|
|
||||||
|
# first install
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||||
|
uv sync --compile-bytecode --locked
|
||||||
|
|
||||||
EXPOSE 8000
|
EXPOSE 8000
|
||||||
|
|
||||||
CMD ["sh", "/app/runserver.sh"]
|
CMD ["sh", "/app/runserver.sh"]
|
||||||
|
|||||||
2
gpu/self_hosted/sequoia.config
Normal file
2
gpu/self_hosted/sequoia.config
Normal file
@@ -0,0 +1,2 @@
|
|||||||
|
[hash_algorithms]
|
||||||
|
sha1 = "always"
|
||||||
@@ -0,0 +1,35 @@
|
|||||||
|
"""drop_use_celery_column
|
||||||
|
|
||||||
|
Revision ID: 3aa20b96d963
|
||||||
|
Revises: e69f08ead8ea
|
||||||
|
Create Date: 2026-02-05 10:12:44.065279
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
from typing import Sequence, Union
|
||||||
|
|
||||||
|
import sqlalchemy as sa
|
||||||
|
from alembic import op
|
||||||
|
|
||||||
|
# revision identifiers, used by Alembic.
|
||||||
|
revision: str = "3aa20b96d963"
|
||||||
|
down_revision: Union[str, None] = "e69f08ead8ea"
|
||||||
|
branch_labels: Union[str, Sequence[str], None] = None
|
||||||
|
depends_on: Union[str, Sequence[str], None] = None
|
||||||
|
|
||||||
|
|
||||||
|
def upgrade() -> None:
|
||||||
|
with op.batch_alter_table("room", schema=None) as batch_op:
|
||||||
|
batch_op.drop_column("use_celery")
|
||||||
|
|
||||||
|
|
||||||
|
def downgrade() -> None:
|
||||||
|
with op.batch_alter_table("room", schema=None) as batch_op:
|
||||||
|
batch_op.add_column(
|
||||||
|
sa.Column(
|
||||||
|
"use_celery",
|
||||||
|
sa.Boolean(),
|
||||||
|
server_default=sa.text("false"),
|
||||||
|
nullable=False,
|
||||||
|
)
|
||||||
|
)
|
||||||
@@ -57,12 +57,6 @@ rooms = sqlalchemy.Table(
|
|||||||
sqlalchemy.String,
|
sqlalchemy.String,
|
||||||
nullable=False,
|
nullable=False,
|
||||||
),
|
),
|
||||||
sqlalchemy.Column(
|
|
||||||
"use_celery",
|
|
||||||
sqlalchemy.Boolean,
|
|
||||||
nullable=False,
|
|
||||||
server_default=false(),
|
|
||||||
),
|
|
||||||
sqlalchemy.Column(
|
sqlalchemy.Column(
|
||||||
"skip_consent",
|
"skip_consent",
|
||||||
sqlalchemy.Boolean,
|
sqlalchemy.Boolean,
|
||||||
@@ -97,7 +91,6 @@ class Room(BaseModel):
|
|||||||
ics_last_sync: datetime | None = None
|
ics_last_sync: datetime | None = None
|
||||||
ics_last_etag: str | None = None
|
ics_last_etag: str | None = None
|
||||||
platform: Platform = Field(default_factory=lambda: settings.DEFAULT_VIDEO_PLATFORM)
|
platform: Platform = Field(default_factory=lambda: settings.DEFAULT_VIDEO_PLATFORM)
|
||||||
use_celery: bool = False
|
|
||||||
skip_consent: bool = False
|
skip_consent: bool = False
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -15,14 +15,10 @@ from hatchet_sdk.clients.rest.exceptions import ApiException, NotFoundException
|
|||||||
from hatchet_sdk.clients.rest.models import V1TaskStatus
|
from hatchet_sdk.clients.rest.models import V1TaskStatus
|
||||||
|
|
||||||
from reflector.db.recordings import recordings_controller
|
from reflector.db.recordings import recordings_controller
|
||||||
from reflector.db.rooms import rooms_controller
|
|
||||||
from reflector.db.transcripts import Transcript, transcripts_controller
|
from reflector.db.transcripts import Transcript, transcripts_controller
|
||||||
from reflector.hatchet.client import HatchetClientManager
|
from reflector.hatchet.client import HatchetClientManager
|
||||||
from reflector.logger import logger
|
from reflector.logger import logger
|
||||||
from reflector.pipelines.main_file_pipeline import task_pipeline_file_process
|
from reflector.pipelines.main_file_pipeline import task_pipeline_file_process
|
||||||
from reflector.pipelines.main_multitrack_pipeline import (
|
|
||||||
task_pipeline_multitrack_process,
|
|
||||||
)
|
|
||||||
from reflector.utils.string import NonEmptyString
|
from reflector.utils.string import NonEmptyString
|
||||||
|
|
||||||
|
|
||||||
@@ -181,124 +177,98 @@ async def dispatch_transcript_processing(
|
|||||||
Returns AsyncResult for Celery tasks, None for Hatchet workflows.
|
Returns AsyncResult for Celery tasks, None for Hatchet workflows.
|
||||||
"""
|
"""
|
||||||
if isinstance(config, MultitrackProcessingConfig):
|
if isinstance(config, MultitrackProcessingConfig):
|
||||||
use_celery = False
|
# Multitrack processing always uses Hatchet (no Celery fallback)
|
||||||
if config.room_id:
|
# First check if we can replay (outside transaction since it's read-only)
|
||||||
room = await rooms_controller.get_by_id(config.room_id)
|
transcript = await transcripts_controller.get_by_id(config.transcript_id)
|
||||||
use_celery = room.use_celery if room else False
|
if transcript and transcript.workflow_run_id and not force:
|
||||||
|
can_replay = await HatchetClientManager.can_replay(
|
||||||
use_hatchet = not use_celery
|
transcript.workflow_run_id
|
||||||
|
|
||||||
if use_celery:
|
|
||||||
logger.info(
|
|
||||||
"Room uses legacy Celery processing",
|
|
||||||
room_id=config.room_id,
|
|
||||||
transcript_id=config.transcript_id,
|
|
||||||
)
|
)
|
||||||
|
if can_replay:
|
||||||
if use_hatchet:
|
await HatchetClientManager.replay_workflow(transcript.workflow_run_id)
|
||||||
# First check if we can replay (outside transaction since it's read-only)
|
logger.info(
|
||||||
transcript = await transcripts_controller.get_by_id(config.transcript_id)
|
"Replaying Hatchet workflow",
|
||||||
if transcript and transcript.workflow_run_id and not force:
|
workflow_id=transcript.workflow_run_id,
|
||||||
can_replay = await HatchetClientManager.can_replay(
|
|
||||||
transcript.workflow_run_id
|
|
||||||
)
|
)
|
||||||
if can_replay:
|
return None
|
||||||
await HatchetClientManager.replay_workflow(
|
else:
|
||||||
transcript.workflow_run_id
|
# Workflow can't replay (CANCELLED, COMPLETED, or 404 deleted)
|
||||||
)
|
# Log and proceed to start new workflow
|
||||||
logger.info(
|
|
||||||
"Replaying Hatchet workflow",
|
|
||||||
workflow_id=transcript.workflow_run_id,
|
|
||||||
)
|
|
||||||
return None
|
|
||||||
else:
|
|
||||||
# Workflow can't replay (CANCELLED, COMPLETED, or 404 deleted)
|
|
||||||
# Log and proceed to start new workflow
|
|
||||||
try:
|
|
||||||
status = await HatchetClientManager.get_workflow_run_status(
|
|
||||||
transcript.workflow_run_id
|
|
||||||
)
|
|
||||||
logger.info(
|
|
||||||
"Old workflow not replayable, starting new",
|
|
||||||
old_workflow_id=transcript.workflow_run_id,
|
|
||||||
old_status=status.value,
|
|
||||||
)
|
|
||||||
except NotFoundException:
|
|
||||||
# Workflow deleted from Hatchet but ID still in DB
|
|
||||||
logger.info(
|
|
||||||
"Old workflow not found in Hatchet, starting new",
|
|
||||||
old_workflow_id=transcript.workflow_run_id,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Force: cancel old workflow if exists
|
|
||||||
if force and transcript and transcript.workflow_run_id:
|
|
||||||
try:
|
|
||||||
await HatchetClientManager.cancel_workflow(
|
|
||||||
transcript.workflow_run_id
|
|
||||||
)
|
|
||||||
logger.info(
|
|
||||||
"Cancelled old workflow (--force)",
|
|
||||||
workflow_id=transcript.workflow_run_id,
|
|
||||||
)
|
|
||||||
except NotFoundException:
|
|
||||||
logger.info(
|
|
||||||
"Old workflow already deleted (--force)",
|
|
||||||
workflow_id=transcript.workflow_run_id,
|
|
||||||
)
|
|
||||||
await transcripts_controller.update(
|
|
||||||
transcript, {"workflow_run_id": None}
|
|
||||||
)
|
|
||||||
|
|
||||||
# Re-fetch and check for concurrent dispatch (optimistic approach).
|
|
||||||
# No database lock - worst case is duplicate dispatch, but Hatchet
|
|
||||||
# workflows are idempotent so this is acceptable.
|
|
||||||
transcript = await transcripts_controller.get_by_id(config.transcript_id)
|
|
||||||
if transcript and transcript.workflow_run_id:
|
|
||||||
# Another process started a workflow between validation and now
|
|
||||||
try:
|
try:
|
||||||
status = await HatchetClientManager.get_workflow_run_status(
|
status = await HatchetClientManager.get_workflow_run_status(
|
||||||
transcript.workflow_run_id
|
transcript.workflow_run_id
|
||||||
)
|
)
|
||||||
if status in (V1TaskStatus.RUNNING, V1TaskStatus.QUEUED):
|
logger.info(
|
||||||
logger.info(
|
"Old workflow not replayable, starting new",
|
||||||
"Concurrent workflow detected, skipping dispatch",
|
old_workflow_id=transcript.workflow_run_id,
|
||||||
workflow_id=transcript.workflow_run_id,
|
old_status=status.value,
|
||||||
)
|
)
|
||||||
return None
|
except NotFoundException:
|
||||||
except ApiException:
|
# Workflow deleted from Hatchet but ID still in DB
|
||||||
# Workflow might be gone (404) or API issue - proceed with new workflow
|
logger.info(
|
||||||
pass
|
"Old workflow not found in Hatchet, starting new",
|
||||||
|
old_workflow_id=transcript.workflow_run_id,
|
||||||
|
)
|
||||||
|
|
||||||
workflow_id = await HatchetClientManager.start_workflow(
|
# Force: cancel old workflow if exists
|
||||||
workflow_name="DiarizationPipeline",
|
if force and transcript and transcript.workflow_run_id:
|
||||||
input_data={
|
try:
|
||||||
"recording_id": config.recording_id,
|
await HatchetClientManager.cancel_workflow(transcript.workflow_run_id)
|
||||||
"tracks": [{"s3_key": k} for k in config.track_keys],
|
logger.info(
|
||||||
"bucket_name": config.bucket_name,
|
"Cancelled old workflow (--force)",
|
||||||
"transcript_id": config.transcript_id,
|
workflow_id=transcript.workflow_run_id,
|
||||||
"room_id": config.room_id,
|
)
|
||||||
},
|
except NotFoundException:
|
||||||
additional_metadata={
|
logger.info(
|
||||||
"transcript_id": config.transcript_id,
|
"Old workflow already deleted (--force)",
|
||||||
"recording_id": config.recording_id,
|
workflow_id=transcript.workflow_run_id,
|
||||||
"daily_recording_id": config.recording_id,
|
)
|
||||||
},
|
await transcripts_controller.update(transcript, {"workflow_run_id": None})
|
||||||
|
|
||||||
|
# Re-fetch and check for concurrent dispatch (optimistic approach).
|
||||||
|
# No database lock - worst case is duplicate dispatch, but Hatchet
|
||||||
|
# workflows are idempotent so this is acceptable.
|
||||||
|
transcript = await transcripts_controller.get_by_id(config.transcript_id)
|
||||||
|
if transcript and transcript.workflow_run_id:
|
||||||
|
# Another process started a workflow between validation and now
|
||||||
|
try:
|
||||||
|
status = await HatchetClientManager.get_workflow_run_status(
|
||||||
|
transcript.workflow_run_id
|
||||||
|
)
|
||||||
|
if status in (V1TaskStatus.RUNNING, V1TaskStatus.QUEUED):
|
||||||
|
logger.info(
|
||||||
|
"Concurrent workflow detected, skipping dispatch",
|
||||||
|
workflow_id=transcript.workflow_run_id,
|
||||||
|
)
|
||||||
|
return None
|
||||||
|
except ApiException:
|
||||||
|
# Workflow might be gone (404) or API issue - proceed with new workflow
|
||||||
|
pass
|
||||||
|
|
||||||
|
workflow_id = await HatchetClientManager.start_workflow(
|
||||||
|
workflow_name="DiarizationPipeline",
|
||||||
|
input_data={
|
||||||
|
"recording_id": config.recording_id,
|
||||||
|
"tracks": [{"s3_key": k} for k in config.track_keys],
|
||||||
|
"bucket_name": config.bucket_name,
|
||||||
|
"transcript_id": config.transcript_id,
|
||||||
|
"room_id": config.room_id,
|
||||||
|
},
|
||||||
|
additional_metadata={
|
||||||
|
"transcript_id": config.transcript_id,
|
||||||
|
"recording_id": config.recording_id,
|
||||||
|
"daily_recording_id": config.recording_id,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
if transcript:
|
||||||
|
await transcripts_controller.update(
|
||||||
|
transcript, {"workflow_run_id": workflow_id}
|
||||||
)
|
)
|
||||||
|
|
||||||
if transcript:
|
logger.info("Hatchet workflow dispatched", workflow_id=workflow_id)
|
||||||
await transcripts_controller.update(
|
return None
|
||||||
transcript, {"workflow_run_id": workflow_id}
|
|
||||||
)
|
|
||||||
|
|
||||||
logger.info("Hatchet workflow dispatched", workflow_id=workflow_id)
|
|
||||||
return None
|
|
||||||
|
|
||||||
# Celery pipeline (durable workflows disabled)
|
|
||||||
return task_pipeline_multitrack_process.delay(
|
|
||||||
transcript_id=config.transcript_id,
|
|
||||||
bucket_name=config.bucket_name,
|
|
||||||
track_keys=config.track_keys,
|
|
||||||
)
|
|
||||||
elif isinstance(config, FileProcessingConfig):
|
elif isinstance(config, FileProcessingConfig):
|
||||||
return task_pipeline_file_process.delay(transcript_id=config.transcript_id)
|
return task_pipeline_file_process.delay(transcript_id=config.transcript_id)
|
||||||
else:
|
else:
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
from pydantic.types import PositiveInt
|
from pydantic.types import PositiveInt
|
||||||
from pydantic_settings import BaseSettings, SettingsConfigDict
|
from pydantic_settings import BaseSettings, SettingsConfigDict
|
||||||
|
|
||||||
from reflector.schemas.platform import WHEREBY_PLATFORM, Platform
|
from reflector.schemas.platform import DAILY_PLATFORM, Platform
|
||||||
from reflector.utils.string import NonEmptyString
|
from reflector.utils.string import NonEmptyString
|
||||||
|
|
||||||
|
|
||||||
@@ -155,7 +155,7 @@ class Settings(BaseSettings):
|
|||||||
None # Webhook UUID for this environment. Not used by production code
|
None # Webhook UUID for this environment. Not used by production code
|
||||||
)
|
)
|
||||||
# Platform Configuration
|
# Platform Configuration
|
||||||
DEFAULT_VIDEO_PLATFORM: Platform = WHEREBY_PLATFORM
|
DEFAULT_VIDEO_PLATFORM: Platform = DAILY_PLATFORM
|
||||||
|
|
||||||
# Zulip integration
|
# Zulip integration
|
||||||
ZULIP_REALM: str | None = None
|
ZULIP_REALM: str | None = None
|
||||||
|
|||||||
@@ -27,9 +27,6 @@ from reflector.db.transcripts import (
|
|||||||
from reflector.hatchet.client import HatchetClientManager
|
from reflector.hatchet.client import HatchetClientManager
|
||||||
from reflector.pipelines.main_file_pipeline import task_pipeline_file_process
|
from reflector.pipelines.main_file_pipeline import task_pipeline_file_process
|
||||||
from reflector.pipelines.main_live_pipeline import asynctask
|
from reflector.pipelines.main_live_pipeline import asynctask
|
||||||
from reflector.pipelines.main_multitrack_pipeline import (
|
|
||||||
task_pipeline_multitrack_process,
|
|
||||||
)
|
|
||||||
from reflector.pipelines.topic_processing import EmptyPipeline
|
from reflector.pipelines.topic_processing import EmptyPipeline
|
||||||
from reflector.processors import AudioFileWriterProcessor
|
from reflector.processors import AudioFileWriterProcessor
|
||||||
from reflector.processors.audio_waveform_processor import AudioWaveformProcessor
|
from reflector.processors.audio_waveform_processor import AudioWaveformProcessor
|
||||||
@@ -351,49 +348,29 @@ async def _process_multitrack_recording_inner(
|
|||||||
room_id=room.id,
|
room_id=room.id,
|
||||||
)
|
)
|
||||||
|
|
||||||
use_celery = room and room.use_celery
|
# Multitrack processing always uses Hatchet (no Celery fallback)
|
||||||
use_hatchet = not use_celery
|
workflow_id = await HatchetClientManager.start_workflow(
|
||||||
|
workflow_name="DiarizationPipeline",
|
||||||
if use_celery:
|
input_data={
|
||||||
logger.info(
|
"recording_id": recording_id,
|
||||||
"Room uses legacy Celery processing",
|
"tracks": [{"s3_key": k} for k in filter_cam_audio_tracks(track_keys)],
|
||||||
room_id=room.id,
|
"bucket_name": bucket_name,
|
||||||
transcript_id=transcript.id,
|
"transcript_id": transcript.id,
|
||||||
)
|
"room_id": room.id,
|
||||||
|
},
|
||||||
if use_hatchet:
|
additional_metadata={
|
||||||
workflow_id = await HatchetClientManager.start_workflow(
|
"transcript_id": transcript.id,
|
||||||
workflow_name="DiarizationPipeline",
|
"recording_id": recording_id,
|
||||||
input_data={
|
"daily_recording_id": recording_id,
|
||||||
"recording_id": recording_id,
|
},
|
||||||
"tracks": [{"s3_key": k} for k in filter_cam_audio_tracks(track_keys)],
|
|
||||||
"bucket_name": bucket_name,
|
|
||||||
"transcript_id": transcript.id,
|
|
||||||
"room_id": room.id,
|
|
||||||
},
|
|
||||||
additional_metadata={
|
|
||||||
"transcript_id": transcript.id,
|
|
||||||
"recording_id": recording_id,
|
|
||||||
"daily_recording_id": recording_id,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
logger.info(
|
|
||||||
"Started Hatchet workflow",
|
|
||||||
workflow_id=workflow_id,
|
|
||||||
transcript_id=transcript.id,
|
|
||||||
)
|
|
||||||
|
|
||||||
await transcripts_controller.update(
|
|
||||||
transcript, {"workflow_run_id": workflow_id}
|
|
||||||
)
|
|
||||||
return
|
|
||||||
|
|
||||||
# Celery pipeline (runs when durable workflows disabled)
|
|
||||||
task_pipeline_multitrack_process.delay(
|
|
||||||
transcript_id=transcript.id,
|
|
||||||
bucket_name=bucket_name,
|
|
||||||
track_keys=filter_cam_audio_tracks(track_keys),
|
|
||||||
)
|
)
|
||||||
|
logger.info(
|
||||||
|
"Started Hatchet workflow",
|
||||||
|
workflow_id=workflow_id,
|
||||||
|
transcript_id=transcript.id,
|
||||||
|
)
|
||||||
|
|
||||||
|
await transcripts_controller.update(transcript, {"workflow_run_id": workflow_id})
|
||||||
|
|
||||||
|
|
||||||
@shared_task
|
@shared_task
|
||||||
@@ -1072,66 +1049,43 @@ async def reprocess_failed_daily_recordings():
|
|||||||
)
|
)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
use_celery = room and room.use_celery
|
# Multitrack reprocessing always uses Hatchet (no Celery fallback)
|
||||||
use_hatchet = not use_celery
|
if not transcript:
|
||||||
|
logger.warning(
|
||||||
if use_hatchet:
|
"No transcript for Hatchet reprocessing, skipping",
|
||||||
if not transcript:
|
|
||||||
logger.warning(
|
|
||||||
"No transcript for Hatchet reprocessing, skipping",
|
|
||||||
recording_id=recording.id,
|
|
||||||
)
|
|
||||||
continue
|
|
||||||
|
|
||||||
workflow_id = await HatchetClientManager.start_workflow(
|
|
||||||
workflow_name="DiarizationPipeline",
|
|
||||||
input_data={
|
|
||||||
"recording_id": recording.id,
|
|
||||||
"tracks": [
|
|
||||||
{"s3_key": k}
|
|
||||||
for k in filter_cam_audio_tracks(recording.track_keys)
|
|
||||||
],
|
|
||||||
"bucket_name": bucket_name,
|
|
||||||
"transcript_id": transcript.id,
|
|
||||||
"room_id": room.id if room else None,
|
|
||||||
},
|
|
||||||
additional_metadata={
|
|
||||||
"transcript_id": transcript.id,
|
|
||||||
"recording_id": recording.id,
|
|
||||||
"reprocess": True,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
await transcripts_controller.update(
|
|
||||||
transcript, {"workflow_run_id": workflow_id}
|
|
||||||
)
|
|
||||||
|
|
||||||
logger.info(
|
|
||||||
"Queued Daily recording for Hatchet reprocessing",
|
|
||||||
recording_id=recording.id,
|
recording_id=recording.id,
|
||||||
workflow_id=workflow_id,
|
|
||||||
room_name=meeting.room_name,
|
|
||||||
track_count=len(recording.track_keys),
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
logger.info(
|
|
||||||
"Queueing Daily recording for Celery reprocessing",
|
|
||||||
recording_id=recording.id,
|
|
||||||
room_name=meeting.room_name,
|
|
||||||
track_count=len(recording.track_keys),
|
|
||||||
transcript_status=transcript.status if transcript else None,
|
|
||||||
)
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
# For reprocessing, pass actual recording time (though it's ignored - see _process_multitrack_recording_inner)
|
workflow_id = await HatchetClientManager.start_workflow(
|
||||||
# Reprocessing uses recording.meeting_id directly instead of time-based matching
|
workflow_name="DiarizationPipeline",
|
||||||
recording_start_ts = int(recording.recorded_at.timestamp())
|
input_data={
|
||||||
|
"recording_id": recording.id,
|
||||||
|
"tracks": [
|
||||||
|
{"s3_key": k}
|
||||||
|
for k in filter_cam_audio_tracks(recording.track_keys)
|
||||||
|
],
|
||||||
|
"bucket_name": bucket_name,
|
||||||
|
"transcript_id": transcript.id,
|
||||||
|
"room_id": room.id if room else None,
|
||||||
|
},
|
||||||
|
additional_metadata={
|
||||||
|
"transcript_id": transcript.id,
|
||||||
|
"recording_id": recording.id,
|
||||||
|
"reprocess": True,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
await transcripts_controller.update(
|
||||||
|
transcript, {"workflow_run_id": workflow_id}
|
||||||
|
)
|
||||||
|
|
||||||
process_multitrack_recording.delay(
|
logger.info(
|
||||||
bucket_name=bucket_name,
|
"Queued Daily recording for Hatchet reprocessing",
|
||||||
daily_room_name=meeting.room_name,
|
recording_id=recording.id,
|
||||||
recording_id=recording.id,
|
workflow_id=workflow_id,
|
||||||
track_keys=recording.track_keys,
|
room_name=meeting.room_name,
|
||||||
recording_start_ts=recording_start_ts,
|
track_count=len(recording.track_keys),
|
||||||
)
|
)
|
||||||
|
|
||||||
reprocessed_count += 1
|
reprocessed_count += 1
|
||||||
|
|
||||||
|
|||||||
@@ -11,7 +11,6 @@ broadcast messages to all connected websockets.
|
|||||||
|
|
||||||
import asyncio
|
import asyncio
|
||||||
import json
|
import json
|
||||||
import threading
|
|
||||||
|
|
||||||
import redis.asyncio as redis
|
import redis.asyncio as redis
|
||||||
from fastapi import WebSocket
|
from fastapi import WebSocket
|
||||||
@@ -98,6 +97,7 @@ class WebsocketManager:
|
|||||||
|
|
||||||
async def _pubsub_data_reader(self, pubsub_subscriber):
|
async def _pubsub_data_reader(self, pubsub_subscriber):
|
||||||
while True:
|
while True:
|
||||||
|
# timeout=1.0 prevents tight CPU loop when no messages available
|
||||||
message = await pubsub_subscriber.get_message(
|
message = await pubsub_subscriber.get_message(
|
||||||
ignore_subscribe_messages=True
|
ignore_subscribe_messages=True
|
||||||
)
|
)
|
||||||
@@ -109,29 +109,38 @@ class WebsocketManager:
|
|||||||
await socket.send_json(data)
|
await socket.send_json(data)
|
||||||
|
|
||||||
|
|
||||||
|
# Process-global singleton to ensure only one WebsocketManager instance exists.
|
||||||
|
# Multiple instances would cause resource leaks and CPU issues.
|
||||||
|
_ws_manager: WebsocketManager | None = None
|
||||||
|
|
||||||
|
|
||||||
def get_ws_manager() -> WebsocketManager:
|
def get_ws_manager() -> WebsocketManager:
|
||||||
"""
|
"""
|
||||||
Returns the WebsocketManager instance for managing websockets.
|
Returns the global WebsocketManager singleton.
|
||||||
|
|
||||||
This function initializes and returns the WebsocketManager instance,
|
Creates instance on first call, subsequent calls return cached instance.
|
||||||
which is responsible for managing websockets and handling websocket
|
Thread-safe via GIL. Concurrent initialization may create duplicate
|
||||||
connections.
|
instances but last write wins (acceptable for this use case).
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
WebsocketManager: The initialized WebsocketManager instance.
|
WebsocketManager: The global WebsocketManager instance.
|
||||||
|
|
||||||
Raises:
|
|
||||||
ImportError: If the 'reflector.settings' module cannot be imported.
|
|
||||||
RedisConnectionError: If there is an error connecting to the Redis server.
|
|
||||||
"""
|
"""
|
||||||
local = threading.local()
|
global _ws_manager
|
||||||
if hasattr(local, "ws_manager"):
|
|
||||||
return local.ws_manager
|
|
||||||
|
|
||||||
|
if _ws_manager is not None:
|
||||||
|
return _ws_manager
|
||||||
|
|
||||||
|
# No lock needed - GIL makes this safe enough
|
||||||
|
# Worst case: race creates two instances, last assignment wins
|
||||||
pubsub_client = RedisPubSubManager(
|
pubsub_client = RedisPubSubManager(
|
||||||
host=settings.REDIS_HOST,
|
host=settings.REDIS_HOST,
|
||||||
port=settings.REDIS_PORT,
|
port=settings.REDIS_PORT,
|
||||||
)
|
)
|
||||||
ws_manager = WebsocketManager(pubsub_client=pubsub_client)
|
_ws_manager = WebsocketManager(pubsub_client=pubsub_client)
|
||||||
local.ws_manager = ws_manager
|
return _ws_manager
|
||||||
return ws_manager
|
|
||||||
|
|
||||||
|
def reset_ws_manager() -> None:
|
||||||
|
"""Reset singleton for testing. DO NOT use in production."""
|
||||||
|
global _ws_manager
|
||||||
|
_ws_manager = None
|
||||||
|
|||||||
@@ -1,11 +1,10 @@
|
|||||||
import os
|
import os
|
||||||
from contextlib import asynccontextmanager
|
from contextlib import asynccontextmanager
|
||||||
from tempfile import NamedTemporaryFile
|
|
||||||
from unittest.mock import patch
|
from unittest.mock import patch
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from reflector.schemas.platform import WHEREBY_PLATFORM
|
from reflector.schemas.platform import DAILY_PLATFORM, WHEREBY_PLATFORM
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session", autouse=True)
|
@pytest.fixture(scope="session", autouse=True)
|
||||||
@@ -15,6 +14,7 @@ def register_mock_platform():
|
|||||||
from reflector.video_platforms.registry import register_platform
|
from reflector.video_platforms.registry import register_platform
|
||||||
|
|
||||||
register_platform(WHEREBY_PLATFORM, MockPlatformClient)
|
register_platform(WHEREBY_PLATFORM, MockPlatformClient)
|
||||||
|
register_platform(DAILY_PLATFORM, MockPlatformClient)
|
||||||
yield
|
yield
|
||||||
|
|
||||||
|
|
||||||
@@ -333,11 +333,14 @@ def celery_enable_logging():
|
|||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
def celery_config():
|
def celery_config():
|
||||||
with NamedTemporaryFile() as f:
|
redis_host = os.environ.get("REDIS_HOST", "localhost")
|
||||||
yield {
|
redis_port = os.environ.get("REDIS_PORT", "6379")
|
||||||
"broker_url": "memory://",
|
# Use db 2 to avoid conflicts with main app
|
||||||
"result_backend": f"db+sqlite:///{f.name}",
|
redis_url = f"redis://{redis_host}:{redis_port}/2"
|
||||||
}
|
yield {
|
||||||
|
"broker_url": redis_url,
|
||||||
|
"result_backend": redis_url,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
@pytest.fixture(scope="session")
|
||||||
@@ -370,9 +373,12 @@ async def ws_manager_in_memory(monkeypatch):
|
|||||||
def __init__(self, queue: asyncio.Queue):
|
def __init__(self, queue: asyncio.Queue):
|
||||||
self.queue = queue
|
self.queue = queue
|
||||||
|
|
||||||
async def get_message(self, ignore_subscribe_messages: bool = True):
|
async def get_message(
|
||||||
|
self, ignore_subscribe_messages: bool = True, timeout: float | None = None
|
||||||
|
):
|
||||||
|
wait_timeout = timeout if timeout is not None else 0.05
|
||||||
try:
|
try:
|
||||||
return await asyncio.wait_for(self.queue.get(), timeout=0.05)
|
return await asyncio.wait_for(self.queue.get(), timeout=wait_timeout)
|
||||||
except Exception:
|
except Exception:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
import asyncio
|
import asyncio
|
||||||
import time
|
import time
|
||||||
from unittest.mock import patch
|
from unittest.mock import AsyncMock, patch
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
from httpx import ASGITransport, AsyncClient
|
from httpx import ASGITransport, AsyncClient
|
||||||
@@ -142,17 +142,17 @@ async def test_whereby_recording_uses_file_pipeline(client):
|
|||||||
"reflector.services.transcript_process.task_pipeline_file_process"
|
"reflector.services.transcript_process.task_pipeline_file_process"
|
||||||
) as mock_file_pipeline,
|
) as mock_file_pipeline,
|
||||||
patch(
|
patch(
|
||||||
"reflector.services.transcript_process.task_pipeline_multitrack_process"
|
"reflector.services.transcript_process.HatchetClientManager"
|
||||||
) as mock_multitrack_pipeline,
|
) as mock_hatchet,
|
||||||
):
|
):
|
||||||
response = await client.post(f"/transcripts/{transcript.id}/process")
|
response = await client.post(f"/transcripts/{transcript.id}/process")
|
||||||
|
|
||||||
assert response.status_code == 200
|
assert response.status_code == 200
|
||||||
assert response.json()["status"] == "ok"
|
assert response.json()["status"] == "ok"
|
||||||
|
|
||||||
# Whereby recordings should use file pipeline
|
# Whereby recordings should use file pipeline, not Hatchet
|
||||||
mock_file_pipeline.delay.assert_called_once_with(transcript_id=transcript.id)
|
mock_file_pipeline.delay.assert_called_once_with(transcript_id=transcript.id)
|
||||||
mock_multitrack_pipeline.delay.assert_not_called()
|
mock_hatchet.start_workflow.assert_not_called()
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.usefixtures("setup_database")
|
@pytest.mark.usefixtures("setup_database")
|
||||||
@@ -177,8 +177,6 @@ async def test_dailyco_recording_uses_multitrack_pipeline(client):
|
|||||||
recording_trigger="automatic-2nd-participant",
|
recording_trigger="automatic-2nd-participant",
|
||||||
is_shared=False,
|
is_shared=False,
|
||||||
)
|
)
|
||||||
# Force Celery backend for test
|
|
||||||
await rooms_controller.update(room, {"use_celery": True})
|
|
||||||
|
|
||||||
transcript = await transcripts_controller.add(
|
transcript = await transcripts_controller.add(
|
||||||
"",
|
"",
|
||||||
@@ -213,18 +211,23 @@ async def test_dailyco_recording_uses_multitrack_pipeline(client):
|
|||||||
"reflector.services.transcript_process.task_pipeline_file_process"
|
"reflector.services.transcript_process.task_pipeline_file_process"
|
||||||
) as mock_file_pipeline,
|
) as mock_file_pipeline,
|
||||||
patch(
|
patch(
|
||||||
"reflector.services.transcript_process.task_pipeline_multitrack_process"
|
"reflector.services.transcript_process.HatchetClientManager"
|
||||||
) as mock_multitrack_pipeline,
|
) as mock_hatchet,
|
||||||
):
|
):
|
||||||
|
mock_hatchet.start_workflow = AsyncMock(return_value="test-workflow-id")
|
||||||
|
|
||||||
response = await client.post(f"/transcripts/{transcript.id}/process")
|
response = await client.post(f"/transcripts/{transcript.id}/process")
|
||||||
|
|
||||||
assert response.status_code == 200
|
assert response.status_code == 200
|
||||||
assert response.json()["status"] == "ok"
|
assert response.json()["status"] == "ok"
|
||||||
|
|
||||||
# Daily.co multitrack recordings should use multitrack pipeline
|
# Daily.co multitrack recordings should use Hatchet workflow
|
||||||
mock_multitrack_pipeline.delay.assert_called_once_with(
|
mock_hatchet.start_workflow.assert_called_once()
|
||||||
transcript_id=transcript.id,
|
call_kwargs = mock_hatchet.start_workflow.call_args.kwargs
|
||||||
bucket_name="daily-bucket",
|
assert call_kwargs["workflow_name"] == "DiarizationPipeline"
|
||||||
track_keys=track_keys,
|
assert call_kwargs["input_data"]["transcript_id"] == transcript.id
|
||||||
)
|
assert call_kwargs["input_data"]["bucket_name"] == "daily-bucket"
|
||||||
|
assert call_kwargs["input_data"]["tracks"] == [
|
||||||
|
{"s3_key": k} for k in track_keys
|
||||||
|
]
|
||||||
mock_file_pipeline.delay.assert_not_called()
|
mock_file_pipeline.delay.assert_not_called()
|
||||||
|
|||||||
@@ -115,9 +115,7 @@ def appserver(tmpdir, setup_database, celery_session_app, celery_session_worker)
|
|||||||
settings.DATA_DIR = DATA_DIR
|
settings.DATA_DIR = DATA_DIR
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="session")
|
# Using celery_includes from conftest.py which includes both pipelines
|
||||||
def celery_includes():
|
|
||||||
return ["reflector.pipelines.main_live_pipeline"]
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.usefixtures("setup_database")
|
@pytest.mark.usefixtures("setup_database")
|
||||||
|
|||||||
@@ -56,7 +56,12 @@ def appserver_ws_user(setup_database):
|
|||||||
|
|
||||||
if server_instance:
|
if server_instance:
|
||||||
server_instance.should_exit = True
|
server_instance.should_exit = True
|
||||||
server_thread.join(timeout=30)
|
server_thread.join(timeout=2.0)
|
||||||
|
|
||||||
|
# Reset global singleton for test isolation
|
||||||
|
from reflector.ws_manager import reset_ws_manager
|
||||||
|
|
||||||
|
reset_ws_manager()
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(autouse=True)
|
@pytest.fixture(autouse=True)
|
||||||
@@ -133,6 +138,8 @@ async def test_user_ws_accepts_valid_token_and_receives_events(appserver_ws_user
|
|||||||
|
|
||||||
# Connect and then trigger an event via HTTP create
|
# Connect and then trigger an event via HTTP create
|
||||||
async with aconnect_ws(base_ws, subprotocols=subprotocols) as ws:
|
async with aconnect_ws(base_ws, subprotocols=subprotocols) as ws:
|
||||||
|
await asyncio.sleep(0.2)
|
||||||
|
|
||||||
# Emit an event to the user's room via a standard HTTP action
|
# Emit an event to the user's room via a standard HTTP action
|
||||||
from httpx import AsyncClient
|
from httpx import AsyncClient
|
||||||
|
|
||||||
@@ -150,6 +157,7 @@ async def test_user_ws_accepts_valid_token_and_receives_events(appserver_ws_user
|
|||||||
"email": "user-abc@example.com",
|
"email": "user-abc@example.com",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Use in-memory client (global singleton makes it share ws_manager)
|
||||||
async with AsyncClient(app=app, base_url=f"http://{host}:{port}/v1") as ac:
|
async with AsyncClient(app=app, base_url=f"http://{host}:{port}/v1") as ac:
|
||||||
# Create a transcript as this user so that the server publishes TRANSCRIPT_CREATED to user room
|
# Create a transcript as this user so that the server publishes TRANSCRIPT_CREATED to user room
|
||||||
resp = await ac.post("/transcripts", json={"name": "WS Test"})
|
resp = await ac.post("/transcripts", json={"name": "WS Test"})
|
||||||
|
|||||||
Reference in New Issue
Block a user