hatchet: restore zullip report

2025-12-21 04:39:06 +00:00 · 2025-12-17 11:06:27 -05:00
parent 298abe8656
commit 7a29c742c5
6 changed files with 77 additions and 50 deletions
--- a/server/reflector/hatchet/run_workers.py
+++ b/server/reflector/hatchet/run_workers.py
@@ -30,9 +30,13 @@ def main() -> None:
        debug=settings.HATCHET_DEBUG,
    )
-    # Import workflows to register them
+    # Import here (not top-level) - workflow imports trigger HatchetClientManager.get_client()
-    from reflector.hatchet.client import HatchetClientManager
+    # which requires HATCHET_CLIENT_TOKEN; must validate settings first
-    from reflector.hatchet.workflows import diarization_pipeline, track_workflow
+    from reflector.hatchet.client import HatchetClientManager  # noqa: PLC0415
    from reflector.hatchet.workflows import (  # noqa: PLC0415
        diarization_pipeline,
        track_workflow,
    )
    hatchet = HatchetClientManager.get_client()
--- a/server/reflector/hatchet/workflows/diarization_pipeline.py
+++ b/server/reflector/hatchet/workflows/diarization_pipeline.py
@@ -10,13 +10,17 @@ import functools
 import tempfile
 from contextlib import asynccontextmanager
 from datetime import timedelta
 from fractions import Fraction
 from pathlib import Path
 from typing import Callable
 import av
 import httpx
 from av.audio.resampler import AudioResampler
 from hatchet_sdk import Context
 from pydantic import BaseModel
 from reflector.dailyco_api.client import DailyApiClient
 from reflector.hatchet.client import HatchetClientManager
 from reflector.hatchet.progress import emit_progress_async
 from reflector.hatchet.workflows.models import (
@@ -36,6 +40,23 @@ from reflector.hatchet.workflows.models import (
 )
 from reflector.hatchet.workflows.track_processing import TrackInput, track_workflow
 from reflector.logger import logger
 from reflector.pipelines import topic_processing
 from reflector.processors import AudioFileWriterProcessor
 from reflector.processors.types import (
    TitleSummary,
    Word,
 )
 from reflector.processors.types import (
    Transcript as TranscriptType,
 )
 from reflector.settings import settings
 from reflector.storage.storage_aws import AwsStorage
 from reflector.utils.audio_waveform import get_audio_waveform
 from reflector.utils.daily import (
    filter_cam_audio_tracks,
    parse_daily_recording_filename,
 )
 from reflector.zulip import post_transcript_notification
 # Audio constants
 OPUS_STANDARD_SAMPLE_RATE = 48000
@@ -74,7 +95,6 @@ async def fresh_db_connection():
    import databases
    from reflector.db import _database_context
    from reflector.settings import settings
    _database_context.set(None)
    db = databases.Database(settings.DATABASE_URL)
@@ -116,9 +136,6 @@ async def set_workflow_error_status(transcript_id: str) -> bool:
 def _get_storage():
    """Create fresh storage instance."""
    from reflector.settings import settings
    from reflector.storage.storage_aws import AwsStorage
    return AwsStorage(
        aws_bucket_name=settings.TRANSCRIPT_STORAGE_AWS_BUCKET_NAME,
        aws_region=settings.TRANSCRIPT_STORAGE_AWS_REGION,
@@ -198,9 +215,6 @@ async def get_recording(input: PipelineInput, ctx: Context) -> RecordingResult:
                transcript_id=input.transcript_id,
            )
    from reflector.dailyco_api.client import DailyApiClient
    from reflector.settings import settings
    if not input.recording_id:
        # No recording_id in reprocess path - return minimal data
        await emit_progress_async(
@@ -257,13 +271,6 @@ async def get_participants(input: PipelineInput, ctx: Context) -> ParticipantsRe
    recording_data = _to_dict(ctx.task_output(get_recording))
    mtg_session_id = recording_data.get("mtg_session_id")
    from reflector.dailyco_api.client import DailyApiClient
    from reflector.settings import settings
    from reflector.utils.daily import (
        filter_cam_audio_tracks,
        parse_daily_recording_filename,
    )
    # Get transcript and reset events/topics/participants
    async with fresh_db_connection():
        from reflector.db.transcripts import (
@@ -488,12 +495,6 @@ async def mixdown_tracks(input: PipelineInput, ctx: Context) -> MixdownResult:
            padded_urls.append(url)
    # Use PipelineMainMultitrack.mixdown_tracks which uses PyAV filter graph
    from fractions import Fraction
    from av.audio.resampler import AudioResampler
    from reflector.processors import AudioFileWriterProcessor
    valid_urls = [url for url in padded_urls if url]
    if not valid_urls:
        raise ValueError("No valid padded tracks to mixdown")
@@ -688,10 +689,7 @@ async def generate_waveform(input: PipelineInput, ctx: Context) -> WaveformResul
        input.transcript_id, "generate_waveform", "in_progress", ctx.workflow_run_id
    )
    import httpx
    from reflector.db.transcripts import TranscriptWaveform, transcripts_controller
    from reflector.utils.audio_waveform import get_audio_waveform
    # Cleanup temporary padded S3 files (deferred until after mixdown)
    track_data = _to_dict(ctx.task_output(process_tracks))
@@ -779,12 +777,9 @@ async def detect_topics(input: PipelineInput, ctx: Context) -> TopicsResult:
    target_language = track_data.get("target_language", "en")
    from reflector.db.transcripts import TranscriptTopic, transcripts_controller
    from reflector.pipelines import topic_processing
    from reflector.processors.types import (
        TitleSummaryWithId as TitleSummaryWithIdProcessorType,
    )
    from reflector.processors.types import Transcript as TranscriptType
    from reflector.processors.types import Word
    # Convert word dicts to Word objects
    word_objects = [Word(**w) for w in words]
@@ -850,8 +845,6 @@ async def generate_title(input: PipelineInput, ctx: Context) -> TitleResult:
        TranscriptFinalTitle,
        transcripts_controller,
    )
    from reflector.pipelines import topic_processing
    from reflector.processors.types import TitleSummary
    topic_objects = [TitleSummary(**t) for t in topics]
@@ -913,8 +906,6 @@ async def generate_summary(input: PipelineInput, ctx: Context) -> SummaryResult:
        TranscriptFinalShortSummary,
        transcripts_controller,
    )
    from reflector.pipelines import topic_processing
    from reflector.processors.types import TitleSummary
    topic_objects = [TitleSummary(**t) for t in topics]
@@ -1100,8 +1091,6 @@ async def post_zulip(input: PipelineInput, ctx: Context) -> ZulipResult:
        input.transcript_id, "post_zulip", "in_progress", ctx.workflow_run_id
    )
    from reflector.settings import settings
    if not settings.ZULIP_REALM:
        logger.info("[Hatchet] post_zulip skipped (Zulip not configured)")
        await emit_progress_async(
@@ -1109,8 +1098,6 @@ async def post_zulip(input: PipelineInput, ctx: Context) -> ZulipResult:
        )
        return ZulipResult(zulip_message_id=None, skipped=True)
    from reflector.zulip import post_transcript_notification
    async with fresh_db_connection():
        from reflector.db.transcripts import transcripts_controller
@@ -1155,8 +1142,6 @@ async def send_webhook(input: PipelineInput, ctx: Context) -> WebhookResult:
        transcript = await transcripts_controller.get_by_id(input.transcript_id)
        if room and room.webhook_url and transcript:
            import httpx
            webhook_payload = {
                "event": "transcript.completed",
                "transcript_id": input.transcript_id,
--- a/server/reflector/services/transcript_process.py
+++ b/server/reflector/services/transcript_process.py
@@ -15,7 +15,8 @@ from hatchet_sdk.clients.rest.exceptions import ApiException
 from hatchet_sdk.clients.rest.models import V1TaskStatus
 from reflector.db.recordings import recordings_controller
-from reflector.db.transcripts import Transcript
+from reflector.db.rooms import rooms_controller
 from reflector.db.transcripts import Transcript, transcripts_controller
 from reflector.hatchet.client import HatchetClientManager
 from reflector.logger import logger
 from reflector.pipelines.main_file_pipeline import task_pipeline_file_process
@@ -180,9 +181,6 @@ async def dispatch_transcript_processing(
    Returns AsyncResult for Celery tasks, None for Hatchet workflows.
    """
    from reflector.db.rooms import rooms_controller
    from reflector.db.transcripts import transcripts_controller
    if isinstance(config, MultitrackProcessingConfig):
        # Check if room has use_hatchet=True (overrides env vars)
        room_forces_hatchet = False
--- a/server/reflector/tools/process_transcript.py
+++ b/server/reflector/tools/process_transcript.py
@@ -17,7 +17,9 @@ from typing import Callable
 from celery.result import AsyncResult
 from hatchet_sdk.clients.rest.models import V1TaskStatus
 from reflector.db import get_database
 from reflector.db.transcripts import Transcript, transcripts_controller
 from reflector.hatchet.client import HatchetClientManager
 from reflector.services.transcript_process import (
    FileProcessingConfig,
    MultitrackProcessingConfig,
@@ -55,8 +57,6 @@ async def process_transcript(
        sync: If True, wait for task completion. If False, dispatch and exit.
        force: If True, cancel old workflow and start new (latest code). If False, replay failed workflow.
    """
    from reflector.db import get_database
    database = get_database()
    await database.connect()
@@ -96,8 +96,6 @@ async def process_transcript(
        if result is None:
            # Hatchet workflow dispatched
            if sync:
                from reflector.hatchet.client import HatchetClientManager
                # Re-fetch transcript to get workflow_run_id
                transcript = await transcripts_controller.get_by_id(transcript_id)
                if not transcript or not transcript.workflow_run_id:
--- a/server/reflector/worker/process.py
+++ b/server/reflector/worker/process.py
@@ -24,6 +24,7 @@ from reflector.db.transcripts import (
    SourceKind,
    transcripts_controller,
 )
 from reflector.hatchet.client import HatchetClientManager
 from reflector.pipelines.main_file_pipeline import task_pipeline_file_process
 from reflector.pipelines.main_live_pipeline import asynctask
 from reflector.pipelines.main_multitrack_pipeline import (
@@ -298,8 +299,6 @@ async def _process_multitrack_recording_inner(
        )
    if use_hatchet:
        from reflector.hatchet.client import HatchetClientManager  # noqa: PLC0415
        workflow_id = await HatchetClientManager.start_workflow(
            workflow_name="DiarizationPipeline",
            input_data={
--- a/server/reflector/zulip.py
+++ b/server/reflector/zulip.py
@@ -3,7 +3,8 @@ from urllib.parse import urlparse
 import httpx
-from reflector.db.transcripts import Transcript
+from reflector.db.rooms import rooms_controller
 from reflector.db.transcripts import Transcript, transcripts_controller
 from reflector.settings import settings
@@ -113,6 +114,48 @@ def get_zulip_message(transcript: Transcript, include_topics: bool):
    return message
 async def post_transcript_notification(transcript: Transcript) -> int | None:
    """Post or update transcript notification in Zulip.
    Uses transcript.room_id directly (Hatchet flow).
    Celery's pipeline_post_to_zulip uses recording→meeting→room path instead.
    """
    if not transcript.room_id:
        return None
    room = await rooms_controller.get_by_id(transcript.room_id)
    if not room or not room.zulip_stream or not room.zulip_auto_post:
        return None
    message = get_zulip_message(transcript=transcript, include_topics=True)
    message_updated = False
    if transcript.zulip_message_id:
        try:
            await update_zulip_message(
                transcript.zulip_message_id,
                room.zulip_stream,
                room.zulip_topic,
                message,
            )
            message_updated = True
        except Exception:
            pass
    if not message_updated:
        response = await send_message_to_zulip(
            room.zulip_stream, room.zulip_topic, message
        )
        message_id = response.get("id")
        if message_id:
            await transcripts_controller.update(
                transcript, {"zulip_message_id": message_id}
            )
        return message_id
    return transcript.zulip_message_id
 def extract_domain(url: str) -> str:
    return urlparse(url).netloc