reflector/server/tests/test_failed_runs_monitor.py

"""
Tests for FailedRunsMonitor Hatchet cron workflow.

Tests cover:
- No Zulip message sent when no failures found
- Messages sent for failed main pipeline runs
- Child workflow failures filtered out
- Errors in the monitor itself are caught and logged
"""

from datetime import timezone
from unittest.mock import AsyncMock, MagicMock, patch

import pytest
from hatchet_sdk.clients.rest.models import V1TaskStatus


def _make_task_summary(
    workflow_name: str,
    workflow_run_external_id: str = "run-123",
    status: V1TaskStatus = V1TaskStatus.FAILED,
):
    """Create a mock V1TaskSummary."""
    mock = MagicMock()
    mock.workflow_name = workflow_name
    mock.workflow_run_external_id = workflow_run_external_id
    mock.status = status
    return mock


@pytest.mark.asyncio
class TestCheckFailedRuns:
    async def test_no_failures_sends_no_message(self):
        mock_result = MagicMock()
        mock_result.rows = []

        mock_client = MagicMock()
        mock_client.runs.aio_list = AsyncMock(return_value=mock_result)

        with (
            patch(
                "reflector.hatchet.workflows.failed_runs_monitor.HatchetClientManager.get_client",
                return_value=mock_client,
            ),
            patch(
                "reflector.hatchet.workflows.failed_runs_monitor.send_message_to_zulip",
                new_callable=AsyncMock,
            ) as mock_send,
        ):
            from reflector.hatchet.workflows.failed_runs_monitor import (
                _check_failed_runs,
            )

            result = await _check_failed_runs()

            assert result["checked"] == 0
            assert result["reported"] == 0
            mock_send.assert_not_called()

    async def test_reports_failed_main_pipeline_runs(self):
        failed_runs = [
            _make_task_summary("DiarizationPipeline", "run-1"),
            _make_task_summary("FilePipeline", "run-2"),
        ]
        mock_result = MagicMock()
        mock_result.rows = failed_runs

        mock_details = MagicMock()
        mock_client = MagicMock()
        mock_client.runs.aio_list = AsyncMock(return_value=mock_result)
        mock_client.runs.aio_get = AsyncMock(return_value=mock_details)

        with (
            patch(
                "reflector.hatchet.workflows.failed_runs_monitor.HatchetClientManager.get_client",
                return_value=mock_client,
            ),
            patch(
                "reflector.hatchet.workflows.failed_runs_monitor.render_run_detail",
                return_value="**rendered DAG**",
            ),
            patch(
                "reflector.hatchet.workflows.failed_runs_monitor.send_message_to_zulip",
                new_callable=AsyncMock,
                return_value={"id": 1},
            ) as mock_send,
            patch(
                "reflector.hatchet.workflows.failed_runs_monitor.settings"
            ) as mock_settings,
        ):
            mock_settings.ZULIP_DAG_STREAM = "dag-stream"
            mock_settings.ZULIP_DAG_TOPIC = "dag-topic"

            from reflector.hatchet.workflows.failed_runs_monitor import (
                _check_failed_runs,
            )

            result = await _check_failed_runs()

            assert result["checked"] == 2
            assert result["reported"] == 2
            assert mock_send.call_count == 2
            mock_send.assert_any_call("dag-stream", "dag-topic", "**rendered DAG**")

    async def test_filters_out_child_workflows(self):
        runs = [
            _make_task_summary("DiarizationPipeline", "run-1"),
            _make_task_summary("TrackProcessing", "run-2"),
            _make_task_summary("TopicChunkProcessing", "run-3"),
            _make_task_summary("SubjectProcessing", "run-4"),
        ]
        mock_result = MagicMock()
        mock_result.rows = runs

        mock_details = MagicMock()
        mock_client = MagicMock()
        mock_client.runs.aio_list = AsyncMock(return_value=mock_result)
        mock_client.runs.aio_get = AsyncMock(return_value=mock_details)

        with (
            patch(
                "reflector.hatchet.workflows.failed_runs_monitor.HatchetClientManager.get_client",
                return_value=mock_client,
            ),
            patch(
                "reflector.hatchet.workflows.failed_runs_monitor.render_run_detail",
                return_value="**rendered**",
            ),
            patch(
                "reflector.hatchet.workflows.failed_runs_monitor.send_message_to_zulip",
                new_callable=AsyncMock,
                return_value={"id": 1},
            ) as mock_send,
            patch(
                "reflector.hatchet.workflows.failed_runs_monitor.settings"
            ) as mock_settings,
        ):
            mock_settings.ZULIP_DAG_STREAM = "dag-stream"
            mock_settings.ZULIP_DAG_TOPIC = "dag-topic"

            from reflector.hatchet.workflows.failed_runs_monitor import (
                _check_failed_runs,
            )

            result = await _check_failed_runs()

            # Only DiarizationPipeline should be reported
            assert result["checked"] == 4
            assert result["reported"] == 1
            assert mock_send.call_count == 1

    async def test_all_three_pipelines_reported(self):
        runs = [
            _make_task_summary("DiarizationPipeline", "run-1"),
            _make_task_summary("FilePipeline", "run-2"),
            _make_task_summary("LivePostProcessingPipeline", "run-3"),
        ]
        mock_result = MagicMock()
        mock_result.rows = runs

        mock_details = MagicMock()
        mock_client = MagicMock()
        mock_client.runs.aio_list = AsyncMock(return_value=mock_result)
        mock_client.runs.aio_get = AsyncMock(return_value=mock_details)

        with (
            patch(
                "reflector.hatchet.workflows.failed_runs_monitor.HatchetClientManager.get_client",
                return_value=mock_client,
            ),
            patch(
                "reflector.hatchet.workflows.failed_runs_monitor.render_run_detail",
                return_value="**rendered**",
            ),
            patch(
                "reflector.hatchet.workflows.failed_runs_monitor.send_message_to_zulip",
                new_callable=AsyncMock,
                return_value={"id": 1},
            ) as mock_send,
            patch(
                "reflector.hatchet.workflows.failed_runs_monitor.settings"
            ) as mock_settings,
        ):
            mock_settings.ZULIP_DAG_STREAM = "dag-stream"
            mock_settings.ZULIP_DAG_TOPIC = "dag-topic"

            from reflector.hatchet.workflows.failed_runs_monitor import (
                _check_failed_runs,
            )

            result = await _check_failed_runs()

            assert result["reported"] == 3
            assert mock_send.call_count == 3

    async def test_continues_on_individual_run_failure(self):
        """If one run fails to report, the others should still be reported."""
        runs = [
            _make_task_summary("DiarizationPipeline", "run-1"),
            _make_task_summary("FilePipeline", "run-2"),
        ]
        mock_result = MagicMock()
        mock_result.rows = runs

        mock_client = MagicMock()
        mock_client.runs.aio_list = AsyncMock(return_value=mock_result)
        # First call raises, second succeeds
        mock_client.runs.aio_get = AsyncMock(
            side_effect=[Exception("Hatchet API error"), MagicMock()]
        )

        with (
            patch(
                "reflector.hatchet.workflows.failed_runs_monitor.HatchetClientManager.get_client",
                return_value=mock_client,
            ),
            patch(
                "reflector.hatchet.workflows.failed_runs_monitor.render_run_detail",
                return_value="**rendered**",
            ),
            patch(
                "reflector.hatchet.workflows.failed_runs_monitor.send_message_to_zulip",
                new_callable=AsyncMock,
                return_value={"id": 1},
            ) as mock_send,
            patch(
                "reflector.hatchet.workflows.failed_runs_monitor.settings"
            ) as mock_settings,
        ):
            mock_settings.ZULIP_DAG_STREAM = "dag-stream"
            mock_settings.ZULIP_DAG_TOPIC = "dag-topic"

            from reflector.hatchet.workflows.failed_runs_monitor import (
                _check_failed_runs,
            )

            result = await _check_failed_runs()

            # First run failed to report, second succeeded
            assert result["reported"] == 1
            assert mock_send.call_count == 1

    async def test_handles_list_api_failure(self):
        """If aio_list fails, should return error and not crash."""
        mock_client = MagicMock()
        mock_client.runs.aio_list = AsyncMock(
            side_effect=Exception("Connection refused")
        )

        with patch(
            "reflector.hatchet.workflows.failed_runs_monitor.HatchetClientManager.get_client",
            return_value=mock_client,
        ):
            from reflector.hatchet.workflows.failed_runs_monitor import (
                _check_failed_runs,
            )

            result = await _check_failed_runs()

            assert result["checked"] == 0
            assert result["reported"] == 0
            assert "error" in result

    async def test_uses_correct_time_window(self):
        """Verify the correct since/until parameters are passed to aio_list."""
        mock_result = MagicMock()
        mock_result.rows = []

        mock_client = MagicMock()
        mock_client.runs.aio_list = AsyncMock(return_value=mock_result)

        with patch(
            "reflector.hatchet.workflows.failed_runs_monitor.HatchetClientManager.get_client",
            return_value=mock_client,
        ):
            from reflector.hatchet.workflows.failed_runs_monitor import (
                _check_failed_runs,
            )

            await _check_failed_runs()

            call_kwargs = mock_client.runs.aio_list.call_args
            assert call_kwargs.kwargs["statuses"] == [V1TaskStatus.FAILED]
            since = call_kwargs.kwargs["since"]
            until = call_kwargs.kwargs["until"]
            assert since.tzinfo == timezone.utc
            assert until.tzinfo == timezone.utc
            # Window should be ~1 hour
            delta = until - since
            assert 3590 < delta.total_seconds() < 3610