feat: multitrack cli (#735)

* multitrack cli prd * prd/todo (no-mistakes) * multitrack cli (no-mistakes) * multitrack cli (no-mistakes) * multitrack cli (no-mistakes) * multitrack cli (no-mistakes) * remove multitrack tests most worthless * useless comments away * useless comments away --------- Co-authored-by: Igor Loskutov <igor.loskutoff@gmail.com>
2025-12-20 12:19:06 +00:00 · 2025-11-24 10:35:06 -05:00
parent 4287f8b8ae
commit 11731c9d38
3 changed files with 643 additions and 19 deletions
--- a/server/reflector/tools/cli_multitrack.py
+++ b/server/reflector/tools/cli_multitrack.py
@@ -0,0 +1,347 @@
+import asyncio
+import sys
+import time
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Protocol
+
+import structlog
+from celery.result import AsyncResult
+
+from reflector.db import get_database
+from reflector.db.transcripts import SourceKind, Transcript, transcripts_controller
+from reflector.pipelines.main_multitrack_pipeline import (
+    task_pipeline_multitrack_process,
+)
+from reflector.storage import get_transcripts_storage
+from reflector.tools.process import (
+    extract_result_from_entry,
+    parse_s3_url,
+    validate_s3_objects,
+)
+
+logger = structlog.get_logger(__name__)
+
+DEFAULT_PROCESSING_TIMEOUT_SECONDS = 3600
+
+MAX_ERROR_MESSAGE_LENGTH = 500
+
+TASK_POLL_INTERVAL_SECONDS = 2
+
+
+class StatusCallback(Protocol):
+    def __call__(self, state: str, elapsed_seconds: int) -> None: ...
+
+
+@dataclass
+class MultitrackTaskResult:
+    success: bool
+    transcript_id: str
+    error: Optional[str] = None
+
+
+async def create_multitrack_transcript(
+    bucket_name: str,
+    track_keys: List[str],
+    source_language: str,
+    target_language: str,
+    user_id: Optional[str] = None,
+) -> Transcript:
+    num_tracks = len(track_keys)
+    track_word = "track" if num_tracks == 1 else "tracks"
+    transcript_name = f"Multitrack ({num_tracks} {track_word})"
+
+    transcript = await transcripts_controller.add(
+        transcript_name,
+        source_kind=SourceKind.FILE,
+        source_language=source_language,
+        target_language=target_language,
+        user_id=user_id,
+    )
+
+    logger.info(
+        "Created multitrack transcript",
+        transcript_id=transcript.id,
+        name=transcript_name,
+        bucket=bucket_name,
+        num_tracks=len(track_keys),
+    )
+
+    return transcript
+
+
+def submit_multitrack_task(
+    transcript_id: str, bucket_name: str, track_keys: List[str]
+) -> AsyncResult:
+    result = task_pipeline_multitrack_process.delay(
+        transcript_id=transcript_id,
+        bucket_name=bucket_name,
+        track_keys=track_keys,
+    )
+
+    logger.info(
+        "Multitrack task submitted",
+        transcript_id=transcript_id,
+        task_id=result.id,
+        bucket=bucket_name,
+        num_tracks=len(track_keys),
+    )
+
+    return result
+
+
+async def wait_for_task(
+    result: AsyncResult,
+    transcript_id: str,
+    timeout_seconds: int = DEFAULT_PROCESSING_TIMEOUT_SECONDS,
+    poll_interval: int = TASK_POLL_INTERVAL_SECONDS,
+    status_callback: Optional[StatusCallback] = None,
+) -> MultitrackTaskResult:
+    start_time = time.time()
+    last_status = None
+
+    while not result.ready():
+        elapsed = time.time() - start_time
+        if elapsed > timeout_seconds:
+            error_msg = (
+                f"Task {result.id} did not complete within {timeout_seconds}s "
+                f"for transcript {transcript_id}"
+            )
+            logger.error(
+                "Task timeout",
+                task_id=result.id,
+                transcript_id=transcript_id,
+                elapsed_seconds=elapsed,
+            )
+            raise TimeoutError(error_msg)
+
+        if result.state != last_status:
+            if status_callback:
+                status_callback(result.state, int(elapsed))
+            last_status = result.state
+
+        await asyncio.sleep(poll_interval)
+
+    if result.failed():
+        error_info = result.info
+        traceback_info = getattr(result, "traceback", None)
+
+        logger.error(
+            "Multitrack task failed",
+            transcript_id=transcript_id,
+            task_id=result.id,
+            error=str(error_info),
+            has_traceback=bool(traceback_info),
+        )
+
+        error_detail = str(error_info)
+        if traceback_info:
+            error_detail += f"\nTraceback:\n{traceback_info}"
+
+        return MultitrackTaskResult(
+            success=False, transcript_id=transcript_id, error=error_detail
+        )
+
+    logger.info(
+        "Multitrack task completed",
+        transcript_id=transcript_id,
+        task_id=result.id,
+        state=result.state,
+    )
+
+    return MultitrackTaskResult(success=True, transcript_id=transcript_id)
+
+
+async def update_transcript_status(
+    transcript_id: str,
+    status: str,
+    error: Optional[str] = None,
+    max_error_length: int = MAX_ERROR_MESSAGE_LENGTH,
+) -> None:
+    database = get_database()
+    connected = False
+
+    try:
+        await database.connect()
+        connected = True
+
+        transcript = await transcripts_controller.get_by_id(transcript_id)
+        if transcript:
+            update_data: Dict[str, Any] = {"status": status}
+
+            if error:
+                if len(error) > max_error_length:
+                    error = error[: max_error_length - 3] + "..."
+                update_data["error"] = error
+
+            await transcripts_controller.update(transcript, update_data)
+
+            logger.info(
+                "Updated transcript status",
+                transcript_id=transcript_id,
+                status=status,
+                has_error=bool(error),
+            )
+    except Exception as e:
+        logger.warning(
+            "Failed to update transcript status",
+            transcript_id=transcript_id,
+            error=str(e),
+        )
+    finally:
+        if connected:
+            try:
+                await database.disconnect()
+            except Exception as e:
+                logger.warning(f"Database disconnect failed: {e}")
+
+
+async def process_multitrack(
+    bucket_name: str,
+    track_keys: List[str],
+    source_language: str,
+    target_language: str,
+    user_id: Optional[str] = None,
+    timeout_seconds: int = DEFAULT_PROCESSING_TIMEOUT_SECONDS,
+    status_callback: Optional[StatusCallback] = None,
+) -> MultitrackTaskResult:
+    """High-level orchestration for multitrack processing."""
+    database = get_database()
+    transcript = None
+    connected = False
+
+    try:
+        await database.connect()
+        connected = True
+
+        transcript = await create_multitrack_transcript(
+            bucket_name=bucket_name,
+            track_keys=track_keys,
+            source_language=source_language,
+            target_language=target_language,
+            user_id=user_id,
+        )
+
+        result = submit_multitrack_task(
+            transcript_id=transcript.id, bucket_name=bucket_name, track_keys=track_keys
+        )
+
+    except Exception as e:
+        if transcript:
+            try:
+                await update_transcript_status(
+                    transcript_id=transcript.id, status="failed", error=str(e)
+                )
+            except Exception as update_error:
+                logger.error(
+                    "Failed to update transcript status after error",
+                    original_error=str(e),
+                    update_error=str(update_error),
+                    transcript_id=transcript.id,
+                )
+        raise
+    finally:
+        if connected:
+            try:
+                await database.disconnect()
+            except Exception as e:
+                logger.warning(f"Database disconnect failed: {e}")
+
+    # Poll outside database connection
+    task_result = await wait_for_task(
+        result=result,
+        transcript_id=transcript.id,
+        timeout_seconds=timeout_seconds,
+        poll_interval=2,
+        status_callback=status_callback,
+    )
+
+    if not task_result.success:
+        await update_transcript_status(
+            transcript_id=transcript.id, status="failed", error=task_result.error
+        )
+
+    return task_result
+
+
+def print_progress(message: str) -> None:
+    """Print progress message to stderr for CLI visibility."""
+    print(f"{message}", file=sys.stderr)
+
+
+def create_status_callback() -> StatusCallback:
+    """Create callback for task status updates during polling."""
+
+    def callback(state: str, elapsed_seconds: int) -> None:
+        print_progress(
+            f"Multitrack pipeline status: {state} (elapsed: {elapsed_seconds}s)"
+        )
+
+    return callback
+
+
+async def process_multitrack_cli(
+    s3_urls: List[str],
+    source_language: str,
+    target_language: str,
+    output_path: Optional[str] = None,
+) -> None:
+    if not s3_urls:
+        raise ValueError("At least one track required for multitrack processing")
+
+    bucket_keys = []
+    for url in s3_urls:
+        try:
+            bucket, key = parse_s3_url(url)
+            bucket_keys.append((bucket, key))
+        except ValueError as e:
+            raise ValueError(f"Invalid S3 URL '{url}': {e}") from e
+
+    buckets = set(bucket for bucket, _ in bucket_keys)
+    if len(buckets) > 1:
+        raise ValueError(
+            f"All tracks must be in the same S3 bucket. "
+            f"Found {len(buckets)} different buckets: {sorted(buckets)}. "
+            f"Please upload all files to a single bucket."
+        )
+
+    primary_bucket = bucket_keys[0][0]
+    track_keys = [key for _, key in bucket_keys]
+
+    print_progress(
+        f"Starting multitrack CLI processing: "
+        f"bucket={primary_bucket}, num_tracks={len(track_keys)}, "
+        f"source_language={source_language}, target_language={target_language}"
+    )
+
+    storage = get_transcripts_storage()
+    await validate_s3_objects(storage, bucket_keys)
+    print_progress(f"S3 validation complete: {len(bucket_keys)} objects verified")
+
+    result = await process_multitrack(
+        bucket_name=primary_bucket,
+        track_keys=track_keys,
+        source_language=source_language,
+        target_language=target_language,
+        user_id=None,
+        timeout_seconds=3600,
+        status_callback=create_status_callback(),
+    )
+
+    if not result.success:
+        error_msg = (
+            f"Multitrack pipeline failed for transcript {result.transcript_id}\n"
+        )
+        if result.error:
+            error_msg += f"Error: {result.error}\n"
+        raise RuntimeError(error_msg)
+
+    print_progress(
+        f"Multitrack processing complete for transcript {result.transcript_id}"
+    )
+
+    database = get_database()
+    await database.connect()
+    try:
+        await extract_result_from_entry(result.transcript_id, output_path)
+    finally:
+        await database.disconnect()
--- a/server/reflector/tools/process.py
+++ b/server/reflector/tools/process.py
@@ -9,7 +9,10 @@ import shutil
 import sys
 import time
 from pathlib import Path
-from typing import Any, Dict, List, Literal
+from typing import Any, Dict, List, Literal, Tuple
+from urllib.parse import unquote, urlparse
+
+from botocore.exceptions import BotoCoreError, ClientError, NoCredentialsError

 from reflector.db.transcripts import SourceKind, TranscriptTopic, transcripts_controller
 from reflector.logger import logger
@@ -20,10 +23,119 @@ from reflector.pipelines.main_live_pipeline import pipeline_post as live_pipelin
 from reflector.pipelines.main_live_pipeline import (
    pipeline_process as live_pipeline_process,
 )
+from reflector.storage import Storage
+
+
+def validate_s3_bucket_name(bucket: str) -> None:
+    if not bucket:
+        raise ValueError("Bucket name cannot be empty")
+    if len(bucket) > 255:  # Absolute max for any region
+        raise ValueError(f"Bucket name too long: {len(bucket)} characters (max 255)")
+
+
+def validate_s3_key(key: str) -> None:
+    if not key:
+        raise ValueError("S3 key cannot be empty")
+    if len(key) > 1024:
+        raise ValueError(f"S3 key too long: {len(key)} characters (max 1024)")
+
+
+def parse_s3_url(url: str) -> Tuple[str, str]:
+    parsed = urlparse(url)
+
+    if parsed.scheme == "s3":
+        bucket = parsed.netloc
+        key = parsed.path.lstrip("/")
+        if parsed.fragment:
+            logger.debug(
+                "URL fragment ignored (not part of S3 key)",
+                url=url,
+                fragment=parsed.fragment,
+            )
+        if not bucket or not key:
+            raise ValueError(f"Invalid S3 URL: {url} (missing bucket or key)")
+        bucket = unquote(bucket)
+        key = unquote(key)
+        validate_s3_bucket_name(bucket)
+        validate_s3_key(key)
+        return bucket, key
+
+    elif parsed.scheme in ("http", "https"):
+        if ".s3." in parsed.netloc or parsed.netloc.endswith(".s3.amazonaws.com"):
+            bucket = parsed.netloc.split(".")[0]
+            key = parsed.path.lstrip("/")
+            if parsed.fragment:
+                logger.debug("URL fragment ignored", url=url, fragment=parsed.fragment)
+            if not bucket or not key:
+                raise ValueError(f"Invalid S3 URL: {url} (missing bucket or key)")
+            bucket = unquote(bucket)
+            key = unquote(key)
+            validate_s3_bucket_name(bucket)
+            validate_s3_key(key)
+            return bucket, key
+
+        elif parsed.netloc.startswith("s3.") and "amazonaws.com" in parsed.netloc:
+            path_parts = parsed.path.lstrip("/").split("/", 1)
+            if len(path_parts) != 2:
+                raise ValueError(f"Invalid S3 URL: {url} (missing bucket or key)")
+            bucket, key = path_parts
+            if parsed.fragment:
+                logger.debug("URL fragment ignored", url=url, fragment=parsed.fragment)
+            bucket = unquote(bucket)
+            key = unquote(key)
+            validate_s3_bucket_name(bucket)
+            validate_s3_key(key)
+            return bucket, key
+
+        else:
+            raise ValueError(f"Invalid S3 URL format: {url} (not recognized as S3 URL)")
+
+    else:
+        raise ValueError(f"Invalid S3 URL scheme: {url} (must be s3:// or https://)")
+
+
+async def validate_s3_objects(
+    storage: Storage, bucket_keys: List[Tuple[str, str]]
+) -> None:
+    async with storage.session.client("s3") as client:
+
+        async def check_object(bucket: str, key: str) -> None:
+            try:
+                await client.head_object(Bucket=bucket, Key=key)
+            except ClientError as e:
+                error_code = e.response["Error"]["Code"]
+                if error_code in ("404", "NoSuchKey"):
+                    raise ValueError(f"S3 object not found: s3://{bucket}/{key}") from e
+                elif error_code in ("403", "Forbidden", "AccessDenied"):
+                    raise ValueError(
+                        f"Access denied for S3 object: s3://{bucket}/{key}. "
+                        f"Check AWS credentials and permissions"
+                    ) from e
+                else:
+                    raise ValueError(
+                        f"S3 error {error_code} for s3://{bucket}/{key}: "
+                        f"{e.response['Error'].get('Message', 'Unknown error')}"
+                    ) from e
+            except NoCredentialsError as e:
+                raise ValueError(
+                    "AWS credentials not configured. Set AWS_ACCESS_KEY_ID and "
+                    "AWS_SECRET_ACCESS_KEY environment variables"
+                ) from e
+            except BotoCoreError as e:
+                raise ValueError(
+                    f"AWS service error for s3://{bucket}/{key}: {str(e)}"
+                ) from e
+            except Exception as e:
+                raise ValueError(
+                    f"Unexpected error validating s3://{bucket}/{key}: {str(e)}"
+                ) from e
+
+        await asyncio.gather(
+            *(check_object(bucket, key) for bucket, key in bucket_keys)
+        )


 def serialize_topics(topics: List[TranscriptTopic]) -> List[Dict[str, Any]]:
-    """Convert TranscriptTopic objects to JSON-serializable dicts"""
    serialized = []
    for topic in topics:
        topic_dict = topic.model_dump()
@@ -32,7 +144,6 @@ def serialize_topics(topics: List[TranscriptTopic]) -> List[Dict[str, Any]]:


 def debug_print_speakers(serialized_topics: List[Dict[str, Any]]) -> None:
-    """Print debug info about speakers found in topics"""
    all_speakers = set()
    for topic_dict in serialized_topics:
        for word in topic_dict.get("words", []):
@@ -47,8 +158,6 @@ def debug_print_speakers(serialized_topics: List[Dict[str, Any]]) -> None:
 TranscriptId = str


-# common interface for every flow: it needs an Entry in db with specific ceremony (file path + status + actual file in file system)
-# ideally we want to get rid of it at some point
 async def prepare_entry(
    source_path: str,
    source_language: str,
@@ -65,9 +174,7 @@ async def prepare_entry(
        user_id=None,
    )

-    logger.info(
-        f"Created empty transcript {transcript.id} for file {file_path.name} because technically we need an empty transcript before we start transcript"
-    )
+    logger.info(f"Created transcript {transcript.id} for {file_path.name}")

    # pipelines expect files as upload.*

@@ -83,7 +190,6 @@ async def prepare_entry(
    return transcript.id


-# same reason as prepare_entry
 async def extract_result_from_entry(
    transcript_id: TranscriptId, output_path: str
 ) -> None:
@@ -193,13 +299,20 @@ if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Process audio files with speaker diarization"
    )
-    parser.add_argument("source", help="Source file (mp3, wav, mp4...)")
+    parser.add_argument(
+        "source",
+        help="Source file (mp3, wav, mp4...) or comma-separated S3 URLs with --multitrack",
+    )
    parser.add_argument(
        "--pipeline",
-        required=True,
        choices=["live", "file"],
        help="Pipeline type to use for processing (live: streaming/incremental, file: batch/parallel)",
    )
+    parser.add_argument(
+        "--multitrack",
+        action="store_true",
+        help="Process multiple audio tracks from comma-separated S3 URLs",
+    )
    parser.add_argument(
        "--source-language", default="en", help="Source language code (default: en)"
    )
@@ -209,12 +322,40 @@ if __name__ == "__main__":
    parser.add_argument("--output", "-o", help="Output file (output.jsonl)")
    args = parser.parse_args()

-    asyncio.run(
-        process(
-            args.source,
-            args.source_language,
-            args.target_language,
-            args.pipeline,
-            args.output,
+    if args.multitrack:
+        if not args.source:
+            parser.error("Source URLs required for multitrack processing")
+
+        s3_urls = [url.strip() for url in args.source.split(",") if url.strip()]
+
+        if not s3_urls:
+            parser.error("At least one S3 URL required for multitrack processing")
+
+        from reflector.tools.cli_multitrack import process_multitrack_cli
+
+        asyncio.run(
+            process_multitrack_cli(
+                s3_urls,
+                args.source_language,
+                args.target_language,
+                args.output,
+            )
+        )
+    else:
+        if not args.pipeline:
+            parser.error("--pipeline is required for single-track processing")
+
+        if "," in args.source:
+            parser.error(
+                "Multiple files detected. Use --multitrack flag for multitrack processing"
+            )
+
+        asyncio.run(
+            process(
+                args.source,
+                args.source_language,
+                args.target_language,
+                args.pipeline,
+                args.output,
+            )
        )
-    )
--- a/server/tests/test_s3_url_parser.py
+++ b/server/tests/test_s3_url_parser.py
@@ -0,0 +1,136 @@
+"""Tests for S3 URL parsing functionality in reflector.tools.process"""
+
+import pytest
+
+from reflector.tools.process import parse_s3_url
+
+
+class TestParseS3URL:
+    """Test cases for parse_s3_url function"""
+
+    def test_parse_s3_protocol(self):
+        """Test parsing s3:// protocol URLs"""
+        bucket, key = parse_s3_url("s3://my-bucket/path/to/file.webm")
+        assert bucket == "my-bucket"
+        assert key == "path/to/file.webm"
+
+    def test_parse_s3_protocol_deep_path(self):
+        """Test s3:// with deeply nested paths"""
+        bucket, key = parse_s3_url("s3://bucket-name/very/deep/path/to/audio.mp4")
+        assert bucket == "bucket-name"
+        assert key == "very/deep/path/to/audio.mp4"
+
+    def test_parse_https_subdomain_format(self):
+        """Test parsing https://bucket.s3.amazonaws.com/key format"""
+        bucket, key = parse_s3_url("https://my-bucket.s3.amazonaws.com/path/file.webm")
+        assert bucket == "my-bucket"
+        assert key == "path/file.webm"
+
+    def test_parse_https_regional_subdomain(self):
+        """Test parsing regional endpoint with subdomain"""
+        bucket, key = parse_s3_url(
+            "https://my-bucket.s3.us-west-2.amazonaws.com/path/file.webm"
+        )
+        assert bucket == "my-bucket"
+        assert key == "path/file.webm"
+
+    def test_parse_https_path_style(self):
+        """Test parsing https://s3.amazonaws.com/bucket/key format"""
+        bucket, key = parse_s3_url("https://s3.amazonaws.com/my-bucket/path/file.webm")
+        assert bucket == "my-bucket"
+        assert key == "path/file.webm"
+
+    def test_parse_https_regional_path_style(self):
+        """Test parsing regional endpoint with path style"""
+        bucket, key = parse_s3_url(
+            "https://s3.us-east-1.amazonaws.com/my-bucket/path/file.webm"
+        )
+        assert bucket == "my-bucket"
+        assert key == "path/file.webm"
+
+    def test_parse_url_encoded_keys(self):
+        """Test parsing URL-encoded keys"""
+        bucket, key = parse_s3_url(
+            "s3://my-bucket/path%20with%20spaces/file%2Bname.webm"
+        )
+        assert bucket == "my-bucket"
+        assert key == "path with spaces/file+name.webm"  # Should be decoded
+
+    def test_parse_url_encoded_https(self):
+        """Test URL-encoded keys with HTTPS format"""
+        bucket, key = parse_s3_url(
+            "https://my-bucket.s3.amazonaws.com/file%20with%20spaces.webm"
+        )
+        assert bucket == "my-bucket"
+        assert key == "file with spaces.webm"
+
+    def test_invalid_url_no_scheme(self):
+        """Test that URLs without scheme raise ValueError"""
+        with pytest.raises(ValueError, match="Invalid S3 URL scheme"):
+            parse_s3_url("my-bucket/path/file.webm")
+
+    def test_invalid_url_wrong_scheme(self):
+        """Test that non-S3 schemes raise ValueError"""
+        with pytest.raises(ValueError, match="Invalid S3 URL scheme"):
+            parse_s3_url("ftp://my-bucket/path/file.webm")
+
+    def test_invalid_s3_missing_bucket(self):
+        """Test s3:// URL without bucket raises ValueError"""
+        with pytest.raises(ValueError, match="missing bucket or key"):
+            parse_s3_url("s3:///path/file.webm")
+
+    def test_invalid_s3_missing_key(self):
+        """Test s3:// URL without key raises ValueError"""
+        with pytest.raises(ValueError, match="missing bucket or key"):
+            parse_s3_url("s3://my-bucket/")
+
+    def test_invalid_s3_empty_key(self):
+        """Test s3:// URL with empty key raises ValueError"""
+        with pytest.raises(ValueError, match="missing bucket or key"):
+            parse_s3_url("s3://my-bucket")
+
+    def test_invalid_https_not_s3(self):
+        """Test HTTPS URL that's not S3 raises ValueError"""
+        with pytest.raises(ValueError, match="not recognized as S3 URL"):
+            parse_s3_url("https://example.com/path/file.webm")
+
+    def test_invalid_https_subdomain_missing_key(self):
+        """Test HTTPS subdomain format without key raises ValueError"""
+        with pytest.raises(ValueError, match="missing bucket or key"):
+            parse_s3_url("https://my-bucket.s3.amazonaws.com/")
+
+    def test_invalid_https_path_style_missing_parts(self):
+        """Test HTTPS path style with missing bucket/key raises ValueError"""
+        with pytest.raises(ValueError, match="missing bucket or key"):
+            parse_s3_url("https://s3.amazonaws.com/")
+
+    def test_bucket_with_dots(self):
+        """Test parsing bucket names with dots"""
+        bucket, key = parse_s3_url("s3://my.bucket.name/path/file.webm")
+        assert bucket == "my.bucket.name"
+        assert key == "path/file.webm"
+
+    def test_bucket_with_hyphens(self):
+        """Test parsing bucket names with hyphens"""
+        bucket, key = parse_s3_url("s3://my-bucket-name-123/path/file.webm")
+        assert bucket == "my-bucket-name-123"
+        assert key == "path/file.webm"
+
+    def test_key_with_special_chars(self):
+        """Test keys with various special characters"""
+        # Note: # is treated as URL fragment separator, not part of key
+        bucket, key = parse_s3_url("s3://bucket/2024-01-01_12:00:00/file.webm")
+        assert bucket == "bucket"
+        assert key == "2024-01-01_12:00:00/file.webm"
+
+    def test_fragment_handling(self):
+        """Test that URL fragments are properly ignored"""
+        bucket, key = parse_s3_url("s3://bucket/path/to/file.webm#fragment123")
+        assert bucket == "bucket"
+        assert key == "path/to/file.webm"  # Fragment not included
+
+    def test_http_scheme_s3_url(self):
+        """Test that HTTP (not HTTPS) S3 URLs are supported"""
+        bucket, key = parse_s3_url("http://my-bucket.s3.amazonaws.com/path/file.webm")
+        assert bucket == "my-bucket"
+        assert key == "path/file.webm"