feat: add auto-generated captions, speaker-colored progress bar with sync controls, and speaker tooltip to cloud video player (#926)

* feat: webvtt captions inside video with sync controls * feat: highlight speaker timestamp progress bar
2026-05-13 22:45:18 +00:00 · 2026-03-25 12:07:08 -05:00
parent e2ba502697
commit f19113a3cf
2 changed files with 371 additions and 17 deletions
--- a/www/app/(app)/transcripts/[transcriptId]/page.tsx
+++ b/www/app/(app)/transcripts/[transcriptId]/page.tsx
@@ -228,6 +228,8 @@ export default function TranscriptDetails(details: TranscriptDetails) {
                duration={transcript.data?.cloud_video_duration ?? null}
                expanded={videoExpanded}
                onClose={() => setVideoExpanded(false)}
+                sourceLanguage={transcript.data?.source_language ?? null}
+                participants={transcript.data?.participants ?? null}
              />
            </GridItem>
          )}
--- a/www/app/(app)/transcripts/videoPlayer.tsx
+++ b/www/app/(app)/transcripts/videoPlayer.tsx
@@ -1,14 +1,19 @@
-import { useEffect, useState } from "react";
+import { useCallback, useEffect, useMemo, useRef, useState } from "react";
 import { Box, Flex, Skeleton, Text } from "@chakra-ui/react";
-import { LuVideo, LuX } from "react-icons/lu";
+import { LuMinus, LuPlus, LuVideo, LuX } from "react-icons/lu";
 import { useAuth } from "../../lib/AuthProvider";
 import { API_URL } from "../../lib/apiClient";
+import { generateHighContrastColor } from "../../lib/utils";
+
+type SpeakerInfo = { speaker: number | null; name: string };

 type VideoPlayerProps = {
  transcriptId: string;
  duration: number | null;
  expanded: boolean;
  onClose: () => void;
+  sourceLanguage?: string | null;
+  participants?: SpeakerInfo[] | null;
 };

 function formatDuration(seconds: number): string {
@@ -20,15 +25,203 @@ function formatDuration(seconds: number): string {
  return `${m}:${String(s).padStart(2, "0")}`;
 }

+const VTT_TIMESTAMP_RE =
+  /(\d{2}:\d{2}:\d{2}\.\d{3})\s*-->\s*(\d{2}:\d{2}:\d{2}\.\d{3})/g;
+
+function parseVttTimestamp(ts: string): number {
+  const [h, m, rest] = ts.split(":");
+  const [s, ms] = rest.split(".");
+  return Number(h) * 3600 + Number(m) * 60 + Number(s) + Number(ms) / 1000;
+}
+
+function formatVttTimestamp(totalSeconds: number): string {
+  const clamped = Math.max(0, totalSeconds);
+  const h = Math.floor(clamped / 3600);
+  const m = Math.floor((clamped % 3600) / 60);
+  const s = Math.floor(clamped % 60);
+  const ms = Math.round((clamped % 1) * 1000);
+  return `${String(h).padStart(2, "0")}:${String(m).padStart(2, "0")}:${String(s).padStart(2, "0")}.${String(ms).padStart(3, "0")}`;
+}
+
+function shiftVttTimestamps(vttContent: string, offsetSeconds: number): string {
+  if (offsetSeconds === 0) return vttContent;
+  return vttContent.replace(
+    VTT_TIMESTAMP_RE,
+    (_match, start: string, end: string) => {
+      const newStart = formatVttTimestamp(
+        parseVttTimestamp(start) + offsetSeconds,
+      );
+      const newEnd = formatVttTimestamp(parseVttTimestamp(end) + offsetSeconds);
+      return `${newStart} --> ${newEnd}`;
+    },
+  );
+}
+
+type VttSegment = { start: number; end: number; speaker: string };
+
+const VTT_CUE_RE =
+  /(\d{2}:\d{2}:\d{2}\.\d{3})\s*-->\s*(\d{2}:\d{2}:\d{2}\.\d{3})\n<v ([^>]+)>/g;
+
+function parseVttSegments(vttContent: string): VttSegment[] {
+  const segments: VttSegment[] = [];
+  let match;
+  while ((match = VTT_CUE_RE.exec(vttContent)) !== null) {
+    segments.push({
+      start: parseVttTimestamp(match[1]),
+      end: parseVttTimestamp(match[2]),
+      speaker: match[3],
+    });
+  }
+  return segments;
+}
+
+// Same background as TopicSegment so speaker colors match the transcript UI
+const SPEAKER_COLOR_BG: [number, number, number] = [96, 165, 250];
+
+function SpeakerProgressBar({
+  segments,
+  videoDuration,
+  currentTime,
+  captionOffset,
+  onSeek,
+  participants,
+}: {
+  segments: VttSegment[];
+  videoDuration: number;
+  currentTime: number;
+  captionOffset: number;
+  onSeek: (time: number) => void;
+  participants?: SpeakerInfo[] | null;
+}) {
+  const barRef = useRef<HTMLDivElement>(null);
+
+  // Build a name→"Speaker N" reverse lookup so colors match TopicSegment
+  const speakerColors = useMemo(() => {
+    const nameToColorKey: Record<string, string> = {};
+    if (participants) {
+      for (const p of participants) {
+        if (p.speaker != null) {
+          nameToColorKey[p.name] = `Speaker ${p.speaker}`;
+        }
+      }
+    }
+    const map: Record<string, string | undefined> = {};
+    for (const seg of segments) {
+      if (!map[seg.speaker]) {
+        const colorKey = nameToColorKey[seg.speaker] ?? seg.speaker;
+        map[seg.speaker] = generateHighContrastColor(
+          colorKey,
+          SPEAKER_COLOR_BG,
+        );
+      }
+    }
+    return map;
+  }, [segments, participants]);
+
+  const activeSpeaker = useMemo(() => {
+    for (const seg of segments) {
+      const adjStart = seg.start + captionOffset;
+      const adjEnd = seg.end + captionOffset;
+      if (currentTime >= adjStart && currentTime < adjEnd) {
+        return seg.speaker;
+      }
+    }
+    return null;
+  }, [segments, currentTime, captionOffset]);
+
+  const handleClick = (e: React.MouseEvent<HTMLDivElement>) => {
+    if (!barRef.current || !videoDuration) return;
+    const rect = barRef.current.getBoundingClientRect();
+    const fraction = Math.max(
+      0,
+      Math.min(1, (e.clientX - rect.left) / rect.width),
+    );
+    onSeek(fraction * videoDuration);
+  };
+
+  const progressPct =
+    videoDuration > 0 ? (currentTime / videoDuration) * 100 : 0;
+
+  return (
+    <Box position="relative" mb={4}>
+      <Box
+        ref={barRef}
+        position="relative"
+        h="8px"
+        bg="gray.700"
+        cursor="pointer"
+        onClick={handleClick}
+        borderBottomRadius="md"
+        overflow="hidden"
+      >
+        {segments.map((seg, i) => {
+          const adjStart = Math.max(0, seg.start + captionOffset);
+          const adjEnd = Math.max(0, seg.end + captionOffset);
+          if (adjEnd <= 0 || adjStart >= videoDuration) return null;
+          const leftPct = (adjStart / videoDuration) * 100;
+          const widthPct = ((adjEnd - adjStart) / videoDuration) * 100;
+          return (
+            <Box
+              key={i}
+              position="absolute"
+              top={0}
+              bottom={0}
+              left={`${leftPct}%`}
+              width={`${widthPct}%`}
+              bg={speakerColors[seg.speaker]}
+            />
+          );
+        })}
+        {/* Playhead */}
+        <Box
+          position="absolute"
+          top={0}
+          bottom={0}
+          left={`${progressPct}%`}
+          w="2px"
+          bg="white"
+          zIndex={1}
+          pointerEvents="none"
+        />
+      </Box>
+      {/* Speaker tooltip below the bar */}
+      {activeSpeaker && (
+        <Text
+          position="absolute"
+          top="10px"
+          left={`${progressPct}%`}
+          transform="translateX(-50%)"
+          fontSize="2xs"
+          color={speakerColors[activeSpeaker]}
+          fontWeight="semibold"
+          whiteSpace="nowrap"
+          pointerEvents="none"
+        >
+          {activeSpeaker}
+        </Text>
+      )}
+    </Box>
+  );
+}
+
 export default function VideoPlayer({
  transcriptId,
  duration,
  expanded,
  onClose,
+  sourceLanguage,
+  participants,
 }: VideoPlayerProps) {
  const [videoUrl, setVideoUrl] = useState<string | null>(null);
+  const [rawVtt, setRawVtt] = useState<string | null>(null);
+  const [captionsUrl, setCaptionsUrl] = useState<string | null>(null);
+  const [captionOffset, setCaptionOffset] = useState(0);
+  const [currentTime, setCurrentTime] = useState(0);
+  const [videoDuration, setVideoDuration] = useState(0);
  const [loading, setLoading] = useState(false);
  const [error, setError] = useState<string | null>(null);
+  const prevBlobUrl = useRef<string | null>(null);
+  const videoRef = useRef<HTMLVideoElement>(null);
  const auth = useAuth();
  const accessToken = auth.status === "authenticated" ? auth.accessToken : null;

@@ -63,6 +256,99 @@ export default function VideoPlayer({
    fetchVideoUrl();
  }, [expanded, transcriptId, accessToken, videoUrl]);

+  useEffect(() => {
+    if (!videoUrl || !transcriptId) return;
+
+    let cancelled = false;
+
+    const fetchCaptions = async () => {
+      try {
+        const url = `${API_URL}/v1/transcripts/${transcriptId}?transcript_format=webvtt-named`;
+        const headers: Record<string, string> = {};
+        if (accessToken) {
+          headers["Authorization"] = `Bearer ${accessToken}`;
+        }
+        const resp = await fetch(url, { headers });
+        if (!resp.ok) return;
+        const data = await resp.json();
+        const vttContent = data?.transcript;
+        if (!vttContent || cancelled) return;
+        setRawVtt(vttContent);
+      } catch {
+        // Captions are non-critical — fail silently
+      }
+    };
+
+    fetchCaptions();
+
+    return () => {
+      cancelled = true;
+    };
+  }, [videoUrl, transcriptId, accessToken]);
+
+  // Rebuild blob URL whenever rawVtt or captionOffset changes
+  useEffect(() => {
+    if (!rawVtt) return;
+
+    const shifted = shiftVttTimestamps(rawVtt, captionOffset);
+    const blob = new Blob([shifted], { type: "text/vtt" });
+    const blobUrl = URL.createObjectURL(blob);
+
+    if (prevBlobUrl.current) {
+      URL.revokeObjectURL(prevBlobUrl.current);
+    }
+    prevBlobUrl.current = blobUrl;
+    setCaptionsUrl(blobUrl);
+
+    return () => {
+      URL.revokeObjectURL(blobUrl);
+      prevBlobUrl.current = null;
+    };
+  }, [rawVtt, captionOffset]);
+
+  const adjustOffset = useCallback((delta: number) => {
+    setCaptionOffset((prev) => Math.round((prev + delta) * 10) / 10);
+  }, []);
+
+  const formattedOffset = useMemo(() => {
+    const sign = captionOffset >= 0 ? "+" : "";
+    return `${sign}${captionOffset.toFixed(1)}s`;
+  }, [captionOffset]);
+
+  const segments = useMemo(
+    () => (rawVtt ? parseVttSegments(rawVtt) : []),
+    [rawVtt],
+  );
+
+  // Track video currentTime and duration
+  useEffect(() => {
+    const video = videoRef.current;
+    if (!video) return;
+
+    const onTimeUpdate = () => setCurrentTime(video.currentTime);
+    const onDurationChange = () => {
+      if (video.duration && isFinite(video.duration)) {
+        setVideoDuration(video.duration);
+      }
+    };
+
+    video.addEventListener("timeupdate", onTimeUpdate);
+    video.addEventListener("loadedmetadata", onDurationChange);
+    video.addEventListener("durationchange", onDurationChange);
+
+    return () => {
+      video.removeEventListener("timeupdate", onTimeUpdate);
+      video.removeEventListener("loadedmetadata", onDurationChange);
+      video.removeEventListener("durationchange", onDurationChange);
+    };
+  }, [videoUrl]);
+
+  const handleSeek = useCallback((time: number) => {
+    if (videoRef.current) {
+      videoRef.current.currentTime = time;
+    }
+  }, []);
+
  if (!expanded) return null;

  if (loading) {
@@ -117,22 +403,64 @@ export default function VideoPlayer({
            </Text>
          )}
        </Flex>
-        <Flex
-          align="center"
-          justify="center"
-          borderRadius="full"
-          p={1}
-          cursor="pointer"
-          onClick={onClose}
-          _hover={{ bg: "whiteAlpha.300" }}
-          transition="background 0.15s"
-        >
-          <LuX size={14} color="white" />
+        <Flex align="center" gap={3}>
+          {rawVtt && (
+            <Flex align="center" gap={1}>
+              <Text fontSize="2xs" color="gray.400">
+                CC sync
+              </Text>
+              <Flex
+                align="center"
+                justify="center"
+                borderRadius="sm"
+                p={0.5}
+                cursor="pointer"
+                onClick={() => adjustOffset(-0.5)}
+                _hover={{ bg: "whiteAlpha.300" }}
+                transition="background 0.15s"
+              >
+                <LuMinus size={12} color="white" />
+              </Flex>
+              <Text
+                fontSize="2xs"
+                color="gray.300"
+                fontFamily="mono"
+                minW="3.5em"
+                textAlign="center"
+              >
+                {formattedOffset}
+              </Text>
+              <Flex
+                align="center"
+                justify="center"
+                borderRadius="sm"
+                p={0.5}
+                cursor="pointer"
+                onClick={() => adjustOffset(0.5)}
+                _hover={{ bg: "whiteAlpha.300" }}
+                transition="background 0.15s"
+              >
+                <LuPlus size={12} color="white" />
+              </Flex>
+            </Flex>
+          )}
+          <Flex
+            align="center"
+            justify="center"
+            borderRadius="full"
+            p={1}
+            cursor="pointer"
+            onClick={onClose}
+            _hover={{ bg: "whiteAlpha.300" }}
+            transition="background 0.15s"
+          >
+            <LuX size={14} color="white" />
+          </Flex>
        </Flex>
      </Flex>
      {/* Video element with visible controls */}
-      {/* eslint-disable-next-line jsx-a11y/media-has-caption */}
      <video
+        ref={videoRef}
        src={videoUrl}
        controls
        autoPlay
@@ -147,10 +475,34 @@ export default function VideoPlayer({
          minHeight: "180px",
          objectFit: "contain",
          background: "black",
-          borderBottomLeftRadius: "0.375rem",
-          borderBottomRightRadius: "0.375rem",
+          ...(segments.length === 0
+            ? {
+                borderBottomLeftRadius: "0.375rem",
+                borderBottomRightRadius: "0.375rem",
+              }
+            : {}),
        }}
-      />
+      >
+        {captionsUrl && (
+          <track
+            kind="captions"
+            src={captionsUrl}
+            srcLang={sourceLanguage || "en"}
+            label="Auto-generated captions"
+            default
+          />
+        )}
+      </video>
+      {segments.length > 0 && videoDuration > 0 && (
+        <SpeakerProgressBar
+          segments={segments}
+          videoDuration={videoDuration}
+          currentTime={currentTime}
+          captionOffset={captionOffset}
+          onSeek={handleSeek}
+          participants={participants}
+        />
+      )}
    </Box>
  );
 }