feat: add auto-generated captions, speaker-colored progress bar with sync controls, and speaker tooltip to cloud video player (#926)

* feat: webvtt captions inside video with sync controls

* feat: highlight speaker timestamp progress bar
This commit is contained in:
Juan Diego García
2026-03-25 12:07:08 -05:00
committed by GitHub
parent e2ba502697
commit f19113a3cf
2 changed files with 371 additions and 17 deletions

View File

@@ -228,6 +228,8 @@ export default function TranscriptDetails(details: TranscriptDetails) {
duration={transcript.data?.cloud_video_duration ?? null}
expanded={videoExpanded}
onClose={() => setVideoExpanded(false)}
sourceLanguage={transcript.data?.source_language ?? null}
participants={transcript.data?.participants ?? null}
/>
</GridItem>
)}

View File

@@ -1,14 +1,19 @@
import { useEffect, useState } from "react";
import { useCallback, useEffect, useMemo, useRef, useState } from "react";
import { Box, Flex, Skeleton, Text } from "@chakra-ui/react";
import { LuVideo, LuX } from "react-icons/lu";
import { LuMinus, LuPlus, LuVideo, LuX } from "react-icons/lu";
import { useAuth } from "../../lib/AuthProvider";
import { API_URL } from "../../lib/apiClient";
import { generateHighContrastColor } from "../../lib/utils";
type SpeakerInfo = { speaker: number | null; name: string };
type VideoPlayerProps = {
transcriptId: string;
duration: number | null;
expanded: boolean;
onClose: () => void;
sourceLanguage?: string | null;
participants?: SpeakerInfo[] | null;
};
function formatDuration(seconds: number): string {
@@ -20,15 +25,203 @@ function formatDuration(seconds: number): string {
return `${m}:${String(s).padStart(2, "0")}`;
}
const VTT_TIMESTAMP_RE =
/(\d{2}:\d{2}:\d{2}\.\d{3})\s*-->\s*(\d{2}:\d{2}:\d{2}\.\d{3})/g;
function parseVttTimestamp(ts: string): number {
const [h, m, rest] = ts.split(":");
const [s, ms] = rest.split(".");
return Number(h) * 3600 + Number(m) * 60 + Number(s) + Number(ms) / 1000;
}
function formatVttTimestamp(totalSeconds: number): string {
const clamped = Math.max(0, totalSeconds);
const h = Math.floor(clamped / 3600);
const m = Math.floor((clamped % 3600) / 60);
const s = Math.floor(clamped % 60);
const ms = Math.round((clamped % 1) * 1000);
return `${String(h).padStart(2, "0")}:${String(m).padStart(2, "0")}:${String(s).padStart(2, "0")}.${String(ms).padStart(3, "0")}`;
}
function shiftVttTimestamps(vttContent: string, offsetSeconds: number): string {
if (offsetSeconds === 0) return vttContent;
return vttContent.replace(
VTT_TIMESTAMP_RE,
(_match, start: string, end: string) => {
const newStart = formatVttTimestamp(
parseVttTimestamp(start) + offsetSeconds,
);
const newEnd = formatVttTimestamp(parseVttTimestamp(end) + offsetSeconds);
return `${newStart} --> ${newEnd}`;
},
);
}
type VttSegment = { start: number; end: number; speaker: string };
const VTT_CUE_RE =
/(\d{2}:\d{2}:\d{2}\.\d{3})\s*-->\s*(\d{2}:\d{2}:\d{2}\.\d{3})\n<v ([^>]+)>/g;
function parseVttSegments(vttContent: string): VttSegment[] {
const segments: VttSegment[] = [];
let match;
while ((match = VTT_CUE_RE.exec(vttContent)) !== null) {
segments.push({
start: parseVttTimestamp(match[1]),
end: parseVttTimestamp(match[2]),
speaker: match[3],
});
}
return segments;
}
// Same background as TopicSegment so speaker colors match the transcript UI
const SPEAKER_COLOR_BG: [number, number, number] = [96, 165, 250];
function SpeakerProgressBar({
segments,
videoDuration,
currentTime,
captionOffset,
onSeek,
participants,
}: {
segments: VttSegment[];
videoDuration: number;
currentTime: number;
captionOffset: number;
onSeek: (time: number) => void;
participants?: SpeakerInfo[] | null;
}) {
const barRef = useRef<HTMLDivElement>(null);
// Build a name→"Speaker N" reverse lookup so colors match TopicSegment
const speakerColors = useMemo(() => {
const nameToColorKey: Record<string, string> = {};
if (participants) {
for (const p of participants) {
if (p.speaker != null) {
nameToColorKey[p.name] = `Speaker ${p.speaker}`;
}
}
}
const map: Record<string, string | undefined> = {};
for (const seg of segments) {
if (!map[seg.speaker]) {
const colorKey = nameToColorKey[seg.speaker] ?? seg.speaker;
map[seg.speaker] = generateHighContrastColor(
colorKey,
SPEAKER_COLOR_BG,
);
}
}
return map;
}, [segments, participants]);
const activeSpeaker = useMemo(() => {
for (const seg of segments) {
const adjStart = seg.start + captionOffset;
const adjEnd = seg.end + captionOffset;
if (currentTime >= adjStart && currentTime < adjEnd) {
return seg.speaker;
}
}
return null;
}, [segments, currentTime, captionOffset]);
const handleClick = (e: React.MouseEvent<HTMLDivElement>) => {
if (!barRef.current || !videoDuration) return;
const rect = barRef.current.getBoundingClientRect();
const fraction = Math.max(
0,
Math.min(1, (e.clientX - rect.left) / rect.width),
);
onSeek(fraction * videoDuration);
};
const progressPct =
videoDuration > 0 ? (currentTime / videoDuration) * 100 : 0;
return (
<Box position="relative" mb={4}>
<Box
ref={barRef}
position="relative"
h="8px"
bg="gray.700"
cursor="pointer"
onClick={handleClick}
borderBottomRadius="md"
overflow="hidden"
>
{segments.map((seg, i) => {
const adjStart = Math.max(0, seg.start + captionOffset);
const adjEnd = Math.max(0, seg.end + captionOffset);
if (adjEnd <= 0 || adjStart >= videoDuration) return null;
const leftPct = (adjStart / videoDuration) * 100;
const widthPct = ((adjEnd - adjStart) / videoDuration) * 100;
return (
<Box
key={i}
position="absolute"
top={0}
bottom={0}
left={`${leftPct}%`}
width={`${widthPct}%`}
bg={speakerColors[seg.speaker]}
/>
);
})}
{/* Playhead */}
<Box
position="absolute"
top={0}
bottom={0}
left={`${progressPct}%`}
w="2px"
bg="white"
zIndex={1}
pointerEvents="none"
/>
</Box>
{/* Speaker tooltip below the bar */}
{activeSpeaker && (
<Text
position="absolute"
top="10px"
left={`${progressPct}%`}
transform="translateX(-50%)"
fontSize="2xs"
color={speakerColors[activeSpeaker]}
fontWeight="semibold"
whiteSpace="nowrap"
pointerEvents="none"
>
{activeSpeaker}
</Text>
)}
</Box>
);
}
export default function VideoPlayer({
transcriptId,
duration,
expanded,
onClose,
sourceLanguage,
participants,
}: VideoPlayerProps) {
const [videoUrl, setVideoUrl] = useState<string | null>(null);
const [rawVtt, setRawVtt] = useState<string | null>(null);
const [captionsUrl, setCaptionsUrl] = useState<string | null>(null);
const [captionOffset, setCaptionOffset] = useState(0);
const [currentTime, setCurrentTime] = useState(0);
const [videoDuration, setVideoDuration] = useState(0);
const [loading, setLoading] = useState(false);
const [error, setError] = useState<string | null>(null);
const prevBlobUrl = useRef<string | null>(null);
const videoRef = useRef<HTMLVideoElement>(null);
const auth = useAuth();
const accessToken = auth.status === "authenticated" ? auth.accessToken : null;
@@ -63,6 +256,99 @@ export default function VideoPlayer({
fetchVideoUrl();
}, [expanded, transcriptId, accessToken, videoUrl]);
useEffect(() => {
if (!videoUrl || !transcriptId) return;
let cancelled = false;
const fetchCaptions = async () => {
try {
const url = `${API_URL}/v1/transcripts/${transcriptId}?transcript_format=webvtt-named`;
const headers: Record<string, string> = {};
if (accessToken) {
headers["Authorization"] = `Bearer ${accessToken}`;
}
const resp = await fetch(url, { headers });
if (!resp.ok) return;
const data = await resp.json();
const vttContent = data?.transcript;
if (!vttContent || cancelled) return;
setRawVtt(vttContent);
} catch {
// Captions are non-critical — fail silently
}
};
fetchCaptions();
return () => {
cancelled = true;
};
}, [videoUrl, transcriptId, accessToken]);
// Rebuild blob URL whenever rawVtt or captionOffset changes
useEffect(() => {
if (!rawVtt) return;
const shifted = shiftVttTimestamps(rawVtt, captionOffset);
const blob = new Blob([shifted], { type: "text/vtt" });
const blobUrl = URL.createObjectURL(blob);
if (prevBlobUrl.current) {
URL.revokeObjectURL(prevBlobUrl.current);
}
prevBlobUrl.current = blobUrl;
setCaptionsUrl(blobUrl);
return () => {
URL.revokeObjectURL(blobUrl);
prevBlobUrl.current = null;
};
}, [rawVtt, captionOffset]);
const adjustOffset = useCallback((delta: number) => {
setCaptionOffset((prev) => Math.round((prev + delta) * 10) / 10);
}, []);
const formattedOffset = useMemo(() => {
const sign = captionOffset >= 0 ? "+" : "";
return `${sign}${captionOffset.toFixed(1)}s`;
}, [captionOffset]);
const segments = useMemo(
() => (rawVtt ? parseVttSegments(rawVtt) : []),
[rawVtt],
);
// Track video currentTime and duration
useEffect(() => {
const video = videoRef.current;
if (!video) return;
const onTimeUpdate = () => setCurrentTime(video.currentTime);
const onDurationChange = () => {
if (video.duration && isFinite(video.duration)) {
setVideoDuration(video.duration);
}
};
video.addEventListener("timeupdate", onTimeUpdate);
video.addEventListener("loadedmetadata", onDurationChange);
video.addEventListener("durationchange", onDurationChange);
return () => {
video.removeEventListener("timeupdate", onTimeUpdate);
video.removeEventListener("loadedmetadata", onDurationChange);
video.removeEventListener("durationchange", onDurationChange);
};
}, [videoUrl]);
const handleSeek = useCallback((time: number) => {
if (videoRef.current) {
videoRef.current.currentTime = time;
}
}, []);
if (!expanded) return null;
if (loading) {
@@ -117,22 +403,64 @@ export default function VideoPlayer({
</Text>
)}
</Flex>
<Flex
align="center"
justify="center"
borderRadius="full"
p={1}
cursor="pointer"
onClick={onClose}
_hover={{ bg: "whiteAlpha.300" }}
transition="background 0.15s"
>
<LuX size={14} color="white" />
<Flex align="center" gap={3}>
{rawVtt && (
<Flex align="center" gap={1}>
<Text fontSize="2xs" color="gray.400">
CC sync
</Text>
<Flex
align="center"
justify="center"
borderRadius="sm"
p={0.5}
cursor="pointer"
onClick={() => adjustOffset(-0.5)}
_hover={{ bg: "whiteAlpha.300" }}
transition="background 0.15s"
>
<LuMinus size={12} color="white" />
</Flex>
<Text
fontSize="2xs"
color="gray.300"
fontFamily="mono"
minW="3.5em"
textAlign="center"
>
{formattedOffset}
</Text>
<Flex
align="center"
justify="center"
borderRadius="sm"
p={0.5}
cursor="pointer"
onClick={() => adjustOffset(0.5)}
_hover={{ bg: "whiteAlpha.300" }}
transition="background 0.15s"
>
<LuPlus size={12} color="white" />
</Flex>
</Flex>
)}
<Flex
align="center"
justify="center"
borderRadius="full"
p={1}
cursor="pointer"
onClick={onClose}
_hover={{ bg: "whiteAlpha.300" }}
transition="background 0.15s"
>
<LuX size={14} color="white" />
</Flex>
</Flex>
</Flex>
{/* Video element with visible controls */}
{/* eslint-disable-next-line jsx-a11y/media-has-caption */}
<video
ref={videoRef}
src={videoUrl}
controls
autoPlay
@@ -147,10 +475,34 @@ export default function VideoPlayer({
minHeight: "180px",
objectFit: "contain",
background: "black",
borderBottomLeftRadius: "0.375rem",
borderBottomRightRadius: "0.375rem",
...(segments.length === 0
? {
borderBottomLeftRadius: "0.375rem",
borderBottomRightRadius: "0.375rem",
}
: {}),
}}
/>
>
{captionsUrl && (
<track
kind="captions"
src={captionsUrl}
srcLang={sourceLanguage || "en"}
label="Auto-generated captions"
default
/>
)}
</video>
{segments.length > 0 && videoDuration > 0 && (
<SpeakerProgressBar
segments={segments}
videoDuration={videoDuration}
currentTime={currentTime}
captionOffset={captionOffset}
onSeek={handleSeek}
participants={participants}
/>
)}
</Box>
);
}