mirror of
https://github.com/Monadical-SAS/reflector.git
synced 2026-04-25 06:35:18 +00:00
* fix: better detect topics chunking depending on duration * fix: better subject detection + prompt improvements
71 lines
2.3 KiB
Python
71 lines
2.3 KiB
Python
"""
|
|
Shared transcript processing constants.
|
|
|
|
Used by both Hatchet workflows and Celery pipelines for consistent processing.
|
|
"""
|
|
|
|
import math
|
|
|
|
# Topic detection: legacy static chunk size, used as fallback
|
|
TOPIC_CHUNK_WORD_COUNT = 300
|
|
|
|
# Dynamic chunking curve parameters
|
|
# Formula: target_topics = _COEFFICIENT * duration_minutes ^ _EXPONENT
|
|
# Derived from anchors: 5 min -> 3 topics, 180 min -> 40 topics
|
|
_TOPIC_CURVE_COEFFICIENT = 0.833
|
|
_TOPIC_CURVE_EXPONENT = 0.723
|
|
_MIN_TOPICS = 2
|
|
_MAX_TOPICS = 50
|
|
_MIN_CHUNK_WORDS = 375
|
|
_MAX_CHUNK_WORDS = 1500
|
|
|
|
|
|
def compute_topic_chunk_size(duration_seconds: float, total_words: int) -> int:
|
|
"""Calculate optimal chunk size for topic detection based on recording duration.
|
|
|
|
Uses a power-curve function to scale topic count sublinearly with duration,
|
|
producing fewer LLM calls for longer recordings while maintaining topic quality.
|
|
|
|
Returns the number of words per chunk.
|
|
"""
|
|
if total_words <= 0 or duration_seconds <= 0:
|
|
return _MIN_CHUNK_WORDS
|
|
|
|
duration_minutes = duration_seconds / 60.0
|
|
target_topics = _TOPIC_CURVE_COEFFICIENT * math.pow(
|
|
duration_minutes, _TOPIC_CURVE_EXPONENT
|
|
)
|
|
target_topics = int(round(max(_MIN_TOPICS, min(_MAX_TOPICS, target_topics))))
|
|
|
|
chunk_size = total_words // target_topics
|
|
chunk_size = max(_MIN_CHUNK_WORDS, min(_MAX_CHUNK_WORDS, chunk_size))
|
|
return chunk_size
|
|
|
|
|
|
# Subject extraction: scale max subjects with recording duration
|
|
# Short calls get fewer subjects to avoid over-analyzing trivial content
|
|
_SUBJECT_DURATION_THRESHOLDS = [
|
|
(5 * 60, 1), # ≤ 5 min → 1 subject
|
|
(15 * 60, 2), # ≤ 15 min → 2 subjects
|
|
(30 * 60, 3), # ≤ 30 min → 3 subjects
|
|
(45 * 60, 4), # ≤ 45 min → 4 subjects
|
|
(60 * 60, 5), # ≤ 60 min → 5 subjects
|
|
]
|
|
_MAX_SUBJECTS = 6
|
|
|
|
|
|
def compute_max_subjects(duration_seconds: float) -> int:
|
|
"""Calculate maximum number of subjects to extract based on recording duration.
|
|
|
|
Uses a step function: short recordings get fewer subjects to avoid
|
|
generating excessive detail for trivial content.
|
|
"""
|
|
if duration_seconds <= 0:
|
|
return 1
|
|
|
|
for threshold, max_subjects in _SUBJECT_DURATION_THRESHOLDS:
|
|
if duration_seconds <= threshold:
|
|
return max_subjects
|
|
|
|
return _MAX_SUBJECTS
|