reflector/server/reflector/processors/audio_diarization.py

from reflector.processors.base import Processor
from reflector.processors.types import (
    AudioDiarizationInput,
    DiarizationSegment,
    TitleSummary,
    Word,
)


class AudioDiarizationProcessor(Processor):
    INPUT_TYPE = AudioDiarizationInput
    OUTPUT_TYPE = TitleSummary

    async def _push(self, data: AudioDiarizationInput):
        try:
            self.logger.info("Diarization started", audio_file_url=data.audio_url)
            diarization = await self._diarize(data)
            self.logger.info("Diarization finished")
        except Exception:
            self.logger.exception("Diarization failed after retrying")
            raise

        # now reapply speaker to topics (if any)
        # topics is a list[BaseModel] with an attribute words
        # words is a list[BaseModel] with text, start and speaker attribute

        # create a view of words based on topics
        # the current algorithm is using words index, we cannot use a generator
        words = list(self.iter_words_from_topics(data.topics))

        # assign speaker to words (mutate the words list)
        self.assign_speaker(words, diarization)

        # emit them
        for topic in data.topics:
            await self.emit(topic)

    async def _diarize(self, data: AudioDiarizationInput):
        raise NotImplementedError

    @classmethod
    def assign_speaker(cls, words: list[Word], diarization: list[DiarizationSegment]):
        cls._diarization_remove_overlap(diarization)
        cls._diarization_remove_segment_without_words(words, diarization)
        cls._diarization_merge_same_speaker(diarization)
        cls._diarization_assign_speaker(words, diarization)

    @staticmethod
    def iter_words_from_topics(topics: list[TitleSummary]):
        for topic in topics:
            for word in topic.transcript.words:
                yield word

    @staticmethod
    def is_word_continuation(word_prev, word):
        """
        Return True if the word is a continuation of the previous word
        by checking if the previous word is ending with a punctuation
        or if the current word is starting with a capital letter
        """
        # is word_prev ending with a punctuation ?
        if word_prev.text and word_prev.text[-1] in ".?!":
            return False
        elif word.text and word.text[0].isupper():
            return False
        return True

    @staticmethod
    def _diarization_remove_overlap(diarization: list[DiarizationSegment]):
        """
        Remove overlap in diarization results

        When using a diarization algorithm, it's possible to have overlapping segments
        This function remove the overlap by keeping the longest segment

        Warning: this function mutate the diarization list
        """
        # remove overlap by keeping the longest segment
        diarization_idx = 0
        while diarization_idx < len(diarization) - 1:
            d = diarization[diarization_idx]
            dnext = diarization[diarization_idx + 1]
            if d["end"] > dnext["start"]:
                # remove the shortest segment
                if d["end"] - d["start"] > dnext["end"] - dnext["start"]:
                    # remove next segment
                    diarization.pop(diarization_idx + 1)
                else:
                    # remove current segment
                    diarization.pop(diarization_idx)
            else:
                diarization_idx += 1

    @staticmethod
    def _diarization_remove_segment_without_words(
        words: list[Word], diarization: list[DiarizationSegment]
    ):
        """
        Remove diarization segments without words

        Warning: this function mutate the diarization list
        """
        # count the number of words for each diarization segment
        diarization_count = []
        for d in diarization:
            start = d["start"]
            end = d["end"]
            count = 0
            for word in words:
                if start <= word.start < end:
                    count += 1
                elif start < word.end <= end:
                    count += 1
            diarization_count.append(count)

        # remove diarization segments with no words
        diarization_idx = 0
        while diarization_idx < len(diarization):
            if diarization_count[diarization_idx] == 0:
                diarization.pop(diarization_idx)
                diarization_count.pop(diarization_idx)
            else:
                diarization_idx += 1

    @staticmethod
    def _diarization_merge_same_speaker(diarization: list[DiarizationSegment]):
        """
        Merge diarization contigous segments with the same speaker

        Warning: this function mutate the diarization list
        """
        # merge segment with same speaker
        diarization_idx = 0
        while diarization_idx < len(diarization) - 1:
            d = diarization[diarization_idx]
            dnext = diarization[diarization_idx + 1]
            if d["speaker"] == dnext["speaker"]:
                diarization[diarization_idx]["end"] = dnext["end"]
                diarization.pop(diarization_idx + 1)
            else:
                diarization_idx += 1

    @classmethod
    def _diarization_assign_speaker(
        cls, words: list[Word], diarization: list[DiarizationSegment]
    ):
        """
        Assign speaker to words based on diarization

        Warning: this function mutate the words list
        """

        word_idx = 0
        last_speaker = 0
        for d in diarization:
            start = d["start"]
            end = d["end"]
            speaker = d["speaker"]

            # diarization may start after the first set of words
            # in this case, we assign the last speaker
            for word in words[word_idx:]:
                if word.start < start:
                    # speaker change, but what make sense for assigning the word ?
                    # If it's a new sentence, assign with the new speaker
                    # If it's a continuation, assign with the last speaker
                    is_continuation = False
                    if word_idx > 0 and word_idx < len(words) - 1:
                        is_continuation = cls.is_word_continuation(
                            *words[word_idx - 1 : word_idx + 1]
                        )
                    if is_continuation:
                        word.speaker = last_speaker
                    else:
                        word.speaker = speaker
                        last_speaker = speaker
                    word_idx += 1
                else:
                    break

            # now continue to assign speaker until the word starts after the end
            for word in words[word_idx:]:
                if start <= word.start < end:
                    last_speaker = speaker
                    word.speaker = speaker
                    word_idx += 1
                elif word.start > end:
                    break

        # no more diarization available,
        # assign last speaker to all words without speaker
        for word in words[word_idx:]:
            word.speaker = last_speaker