add profanity filter, post-process topic/title

2026-02-04 18:06:48 +00:00 · 2023-09-21 11:12:00 +05:30
parent 19dfb1d027
commit ab41ce90e8
8 changed files with 224 additions and 5 deletions
--- a/server/poetry.lock
+++ b/server/poetry.lock
@@ -1551,6 +1551,17 @@ files = [
    {file = "ifaddr-0.2.0.tar.gz", hash = "sha256:cc0cbfcaabf765d44595825fb96a99bb12c79716b73b44330ea38ee2b0c4aed4"},
 ]
 [[package]]
 name = "inflection"
 version = "0.5.1"
 description = "A port of Ruby on Rails inflector to Python"
 optional = false
 python-versions = ">=3.5"
 files = [
    {file = "inflection-0.5.1-py2.py3-none-any.whl", hash = "sha256:f38b2b640938a4f35ade69ac3d053042959b62a0f1076a5bbaa1b9526605a8a2"},
    {file = "inflection-0.5.1.tar.gz", hash = "sha256:1a29730d366e996aaacffb2f1f1cb9593dc38e2ddd30c91250c6dde09ea9b417"},
 ]
 [[package]]
 name = "iniconfig"
 version = "2.0.0"
@@ -2097,6 +2108,20 @@ files = [
 dev = ["pre-commit", "tox"]
 testing = ["pytest", "pytest-benchmark"]
 [[package]]
 name = "profanityfilter"
 version = "2.0.6"
 description = "A universal Python library for detecting and/or filtering profane words."
 optional = false
 python-versions = "*"
 files = [
    {file = "profanityfilter-2.0.6-py2.py3-none-any.whl", hash = "sha256:1706c080c2364f5bfe217b2330dc35d90e02e4afa0a00ed52d5673c410b45b64"},
    {file = "profanityfilter-2.0.6.tar.gz", hash = "sha256:ca701e22799526696963415fc36d5e943c168f1917e3c83881ffda6bf5240a30"},
 ]
 [package.dependencies]
 inflection = "*"
 [[package]]
 name = "prometheus-client"
 version = "0.17.1"
@@ -3744,4 +3769,4 @@ multidict = ">=4.0"
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.11"
-content-hash = "b67b094055950a6b39da80dc7ca26b2a0e1c778f174016a00185d7219a3348b5"
+content-hash = "a85cb09a0e4b68b29c4272d550e618d2e24ace5f16b707f29e8ac4ce915c1fae"
--- a/server/pyproject.toml
+++ b/server/pyproject.toml
@@ -32,6 +32,7 @@ transformers = "^4.32.1"
 prometheus-fastapi-instrumentator = "^6.1.0"
 sentencepiece = "^0.1.99"
 protobuf = "^4.24.3"
 profanityfilter = "^2.0.6"
 [tool.poetry.group.dev.dependencies]
--- a/server/reflector/llm/base.py
+++ b/server/reflector/llm/base.py
@@ -6,11 +6,12 @@ from typing import TypeVar
 import nltk
 from prometheus_client import Counter, Histogram
 from transformers import GenerationConfig
 from reflector.llm.llm_params import TaskParams
 from reflector.logger import logger as reflector_logger
 from reflector.settings import settings
 from reflector.utils.retry import retry
 from transformers import GenerationConfig
 T = TypeVar("T", bound="LLM")
@@ -221,6 +222,30 @@ class LLM:
        return title
    def trim_title(self, title: str) -> str:
        """
        List of manual trimming to the title.
        Longer titles currently run into
        "Discussion on", "Discussion about", etc. that don't really
        add any descriptive information and in some cases, this behaviour
        can be repeated for several consecutive topics. We want to handle
        these cases.
        """
        phrases_to_remove = ["Discussion on", "Discussion about"]
        try:
            pattern = (
                r"\b(?:"
                + "|".join(re.escape(phrase) for phrase in phrases_to_remove)
                + r")\b"
            )
            title = re.sub(pattern, "", title, flags=re.IGNORECASE)
        except Exception as e:
            reflector_logger.info(
                f"Failed to trim {title=} " f"with exception : {str(e)}"
            )
        return title
    async def _generate(
        self, prompt: str, gen_schema: dict | None, gen_cfg: dict | None, **kwargs
    ) -> str:
--- a/server/reflector/llm/llm_params_cod.py
+++ b/server/reflector/llm/llm_params_cod.py
@@ -0,0 +1,150 @@
 from typing import Optional, TypeVar
 from pydantic import BaseModel
 from transformers import GenerationConfig
 class TaskParams(BaseModel, arbitrary_types_allowed=True):
    instruct: str
    gen_cfg: Optional[GenerationConfig] = None
    gen_schema: Optional[dict] = None
 T = TypeVar("T", bound="LLMTaskParams")
 class LLMTaskParams:
    _registry = {}
    @classmethod
    def register(cls, task, klass) -> None:
        cls._registry[task] = klass
    @classmethod
    def get_instance(cls, task: str) -> T:
        return cls._registry[task]()
    @property
    def task_params(self) -> TaskParams | None:
        """
        Fetch the task related parameters
        """
        return self._get_task_params()
    def _get_task_params(self) -> None:
        pass
 class FinalLongSummaryParams(LLMTaskParams):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self._gen_cfg = GenerationConfig(
            max_new_tokens=800, num_beams=3, do_sample=True, temperature=0.3
        )
        self._instruct = """
        Take the key ideas and takeaways from the text and create a short
         summary. Be sure to keep the length of the response to a minimum.
         Do not include trivial information in the summary.
          """
        self._schema = {
            "type": "object",
            "properties": {"long_summary": {"type": "string"}},
        }
        self._task_params = TaskParams(
            instruct=self._instruct, gen_schema=self._schema, gen_cfg=self._gen_cfg
        )
    def _get_task_params(self) -> TaskParams:
        """gen_schema
        Return the parameters associated with a specific LLM task
        """
        return self._task_params
 class FinalShortSummaryParams(LLMTaskParams):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self._gen_cfg = GenerationConfig(
            max_new_tokens=1300, num_beams=3, do_sample=True, temperature=0.3
        )
        self._instruct = """
        Take the key ideas and takeaways from the text and create a short
         summary. Be sure to keep the length of the response to a minimum.
         Do not include trivial information in the summary.
          """
        self._schema = {
            "type": "object",
            "properties": {"short_summary": {"type": "string"}},
        }
        self._task_params = TaskParams(
            instruct=self._instruct, gen_schema=self._schema, gen_cfg=self._gen_cfg
        )
    def _get_task_params(self) -> TaskParams:
        """
        Return the parameters associated with a specific LLM task
        """
        return self._task_params
 class FinalTitleParams(LLMTaskParams):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self._gen_cfg = GenerationConfig(
            max_new_tokens=200, num_beams=5, do_sample=True, temperature=0.5
        )
        self._instruct = """
            Combine the following individual titles into one single short title that
            condenses the essence of all titles.
        """
        self._schema = {
            "type": "object",
            "properties": {"title": {"type": "string"}},
        }
        self._task_params = TaskParams(
            instruct=self._instruct, gen_schema=self._schema, gen_cfg=self._gen_cfg
        )
    def _get_task_params(self) -> TaskParams:
        """
        Return the parameters associated with a specific LLM task
        """
        return self._task_params
 class TopicParams(LLMTaskParams):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self._gen_cfg = GenerationConfig(
            max_new_tokens=550, num_beams=6, do_sample=True, temperature=0.9
        )
        self._instruct = """
                Create a JSON object as response.The JSON object must have 2 fields:
                i) title and ii) summary.
                For the title field, generate a very detailed and self-explanatory
                 title for the given text. Let the title be as descriptive as possible.
                For the summary field, summarize the given text in a maximum of
                three sentences.
            """
        self._schema = {
            "type": "object",
            "properties": {
                "title": {"type": "string"},
                "summary": {"type": "string"},
            },
        }
        self._task_params = TaskParams(
            instruct=self._instruct, gen_schema=self._schema, gen_cfg=self._gen_cfg
        )
    def _get_task_params(self) -> TaskParams:
        """
        Return the parameters associated with a specific LLM task
        """
        return self._task_params
 LLMTaskParams.register("topic", TopicParams)
 LLMTaskParams.register("final_title", FinalTitleParams)
 LLMTaskParams.register("final_short_summary", FinalShortSummaryParams)
 LLMTaskParams.register("final_long_summary", FinalLongSummaryParams)
--- a/server/reflector/processors/audio_transcript.py
+++ b/server/reflector/processors/audio_transcript.py
@@ -1,4 +1,6 @@
 from profanityfilter import ProfanityFilter
 from prometheus_client import Counter, Histogram
 from reflector.processors.base import Processor
 from reflector.processors.types import AudioFile, Transcript
@@ -38,6 +40,8 @@ class AudioTranscriptProcessor(Processor):
        self.m_transcript_call = self.m_transcript_call.labels(name)
        self.m_transcript_success = self.m_transcript_success.labels(name)
        self.m_transcript_failure = self.m_transcript_failure.labels(name)
        self.profanity_filter = ProfanityFilter()
        self.profanity_filter.set_censor("|*|")
        super().__init__(*args, **kwargs)
    async def _push(self, data: AudioFile):
@@ -56,3 +60,11 @@ class AudioTranscriptProcessor(Processor):
    async def _transcript(self, data: AudioFile):
        raise NotImplementedError
    def filter_profanity(self, text: str) -> str:
        """
        Remove censored words from the transcript
        """
        text = self.profanity_filter.censor(text)
        text = text.replace("|*|", "")
        return text
--- a/server/reflector/processors/audio_transcript_modal.py
+++ b/server/reflector/processors/audio_transcript_modal.py
@@ -15,6 +15,7 @@ API will be a POST request to TRANSCRIPT_URL:
 from time import monotonic
 import httpx
 from reflector.processors.audio_transcript import AudioTranscriptProcessor
 from reflector.processors.audio_transcript_auto import AudioTranscriptAutoProcessor
 from reflector.processors.types import AudioFile, Transcript, TranslationLanguages, Word
@@ -86,7 +87,7 @@ class AudioTranscriptModalProcessor(AudioTranscriptProcessor):
            if source_language != target_language and target_language in result["text"]:
                translation = result["text"][target_language]
            text = result["text"][source_language]
-
+            text = self.filter_profanity(text)
            transcript = Transcript(
                text=text,
                translation=translation,
--- a/server/reflector/processors/transcript_final_title.py
+++ b/server/reflector/processors/transcript_final_title.py
@@ -60,6 +60,8 @@ class TranscriptFinalTitleProcessor(Processor):
        accumulated_titles = ".".join([chunk.title for chunk in self.chunks])
        title_result = await self.get_title(accumulated_titles)
        final_title = self.llm.ensure_casing(title_result["title"])
        final_title = self.llm.trim_title(final_title)
-        final_title = FinalTitle(title=title_result["title"])
+        final_title = FinalTitle(title=final_title)
        await self.emit(final_title)
--- a/server/reflector/processors/transcript_topic_detector.py
+++ b/server/reflector/processors/transcript_topic_detector.py
@@ -55,8 +55,11 @@ class TranscriptTopicDetectorProcessor(Processor):
        self.logger.info(f"Topic detector got {len(text)} length transcript")
        topic_result = await self.get_topic(text=text)
        title = self.llm.ensure_casing(topic_result["title"])
        title = self.llm.trim_title(title)
        summary = TitleSummary(
-            title=self.llm.ensure_casing(topic_result["title"]),
+            title=title,
            summary=topic_result["summary"],
            timestamp=self.transcript.timestamp,
            duration=self.transcript.duration,