fix: remove unused settings and utils files (#522)

* fix: remove unused settings and utils files

* fix: remove migration done

* fix: remove outdated scripts

* fix: removing deployment of hermes, not used anymore

* fix: partially remove secret, still have to understand frontend.
This commit is contained in:
2025-07-31 17:45:48 -06:00
committed by GitHub
parent 4ee19ed015
commit ad56165b54
17 changed files with 8 additions and 1046 deletions

View File

@@ -1,21 +0,0 @@
TRANSCRIPT_BACKEND=modal
TRANSCRIPT_URL=https://monadical-sas--reflector-transcriber-web.modal.run
TRANSCRIPT_MODAL_API_KEY=***REMOVED***
LLM_BACKEND=modal
LLM_URL=https://monadical-sas--reflector-llm-web.modal.run
LLM_MODAL_API_KEY=***REMOVED***
TRANSLATE_URL=https://monadical-sas--reflector-translator-web.modal.run
ZEPHYR_LLM_URL=https://monadical-sas--reflector-llm-zephyr-web.modal.run
DIARIZATION_URL=https://monadical-sas--reflector-diarizer-web.modal.run
BASE_URL=https://xxxxx.ngrok.app
DIARIZATION_ENABLED=false
SQS_POLLING_TIMEOUT_SECONDS=60
# Summary LLM configuration
SUMMARY_MODEL=monadical/private/smart
SUMMARY_LLM_URL=
SUMMARY_LLM_API_KEY=

View File

@@ -20,7 +20,6 @@ AUTH_JWT_AUDIENCE=
## Using local whisper
#TRANSCRIPT_BACKEND=whisper
#WHISPER_MODEL_SIZE=tiny
## Using serverless modal.com (require reflector-gpu-modal deployed)
#TRANSCRIPT_BACKEND=modal
@@ -30,7 +29,7 @@ AUTH_JWT_AUDIENCE=
TRANSCRIPT_BACKEND=modal
TRANSCRIPT_URL=https://monadical-sas--reflector-transcriber-web.modal.run
TRANSCRIPT_MODAL_API_KEY=***REMOVED***
TRANSCRIPT_MODAL_API_KEY=
## =======================================================
## Transcription backend
@@ -50,7 +49,7 @@ TRANSLATE_URL=https://monadical-sas--reflector-translator-web.modal.run
## Using serverless modal.com (require reflector-gpu-modal deployed)
LLM_BACKEND=modal
LLM_URL=https://monadical-sas--reflector-llm-web.modal.run
LLM_MODAL_API_KEY=***REMOVED***
LLM_MODAL_API_KEY=
ZEPHYR_LLM_URL=https://monadical-sas--reflector-llm-zephyr-web.modal.run

View File

@@ -1,171 +0,0 @@
# # Run an OpenAI-Compatible vLLM Server
import modal
MODELS_DIR = "/llamas"
MODEL_NAME = "NousResearch/Hermes-3-Llama-3.1-8B"
N_GPU = 1
def download_llm():
from huggingface_hub import snapshot_download
print("Downloading LLM model")
snapshot_download(
MODEL_NAME,
local_dir=f"{MODELS_DIR}/{MODEL_NAME}",
ignore_patterns=[
"*.pt",
"*.bin",
"*.pth",
"original/*",
], # Ensure safetensors
)
print("LLM model downloaded")
def move_cache():
from transformers.utils import move_cache as transformers_move_cache
transformers_move_cache()
vllm_image = (
modal.Image.debian_slim(python_version="3.10")
.pip_install("vllm==0.5.3post1")
.env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})
.pip_install(
# "accelerate==0.34.2",
"einops==0.8.0",
"hf-transfer~=0.1",
)
.run_function(download_llm)
.run_function(move_cache)
.pip_install(
"bitsandbytes>=0.42.9",
)
)
app = modal.App("reflector-vllm-hermes3")
@app.function(
image=vllm_image,
gpu=modal.gpu.A100(count=N_GPU, size="40GB"),
timeout=60 * 5,
scaledown_window=60 * 5,
allow_concurrent_inputs=100,
secrets=[
modal.Secret.from_name("reflector-gpu"),
],
)
@modal.asgi_app()
def serve():
import os
import fastapi
import vllm.entrypoints.openai.api_server as api_server
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.entrypoints.logger import RequestLogger
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
from vllm.usage.usage_lib import UsageContext
TOKEN = os.environ["REFLECTOR_GPU_APIKEY"]
# create a fastAPI app that uses vLLM's OpenAI-compatible router
web_app = fastapi.FastAPI(
title=f"OpenAI-compatible {MODEL_NAME} server",
description="Run an OpenAI-compatible LLM server with vLLM on modal.com",
version="0.0.1",
docs_url="/docs",
)
# security: CORS middleware for external requests
http_bearer = fastapi.security.HTTPBearer(
scheme_name="Bearer Token",
description="See code for authentication details.",
)
web_app.add_middleware(
fastapi.middleware.cors.CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# security: inject dependency on authed routes
async def is_authenticated(api_key: str = fastapi.Security(http_bearer)):
if api_key.credentials != TOKEN:
raise fastapi.HTTPException(
status_code=fastapi.status.HTTP_401_UNAUTHORIZED,
detail="Invalid authentication credentials",
)
return {"username": "authenticated_user"}
router = fastapi.APIRouter(dependencies=[fastapi.Depends(is_authenticated)])
# wrap vllm's router in auth router
router.include_router(api_server.router)
# add authed vllm to our fastAPI app
web_app.include_router(router)
engine_args = AsyncEngineArgs(
model=MODELS_DIR + "/" + MODEL_NAME,
tensor_parallel_size=N_GPU,
gpu_memory_utilization=0.90,
# max_model_len=8096,
enforce_eager=False, # capture the graph for faster inference, but slower cold starts (30s > 20s)
# --- 4 bits load
# quantization="bitsandbytes",
# load_format="bitsandbytes",
)
engine = AsyncLLMEngine.from_engine_args(
engine_args, usage_context=UsageContext.OPENAI_API_SERVER
)
model_config = get_model_config(engine)
request_logger = RequestLogger(max_log_len=2048)
api_server.openai_serving_chat = OpenAIServingChat(
engine,
model_config=model_config,
served_model_names=[MODEL_NAME],
chat_template=None,
response_role="assistant",
lora_modules=[],
prompt_adapters=[],
request_logger=request_logger,
)
api_server.openai_serving_completion = OpenAIServingCompletion(
engine,
model_config=model_config,
served_model_names=[MODEL_NAME],
lora_modules=[],
prompt_adapters=[],
request_logger=request_logger,
)
return web_app
def get_model_config(engine):
import asyncio
try: # adapted from vLLM source -- https://github.com/vllm-project/vllm/blob/507ef787d85dec24490069ffceacbd6b161f4f72/vllm/entrypoints/openai/api_server.py#L235C1-L247C1
event_loop = asyncio.get_running_loop()
except RuntimeError:
event_loop = None
if event_loop is not None and event_loop.is_running():
# If the current is instanced by Ray Serve,
# there is already a running event loop
model_config = event_loop.run_until_complete(engine.get_model_config())
else:
# When using single vLLM without engine_use_ray
model_config = asyncio.run(engine.get_model_config())
return model_config

View File

@@ -1,16 +0,0 @@
LOAD DATABASE
FROM sqlite:///app/reflector.sqlite3
INTO pgsql://reflector:reflector@postgres:5432/reflector
WITH
include drop,
create tables,
create indexes,
reset sequences,
preserve index names,
prefetch rows = 10
SET
work_mem to '512MB',
maintenance_work_mem to '1024MB'
CAST
column transcript.duration to float using (lambda (val) (when val (format nil "~f" val)))
;

View File

@@ -61,7 +61,7 @@ class LLM:
Return an instance depending on the settings.
Settings used:
- `LLM_BACKEND`: key of the backend, defaults to `oobabooga`
- `LLM_BACKEND`: key of the backend
- `LLM_URL`: url of the backend
"""
if name is None:

View File

@@ -1,29 +0,0 @@
import httpx
from reflector.llm.base import LLM
from reflector.settings import settings
class OobaboogaLLM(LLM):
def __init__(self, model_name: str | None = None):
super().__init__()
async def _generate(
self, prompt: str, gen_schema: dict | None, gen_cfg: dict | None, **kwargs
):
json_payload = {"prompt": prompt}
if gen_schema:
json_payload["gen_schema"] = gen_schema
if gen_cfg:
json_payload.update(gen_cfg)
async with httpx.AsyncClient() as client:
response = await client.post(
settings.LLM_URL,
headers={"Content-Type": "application/json"},
json=json_payload,
)
response.raise_for_status()
return response.json()
LLM.register("oobabooga", OobaboogaLLM)

View File

@@ -8,8 +8,6 @@ class Settings(BaseSettings):
extra="ignore",
)
OPENMP_KMP_DUPLICATE_LIB_OK: bool = False
# CORS
CORS_ORIGIN: str = "*"
CORS_ALLOW_CREDENTIALS: bool = False
@@ -20,26 +18,6 @@ class Settings(BaseSettings):
# local data directory (audio for no)
DATA_DIR: str = "./data"
# Whisper
WHISPER_MODEL_SIZE: str = "tiny"
WHISPER_REAL_TIME_MODEL_SIZE: str = "tiny"
# Summarizer
SUMMARIZER_MODEL: str = "facebook/bart-large-cnn"
SUMMARIZER_INPUT_ENCODING_MAX_LENGTH: int = 1024
SUMMARIZER_MAX_LENGTH: int = 2048
SUMMARIZER_BEAM_SIZE: int = 6
SUMMARIZER_MAX_CHUNK_LENGTH: int = 1024
SUMMARIZER_USING_CHUNKS: bool = True
# Audio
AUDIO_BLACKHOLE_INPUT_AGGREGATOR_DEVICE_NAME: str = "aggregator"
AUDIO_AV_FOUNDATION_DEVICE_ID: int = 1
AUDIO_CHANNELS: int = 2
AUDIO_SAMPLING_RATE: int = 48000
AUDIO_SAMPLING_WIDTH: int = 2
AUDIO_BUFFER_SIZE: int = 256 * 960
# Audio Transcription
# backends: whisper, modal
TRANSCRIPT_BACKEND: str = "whisper"
@@ -63,8 +41,8 @@ class Settings(BaseSettings):
TRANSCRIPT_STORAGE_AWS_SECRET_ACCESS_KEY: str | None = None
# LLM
# available backend: openai, modal, oobabooga
LLM_BACKEND: str = "oobabooga"
# available backend: openai, modal
LLM_BACKEND: str = "modal"
# LLM common configuration
LLM_URL: str | None = None

View File

@@ -1,14 +1,13 @@
import asyncio
import time
import uuid
from os import environ
import httpx
import stamina
from aiortc import RTCPeerConnection, RTCSessionDescription
from aiortc.contrib.media import MediaPlayer, MediaRelay
from reflector.logger import logger
from reflector.settings import settings
class StreamClient:
@@ -43,8 +42,9 @@ class StreamClient:
else:
if self.relay is None:
self.relay = MediaRelay()
audio_device_id = int(environ.get("AUDIO_AV_FOUNDATION_DEVICE_ID", 1))
self.player = MediaPlayer(
f":{settings.AUDIO_AV_FOUNDATION_DEVICE_ID}",
f":{audio_device_id}",
format="avfoundation",
options={"channels": "2"},
)

View File

@@ -1,59 +0,0 @@
"""
Utility file for file handling related functions, including file downloads and
uploads to cloud storage
"""
import sys
from typing import List, NoReturn
import boto3
import botocore
from .log_utils import LOGGER
from .run_utils import SECRETS
BUCKET_NAME = SECRETS["AWS-S3"]["BUCKET_NAME"]
s3 = boto3.client(
"s3",
aws_access_key_id=SECRETS["AWS-S3"]["AWS_ACCESS_KEY"],
aws_secret_access_key=SECRETS["AWS-S3"]["AWS_SECRET_KEY"],
)
def upload_files(files_to_upload: List[str]) -> NoReturn:
"""
Upload a list of files to the configured S3 bucket
:param files_to_upload: List of files to upload
:return: None
"""
for key in files_to_upload:
LOGGER.info("Uploading file " + key)
try:
s3.upload_file(key, BUCKET_NAME, key)
except botocore.exceptions.ClientError as exception:
print(exception.response)
def download_files(files_to_download: List[str]) -> NoReturn:
"""
Download a list of files from the configured S3 bucket
:param files_to_download: List of files to download
:return: None
"""
for key in files_to_download:
LOGGER.info("Downloading file " + key)
try:
s3.download_file(BUCKET_NAME, key, key)
except botocore.exceptions.ClientError as exception:
if exception.response["Error"]["Code"] == "404":
print("The object does not exist.")
else:
raise
if __name__ == "__main__":
if sys.argv[1] == "download":
download_files([sys.argv[2]])
elif sys.argv[1] == "upload":
upload_files([sys.argv[2]])

View File

@@ -1,38 +0,0 @@
"""
Utility function to format the artefacts created during Reflector run
"""
import json
with open("../artefacts/meeting_titles_and_summaries.txt", "r", encoding="utf-8") as f:
outputs = f.read()
outputs = json.loads(outputs)
transcript_file = open("../artefacts/meeting_transcript.txt", "a", encoding="utf-8")
title_desc_file = open(
"../artefacts/meeting_title_description.txt", "a", encoding="utf-8"
)
summary_file = open("../artefacts/meeting_summary.txt", "a", encoding="utf-8")
for item in outputs["topics"]:
transcript_file.write(item["transcript"])
summary_file.write(item["description"])
title_desc_file.write("TITLE: \n")
title_desc_file.write(item["title"])
title_desc_file.write("\n")
title_desc_file.write("DESCRIPTION: \n")
title_desc_file.write(item["description"])
title_desc_file.write("\n")
title_desc_file.write("TRANSCRIPT: \n")
title_desc_file.write(item["transcript"])
title_desc_file.write("\n")
title_desc_file.write("---------------------------------------- \n\n")
transcript_file.close()
title_desc_file.close()
summary_file.close()

View File

@@ -1,55 +0,0 @@
"""
Utility file for server side asynchronous task running and config objects
"""
import asyncio
import contextlib
from functools import partial
from threading import Lock
from typing import ContextManager, Generic, TypeVar
def run_in_executor(func, *args, executor=None, **kwargs):
"""
Run the function in an executor, unblocking the main loop
:param func: Function to be run in executor
:param args: function parameters
:param executor: executor instance [Thread | Process]
:param kwargs: Additional parameters
:return: Future of function result upon completion
"""
callback = partial(func, *args, **kwargs)
loop = asyncio.get_event_loop()
return loop.run_in_executor(executor, callback)
# Genetic type template
T = TypeVar("T")
class Mutex(Generic[T]):
"""
Mutex class to implement lock/release of a shared
protected variable
"""
def __init__(self, value: T):
"""
Create an instance of Mutex wrapper for the given resource
:param value: Shared resources to be thread protected
"""
self.__value = value
self.__lock = Lock()
@contextlib.contextmanager
def lock(self) -> ContextManager[T]:
"""
Lock the resource with a mutex to be used within a context block
The lock is automatically released on context exit
:return: Shared resource
"""
self.__lock.acquire()
try:
yield self.__value
finally:
self.__lock.release()

View File

@@ -1,262 +0,0 @@
"""
Utility file for all text processing related functionalities
"""
import datetime
from typing import List
import nltk
import torch
from log_utils import LOGGER
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from run_utils import CONFIG
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import BartForConditionalGeneration, BartTokenizer
nltk.download("punkt", quiet=True)
def preprocess_sentence(sentence: str) -> str:
"""
Filter out undesirable tokens from thr sentence
:param sentence:
:return:
"""
stop_words = set(stopwords.words("english"))
tokens = word_tokenize(sentence.lower())
tokens = [token for token in tokens if token.isalnum() and token not in stop_words]
return " ".join(tokens)
def compute_similarity(sent1: str, sent2: str) -> float:
"""
Compute the similarity
"""
tfidf_vectorizer = TfidfVectorizer()
if sent1 is not None and sent2 is not None:
tfidf_matrix = tfidf_vectorizer.fit_transform([sent1, sent2])
return cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])[0][0]
return 0.0
def remove_almost_alike_sentences(sentences: List[str], threshold=0.7) -> List[str]:
"""
Filter sentences that are similar beyond a set threshold
:param sentences:
:param threshold:
:return:
"""
num_sentences = len(sentences)
removed_indices = set()
for i in range(num_sentences):
if i not in removed_indices:
for j in range(i + 1, num_sentences):
if j not in removed_indices:
l_i = len(sentences[i])
l_j = len(sentences[j])
if l_i == 0 or l_j == 0:
if l_i == 0:
removed_indices.add(i)
if l_j == 0:
removed_indices.add(j)
else:
sentence1 = preprocess_sentence(sentences[i])
sentence2 = preprocess_sentence(sentences[j])
if len(sentence1) != 0 and len(sentence2) != 0:
similarity = compute_similarity(sentence1, sentence2)
if similarity >= threshold:
removed_indices.add(max(i, j))
filtered_sentences = [
sentences[i] for i in range(num_sentences) if i not in removed_indices
]
return filtered_sentences
def remove_outright_duplicate_sentences_from_chunk(chunk: str) -> List[str]:
"""
Remove repetitive sentences
:param chunk:
:return:
"""
chunk_text = chunk["text"]
sentences = nltk.sent_tokenize(chunk_text)
nonduplicate_sentences = list(dict.fromkeys(sentences))
return nonduplicate_sentences
def remove_whisper_repetitive_hallucination(
nonduplicate_sentences: List[str],
) -> List[str]:
"""
Remove sentences that are repeated as a result of Whisper
hallucinations
:param nonduplicate_sentences:
:return:
"""
chunk_sentences = []
for sent in nonduplicate_sentences:
temp_result = ""
seen = {}
words = nltk.word_tokenize(sent)
n_gram_filter = 3
for i in range(len(words)):
if (
str(words[i : i + n_gram_filter]) in seen
and seen[str(words[i : i + n_gram_filter])]
== words[i + 1 : i + n_gram_filter + 2]
):
pass
else:
seen[str(words[i : i + n_gram_filter])] = words[
i + 1 : i + n_gram_filter + 2
]
temp_result += words[i]
temp_result += " "
chunk_sentences.append(temp_result)
return chunk_sentences
def post_process_transcription(whisper_result: dict) -> dict:
"""
Parent function to perform post-processing on the transcription result
:param whisper_result:
:return:
"""
transcript_text = ""
for chunk in whisper_result["chunks"]:
nonduplicate_sentences = remove_outright_duplicate_sentences_from_chunk(chunk)
chunk_sentences = remove_whisper_repetitive_hallucination(
nonduplicate_sentences
)
similarity_matched_sentences = remove_almost_alike_sentences(chunk_sentences)
chunk["text"] = " ".join(similarity_matched_sentences)
transcript_text += chunk["text"]
whisper_result["text"] = transcript_text
return whisper_result
def summarize_chunks(chunks: List[str], tokenizer, model) -> List[str]:
"""
Summarize each chunk using a summarizer model
:param chunks:
:param tokenizer:
:param model:
:return:
"""
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
summaries = []
for c in chunks:
input_ids = tokenizer.encode(c, return_tensors="pt")
input_ids = input_ids.to(device)
with torch.no_grad():
summary_ids = model.generate(
input_ids,
num_beams=int(CONFIG["SUMMARIZER"]["BEAM_SIZE"]),
length_penalty=2.0,
max_length=int(CONFIG["SUMMARIZER"]["MAX_LENGTH"]),
early_stopping=True,
)
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
summaries.append(summary)
return summaries
def chunk_text(
text: str, max_chunk_length: int = int(CONFIG["SUMMARIZER"]["MAX_CHUNK_LENGTH"])
) -> List[str]:
"""
Split text into smaller chunks.
:param text: Text to be chunked
:param max_chunk_length: length of chunk
:return: chunked texts
"""
sentences = nltk.sent_tokenize(text)
chunks = []
current_chunk = ""
for sentence in sentences:
if len(current_chunk) + len(sentence) < max_chunk_length:
current_chunk += f" {sentence.strip()}"
else:
chunks.append(current_chunk.strip())
current_chunk = f"{sentence.strip()}"
chunks.append(current_chunk.strip())
return chunks
def summarize(
transcript_text: str,
timestamp: datetime.datetime.timestamp,
real_time: bool = False,
chunk_summarize: str = CONFIG["SUMMARIZER"]["SUMMARIZE_USING_CHUNKS"],
):
"""
Summarize the given text either as a whole or as chunks as needed
:param transcript_text:
:param timestamp:
:param real_time:
:param chunk_summarize:
:return:
"""
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
summary_model = CONFIG["SUMMARIZER"]["SUMMARY_MODEL"]
if not summary_model:
summary_model = "facebook/bart-large-cnn"
# Summarize the generated transcript using the BART model
LOGGER.info(f"Loading BART model: {summary_model}")
tokenizer = BartTokenizer.from_pretrained(summary_model)
model = BartForConditionalGeneration.from_pretrained(summary_model)
model = model.to(device)
output_file = "summary_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".txt"
if real_time:
output_file = "real_time_" + output_file
if chunk_summarize != "YES":
max_length = int(CONFIG["SUMMARIZER"]["INPUT_ENCODING_MAX_LENGTH"])
inputs = tokenizer.batch_encode_plus(
[transcript_text],
truncation=True,
padding="longest",
max_length=max_length,
return_tensors="pt",
)
inputs = inputs.to(device)
with torch.no_grad():
num_beans = int(CONFIG["SUMMARIZER"]["BEAM_SIZE"])
max_length = int(CONFIG["SUMMARIZER"]["MAX_LENGTH"])
summaries = model.generate(
inputs["input_ids"],
num_beams=num_beans,
length_penalty=2.0,
max_length=max_length,
early_stopping=True,
)
decoded_summaries = [
tokenizer.decode(
summary, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
for summary in summaries
]
summary = " ".join(decoded_summaries)
with open("./artefacts/" + output_file, "w", encoding="utf-8") as file:
file.write(summary.strip() + "\n")
else:
LOGGER.info("Breaking transcript into smaller chunks")
chunks = chunk_text(transcript_text)
LOGGER.info(f"Transcript broken into {len(chunks)} chunks of at most 500 words")
LOGGER.info(f"Writing summary text to: {output_file}")
with open(output_file, "w") as f:
summaries = summarize_chunks(chunks, tokenizer, model)
for summary in summaries:
f.write(summary.strip() + " ")

View File

@@ -1,283 +0,0 @@
"""
Utility file for all visualization related functions
"""
import ast
import collections
import datetime
import os
import pickle
from typing import NoReturn
import matplotlib.pyplot as plt
import pandas as pd
import scattertext as st
import spacy
from nltk.corpus import stopwords
from wordcloud import STOPWORDS, WordCloud
en = spacy.load("en_core_web_md")
spacy_stopwords = en.Defaults.stop_words
STOPWORDS = (
set(STOPWORDS).union(set(stopwords.words("english"))).union(set(spacy_stopwords))
)
def create_wordcloud(
timestamp: datetime.datetime.timestamp, real_time: bool = False
) -> NoReturn:
"""
Create a basic word cloud visualization of transcribed text
:return: None. The wordcloud image is saved locally
"""
filename = "transcript"
if real_time:
filename = (
"real_time_"
+ filename
+ "_"
+ timestamp.strftime("%m-%d-%Y_%H:%M:%S")
+ ".txt"
)
else:
filename += "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".txt"
with open("./artefacts/" + filename, "r") as f:
transcription_text = f.read()
# python_mask = np.array(PIL.Image.open("download1.png"))
wordcloud = WordCloud(
height=800,
width=800,
background_color="white",
stopwords=STOPWORDS,
min_font_size=8,
).generate(transcription_text)
# Plot wordcloud and save image
plt.figure(facecolor=None)
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.tight_layout(pad=0)
wordcloud = "wordcloud"
if real_time:
wordcloud = (
"real_time_"
+ wordcloud
+ "_"
+ timestamp.strftime("%m-%d-%Y_%H:%M:%S")
+ ".png"
)
else:
wordcloud += "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".png"
plt.savefig("./artefacts/" + wordcloud)
def create_talk_diff_scatter_viz(
timestamp: datetime.datetime.timestamp, real_time: bool = False
) -> NoReturn:
"""
Perform agenda vs transcription diff to see covered topics.
Create a scatter plot of words in topics.
:return: None. Saved locally.
"""
spacy_model = "en_core_web_md"
nlp = spacy.load(spacy_model)
nlp.add_pipe("sentencizer")
agenda_topics = []
agenda = []
# Load the agenda
with open(os.path.join(os.getcwd(), "agenda-headers.txt"), "r") as f:
for line in f.readlines():
if line.strip():
agenda.append(line.strip())
agenda_topics.append(line.split(":")[0])
# Load the transcription with timestamp
if real_time:
filename = (
"./artefacts/real_time_transcript_with_timestamp_"
+ timestamp.strftime("%m-%d-%Y_%H:%M:%S")
+ ".txt"
)
else:
filename = (
"./artefacts/transcript_with_timestamp_"
+ timestamp.strftime("%m-%d-%Y_%H:%M:%S")
+ ".txt"
)
with open(filename) as file:
transcription_timestamp_text = file.read()
res = ast.literal_eval(transcription_timestamp_text)
chunks = res["chunks"]
# create df for processing
df = pd.DataFrame.from_dict(res["chunks"])
covered_items = {}
# ts: timestamp
# Map each timestamped chunk with top1 and top2 matched agenda
ts_to_topic_mapping_top_1 = {}
ts_to_topic_mapping_top_2 = {}
# Also create a mapping of the different timestamps
# in which each topic was covered
topic_to_ts_mapping_top_1 = collections.defaultdict(list)
topic_to_ts_mapping_top_2 = collections.defaultdict(list)
similarity_threshold = 0.7
for c in chunks:
doc_transcription = nlp(c["text"])
topic_similarities = []
for item in range(len(agenda)):
item_doc = nlp(agenda[item])
# if not doc_transcription or not all
# (token.has_vector for token in doc_transcription):
if not doc_transcription:
continue
similarity = doc_transcription.similarity(item_doc)
topic_similarities.append((item, similarity))
topic_similarities.sort(key=lambda x: x[1], reverse=True)
for i in range(2):
if topic_similarities[i][1] >= similarity_threshold:
covered_items[agenda[topic_similarities[i][0]]] = True
# top1 match
if i == 0:
ts_to_topic_mapping_top_1[c["timestamp"]] = agenda_topics[
topic_similarities[i][0]
]
topic_to_ts_mapping_top_1[
agenda_topics[topic_similarities[i][0]]
].append(c["timestamp"])
# top2 match
else:
ts_to_topic_mapping_top_2[c["timestamp"]] = agenda_topics[
topic_similarities[i][0]
]
topic_to_ts_mapping_top_2[
agenda_topics[topic_similarities[i][0]]
].append(c["timestamp"])
def create_new_columns(record: dict) -> dict:
"""
Accumulate the mapping information into the df
:param record:
:return:
"""
record["ts_to_topic_mapping_top_1"] = ts_to_topic_mapping_top_1[
record["timestamp"]
]
record["ts_to_topic_mapping_top_2"] = ts_to_topic_mapping_top_2[
record["timestamp"]
]
return record
df = df.apply(create_new_columns, axis=1)
# Count the number of items covered and calculate the percentage
num_covered_items = sum(covered_items.values())
percentage_covered = num_covered_items / len(agenda) * 100
# Print the results
print("💬 Agenda items covered in the transcription:")
for item in agenda:
if item in covered_items and covered_items[item]:
print("", item)
else:
print("", item)
print("📊 Coverage: {:.2f}%".format(percentage_covered))
# Save df, mappings for further experimentation
df_name = "df"
if real_time:
df_name = (
"real_time_"
+ df_name
+ "_"
+ timestamp.strftime("%m-%d-%Y_%H:%M:%S")
+ ".pkl"
)
else:
df_name += "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".pkl"
df.to_pickle("./artefacts/" + df_name)
my_mappings = [
ts_to_topic_mapping_top_1,
ts_to_topic_mapping_top_2,
topic_to_ts_mapping_top_1,
topic_to_ts_mapping_top_2,
]
mappings_name = "mappings"
if real_time:
mappings_name = (
"real_time_"
+ mappings_name
+ "_"
+ timestamp.strftime("%m-%d-%Y_%H:%M:%S")
+ ".pkl"
)
else:
mappings_name += "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".pkl"
pickle.dump(my_mappings, open("./artefacts/" + mappings_name, "wb"))
# to load, my_mappings = pickle.load( open ("mappings.pkl", "rb") )
# pick the 2 most matched topic to be used for plotting
topic_times = collections.defaultdict(int)
for key in ts_to_topic_mapping_top_1.keys():
if key[0] is None or key[1] is None:
continue
duration = key[1] - key[0]
topic_times[ts_to_topic_mapping_top_1[key]] += duration
topic_times = sorted(topic_times.items(), key=lambda x: x[1], reverse=True)
if len(topic_times) > 1:
cat_1 = topic_times[0][0]
cat_1_name = topic_times[0][0]
cat_2_name = topic_times[1][0]
# Scatter plot of topics
df = df.assign(parse=lambda df: df.text.apply(st.whitespace_nlp_with_sentences))
corpus = (
st.CorpusFromParsedDocuments(
df, category_col="ts_to_topic_mapping_top_1", parsed_col="parse"
)
.build()
.get_unigram_corpus()
.compact(st.AssociationCompactor(2000))
)
html = st.produce_scattertext_explorer(
corpus,
category=cat_1,
category_name=cat_1_name,
not_category_name=cat_2_name,
minimum_term_frequency=0,
pmi_threshold_coefficient=0,
width_in_pixels=1000,
transform=st.Scalers.dense_rank,
)
if real_time:
with open(
"./artefacts/real_time_scatter_"
+ timestamp.strftime("%m-%d-%Y_%H:%M:%S")
+ ".html",
"w",
) as file:
file.write(html)
else:
with open(
"./artefacts/scatter_"
+ timestamp.strftime("%m-%d-%Y_%H:%M:%S")
+ ".html",
"w",
) as file:
file.write(html)

View File

@@ -1,30 +0,0 @@
#!/bin/bash
# Directory to search for Python files
cwd=$(pwd)
last_component="${cwd##*/}"
if [ "$last_component" = "reflector" ]; then
directory="./artefacts"
elif [ "$last_component" = "scripts" ]; then
directory="../artefacts"
fi
# Pattern to match Python files (e.g., "*.py" for all .py files)
transcript_file_pattern="transcript_*.txt"
summary_file_pattern="summary_*.txt"
pickle_file_pattern="*.pkl"
html_file_pattern="*.html"
png_file_pattern="wordcloud*.png"
mp3_file_pattern="*.mp3"
mp4_file_pattern="*.mp4"
m4a_file_pattern="*.m4a"
find "$directory" -type f -name "$transcript_file_pattern" -delete
find "$directory" -type f -name "$summary_file_pattern" -delete
find "$directory" -type f -name "$pickle_file_pattern" -delete
find "$directory" -type f -name "$html_file_pattern" -delete
find "$directory" -type f -name "$png_file_pattern" -delete
find "$directory" -type f -name "$mp3_file_pattern" -delete
find "$directory" -type f -name "$mp4_file_pattern" -delete
find "$directory" -type f -name "$m4a_file_pattern" -delete

View File

@@ -1,39 +0,0 @@
#!/bin/sh
# Upgrade pip
pip install --upgrade pip
# Default to CPU Installation of JAX
jax_mode="jax[cpu]"
# Install JAX
if [ "$1" == "cpu" ]
then
jax_mode="jax[cpu]"
elif [ "$1" == "cuda11" ]
then
jax_mode="jax[cuda11_pip]"
elif [ "$1" == "cuda12" ]
then
jax_mode="jax[cuda12_pip]"
fi
pip install --upgrade "$jax_mode"
# Install Whisper-JAX base
pip install git+https://github.com/sanchit-gandhi/whisper-jax.git
# Update to latest version
pip install --upgrade --no-deps --force-reinstall git+https://github.com/sanchit-gandhi/whisper-jax.git
cwd=$(pwd)
last_component="${cwd##*/}"
if [ "$last_component" = "reflector" ]; then
pip install -r pipeline-requirements.txt
elif [ "$last_component" = "scripts" ]; then
pip install -r ../pipeline-requirements.txt
fi
# download spacy models
spacy download en_core_web_sm
spacy download en_core_web_md

View File

@@ -1,11 +0,0 @@
#!/bin/sh
pip install --upgrade pip
cwd=$(pwd)
last_component="${cwd##*/}"
if [ "$last_component" = "reflector" ]; then
pip install -r server-requirements.txt
elif [ "$last_component" = "scripts" ]; then
pip install -r ../server-requirements.txt
fi

View File

@@ -1 +0,0 @@
ZULIP_API_KEY=<omitted, ask in zulip>