This commit is contained in:
Gokul Mohanarangan
2023-07-11 18:47:21 +05:30
parent d962ff1712
commit 71eb277fd7
11 changed files with 67 additions and 80 deletions

4
.gitignore vendored
View File

@@ -160,9 +160,6 @@ cython_debug/
#.idea/
*.mp4
summary.txt
transcript.txt
transcript_timestamps.txt
*.html
*.pkl
transcript_*.txt
@@ -176,3 +173,4 @@ test_samples/
.DS_Store/
.DS_Store
.vscode/
artefacts/

View File

@@ -1,22 +0,0 @@
[DEFAULT]
# Set exception rule for OpenMP error to allow duplicate lib initialization
KMP_DUPLICATE_LIB_OK = TRUE
# Export OpenAI API Key
OPENAI_APIKEY =
# Export Whisper Model Size
WHISPER_MODEL_SIZE = tiny
WHISPER_REAL_TIME_MODEL_SIZE = tiny
# AWS config
AWS_ACCESS_KEY = ***REMOVED***
AWS_SECRET_KEY = ***REMOVED***
BUCKET_NAME = 'reflector-bucket'
# Summarizer config
SUMMARY_MODEL = facebook/bart-large-cnn
INPUT_ENCODING_MAX_LENGTH = 1024
MAX_LENGTH = 2048
BEAM_SIZE = 6
MAX_CHUNK_LENGTH = 1024
SUMMARIZE_USING_CHUNKS = YES
# Audio device
BLACKHOLE_INPUT_AGGREGATOR_DEVICE_NAME = aggregator
AV_FOUNDATION_DEVICE_ID = 2

View File

@@ -56,5 +56,4 @@ cached_property==1.5.2
stamina==23.1.0
httpx==0.24.1
sortedcontainers==2.4.0
openai-whisper@ git+https://github.com/openai/whisper.git@248b6cb124225dd263bb9bd32d060b6517e067f8
https://github.com/yt-dlp/yt-dlp/archive/master.tar.gz

View File

@@ -1,15 +1,24 @@
#!/bin/bash
# Directory to search for Python files
directory="."
cwd=$(pwd)
last_component="${cwd##*/}"
if [ "$last_component" = "reflector" ]; then
directory="./artefacts"
elif [ "$last_component" = "scripts" ]; then
directory="../artefacts"
fi
# Pattern to match Python files (e.g., "*.py" for all .py files)
text_file_pattern="transcript_*.txt"
transcript_file_pattern="transcript_*.txt"
summary_file_pattern="summary_*.txt"
pickle_file_pattern="*.pkl"
html_file_pattern="*.html"
png_file_pattern="wordcloud*.png"
find "$directory" -type f -name "$text_file_pattern" -delete
find "$directory" -type f -name "$transcript_file_pattern" -delete
find "$directory" -type f -name "$summary_file_pattern" -delete
find "$directory" -type f -name "$pickle_file_pattern" -delete
find "$directory" -type f -name "$html_file_pattern" -delete
find "$directory" -type f -name "$png_file_pattern" -delete

View File

@@ -65,6 +65,7 @@ def get_transcription():
transcribe = True
if transcribe:
print("Transcribing..")
try:
sorted_message_queue[frames[0].time] = None
out_file = io.BytesIO()
@@ -113,7 +114,7 @@ def start_messaging_thread():
def start_transcription_thread(max_threads: int):
for i in range(max_threads):
t_thread = threading.Thread(target=get_transcription, args=(i,))
t_thread = threading.Thread(target=get_transcription)
t_thread.start()
@@ -128,7 +129,7 @@ async def offer(request: requests.Request):
def log_info(msg: str, *args):
logger.info(pc_id + " " + msg, *args)
log_info("Created for %s", request.remote)
log_info("Created for " + request.remote)
@pc.on("datachannel")
def on_datachannel(channel):
@@ -146,14 +147,14 @@ async def offer(request: requests.Request):
@pc.on("connectionstatechange")
async def on_connectionstatechange():
log_info("Connection state is %s", pc.connectionState)
log_info("Connection state is " + pc.connectionState)
if pc.connectionState == "failed":
await pc.close()
pcs.discard(pc)
@pc.on("track")
def on_track(track):
log_info("Track %s received", track.kind)
log_info("Track " + track.kind + " received")
pc.addTrack(AudioStreamTrack(relay.subscribe(track)))
# handle offer

View File

@@ -3,8 +3,8 @@ import sys
import boto3
import botocore
from log_utils import logger
from run_utils import config
from .log_utils import logger
from .run_utils import config
BUCKET_NAME = config["DEFAULT"]["BUCKET_NAME"]

View File

@@ -6,18 +6,18 @@ from threading import Lock
from typing import ContextManager, Generic, TypeVar
class ConfigParser:
__config = configparser.ConfigParser()
def __init__(self, config_file='../config.ini'):
self.__config.read(config_file)
class ReflectorConfig:
__config = None
@staticmethod
def get_config():
return ConfigParser.__config
if ReflectorConfig.__config is None:
ReflectorConfig.__config = configparser.ConfigParser()
ReflectorConfig.__config.read('utils/config.ini')
return ReflectorConfig.__config
config = ConfigParser.get_config()
config = ReflectorConfig.get_config()
def run_in_executor(func, *args, executor=None, **kwargs):

0
utils/test.py Normal file
View File

View File

@@ -6,8 +6,8 @@ from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import BartForConditionalGeneration, BartTokenizer
from log_utils import logger
from run_utils import config
from utils.log_utils import logger
from utils.run_utils import config
nltk.download('punkt', quiet=True)
@@ -186,7 +186,7 @@ def summarize(transcript_text, timestamp,
decoded_summaries = [tokenizer.decode(summary, skip_special_tokens=True, clean_up_tokenization_spaces=False)
for summary in summaries]
summary = " ".join(decoded_summaries)
with open(output_filename, 'w') as f:
with open("./artefacts/" + output_filename, 'w') as f:
f.write(summary.strip() + "\n")
else:
logger.info("Breaking transcript into smaller chunks")

View File

@@ -52,7 +52,7 @@ def create_wordcloud(timestamp, real_time=False):
else:
wordcloud_name += "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".png"
plt.savefig(wordcloud_name)
plt.savefig("./artefacts/" + wordcloud_name)
def create_talk_diff_scatter_viz(timestamp, real_time=False):
@@ -77,10 +77,10 @@ def create_talk_diff_scatter_viz(timestamp, real_time=False):
# Load the transcription with timestamp
filename = ""
if real_time:
filename = "real_time_transcript_with_timestamp_" +\
filename = "./artefacts/real_time_transcript_with_timestamp_" +\
timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".txt"
else:
filename = "transcript_with_timestamp_" +\
filename = "./artefacts/transcript_with_timestamp_" +\
timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".txt"
with open(filename) as f:
transcription_timestamp_text = f.read()
@@ -162,7 +162,7 @@ def create_talk_diff_scatter_viz(timestamp, real_time=False):
timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".pkl"
else:
df_name += "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".pkl"
df.to_pickle(df_name)
df.to_pickle("./artefacts/" + df_name)
my_mappings = [ts_to_topic_mapping_top_1, ts_to_topic_mapping_top_2,
topic_to_ts_mapping_top_1, topic_to_ts_mapping_top_2]
@@ -173,7 +173,7 @@ def create_talk_diff_scatter_viz(timestamp, real_time=False):
timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".pkl"
else:
mappings_name += "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".pkl"
pickle.dump(my_mappings, open(mappings_name, "wb"))
pickle.dump(my_mappings, open("./artefacts/" + mappings_name, "wb"))
# to load, my_mappings = pickle.load( open ("mappings.pkl", "rb") )
@@ -187,27 +187,28 @@ def create_talk_diff_scatter_viz(timestamp, real_time=False):
topic_times = sorted(topic_times.items(), key=lambda x: x[1], reverse=True)
cat_1 = topic_times[0][0]
cat_1_name = topic_times[0][0]
cat_2_name = topic_times[1][0]
if len(topic_times) > 1:
cat_1 = topic_times[0][0]
cat_1_name = topic_times[0][0]
cat_2_name = topic_times[1][0]
# Scatter plot of topics
df = df.assign(parse=lambda df: df.text.apply(st.whitespace_nlp_with_sentences))
corpus = st.CorpusFromParsedDocuments(
df, category_col='ts_to_topic_mapping_top_1', parsed_col='parse'
).build().get_unigram_corpus().compact(st.AssociationCompactor(2000))
html = st.produce_scattertext_explorer(
corpus,
category=cat_1,
category_name=cat_1_name,
not_category_name=cat_2_name,
minimum_term_frequency=0, pmi_threshold_coefficient=0,
width_in_pixels=1000,
transform=st.Scalers.dense_rank
)
if real_time:
open('./artefacts/real_time_scatter_' +
timestamp.strftime("%m-%d-%Y_%H:%M:%S") + '.html', 'w').write(html)
else:
open('./artefacts/scatter_' +
timestamp.strftime("%m-%d-%Y_%H:%M:%S") + '.html', 'w').write(html)
# Scatter plot of topics
df = df.assign(parse=lambda df: df.text.apply(st.whitespace_nlp_with_sentences))
corpus = st.CorpusFromParsedDocuments(
df, category_col='ts_to_topic_mapping_top_1', parsed_col='parse'
).build().get_unigram_corpus().compact(st.AssociationCompactor(2000))
html = st.produce_scattertext_explorer(
corpus,
category=cat_1,
category_name=cat_1_name,
not_category_name=cat_2_name,
minimum_term_frequency=0, pmi_threshold_coefficient=0,
width_in_pixels=1000,
transform=st.Scalers.dense_rank
)
if real_time:
open('./artefacts/real_time_scatter_' +
timestamp.strftime("%m-%d-%Y_%H:%M:%S") + '.html', 'w').write(html)
else:
open('./artefacts/scatter_' +
timestamp.strftime("%m-%d-%Y_%H:%M:%S") + '.html', 'w').write(html)

View File

@@ -127,7 +127,7 @@ def main():
audio_filename = media_file
logger.info("Finished extracting audio")
logger.info("Transcribing")
# Convert the audio to text using the OpenAI Whisper model
pipeline = FlaxWhisperPipline("openai/whisper-" + WHISPER_MODEL_SIZE,
dtype=jnp.float16,
@@ -157,13 +157,14 @@ def main():
create_talk_diff_scatter_viz(NOW)
# S3 : Push artefacts to S3 bucket
prefix = "./artefacts/"
suffix = NOW.strftime("%m-%d-%Y_%H:%M:%S")
files_to_upload = ["transcript_" + suffix + ".txt",
"transcript_with_timestamp_" + suffix + ".txt",
"df_" + suffix + ".pkl",
"wordcloud_" + suffix + ".png",
"mappings_" + suffix + ".pkl",
"scatter_" + suffix + ".html"]
files_to_upload = [prefix + "transcript_" + suffix + ".txt",
prefix + "transcript_with_timestamp_" + suffix + ".txt",
prefix + "df_" + suffix + ".pkl",
prefix + "wordcloud_" + suffix + ".png",
prefix + "mappings_" + suffix + ".pkl",
prefix + "scatter_" + suffix + ".html"]
upload_files(files_to_upload)
summarize(transcript_text, NOW, False, False)