Merge pull request #40 from Monadical-SAS/feat/gokul

Code refactor and cleanup from Feat/gokul
This commit is contained in:
projects-g
2023-07-25 13:56:14 +05:30
committed by GitHub
33 changed files with 700 additions and 242 deletions

2
.gitignore vendored
View File

@@ -165,7 +165,7 @@ cython_debug/
transcript_*.txt
test_*.txt
wordcloud*.png
*.ini
utils/config.ini
test_samples/
*.wav
*.mp3

View File

@@ -5,15 +5,15 @@ import signal
from aiortc.contrib.signaling import (add_signaling_arguments,
create_signaling)
from stream_client import StreamClient
from utils.log_utils import logger
from stream_client import StreamClient
async def main():
parser = argparse.ArgumentParser(description="Data channels ping/pong")
parser.add_argument(
"--url", type=str, nargs="?", default="http://127.0.0.1:1250/offer"
"--url", type=str, nargs="?", default="http://0.0.0.0:1250/offer"
)
parser.add_argument(

View File

@@ -2,8 +2,6 @@ pyaudio==0.2.13
keyboard==0.13.5
pynput==1.7.6
wave==0.0.2
aiohttp==3.8.4
aiosignal==1.3.1
async-timeout==4.0.2
attrs==23.1.0
certifi==2023.5.7
@@ -51,11 +49,8 @@ matplotlib==3.7.2
matplotlib-inline==0.1.6
termcolor==2.3.0
ffmpeg==1.4
aiortc==1.5.0
cached_property==1.5.2
stamina==23.1.0
httpx==0.24.1
sortedcontainers==2.4.0
https://github.com/yt-dlp/yt-dlp/archive/master.tar.gz
gpt4all==1.0.5
aiohttp_cors==0.7.0

View File

@@ -26,7 +26,7 @@ pip install git+https://github.com/sanchit-gandhi/whisper-jax.git
# Update to latest version
pip install --upgrade --no-deps --force-reinstall git+https://github.com/sanchit-gandhi/whisper-jax.git
pip install -r ../requirements.txt
pip install -r ../server-requirements.txt
# download spacy models
spacy download en_core_web_sm

View File

@@ -0,0 +1,4 @@
#!/bin/sh
pip install --upgrade pip
pip install -r ../server-requirements.txt

50
server-requirements.txt Normal file
View File

@@ -0,0 +1,50 @@
aiohttp==3.8.5
aiohttp-cors==0.7.0
aioice==0.9.0
aiortc==1.5.0
aiosignal==1.3.1
anyio==3.7.1
async-timeout==4.0.2
attrs==23.1.0
av==10.0.0
certifi==2023.7.22
cffi==1.15.1
charset-normalizer==3.2.0
coloredlogs==15.0.1
cryptography==41.0.2
ctranslate2==3.17.1
dnspython==2.4.0
faster-whisper==0.7.1
filelock==3.12.2
flatbuffers==23.5.26
frozenlist==1.4.0
fsspec==2023.6.0
google-crc32c==1.5.0
h11==0.14.0
httpcore==0.17.3
huggingface-hub==0.16.4
humanfriendly==10.0
idna==3.4
ifaddr==0.2.0
loguru==0.7.0
mpmath==1.3.0
multidict==6.0.4
numpy==1.25.1
onnxruntime==1.15.1
packaging==23.1
protobuf==4.23.4
pycparser==2.21
pyee==11.0.0
pylibsrtp==0.8.0
pyOpenSSL==23.2.0
PyYAML==6.0.1
requests==2.31.0
sniffio==1.3.0
sortedcontainers==2.4.0
sympy==1.12
tokenizers==0.13.3
tqdm==4.65.0
typing_extensions==4.7.1
urllib3==2.0.4
yarl==1.9.2
wave==0.0.2

View File

@@ -1,29 +1,30 @@
import argparse
import asyncio
import datetime
import io
import json
import os
import uuid
import wave
from concurrent.futures import ThreadPoolExecutor
import aiohttp_cors
import jax.numpy as jnp
import requests
from aiohttp import web
from aiortc import MediaStreamTrack, RTCPeerConnection, RTCSessionDescription
from aiortc.contrib.media import MediaRelay
from av import AudioFifo
from faster_whisper import WhisperModel
from loguru import logger
from whisper_jax import FlaxWhisperPipline
from utils.run_utils import run_in_executor
from sortedcontainers import SortedDict
from utils.run_utils import run_in_executor, config
pcs = set()
relay = MediaRelay()
data_channel = None
pipeline = FlaxWhisperPipline("openai/whisper-tiny",
dtype=jnp.float16,
batch_size=16)
model = WhisperModel("tiny", device="cpu",
compute_type="float32",
num_workers=12)
CHANNELS = 2
RATE = 48000
@@ -31,8 +32,8 @@ audio_buffer = AudioFifo()
executor = ThreadPoolExecutor()
transcription_text = ""
last_transcribed_time = 0.0
LLM_MACHINE_IP = "216.153.52.83"
LLM_MACHINE_PORT = "5000"
LLM_MACHINE_IP = config["DEFAULT"]["LLM_MACHINE_IP"]
LLM_MACHINE_PORT = config["DEFAULT"]["LLM_MACHINE_PORT"]
LLM_URL = f"http://{LLM_MACHINE_IP}:{LLM_MACHINE_PORT}/api/v1/generate"
incremental_responses = []
sorted_transcripts = SortedDict()
@@ -43,7 +44,7 @@ blacklisted_messages = [" Thank you.", " See you next time!",
def get_title_and_summary(llm_input_text, last_timestamp):
print("Generating title and summary")
logger.info("Generating title and summary")
# output = llm.generate(prompt)
# Use monadical-ml to fire this query to an LLM and get result
@@ -53,11 +54,11 @@ def get_title_and_summary(llm_input_text, last_timestamp):
prompt = f"""
### Human:
Create a JSON object as response. The JSON object must have 2 fields:
i) title and ii) summary. For the title field,generate a short title
for the given text. For the summary field, summarize the given text
Create a JSON object as response. The JSON object must have 2 fields:
i) title and ii) summary. For the title field,generate a short title
for the given text. For the summary field, summarize the given text
in three sentences.
{llm_input_text}
### Assistant:
@@ -67,27 +68,28 @@ def get_title_and_summary(llm_input_text, last_timestamp):
"prompt": prompt
}
# To-do: Handle unexpected output formats from the model
# TODO : Handle unexpected output formats from the model
try:
response = requests.post(LLM_URL, headers=headers, json=data)
output = json.loads(response.json()["results"][0]["text"])
output["description"] = output.pop("summary")
output["transcript"] = llm_input_text
output["timestamp"] =\
output["timestamp"] = \
str(datetime.timedelta(seconds=round(last_timestamp)))
incremental_responses.append(output)
result = {
"cmd": "UPDATE_TOPICS",
"topics": incremental_responses,
}
except Exception as e:
print("Exception" + str(e))
logger.info("Exception" + str(e))
result = None
return result
def channel_log(channel, t, message):
print("channel(%s) %s %s" % (channel.label, t, message))
logger.info("channel(%s) %s %s" % (channel.label, t, message))
def channel_send(channel, message):
@@ -113,18 +115,25 @@ def channel_send_transcript(channel):
# Due to exceptions if one of the earlier batches can't return
# a transcript, we don't want to be stuck waiting for the result
# With the threshold size of 3, we pop the first(lost) element
elif len(sorted_transcripts) >= 3:
del sorted_transcripts[least_time]
else:
if len(sorted_transcripts) >= 3:
del sorted_transcripts[least_time]
except Exception as e:
print("Exception", str(e))
logger.info("Exception", str(e))
pass
def get_transcription(frames):
print("Transcribing..")
logger.info("Transcribing..")
sorted_transcripts[frames[0].time] = None
out_file = io.BytesIO()
wf = wave.open(out_file, "wb")
# TODO:
# Passing IO objects instead of temporary files throws an error
# Passing ndarrays (typecasted with float) does not give any
# transcription. Refer issue,
# https://github.com/guillaumekln/faster-whisper/issues/369
audiofilename = "test" + str(datetime.datetime.now())
wf = wave.open(audiofilename, "wb")
wf.setnchannels(CHANNELS)
wf.setframerate(RATE)
wf.setsampwidth(2)
@@ -133,22 +142,40 @@ def get_transcription(frames):
wf.writeframes(b"".join(frame.to_ndarray()))
wf.close()
# To-Do: Look into WhisperTimeStampLogitsProcessor exception
try:
whisper_result = pipeline(out_file.getvalue(), return_timestamps=True)
except Exception as e:
return
result_text = ""
global transcription_text, last_transcribed_time
transcription_text += whisper_result["text"]
duration = whisper_result["chunks"][0]["timestamp"][1]
if not duration:
duration = 5.0
last_transcribed_time += duration
try:
segments, _ = \
model.transcribe(audiofilename,
language="en",
beam_size=5,
vad_filter=True,
vad_parameters=dict(min_silence_duration_ms=500))
os.remove(audiofilename)
segments = list(segments)
result_text = ""
duration = 0.0
for segment in segments:
result_text += segment.text
start_time = segment.start
end_time = segment.end
if not segment.start:
start_time = 0.0
if not segment.end:
end_time = 5.5
duration += (end_time - start_time)
global last_transcribed_time, transcription_text
last_transcribed_time += duration
transcription_text += result_text
except Exception as e:
logger.info("Exception" + str(e))
pass
result = {
"cmd": "SHOW_TRANSCRIPTION",
"text": whisper_result["text"]
"text": result_text
}
sorted_transcripts[frames[0].time] = result
return result
@@ -167,6 +194,9 @@ def get_final_summary_response():
seconds=round(last_transcribed_time))),
"summary": final_summary
}
with open("./artefacts/meeting_titles_and_summaries.txt", "a") as f:
f.write(json.dumps(incremental_responses))
return response
@@ -196,7 +226,7 @@ class AudioStreamTrack(MediaStreamTrack):
else None
)
if len(transcription_text) > 500:
if len(transcription_text) > 750:
llm_input_text = transcription_text
transcription_text = ""
llm_result = run_in_executor(get_title_and_summary,
@@ -245,7 +275,6 @@ async def offer(request):
if isinstance(message, str) and message.startswith("ping"):
channel_send(channel, "pong" + message[4:])
@pc.on("connectionstatechange")
async def on_connectionstatechange():
log_info("Connection state is " + pc.connectionState)
@@ -278,6 +307,16 @@ async def on_shutdown(app):
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="WebRTC based server for Reflector"
)
parser.add_argument(
"--host", default="0.0.0.0", help="Server host IP (def: 0.0.0.0)"
)
parser.add_argument(
"--port", type=int, default=1250, help="Server port (def: 1250)"
)
args = parser.parse_args()
app = web.Application()
cors = aiohttp_cors.setup(
app,
@@ -293,4 +332,4 @@ if __name__ == "__main__":
offer_resource = cors.add(app.router.add_resource("/offer"))
cors.add(offer_resource.add_route("POST", offer))
app.on_shutdown.append(on_shutdown)
web.run_app(app, access_log=None, host="127.0.0.1", port=1250)
web.run_app(app, access_log=None, host=args.host, port=args.port)

View File

@@ -17,7 +17,7 @@ class StreamClient:
def __init__(
self,
signaling,
url="http://127.0.0.1:1250",
url="http://0.0.0.0:1250",
play_from=None,
ping_pong=False
):
@@ -114,7 +114,7 @@ class StreamClient:
self.channel_log(channel, "<", message)
if isinstance(message, str) and message.startswith("pong"):
elapsed_ms = (self.current_stamp() - int(message[5:]))\
elapsed_ms = (self.current_stamp() - int(message[5:])) \
/ 1000
print(" RTT %.2f ms" % elapsed_ms)

0
trials/__init__.py Normal file
View File

View File

View File

@@ -0,0 +1,24 @@
# Steps to prepare data and submit/check OpenAI finetuning
# import subprocess
# subprocess.run("openai tools fine_tunes.prepare_data -f " + "finetuning_dataset.jsonl")
# export OPENAI_API_KEY=
# openai api fine_tunes.create -t <TRAIN_FILE_ID_OR_PATH> -m <BASE_MODEL>
# openai api fine_tunes.list
import openai
# Use your OpenAI API Key
openai.api_key = ""
sample_chunks = ["You all just came off of your incredible Google Cloud next conference where you released a wide variety of functionality and features and new products across artisan television and also across the entire sort of cloud ecosystem . You want to just first by walking through , first start by walking through all the innovations that you sort of released and what you 're excited about when you come to Google Cloud ? Now our vision is super simple . If you look at what smartphones did for a consumer , you know they took a computer and internet browser , a communication device , and a camera , and made it so that it 's in everybody 's pocket , so it really brought computation to every person . We feel that , you know , our , what we 're trying to do is take all the technological innovation that Google 's doing , but make it super simple so that everyone can consume it . And so that includes our global data center footprint , all the new types of hardware and large-scale systems we work on , the software that we 're making available for people to do high-scale computation , tools for data processing , tools for cybersecurity , processing , tools for cyber security , tools for machine learning , but make it so simple that everyone can use it . And every step that we do to simplify things for people , we think adoption can grow . And so that 's a lot of what we 've done these last three , four years , and we made a number of announcements that next in machine learning and AI in particular , you know , we look at our work as four elements , how we take our large-scale compute systems that were building for AI and how we make that available to everybody . Second , what we 're doing with the software stacks and top of it , things like jacks and other things and how we 're making those available to everybody . Third is advances because different people have different levels of expertise . Some people say I need the hardware to build my own large language model or algorithm . Other people say , look , I really need to use a building block . You guys give me . So , 30s we 've done a lot with AutoML and we announce new capability for image , video , and translation to make it available to everybody . And then lastly , we 're also building completely packaged solutions for some areas and we announce some new stuff . -> ",
" We 're joined next by Thomas Curian , CEO of Google Cloud , and Alexander Wang , CEO and founder of Scale AI . Thomas joined Google in November 2018 as the CEO of Google Cloud . Prior to Google , Thomas spent 22 years at Oracle , where most recently he was president of product development . Before that , Thomas worked at McKinsey as a business analyst and engagement manager . His nearly 30 years of experience have given him a deep knowledge of engineering enterprise relationships and leadership of large organizations . Thomas 's degrees include an MBA in administration and management from Stanford University , as an RJ Miller scholar and a BSEE in electrical engineering and computer science from Princeton University , where he graduated suma cum laude . Thomas serves as a member of the Stanford graduate School of Business Advisory Council and Princeton University School of Engineering Advisory Council . Please welcome to the stage , Thomas Curian and Alexander Wang . This is a super exciting conversation . Thanks for being here , Thomas . - > "]
# Give your finetuned model name here
# "davinci:ft-personal-2023-07-14-10-43-51"
model_name = ""
response = openai.Completion.create(
model=model_name,
prompt=sample_chunks[0])
print(response)

View File

@@ -0,0 +1,98 @@
import json
import yt_dlp as youtube_dl
from whisper_jax import FlaxWhisperPipline
import jax.numpy as jnp
# Function to extract chapter information from a YouTube video URL
def get_youtube_chapters(video_id):
video_url = "https://www.youtube.com/watch?v=" + video_id
ydl_opts = {
'extract_flat': 'in_playlist',
'skip_download': True,
'quiet': True,
}
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
video_info = ydl.extract_info(video_url, download=False)
chapters = []
if 'chapters' in video_info:
for chapter in video_info['chapters']:
start_time = chapter['start_time']
end_time = chapter['end_time']
title = chapter['title']
chapters.append({
'start': start_time,
'end': end_time,
'title': title
})
return chapters
# Function to extract video transcription using yt_dlp
def get_youtube_transcription(video_id):
ydl_opts = {
'format': 'bestaudio/best',
'postprocessors': [{
'key': 'FFmpegExtractAudio',
'preferredcodec': 'mp3',
'preferredquality': '192',
}],
'outtmpl': './artefacts/audio', # Specify output file path and name
}
# Download the audio
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
ydl.download(["https://www.youtube.com/watch?v=" + video_id])
media_file = "./artefacts/audio.mp3"
pipeline = FlaxWhisperPipline("openai/whisper-" + "tiny",
dtype=jnp.float16,
batch_size=16)
whisper_result = pipeline(media_file, return_timestamps=True)
return whisper_result["chunks"]
# Function to scrape YouTube video transcripts and chapter information
def scrape_youtube_data(video_id):
transcript_text = get_youtube_transcription(video_id)
chapters = get_youtube_chapters(video_id)
print("transcript_text", transcript_text)
print("chapters", chapters)
return transcript_text, chapters
# Function to generate fine-tuning dataset from YouTube data
def generate_finetuning_dataset(video_ids):
prompt_completion_pairs = []
for video_id in video_ids:
transcript_text, chapters = scrape_youtube_data(video_id)
if transcript_text is not None and chapters is not None:
for chapter in chapters:
start_time = chapter["start"]
end_time = chapter["end"]
chapter_text = chapter["title"]
prompt = ""
for transcript in transcript_text:
if transcript["timestamp"][0] >= start_time and transcript["timestamp"][1] < end_time:
prompt += transcript["text"]
if prompt is not None:
completion = chapter_text
prompt_completion_pairs.append({"prompt": prompt, "completion": completion})
return prompt_completion_pairs
# Add all the video ids here, the videos must have captions [chapters]
video_ids = ["yTnSEZIwnkU"]
dataset = generate_finetuning_dataset(video_ids)
with open("finetuning_dataset.jsonl", "w") as f:
for example in dataset:
f.write(json.dumps(example) + "\n")

View File

@@ -1,98 +0,0 @@
# # Approach 1
# from transformers import GPTNeoForCausalLM, GPT2Tokenizer
#
# model_name = 'EleutherAI/gpt-neo-1.3B'
# tokenizer = GPT2Tokenizer.from_pretrained(model_name)
# model = GPTNeoForCausalLM.from_pretrained(model_name)
#
# conversation = """
# Summarize the following conversation in 3 key sentences:
#
# We 're joined next by Thomas Curian , CEO of Google Cloud , and Alexander Wang , CEO and founder of Scale AI .
# Thomas joined Google in November 2018 as the CEO of Google Cloud . Prior to Google , Thomas spent 22 years at Oracle , where most recently he was president of product development .
# Before that , Thomas worked at McKinsey as a business analyst and engagement manager . His nearly 30 years of experience have given him a deep knowledge of engineering enterprise relationships and leadership of large organizations .
# Thomas 's degrees include an MBA in administration and management from Stanford University , as an RJ Miller scholar and a BSEE in electrical engineering and computer science from Princeton University , where he graduated suma cum laude .
# Thomas serves as a member of the Stanford graduate School of Business Advisory Council and Princeton University School of Engineering Advisory Council .
# Please welcome to the stage , Thomas Curian and Alexander Wang . This is a super exciting conversation . Thanks for being here , Thomas .
# """
#
# input_ids = tokenizer.encode(conversation, return_tensors='pt')
#
# output = model.generate(input_ids,
# max_length=30,
# num_return_sequences=1)
#
# caption = tokenizer.decode(output[0], skip_special_tokens=True)
# print("Caption:", caption[len(input_ids):])
#
# # Approach 2
# import torch
# from transformers import GPT2LMHeadModel, GPT2Tokenizer
#
# model_name = "gpt2"
# tokenizer = GPT2Tokenizer.from_pretrained(model_name)
# model = GPT2LMHeadModel.from_pretrained(model_name)
#
# model.eval()
#
# text = """
# You all just came off of your incredible Google Cloud next conference where you released a wide variety of functionality and features and new products across artisan television and also across the entire sort of cloud ecosystem . You want to just first by walking through , first start by walking through all the innovations that you sort of released and what you 're excited about when you come to Google Cloud ? Now our vision is super simple . If you look at what smartphones did for a consumer , you know they took a computer and internet browser , a communication device , and a camera , and made it so that it 's in everybody 's pocket , so it really brought computation to every person . We feel that , you know , our , what we 're trying to do is take all the technological innovation that Google 's doing , but make it super simple so that everyone can consume it . And so that includes our global data center footprint , all the new types of hardware and large-scale systems we work on , the software that we 're making available for people to do high-scale computation , tools for data processing , tools for cybersecurity , processing , tools for cyber security , tools for machine learning , but make it so simple that everyone can use it . And every step that we do to simplify things for people , we think adoption can grow . And so that 's a lot of what we 've done these last three , four years , and we made a number of announcements that next in machine learning and AI in particular , you know , we look at our work as four elements , how we take our large-scale compute systems that were building for AI and how we make that available to everybody . Second , what we 're doing with the software stacks and top of it , things like jacks and other things and how we 're making those available to everybody . Third is advances because different people have different levels of expertise . Some people say I need the hardware to build my own large language model or algorithm . Other people say , look , I really need to use a building block . You guys give me . So , 30s we 've done a lot with AutoML and we announce new capability for image , video , and translation to make it available to everybody . And then lastly , we 're also building completely packaged solutions for some areas and we announce some new stuff . "
# """
#
# tokenizer.pad_token = tokenizer.eos_token
# input_ids = tokenizer.encode(text,
# max_length=100,
# truncation=True,
# return_tensors="pt")
# attention_mask = torch.ones(input_ids.shape, dtype=torch.long)
# output = model.generate(input_ids,
# max_new_tokens=20,
# num_return_sequences=1,
# num_beams=2,
# attention_mask=attention_mask)
#
# chapter_titles = [tokenizer.decode(output[i], skip_special_tokens=True) for i in range(output.shape[0])]
# for i, title in enumerate(chapter_titles):
# print("Caption: ", title)
# Approach 3
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
def generate_response(conversation, max_length=100):
input_text = ""
for entry in conversation:
role = entry["role"]
content = entry["content"]
input_text += f"{role}: {content}\n"
# Tokenize the entire conversation
input_ids = tokenizer.encode(input_text, return_tensors="pt")
# Generate text based on the entire conversation
with torch.no_grad():
output = model.generate(input_ids, pad_token_id=tokenizer.eos_token_id)
# Decode the generated text and return it
response = tokenizer.decode(output[0], skip_special_tokens=True)
return response
if __name__ == "__main__":
model_name = "gpt2"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
sample_chunks = [
"You all just came off of your incredible Google Cloud next conference where you released a wide variety of functionality and features and new products across artisan television and also across the entire sort of cloud ecosystem . You want to just first by walking through , first start by walking through all the innovations that you sort of released and what you 're excited about when you come to Google Cloud ? Now our vision is super simple . If you look at what smartphones did for a consumer , you know they took a computer and internet browser , a communication device , and a camera , and made it so that it 's in everybody 's pocket , so it really brought computation to every person . We feel that , you know , our , what we 're trying to do is take all the technological innovation that Google 's doing , but make it super simple so that everyone can consume it . And so that includes our global data center footprint , all the new types of hardware and large-scale systems we work on , the software that we 're making available for people to do high-scale computation , tools for data processing , tools for cybersecurity , processing , tools for cyber security , tools for machine learning , but make it so simple that everyone can use it . And every step that we do to simplify things for people , we think adoption can grow . And so that 's a lot of what we 've done these last three , four years , and we made a number of announcements that next in machine learning and AI in particular , you know , we look at our work as four elements , how we take our large-scale compute systems that were building for AI and how we make that available to everybody . Second , what we 're doing with the software stacks and top of it , things like jacks and other things and how we 're making those available to everybody . Third is advances because different people have different levels of expertise . Some people say I need the hardware to build my own large language model or algorithm . Other people say , look , I really need to use a building block . You guys give me . So , 30s we 've done a lot with AutoML and we announce new capability for image , video , and translation to make it available to everybody . And then lastly , we 're also building completely packaged solutions for some areas and we announce some new stuff . "
]
conversation = [
{"role": "system", "content": "Summarize this text" },
{"role": "user", "content": " text : " + sample_chunks[0]},
]
response = generate_response(conversation)
print("Response:", response)

View File

View File

@@ -16,8 +16,8 @@ from av import AudioFifo
from sortedcontainers import SortedDict
from whisper_jax import FlaxWhisperPipline
from utils.log_utils import logger
from utils.run_utils import config, Mutex
from reflector.utils.log_utils import logger
from reflector.utils.run_utils import config, Mutex
WHISPER_MODEL_SIZE = config['DEFAULT']["WHISPER_REAL_TIME_MODEL_SIZE"]
pcs = set()

View File

View File

@@ -0,0 +1,57 @@
import requests
import spacy
# Enter the Machine where the LLM is hosted
LLM_MACHINE_IP = ""
# This is the URL of text-generation-webui
URL = f"http://{LLM_MACHINE_IP}:5000/api/v1/generate"
headers = {
"Content-Type": "application/json"
}
def split_text_file(filename, token_count):
nlp = spacy.load('en_core_web_md')
with open(filename, 'r') as file:
text = file.read()
doc = nlp(text)
total_tokens = len(doc)
parts = []
start_index = 0
while start_index < total_tokens:
end_index = start_index + token_count
part_tokens = doc[start_index:end_index - 5]
part = ' '.join(token.text for token in part_tokens)
parts.append(part)
start_index = end_index
return parts
final_summary = ""
parts = split_text_file("transcript.txt", 1600)
for part in parts:
prompt = f"""
### Human:
Given the following text, distill the most important information
into a short summary: {part}
### Assistant:
"""
data = {
"prompt": prompt
}
try:
response = requests.post(URL, headers=headers, json=data)
print(response.json())
except Exception as e:
print(str(e))
with open("summary.txt", "w") as sum:
sum.write(" ".join(final_summary))

View File

@@ -0,0 +1,43 @@
import torch
from transformers import BertTokenizer, BertModel
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
# Load the pre-trained BERT model and tokenizer
model_name = "bert-base-uncased"
model = BertModel.from_pretrained(model_name)
tokenizer = BertTokenizer.from_pretrained(model_name)
# Set the device to use
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# Load the SentenceTransformer model
sentence_transformer_model = SentenceTransformer('average_word_embeddings_glove.6B.300d')
# Define the input text
text = "Your input text to be summarized goes here."
# Tokenize the text
tokens = tokenizer.tokenize(text)
input_ids = tokenizer.convert_tokens_to_ids(tokens)
input_ids = torch.tensor([input_ids]).to(device)
# Get the BERT model output
with torch.no_grad():
outputs = model(input_ids)[0] # Extract the last hidden states
# Calculate sentence embeddings
sentence_embeddings = outputs.mean(dim=1).squeeze().cpu().numpy()
input_text_embedding = sentence_transformer_model.encode([text])[0]
# Calculate cosine similarity between sentences and input text
similarity_scores = cosine_similarity([input_text_embedding], sentence_embeddings)
# Sort the sentences by similarity scores in descending order
sorted_sentences = [sent for _, sent in sorted(zip(similarity_scores[0], sentences), reverse=True)]
# Choose the top sentences as the summary
num_summary_sentences = 2 # Adjust as needed
summary = ". ".join(sorted_sentences[:num_summary_sentences])
print("Summary:", summary)

View File

@@ -0,0 +1,101 @@
# Approach 1
from transformers import GPTNeoForCausalLM, GPT2Tokenizer
model_name = 'EleutherAI/gpt-neo-1.3B'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPTNeoForCausalLM.from_pretrained(model_name)
conversation = """
Summarize the following conversation in 3 key sentences:
We 're joined next by Thomas Curian , CEO of Google Cloud , and Alexander Wang , CEO and founder of Scale AI .
Thomas joined Google in November 2018 as the CEO of Google Cloud . Prior to Google , Thomas spent 22 years at Oracle , where most recently he was president of product development .
Before that , Thomas worked at McKinsey as a business analyst and engagement manager . His nearly 30 years of experience have given him a deep knowledge of engineering enterprise relationships and leadership of large organizations .
Thomas 's degrees include an MBA in administration and management from Stanford University , as an RJ Miller scholar and a BSEE in electrical engineering and computer science from Princeton University , where he graduated suma cum laude .
Thomas serves as a member of the Stanford graduate School of Business Advisory Council and Princeton University School of Engineering Advisory Council .
Please welcome to the stage , Thomas Curian and Alexander Wang . This is a super exciting conversation . Thanks for being here , Thomas .
"""
input_ids = tokenizer.encode(conversation, return_tensors='pt')
output = model.generate(input_ids,
max_length=30,
num_return_sequences=1)
caption = tokenizer.decode(output[0], skip_special_tokens=True)
print("Caption:", caption[len(input_ids):])
# Approach 2
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)
model.eval()
text = """
You all just came off of your incredible Google Cloud next conference where you released a wide variety of functionality and features and new products across artisan television and also across the entire sort of cloud ecosystem . You want to just first by walking through , first start by walking through all the innovations that you sort of released and what you 're excited about when you come to Google Cloud ? Now our vision is super simple . If you look at what smartphones did for a consumer , you know they took a computer and internet browser , a communication device , and a camera , and made it so that it 's in everybody 's pocket , so it really brought computation to every person . We feel that , you know , our , what we 're trying to do is take all the technological innovation that Google 's doing , but make it super simple so that everyone can consume it . And so that includes our global data center footprint , all the new types of hardware and large-scale systems we work on , the software that we 're making available for people to do high-scale computation , tools for data processing , tools for cybersecurity , processing , tools for cyber security , tools for machine learning , but make it so simple that everyone can use it . And every step that we do to simplify things for people , we think adoption can grow . And so that 's a lot of what we 've done these last three , four years , and we made a number of announcements that next in machine learning and AI in particular , you know , we look at our work as four elements , how we take our large-scale compute systems that were building for AI and how we make that available to everybody . Second , what we 're doing with the software stacks and top of it , things like jacks and other things and how we 're making those available to everybody . Third is advances because different people have different levels of expertise . Some people say I need the hardware to build my own large language model or algorithm . Other people say , look , I really need to use a building block . You guys give me . So , 30s we 've done a lot with AutoML and we announce new capability for image , video , and translation to make it available to everybody . And then lastly , we 're also building completely packaged solutions for some areas and we announce some new stuff . "
"""
tokenizer.pad_token = tokenizer.eos_token
input_ids = tokenizer.encode(text,
max_length=100,
truncation=True,
return_tensors="pt")
attention_mask = torch.ones(input_ids.shape, dtype=torch.long)
output = model.generate(input_ids,
max_new_tokens=20,
num_return_sequences=1,
num_beams=2,
attention_mask=attention_mask)
chapter_titles = [tokenizer.decode(output[i], skip_special_tokens=True) for i in range(output.shape[0])]
for i, title in enumerate(chapter_titles):
print("Caption: ", title)
# Approach 3
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
def generate_response(conversation, max_length=100):
input_text = ""
for entry in conversation:
role = entry["role"]
content = entry["content"]
input_text += f"{role}: {content}\n"
# Tokenize the entire conversation
input_ids = tokenizer.encode(input_text, return_tensors="pt")
# Generate text based on the entire conversation
with torch.no_grad():
output = model.generate(input_ids, pad_token_id=tokenizer.eos_token_id)
# Decode the generated text and return it
response = tokenizer.decode(output[0], skip_special_tokens=True)
return response
if __name__ == "__main__":
# Call appropriate approach from the main while experimenting
model_name = "gpt2"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
sample_chunks = [
"You all just came off of your incredible Google Cloud next conference where you released a wide variety of functionality and features and new products across artisan television and also across the entire sort of cloud ecosystem . You want to just first by walking through , first start by walking through all the innovations that you sort of released and what you 're excited about when you come to Google Cloud ? Now our vision is super simple . If you look at what smartphones did for a consumer , you know they took a computer and internet browser , a communication device , and a camera , and made it so that it 's in everybody 's pocket , so it really brought computation to every person . We feel that , you know , our , what we 're trying to do is take all the technological innovation that Google 's doing , but make it super simple so that everyone can consume it . And so that includes our global data center footprint , all the new types of hardware and large-scale systems we work on , the software that we 're making available for people to do high-scale computation , tools for data processing , tools for cybersecurity , processing , tools for cyber security , tools for machine learning , but make it so simple that everyone can use it . And every step that we do to simplify things for people , we think adoption can grow . And so that 's a lot of what we 've done these last three , four years , and we made a number of announcements that next in machine learning and AI in particular , you know , we look at our work as four elements , how we take our large-scale compute systems that were building for AI and how we make that available to everybody . Second , what we 're doing with the software stacks and top of it , things like jacks and other things and how we 're making those available to everybody . Third is advances because different people have different levels of expertise . Some people say I need the hardware to build my own large language model or algorithm . Other people say , look , I really need to use a building block . You guys give me . So , 30s we 've done a lot with AutoML and we announce new capability for image , video , and translation to make it available to everybody . And then lastly , we 're also building completely packaged solutions for some areas and we announce some new stuff . "
]
conversation = [
{"role": "system", "content": "Summarize this text"},
{"role": "user", "content": " text : " + sample_chunks[0]},
]
response = generate_response(conversation)
print("Response:", response)

View File

@@ -1,9 +1,11 @@
import spacy
import sys
# Observe the incremental summaries by performing summaries in chunks
with open("transcript.txt") as f:
transcription = f.read()
import spacy
def split_text_file(filename, token_count):
nlp = spacy.load('en_core_web_md')
@@ -26,8 +28,9 @@ def split_text_file(filename, token_count):
return parts
# Set the chunk length here to split the transcript and test
MAX_CHUNK_LENGTH=1000
MAX_CHUNK_LENGTH = 1000
chunks = split_text_file("transcript.txt", MAX_CHUNK_LENGTH)
print("Number of chunks", len(chunks))
@@ -41,19 +44,17 @@ with open("chunks" + str(MAX_CHUNK_LENGTH) + ".txt", "a") as f:
# ex. python incsum.py 1 => will run approach 1
# If no input, will run all approaches
import sys
try:
index = sys.argv[1]
except:
index = None
# Approach 1 : facebook/bart-large-cnn
if index == "1" or index is None:
SUMMARY_MODEL="facebook/bart-large-cnn"
MIN_LENGTH=5
MAX_LENGTH=10
BEAM_SIZE=2
SUMMARY_MODEL = "facebook/bart-large-cnn"
MIN_LENGTH = 5
MAX_LENGTH = 10
BEAM_SIZE = 2
print("Performing chunk summary : " + SUMMARY_MODEL)
@@ -81,7 +82,6 @@ if index == "1" or index is None:
for summary in summaries:
f.write(summary + "\n\n")
# Approach 2
if index == "2" or index is None:
print("Performing chunk summary : " + "gpt-neo-1.3B")
@@ -108,14 +108,14 @@ if index == "2" or index is None:
max_length=max_length,
attention_mask=attention_mask,
pad_token_id=model.config.eos_token_id,
num_beams=4,
length_penalty=2.0,
early_stopping=True)
num_beams=4,
length_penalty=2.0,
early_stopping=True)
summary_ids = output[0, input_length:]
summary = tokenizer.decode(summary_ids, skip_special_tokens=True)
summaries.append(summary)
with open("gptneo1.3B-summaries.txt", "a") as f:
f.write(summary + "\n\n")
f.write(summary + "\n\n")
# Approach 3
if index == "3" or index is None:
@@ -155,4 +155,3 @@ if index == "3" or index is None:
with open("mpt-7b-summaries.txt", "a") as f:
for summary in summaries:
f.write(summary + "\n\n")

View File

@@ -0,0 +1,37 @@
# Use OpenAI API endpoint to send data to OpenAI
# along with prompts to caption/summarize the conversation
import openai
openai.api_key = ""
# to caption, user prompt used : "caption this conversation"
# max_tokens=20
# to incremental summarize, user prompt used : "summarize this conversation in a few sentences by taking key points"
# max_tokens=300
sample_chunks = [
"You all just came off of your incredible Google Cloud next conference where you released a wide variety of functionality and features and new products across artisan television and also across the entire sort of cloud ecosystem . You want to just first by walking through , first start by walking through all the innovations that you sort of released and what you 're excited about when you come to Google Cloud ? Now our vision is super simple . If you look at what smartphones did for a consumer , you know they took a computer and internet browser , a communication device , and a camera , and made it so that it 's in everybody 's pocket , so it really brought computation to every person . We feel that , you know , our , what we 're trying to do is take all the technological innovation that Google 's doing , but make it super simple so that everyone can consume it . And so that includes our global data center footprint , all the new types of hardware and large-scale systems we work on , the software that we 're making available for people to do high-scale computation , tools for data processing , tools for cybersecurity , processing , tools for cyber security , tools for machine learning , but make it so simple that everyone can use it . And every step that we do to simplify things for people , we think adoption can grow . And so that 's a lot of what we 've done these last three , four years , and we made a number of announcements that next in machine learning and AI in particular , you know , we look at our work as four elements , how we take our large-scale compute systems that were building for AI and how we make that available to everybody . Second , what we 're doing with the software stacks and top of it , things like jacks and other things and how we 're making those available to everybody . Third is advances because different people have different levels of expertise . Some people say I need the hardware to build my own large language model or algorithm . Other people say , look , I really need to use a building block . You guys give me . So , 30s we 've done a lot with AutoML and we announce new capability for image , video , and translation to make it available to everybody . And then lastly , we 're also building completely packaged solutions for some areas and we announce some new stuff . ",
" We 're joined next by Thomas Curian , CEO of Google Cloud , and Alexander Wang , CEO and founder of Scale AI . Thomas joined Google in November 2018 as the CEO of Google Cloud . Prior to Google , Thomas spent 22 years at Oracle , where most recently he was president of product development . Before that , Thomas worked at McKinsey as a business analyst and engagement manager . His nearly 30 years of experience have given him a deep knowledge of engineering enterprise relationships and leadership of large organizations . Thomas 's degrees include an MBA in administration and management from Stanford University , as an RJ Miller scholar and a BSEE in electrical engineering and computer science from Princeton University , where he graduated suma cum laude . Thomas serves as a member of the Stanford graduate School of Business Advisory Council and Princeton University School of Engineering Advisory Council . Please welcome to the stage , Thomas Curian and Alexander Wang . This is a super exciting conversation . Thanks for being here , Thomas ."]
conversation = [
{"role": "system",
"content": sample_chunks[1]},
{"role": "user",
"content": "summarize this conversation in a few sentences by taking key points"}
]
model = "gpt-3.5-turbo"
response = openai.ChatCompletion.create(model=model,
messages=conversation,
n=1,
max_tokens=300)
# Try fine tuned model
# model = "davinci:ft-personal-2023-07-14-10-43-51"
# response = openai.Completion.create(model=model,
# prompt=sample_chunks[0] + " -> ")
caption = response.choices[0]
print(caption)

View File

@@ -0,0 +1,33 @@
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
import torch
# Load the Pegasus model and tokenizer
model_name = "google/pegasus-large"
model = PegasusForConditionalGeneration.from_pretrained(model_name)
tokenizer = PegasusTokenizer.from_pretrained(model_name)
# Set the device to use
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
sample_chunks = ["You all just came off of your incredible Google Cloud next conference where you released a wide variety of functionality and features and new products across artisan television and also across the entire sort of cloud ecosystem . You want to just first by walking through , first start by walking through all the innovations that you sort of released and what you 're excited about when you come to Google Cloud ? Now our vision is super simple . If you look at what smartphones did for a consumer , you know they took a computer and internet browser , a communication device , and a camera , and made it so that it 's in everybody 's pocket , so it really brought computation to every person . We feel that , you know , our , what we 're trying to do is take all the technological innovation that Google 's doing , but make it super simple so that everyone can consume it . And so that includes our global data center footprint , all the new types of hardware and large-scale systems we work on , the software that we 're making available for people to do high-scale computation , tools for data processing , tools for cybersecurity , processing , tools for cyber security , tools for machine learning , but make it so simple that everyone can use it . And every step that we do to simplify things for people , we think adoption can grow . And so that 's a lot of what we 've done these last three , four years , and we made a number of announcements that next in machine learning and AI in particular , you know , we look at our work as four elements , how we take our large-scale compute systems that were building for AI and how we make that available to everybody . Second , what we 're doing with the software stacks and top of it , things like jacks and other things and how we 're making those available to everybody . Third is advances because different people have different levels of expertise . Some people say I need the hardware to build my own large language model or algorithm . Other people say , look , I really need to use a building block . You guys give me . So , 30s we 've done a lot with AutoML and we announce new capability for image , video , and translation to make it available to everybody . And then lastly , we 're also building completely packaged solutions for some areas and we announce some new stuff . ",
" We 're joined next by Thomas Curian , CEO of Google Cloud , and Alexander Wang , CEO and founder of Scale AI . Thomas joined Google in November 2018 as the CEO of Google Cloud . Prior to Google , Thomas spent 22 years at Oracle , where most recently he was president of product development . Before that , Thomas worked at McKinsey as a business analyst and engagement manager . His nearly 30 years of experience have given him a deep knowledge of engineering enterprise relationships and leadership of large organizations . Thomas 's degrees include an MBA in administration and management from Stanford University , as an RJ Miller scholar and a BSEE in electrical engineering and computer science from Princeton University , where he graduated suma cum laude . Thomas serves as a member of the Stanford graduate School of Business Advisory Council and Princeton University School of Engineering Advisory Council . Please welcome to the stage , Thomas Curian and Alexander Wang . This is a super exciting conversation . Thanks for being here , Thomas ."]
# Define the input text for summarization
text = sample_chunks[1]
inputs = tokenizer(text, truncation=True, padding="longest", return_tensors="pt").to(device)
# Generate the summary
summary_ids = model.generate(
inputs["input_ids"],
attention_mask=inputs["attention_mask"],
max_length=200,
num_beams=4,
length_penalty=2.0,
early_stopping=True,
)
# Decode and print the summary
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
print("Summary:", summary)

View File

@@ -1,36 +1,27 @@
# Use OpenAI API endpoint to send data to OpenAI
# along with prompts to caption/summarize the conversation
from transformers import T5ForConditionalGeneration, T5Tokenizer
import torch
# Load the T5 model and tokenizer
model_name = "t5-base"
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)
import openai
openai.api_key = ""
# to caption, user prompt used : "caption this conversation"
# max_tokens=20
# to incremental summarize, user prompt used : "summarize this conversation in a few sentences by taking key points"
# max_tokens=300
# Set the device to use
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
sample_chunks = ["You all just came off of your incredible Google Cloud next conference where you released a wide variety of functionality and features and new products across artisan television and also across the entire sort of cloud ecosystem . You want to just first by walking through , first start by walking through all the innovations that you sort of released and what you 're excited about when you come to Google Cloud ? Now our vision is super simple . If you look at what smartphones did for a consumer , you know they took a computer and internet browser , a communication device , and a camera , and made it so that it 's in everybody 's pocket , so it really brought computation to every person . We feel that , you know , our , what we 're trying to do is take all the technological innovation that Google 's doing , but make it super simple so that everyone can consume it . And so that includes our global data center footprint , all the new types of hardware and large-scale systems we work on , the software that we 're making available for people to do high-scale computation , tools for data processing , tools for cybersecurity , processing , tools for cyber security , tools for machine learning , but make it so simple that everyone can use it . And every step that we do to simplify things for people , we think adoption can grow . And so that 's a lot of what we 've done these last three , four years , and we made a number of announcements that next in machine learning and AI in particular , you know , we look at our work as four elements , how we take our large-scale compute systems that were building for AI and how we make that available to everybody . Second , what we 're doing with the software stacks and top of it , things like jacks and other things and how we 're making those available to everybody . Third is advances because different people have different levels of expertise . Some people say I need the hardware to build my own large language model or algorithm . Other people say , look , I really need to use a building block . You guys give me . So , 30s we 've done a lot with AutoML and we announce new capability for image , video , and translation to make it available to everybody . And then lastly , we 're also building completely packaged solutions for some areas and we announce some new stuff . ",
" We 're joined next by Thomas Curian , CEO of Google Cloud , and Alexander Wang , CEO and founder of Scale AI . Thomas joined Google in November 2018 as the CEO of Google Cloud . Prior to Google , Thomas spent 22 years at Oracle , where most recently he was president of product development . Before that , Thomas worked at McKinsey as a business analyst and engagement manager . His nearly 30 years of experience have given him a deep knowledge of engineering enterprise relationships and leadership of large organizations . Thomas 's degrees include an MBA in administration and management from Stanford University , as an RJ Miller scholar and a BSEE in electrical engineering and computer science from Princeton University , where he graduated suma cum laude . Thomas serves as a member of the Stanford graduate School of Business Advisory Council and Princeton University School of Engineering Advisory Council . Please welcome to the stage , Thomas Curian and Alexander Wang . This is a super exciting conversation . Thanks for being here , Thomas ."]
conversation = [
{"role": "system",
"content": sample_chunks[1]},
{"role": "user",
"content": "summarize this conversation in a few sentences by taking key points"}
]
model = "gpt-3.5-turbo"
response = openai.ChatCompletion.create(model=model,
messages=conversation,
n=1,
max_tokens=300)
# Define the input text for summarization
text = "Summarize the following text in 3 key points. text : " + sample_chunks[1]
# Try finetuned model
# model = "davinci:ft-personal-2023-07-14-10-43-51"
# response = openai.Completion.create(model=model,
# prompt=sample_chunks[0] + " -> ")
# Tokenize the input text
inputs = tokenizer.encode(text, return_tensors="pt").to(device)
caption = response.choices[0]
print(caption)
# Generate the summary
summary_ids = model.generate(inputs, max_length=1000, num_beams=4, early_stopping=True)
# Decode and print the summary
summary = tokenizer.decode(summary_ids.squeeze(), skip_special_tokens=True)
print("Summary:", summary)

View File

@@ -0,0 +1,44 @@
from gpt4all import GPT4All
model = GPT4All("/Users/gokulmohanarangan/Library/Application Support/nomic.ai/GPT4All/ggml-vicuna-13b-1.1-q4_2.bin")
import spacy
def split_text_file(filename, token_count):
nlp = spacy.load('en_core_web_md')
with open(filename, 'r') as file:
text = file.read()
doc = nlp(text)
total_tokens = len(doc)
parts = []
start_index = 0
while start_index < total_tokens:
end_index = start_index + token_count
part_tokens = doc[start_index:end_index]
part = ' '.join(token.text for token in part_tokens)
parts.append(part)
start_index = end_index
return parts
parts = split_text_file("transcript.txt", 1800)
final_summary = []
for part in parts:
prompt = f"""
### Human:
Summarize the following text without missing any key points and action items.
{part}
### Assistant:
"""
output = model.generate(prompt)
final_summary.append(output)
with open("sum.txt", "w") as sum:
sum.write(" ".join(final_summary))

View File

View File

@@ -18,11 +18,11 @@ import nltk
import yt_dlp as youtube_dl
from whisper_jax import FlaxWhisperPipline
from utils.file_utils import download_files, upload_files
from utils.log_utils import logger
from utils.run_utils import config
from utils.text_utilities import post_process_transcription, summarize
from utils.viz_utilities import create_talk_diff_scatter_viz, create_wordcloud
from ...utils.file_utils import download_files, upload_files
from ...utils.log_utils import logger
from ...utils.run_utils import config
from ...utils.text_utils import post_process_transcription, summarize
from ...utils.viz_utils import create_talk_diff_scatter_viz, create_wordcloud
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
@@ -30,8 +30,8 @@ nltk.download('stopwords', quiet=True)
WHISPER_MODEL_SIZE = config['DEFAULT']["WHISPER_MODEL_SIZE"]
NOW = datetime.now()
if not os.path.exists('./artefacts'):
os.makedirs('./artefacts')
if not os.path.exists('../../artefacts'):
os.makedirs('../../artefacts')
def init_argparse() -> argparse.ArgumentParser:
@@ -91,7 +91,7 @@ def main():
# Download the audio
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
ydl.download([args.location])
media_file = "./artefacts/audio.mp3"
media_file = "../artefacts/audio.mp3"
logger.info("Saved downloaded YouTube video to: " + media_file)
else:

View File

@@ -10,11 +10,11 @@ from pynput import keyboard
from termcolor import colored
from whisper_jax import FlaxWhisperPipline
from utils.file_utils import upload_files
from utils.log_utils import logger
from utils.run_utils import config
from utils.text_utilities import post_process_transcription, summarize
from utils.viz_utilities import create_talk_diff_scatter_viz, create_wordcloud
from ...utils.file_utils import upload_files
from ...utils.log_utils import logger
from ...utils.run_utils import config
from ...utils.text_utils import post_process_transcription, summarize
from ...utils.viz_utils import create_talk_diff_scatter_viz, create_wordcloud
WHISPER_MODEL_SIZE = config['DEFAULT']["WHISPER_MODEL_SIZE"]

View File

@@ -1,5 +1,6 @@
[DEFAULT]
#SetexceptionruleforOpenMPerrortoallowduplicatelibinitialization
#Set exception rule for OpenMP error
#to allow duplicate lib initialization
KMP_DUPLICATE_LIB_OK=TRUE
#ExportOpenAIAPIKey
OPENAI_APIKEY=
@@ -7,8 +8,8 @@ OPENAI_APIKEY=
WHISPER_MODEL_SIZE=tiny
WHISPER_REAL_TIME_MODEL_SIZE=tiny
#AWSconfig
AWS_ACCESS_KEY=***REMOVED***
AWS_SECRET_KEY=***REMOVED***
AWS_ACCESS_KEY=
AWS_SECRET_KEY=
BUCKET_NAME=reflector-bucket
#Summarizerconfig
SUMMARY_MODEL=facebook/bart-large-cnn
@@ -17,8 +18,9 @@ MAX_LENGTH=2048
BEAM_SIZE=6
MAX_CHUNK_LENGTH=1024
SUMMARIZE_USING_CHUNKS=YES
#Audiodevice
# Audiodevice
BLACKHOLE_INPUT_AGGREGATOR_DEVICE_NAME=aggregator
AV_FOUNDATION_DEVICE_ID=1
# LLM PATH
LLM_PATH=
# LLM configs
LLM_MACHINE_IP=
LLM_MACHINE_PORT=

32
utils/format_output.py Normal file
View File

@@ -0,0 +1,32 @@
import json
with open("../artefacts/meeting_titles_and_summaries.txt", "r") as f:
outputs = f.read()
outputs = json.loads(outputs)
transcript_file = open("../artefacts/meeting_transcript.txt", "a")
title_desc_file = open("../artefacts/meeting_title_description.txt", "a")
summary_file = open("../artefacts/meeting_summary.txt", "a")
for item in outputs["topics"]:
transcript_file.write(item["transcript"])
summary_file.write(item["description"])
title_desc_file.write("TITLE: \n")
title_desc_file.write(item["title"])
title_desc_file.write("\n")
title_desc_file.write("DESCRIPTION: \n")
title_desc_file.write(item["description"])
title_desc_file.write("\n")
title_desc_file.write("TRANSCRIPT: \n")
title_desc_file.write(item["transcript"])
title_desc_file.write("\n")
title_desc_file.write("---------------------------------------- \n\n")
transcript_file.close()
title_desc_file.close()
summary_file.close()

View File

@@ -6,8 +6,8 @@ from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import BartForConditionalGeneration, BartTokenizer
from utils.log_utils import logger
from utils.run_utils import config
from log_utils import logger
from run_utils import config
nltk.download('punkt', quiet=True)
@@ -154,7 +154,7 @@ def chunk_text(text,
def summarize(transcript_text, timestamp,
real_time=False,
summarize_using_chunks=config["DEFAULT"]["SUMMARIZE_USING_CHUNKS"]):
chunk_summarize=config["DEFAULT"]["SUMMARIZE_USING_CHUNKS"]):
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
summary_model = config["DEFAULT"]["SUMMARY_MODEL"]
if not summary_model:
@@ -166,27 +166,35 @@ def summarize(transcript_text, timestamp,
model = BartForConditionalGeneration.from_pretrained(summary_model)
model = model.to(device)
output_filename = "summary_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".txt"
output_file = "summary_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".txt"
if real_time:
output_filename = "real_time_" + output_filename
output_file = "real_time_" + output_file
if summarize_using_chunks != "YES":
inputs = tokenizer.\
if chunk_summarize != "YES":
max_length = int(config["DEFAULT"]["INPUT_ENCODING_MAX_LENGTH"])
inputs = tokenizer. \
batch_encode_plus([transcript_text], truncation=True,
padding='longest',
max_length=int(config["DEFAULT"]["INPUT_ENCODING_MAX_LENGTH"]),
max_length=max_length,
return_tensors='pt')
inputs = inputs.to(device)
with torch.no_grad():
num_beans = int(config["DEFAULT"]["BEAM_SIZE"])
max_length = int(config["DEFAULT"]["MAX_LENGTH"])
summaries = model.generate(inputs['input_ids'],
num_beams=int(config["DEFAULT"]["BEAM_SIZE"]), length_penalty=2.0,
max_length=int(config["DEFAULT"]["MAX_LENGTH"]), early_stopping=True)
num_beams=num_beans,
length_penalty=2.0,
max_length=max_length,
early_stopping=True)
decoded_summaries = [tokenizer.decode(summary, skip_special_tokens=True, clean_up_tokenization_spaces=False)
for summary in summaries]
decoded_summaries = \
[tokenizer.decode(summary,
skip_special_tokens=True,
clean_up_tokenization_spaces=False)
for summary in summaries]
summary = " ".join(decoded_summaries)
with open("./artefacts/" + output_filename, 'w') as f:
with open("./artefacts/" + output_file, 'w') as f:
f.write(summary.strip() + "\n")
else:
logger.info("Breaking transcript into smaller chunks")
@@ -195,8 +203,8 @@ def summarize(transcript_text, timestamp,
logger.info(f"Transcript broken into {len(chunks)} "
f"chunks of at most 500 words")
logger.info(f"Writing summary text to: {output_filename}")
with open(output_filename, 'w') as f:
logger.info(f"Writing summary text to: {output_file}")
with open(output_file, 'w') as f:
summaries = summarize_chunks(chunks, tokenizer, model)
for summary in summaries:
f.write(summary.strip() + " ")

View File

@@ -13,7 +13,7 @@ from wordcloud import STOPWORDS, WordCloud
en = spacy.load('en_core_web_md')
spacy_stopwords = en.Defaults.stop_words
STOPWORDS = set(STOPWORDS).union(set(stopwords.words("english"))).\
STOPWORDS = set(STOPWORDS).union(set(stopwords.words("english"))). \
union(set(spacy_stopwords))
@@ -24,7 +24,7 @@ def create_wordcloud(timestamp, real_time=False):
"""
filename = "transcript"
if real_time:
filename = "real_time_" + filename + "_" +\
filename = "real_time_" + filename + "_" + \
timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".txt"
else:
filename += "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".txt"
@@ -45,24 +45,24 @@ def create_wordcloud(timestamp, real_time=False):
plt.axis("off")
plt.tight_layout(pad=0)
wordcloud_name = "wordcloud"
wordcloud = "wordcloud"
if real_time:
wordcloud_name = "real_time_" + wordcloud_name + "_" +\
wordcloud = "real_time_" + wordcloud + "_" + \
timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".png"
else:
wordcloud_name += "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".png"
wordcloud += "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".png"
plt.savefig("./artefacts/" + wordcloud_name)
plt.savefig("./artefacts/" + wordcloud)
def create_talk_diff_scatter_viz(timestamp, real_time=False):
"""
Perform agenda vs transription diff to see covered topics.
Perform agenda vs transcription diff to see covered topics.
Create a scatter plot of words in topics.
:return: None. Saved locally.
"""
spaCy_model = "en_core_web_md"
nlp = spacy.load(spaCy_model)
spacy_model = "en_core_web_md"
nlp = spacy.load(spacy_model)
nlp.add_pipe('sentencizer')
agenda_topics = []
@@ -75,12 +75,11 @@ def create_talk_diff_scatter_viz(timestamp, real_time=False):
agenda_topics.append(line.split(":")[0])
# Load the transcription with timestamp
filename = ""
if real_time:
filename = "./artefacts/real_time_transcript_with_timestamp_" +\
filename = "./artefacts/real_time_transcript_with_timestamp_" + \
timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".txt"
else:
filename = "./artefacts/transcript_with_timestamp_" +\
filename = "./artefacts/transcript_with_timestamp_" + \
timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".txt"
with open(filename) as f:
transcription_timestamp_text = f.read()
@@ -142,7 +141,7 @@ def create_talk_diff_scatter_viz(timestamp, real_time=False):
df = df.apply(create_new_columns, axis=1)
# Count the number of items covered and calculatre the percentage
# Count the number of items covered and calculate the percentage
num_covered_items = sum(covered_items.values())
percentage_covered = num_covered_items / len(agenda) * 100
@@ -158,7 +157,7 @@ def create_talk_diff_scatter_viz(timestamp, real_time=False):
# Save df, mappings for further experimentation
df_name = "df"
if real_time:
df_name = "real_time_" + df_name + "_" +\
df_name = "real_time_" + df_name + "_" + \
timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".pkl"
else:
df_name += "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".pkl"
@@ -169,7 +168,7 @@ def create_talk_diff_scatter_viz(timestamp, real_time=False):
mappings_name = "mappings"
if real_time:
mappings_name = "real_time_" + mappings_name + "_" +\
mappings_name = "real_time_" + mappings_name + "_" + \
timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".pkl"
else:
mappings_name += "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".pkl"