mirror of
https://github.com/Monadical-SAS/reflector.git
synced 2025-12-20 20:29:06 +00:00
Merge pull request #40 from Monadical-SAS/feat/gokul
Code refactor and cleanup from Feat/gokul
This commit is contained in:
2
.gitignore
vendored
2
.gitignore
vendored
@@ -165,7 +165,7 @@ cython_debug/
|
||||
transcript_*.txt
|
||||
test_*.txt
|
||||
wordcloud*.png
|
||||
*.ini
|
||||
utils/config.ini
|
||||
test_samples/
|
||||
*.wav
|
||||
*.mp3
|
||||
|
||||
@@ -5,15 +5,15 @@ import signal
|
||||
from aiortc.contrib.signaling import (add_signaling_arguments,
|
||||
create_signaling)
|
||||
|
||||
from stream_client import StreamClient
|
||||
from utils.log_utils import logger
|
||||
from stream_client import StreamClient
|
||||
|
||||
|
||||
async def main():
|
||||
parser = argparse.ArgumentParser(description="Data channels ping/pong")
|
||||
|
||||
parser.add_argument(
|
||||
"--url", type=str, nargs="?", default="http://127.0.0.1:1250/offer"
|
||||
"--url", type=str, nargs="?", default="http://0.0.0.0:1250/offer"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
|
||||
@@ -2,8 +2,6 @@ pyaudio==0.2.13
|
||||
keyboard==0.13.5
|
||||
pynput==1.7.6
|
||||
wave==0.0.2
|
||||
aiohttp==3.8.4
|
||||
aiosignal==1.3.1
|
||||
async-timeout==4.0.2
|
||||
attrs==23.1.0
|
||||
certifi==2023.5.7
|
||||
@@ -51,11 +49,8 @@ matplotlib==3.7.2
|
||||
matplotlib-inline==0.1.6
|
||||
termcolor==2.3.0
|
||||
ffmpeg==1.4
|
||||
aiortc==1.5.0
|
||||
cached_property==1.5.2
|
||||
stamina==23.1.0
|
||||
httpx==0.24.1
|
||||
sortedcontainers==2.4.0
|
||||
https://github.com/yt-dlp/yt-dlp/archive/master.tar.gz
|
||||
gpt4all==1.0.5
|
||||
aiohttp_cors==0.7.0
|
||||
2
scripts/setup_dependencies.sh → scripts/setup_pipeline_dependencies.sh
Executable file → Normal file
2
scripts/setup_dependencies.sh → scripts/setup_pipeline_dependencies.sh
Executable file → Normal file
@@ -26,7 +26,7 @@ pip install git+https://github.com/sanchit-gandhi/whisper-jax.git
|
||||
# Update to latest version
|
||||
pip install --upgrade --no-deps --force-reinstall git+https://github.com/sanchit-gandhi/whisper-jax.git
|
||||
|
||||
pip install -r ../requirements.txt
|
||||
pip install -r ../server-requirements.txt
|
||||
|
||||
# download spacy models
|
||||
spacy download en_core_web_sm
|
||||
4
scripts/setup_server_dependencies.sh
Executable file
4
scripts/setup_server_dependencies.sh
Executable file
@@ -0,0 +1,4 @@
|
||||
#!/bin/sh
|
||||
|
||||
pip install --upgrade pip
|
||||
pip install -r ../server-requirements.txt
|
||||
50
server-requirements.txt
Normal file
50
server-requirements.txt
Normal file
@@ -0,0 +1,50 @@
|
||||
aiohttp==3.8.5
|
||||
aiohttp-cors==0.7.0
|
||||
aioice==0.9.0
|
||||
aiortc==1.5.0
|
||||
aiosignal==1.3.1
|
||||
anyio==3.7.1
|
||||
async-timeout==4.0.2
|
||||
attrs==23.1.0
|
||||
av==10.0.0
|
||||
certifi==2023.7.22
|
||||
cffi==1.15.1
|
||||
charset-normalizer==3.2.0
|
||||
coloredlogs==15.0.1
|
||||
cryptography==41.0.2
|
||||
ctranslate2==3.17.1
|
||||
dnspython==2.4.0
|
||||
faster-whisper==0.7.1
|
||||
filelock==3.12.2
|
||||
flatbuffers==23.5.26
|
||||
frozenlist==1.4.0
|
||||
fsspec==2023.6.0
|
||||
google-crc32c==1.5.0
|
||||
h11==0.14.0
|
||||
httpcore==0.17.3
|
||||
huggingface-hub==0.16.4
|
||||
humanfriendly==10.0
|
||||
idna==3.4
|
||||
ifaddr==0.2.0
|
||||
loguru==0.7.0
|
||||
mpmath==1.3.0
|
||||
multidict==6.0.4
|
||||
numpy==1.25.1
|
||||
onnxruntime==1.15.1
|
||||
packaging==23.1
|
||||
protobuf==4.23.4
|
||||
pycparser==2.21
|
||||
pyee==11.0.0
|
||||
pylibsrtp==0.8.0
|
||||
pyOpenSSL==23.2.0
|
||||
PyYAML==6.0.1
|
||||
requests==2.31.0
|
||||
sniffio==1.3.0
|
||||
sortedcontainers==2.4.0
|
||||
sympy==1.12
|
||||
tokenizers==0.13.3
|
||||
tqdm==4.65.0
|
||||
typing_extensions==4.7.1
|
||||
urllib3==2.0.4
|
||||
yarl==1.9.2
|
||||
wave==0.0.2
|
||||
@@ -1,29 +1,30 @@
|
||||
import argparse
|
||||
import asyncio
|
||||
import datetime
|
||||
import io
|
||||
import json
|
||||
import os
|
||||
import uuid
|
||||
import wave
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
|
||||
import aiohttp_cors
|
||||
import jax.numpy as jnp
|
||||
import requests
|
||||
from aiohttp import web
|
||||
from aiortc import MediaStreamTrack, RTCPeerConnection, RTCSessionDescription
|
||||
from aiortc.contrib.media import MediaRelay
|
||||
from av import AudioFifo
|
||||
from faster_whisper import WhisperModel
|
||||
from loguru import logger
|
||||
from whisper_jax import FlaxWhisperPipline
|
||||
from utils.run_utils import run_in_executor
|
||||
from sortedcontainers import SortedDict
|
||||
|
||||
from utils.run_utils import run_in_executor, config
|
||||
|
||||
pcs = set()
|
||||
relay = MediaRelay()
|
||||
data_channel = None
|
||||
pipeline = FlaxWhisperPipline("openai/whisper-tiny",
|
||||
dtype=jnp.float16,
|
||||
batch_size=16)
|
||||
model = WhisperModel("tiny", device="cpu",
|
||||
compute_type="float32",
|
||||
num_workers=12)
|
||||
|
||||
CHANNELS = 2
|
||||
RATE = 48000
|
||||
@@ -31,8 +32,8 @@ audio_buffer = AudioFifo()
|
||||
executor = ThreadPoolExecutor()
|
||||
transcription_text = ""
|
||||
last_transcribed_time = 0.0
|
||||
LLM_MACHINE_IP = "216.153.52.83"
|
||||
LLM_MACHINE_PORT = "5000"
|
||||
LLM_MACHINE_IP = config["DEFAULT"]["LLM_MACHINE_IP"]
|
||||
LLM_MACHINE_PORT = config["DEFAULT"]["LLM_MACHINE_PORT"]
|
||||
LLM_URL = f"http://{LLM_MACHINE_IP}:{LLM_MACHINE_PORT}/api/v1/generate"
|
||||
incremental_responses = []
|
||||
sorted_transcripts = SortedDict()
|
||||
@@ -43,7 +44,7 @@ blacklisted_messages = [" Thank you.", " See you next time!",
|
||||
|
||||
|
||||
def get_title_and_summary(llm_input_text, last_timestamp):
|
||||
print("Generating title and summary")
|
||||
logger.info("Generating title and summary")
|
||||
# output = llm.generate(prompt)
|
||||
|
||||
# Use monadical-ml to fire this query to an LLM and get result
|
||||
@@ -67,27 +68,28 @@ def get_title_and_summary(llm_input_text, last_timestamp):
|
||||
"prompt": prompt
|
||||
}
|
||||
|
||||
# To-do: Handle unexpected output formats from the model
|
||||
# TODO : Handle unexpected output formats from the model
|
||||
try:
|
||||
response = requests.post(LLM_URL, headers=headers, json=data)
|
||||
output = json.loads(response.json()["results"][0]["text"])
|
||||
output["description"] = output.pop("summary")
|
||||
output["transcript"] = llm_input_text
|
||||
output["timestamp"] =\
|
||||
output["timestamp"] = \
|
||||
str(datetime.timedelta(seconds=round(last_timestamp)))
|
||||
incremental_responses.append(output)
|
||||
result = {
|
||||
"cmd": "UPDATE_TOPICS",
|
||||
"topics": incremental_responses,
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
print("Exception" + str(e))
|
||||
logger.info("Exception" + str(e))
|
||||
result = None
|
||||
return result
|
||||
|
||||
|
||||
def channel_log(channel, t, message):
|
||||
print("channel(%s) %s %s" % (channel.label, t, message))
|
||||
logger.info("channel(%s) %s %s" % (channel.label, t, message))
|
||||
|
||||
|
||||
def channel_send(channel, message):
|
||||
@@ -113,18 +115,25 @@ def channel_send_transcript(channel):
|
||||
# Due to exceptions if one of the earlier batches can't return
|
||||
# a transcript, we don't want to be stuck waiting for the result
|
||||
# With the threshold size of 3, we pop the first(lost) element
|
||||
elif len(sorted_transcripts) >= 3:
|
||||
del sorted_transcripts[least_time]
|
||||
else:
|
||||
if len(sorted_transcripts) >= 3:
|
||||
del sorted_transcripts[least_time]
|
||||
except Exception as e:
|
||||
print("Exception", str(e))
|
||||
logger.info("Exception", str(e))
|
||||
pass
|
||||
|
||||
|
||||
def get_transcription(frames):
|
||||
print("Transcribing..")
|
||||
logger.info("Transcribing..")
|
||||
sorted_transcripts[frames[0].time] = None
|
||||
out_file = io.BytesIO()
|
||||
wf = wave.open(out_file, "wb")
|
||||
|
||||
# TODO:
|
||||
# Passing IO objects instead of temporary files throws an error
|
||||
# Passing ndarrays (typecasted with float) does not give any
|
||||
# transcription. Refer issue,
|
||||
# https://github.com/guillaumekln/faster-whisper/issues/369
|
||||
audiofilename = "test" + str(datetime.datetime.now())
|
||||
wf = wave.open(audiofilename, "wb")
|
||||
wf.setnchannels(CHANNELS)
|
||||
wf.setframerate(RATE)
|
||||
wf.setsampwidth(2)
|
||||
@@ -133,22 +142,40 @@ def get_transcription(frames):
|
||||
wf.writeframes(b"".join(frame.to_ndarray()))
|
||||
wf.close()
|
||||
|
||||
# To-Do: Look into WhisperTimeStampLogitsProcessor exception
|
||||
try:
|
||||
whisper_result = pipeline(out_file.getvalue(), return_timestamps=True)
|
||||
except Exception as e:
|
||||
return
|
||||
result_text = ""
|
||||
|
||||
global transcription_text, last_transcribed_time
|
||||
transcription_text += whisper_result["text"]
|
||||
duration = whisper_result["chunks"][0]["timestamp"][1]
|
||||
if not duration:
|
||||
duration = 5.0
|
||||
last_transcribed_time += duration
|
||||
try:
|
||||
segments, _ = \
|
||||
model.transcribe(audiofilename,
|
||||
language="en",
|
||||
beam_size=5,
|
||||
vad_filter=True,
|
||||
vad_parameters=dict(min_silence_duration_ms=500))
|
||||
os.remove(audiofilename)
|
||||
segments = list(segments)
|
||||
result_text = ""
|
||||
duration = 0.0
|
||||
for segment in segments:
|
||||
result_text += segment.text
|
||||
start_time = segment.start
|
||||
end_time = segment.end
|
||||
if not segment.start:
|
||||
start_time = 0.0
|
||||
if not segment.end:
|
||||
end_time = 5.5
|
||||
duration += (end_time - start_time)
|
||||
|
||||
global last_transcribed_time, transcription_text
|
||||
last_transcribed_time += duration
|
||||
transcription_text += result_text
|
||||
|
||||
except Exception as e:
|
||||
logger.info("Exception" + str(e))
|
||||
pass
|
||||
|
||||
result = {
|
||||
"cmd": "SHOW_TRANSCRIPTION",
|
||||
"text": whisper_result["text"]
|
||||
"text": result_text
|
||||
}
|
||||
sorted_transcripts[frames[0].time] = result
|
||||
return result
|
||||
@@ -167,6 +194,9 @@ def get_final_summary_response():
|
||||
seconds=round(last_transcribed_time))),
|
||||
"summary": final_summary
|
||||
}
|
||||
|
||||
with open("./artefacts/meeting_titles_and_summaries.txt", "a") as f:
|
||||
f.write(json.dumps(incremental_responses))
|
||||
return response
|
||||
|
||||
|
||||
@@ -196,7 +226,7 @@ class AudioStreamTrack(MediaStreamTrack):
|
||||
else None
|
||||
)
|
||||
|
||||
if len(transcription_text) > 500:
|
||||
if len(transcription_text) > 750:
|
||||
llm_input_text = transcription_text
|
||||
transcription_text = ""
|
||||
llm_result = run_in_executor(get_title_and_summary,
|
||||
@@ -245,7 +275,6 @@ async def offer(request):
|
||||
if isinstance(message, str) and message.startswith("ping"):
|
||||
channel_send(channel, "pong" + message[4:])
|
||||
|
||||
|
||||
@pc.on("connectionstatechange")
|
||||
async def on_connectionstatechange():
|
||||
log_info("Connection state is " + pc.connectionState)
|
||||
@@ -278,6 +307,16 @@ async def on_shutdown(app):
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description="WebRTC based server for Reflector"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--host", default="0.0.0.0", help="Server host IP (def: 0.0.0.0)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--port", type=int, default=1250, help="Server port (def: 1250)"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
app = web.Application()
|
||||
cors = aiohttp_cors.setup(
|
||||
app,
|
||||
@@ -293,4 +332,4 @@ if __name__ == "__main__":
|
||||
offer_resource = cors.add(app.router.add_resource("/offer"))
|
||||
cors.add(offer_resource.add_route("POST", offer))
|
||||
app.on_shutdown.append(on_shutdown)
|
||||
web.run_app(app, access_log=None, host="127.0.0.1", port=1250)
|
||||
web.run_app(app, access_log=None, host=args.host, port=args.port)
|
||||
@@ -17,7 +17,7 @@ class StreamClient:
|
||||
def __init__(
|
||||
self,
|
||||
signaling,
|
||||
url="http://127.0.0.1:1250",
|
||||
url="http://0.0.0.0:1250",
|
||||
play_from=None,
|
||||
ping_pong=False
|
||||
):
|
||||
@@ -114,7 +114,7 @@ class StreamClient:
|
||||
self.channel_log(channel, "<", message)
|
||||
|
||||
if isinstance(message, str) and message.startswith("pong"):
|
||||
elapsed_ms = (self.current_stamp() - int(message[5:]))\
|
||||
elapsed_ms = (self.current_stamp() - int(message[5:])) \
|
||||
/ 1000
|
||||
print(" RTT %.2f ms" % elapsed_ms)
|
||||
|
||||
|
||||
0
trials/__init__.py
Normal file
0
trials/__init__.py
Normal file
0
trials/finetuning/__init__.py
Normal file
0
trials/finetuning/__init__.py
Normal file
24
trials/finetuning/inference_fine_tuned.py
Normal file
24
trials/finetuning/inference_fine_tuned.py
Normal file
@@ -0,0 +1,24 @@
|
||||
# Steps to prepare data and submit/check OpenAI finetuning
|
||||
# import subprocess
|
||||
# subprocess.run("openai tools fine_tunes.prepare_data -f " + "finetuning_dataset.jsonl")
|
||||
# export OPENAI_API_KEY=
|
||||
# openai api fine_tunes.create -t <TRAIN_FILE_ID_OR_PATH> -m <BASE_MODEL>
|
||||
# openai api fine_tunes.list
|
||||
|
||||
|
||||
import openai
|
||||
|
||||
# Use your OpenAI API Key
|
||||
openai.api_key = ""
|
||||
|
||||
sample_chunks = ["You all just came off of your incredible Google Cloud next conference where you released a wide variety of functionality and features and new products across artisan television and also across the entire sort of cloud ecosystem . You want to just first by walking through , first start by walking through all the innovations that you sort of released and what you 're excited about when you come to Google Cloud ? Now our vision is super simple . If you look at what smartphones did for a consumer , you know they took a computer and internet browser , a communication device , and a camera , and made it so that it 's in everybody 's pocket , so it really brought computation to every person . We feel that , you know , our , what we 're trying to do is take all the technological innovation that Google 's doing , but make it super simple so that everyone can consume it . And so that includes our global data center footprint , all the new types of hardware and large-scale systems we work on , the software that we 're making available for people to do high-scale computation , tools for data processing , tools for cybersecurity , processing , tools for cyber security , tools for machine learning , but make it so simple that everyone can use it . And every step that we do to simplify things for people , we think adoption can grow . And so that 's a lot of what we 've done these last three , four years , and we made a number of announcements that next in machine learning and AI in particular , you know , we look at our work as four elements , how we take our large-scale compute systems that were building for AI and how we make that available to everybody . Second , what we 're doing with the software stacks and top of it , things like jacks and other things and how we 're making those available to everybody . Third is advances because different people have different levels of expertise . Some people say I need the hardware to build my own large language model or algorithm . Other people say , look , I really need to use a building block . You guys give me . So , 30s we 've done a lot with AutoML and we announce new capability for image , video , and translation to make it available to everybody . And then lastly , we 're also building completely packaged solutions for some areas and we announce some new stuff . -> ",
|
||||
" We 're joined next by Thomas Curian , CEO of Google Cloud , and Alexander Wang , CEO and founder of Scale AI . Thomas joined Google in November 2018 as the CEO of Google Cloud . Prior to Google , Thomas spent 22 years at Oracle , where most recently he was president of product development . Before that , Thomas worked at McKinsey as a business analyst and engagement manager . His nearly 30 years of experience have given him a deep knowledge of engineering enterprise relationships and leadership of large organizations . Thomas 's degrees include an MBA in administration and management from Stanford University , as an RJ Miller scholar and a BSEE in electrical engineering and computer science from Princeton University , where he graduated suma cum laude . Thomas serves as a member of the Stanford graduate School of Business Advisory Council and Princeton University School of Engineering Advisory Council . Please welcome to the stage , Thomas Curian and Alexander Wang . This is a super exciting conversation . Thanks for being here , Thomas . - > "]
|
||||
|
||||
# Give your finetuned model name here
|
||||
# "davinci:ft-personal-2023-07-14-10-43-51"
|
||||
model_name = ""
|
||||
response = openai.Completion.create(
|
||||
model=model_name,
|
||||
prompt=sample_chunks[0])
|
||||
|
||||
print(response)
|
||||
98
trials/finetuning/youtube_scraping.py
Normal file
98
trials/finetuning/youtube_scraping.py
Normal file
@@ -0,0 +1,98 @@
|
||||
import json
|
||||
import yt_dlp as youtube_dl
|
||||
from whisper_jax import FlaxWhisperPipline
|
||||
import jax.numpy as jnp
|
||||
|
||||
# Function to extract chapter information from a YouTube video URL
|
||||
def get_youtube_chapters(video_id):
|
||||
video_url = "https://www.youtube.com/watch?v=" + video_id
|
||||
ydl_opts = {
|
||||
'extract_flat': 'in_playlist',
|
||||
'skip_download': True,
|
||||
'quiet': True,
|
||||
}
|
||||
|
||||
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
|
||||
video_info = ydl.extract_info(video_url, download=False)
|
||||
|
||||
chapters = []
|
||||
|
||||
if 'chapters' in video_info:
|
||||
for chapter in video_info['chapters']:
|
||||
start_time = chapter['start_time']
|
||||
end_time = chapter['end_time']
|
||||
title = chapter['title']
|
||||
|
||||
chapters.append({
|
||||
'start': start_time,
|
||||
'end': end_time,
|
||||
'title': title
|
||||
})
|
||||
|
||||
return chapters
|
||||
|
||||
|
||||
# Function to extract video transcription using yt_dlp
|
||||
def get_youtube_transcription(video_id):
|
||||
ydl_opts = {
|
||||
'format': 'bestaudio/best',
|
||||
'postprocessors': [{
|
||||
'key': 'FFmpegExtractAudio',
|
||||
'preferredcodec': 'mp3',
|
||||
'preferredquality': '192',
|
||||
}],
|
||||
'outtmpl': './artefacts/audio', # Specify output file path and name
|
||||
}
|
||||
|
||||
# Download the audio
|
||||
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
|
||||
ydl.download(["https://www.youtube.com/watch?v=" + video_id])
|
||||
media_file = "./artefacts/audio.mp3"
|
||||
|
||||
pipeline = FlaxWhisperPipline("openai/whisper-" + "tiny",
|
||||
dtype=jnp.float16,
|
||||
batch_size=16)
|
||||
whisper_result = pipeline(media_file, return_timestamps=True)
|
||||
return whisper_result["chunks"]
|
||||
|
||||
|
||||
|
||||
# Function to scrape YouTube video transcripts and chapter information
|
||||
def scrape_youtube_data(video_id):
|
||||
transcript_text = get_youtube_transcription(video_id)
|
||||
chapters = get_youtube_chapters(video_id)
|
||||
print("transcript_text", transcript_text)
|
||||
print("chapters", chapters)
|
||||
return transcript_text, chapters
|
||||
|
||||
|
||||
# Function to generate fine-tuning dataset from YouTube data
|
||||
def generate_finetuning_dataset(video_ids):
|
||||
prompt_completion_pairs = []
|
||||
for video_id in video_ids:
|
||||
transcript_text, chapters = scrape_youtube_data(video_id)
|
||||
if transcript_text is not None and chapters is not None:
|
||||
for chapter in chapters:
|
||||
start_time = chapter["start"]
|
||||
end_time = chapter["end"]
|
||||
chapter_text = chapter["title"]
|
||||
|
||||
prompt = ""
|
||||
for transcript in transcript_text:
|
||||
if transcript["timestamp"][0] >= start_time and transcript["timestamp"][1] < end_time:
|
||||
prompt += transcript["text"]
|
||||
|
||||
if prompt is not None:
|
||||
completion = chapter_text
|
||||
prompt_completion_pairs.append({"prompt": prompt, "completion": completion})
|
||||
|
||||
return prompt_completion_pairs
|
||||
|
||||
|
||||
# Add all the video ids here, the videos must have captions [chapters]
|
||||
video_ids = ["yTnSEZIwnkU"]
|
||||
dataset = generate_finetuning_dataset(video_ids)
|
||||
|
||||
with open("finetuning_dataset.jsonl", "w") as f:
|
||||
for example in dataset:
|
||||
f.write(json.dumps(example) + "\n")
|
||||
@@ -1,98 +0,0 @@
|
||||
# # Approach 1
|
||||
# from transformers import GPTNeoForCausalLM, GPT2Tokenizer
|
||||
#
|
||||
# model_name = 'EleutherAI/gpt-neo-1.3B'
|
||||
# tokenizer = GPT2Tokenizer.from_pretrained(model_name)
|
||||
# model = GPTNeoForCausalLM.from_pretrained(model_name)
|
||||
#
|
||||
# conversation = """
|
||||
# Summarize the following conversation in 3 key sentences:
|
||||
#
|
||||
# We 're joined next by Thomas Curian , CEO of Google Cloud , and Alexander Wang , CEO and founder of Scale AI .
|
||||
# Thomas joined Google in November 2018 as the CEO of Google Cloud . Prior to Google , Thomas spent 22 years at Oracle , where most recently he was president of product development .
|
||||
# Before that , Thomas worked at McKinsey as a business analyst and engagement manager . His nearly 30 years of experience have given him a deep knowledge of engineering enterprise relationships and leadership of large organizations .
|
||||
# Thomas 's degrees include an MBA in administration and management from Stanford University , as an RJ Miller scholar and a BSEE in electrical engineering and computer science from Princeton University , where he graduated suma cum laude .
|
||||
# Thomas serves as a member of the Stanford graduate School of Business Advisory Council and Princeton University School of Engineering Advisory Council .
|
||||
# Please welcome to the stage , Thomas Curian and Alexander Wang . This is a super exciting conversation . Thanks for being here , Thomas .
|
||||
# """
|
||||
#
|
||||
# input_ids = tokenizer.encode(conversation, return_tensors='pt')
|
||||
#
|
||||
# output = model.generate(input_ids,
|
||||
# max_length=30,
|
||||
# num_return_sequences=1)
|
||||
#
|
||||
# caption = tokenizer.decode(output[0], skip_special_tokens=True)
|
||||
# print("Caption:", caption[len(input_ids):])
|
||||
|
||||
#
|
||||
# # Approach 2
|
||||
# import torch
|
||||
# from transformers import GPT2LMHeadModel, GPT2Tokenizer
|
||||
#
|
||||
# model_name = "gpt2"
|
||||
# tokenizer = GPT2Tokenizer.from_pretrained(model_name)
|
||||
# model = GPT2LMHeadModel.from_pretrained(model_name)
|
||||
#
|
||||
# model.eval()
|
||||
#
|
||||
# text = """
|
||||
# You all just came off of your incredible Google Cloud next conference where you released a wide variety of functionality and features and new products across artisan television and also across the entire sort of cloud ecosystem . You want to just first by walking through , first start by walking through all the innovations that you sort of released and what you 're excited about when you come to Google Cloud ? Now our vision is super simple . If you look at what smartphones did for a consumer , you know they took a computer and internet browser , a communication device , and a camera , and made it so that it 's in everybody 's pocket , so it really brought computation to every person . We feel that , you know , our , what we 're trying to do is take all the technological innovation that Google 's doing , but make it super simple so that everyone can consume it . And so that includes our global data center footprint , all the new types of hardware and large-scale systems we work on , the software that we 're making available for people to do high-scale computation , tools for data processing , tools for cybersecurity , processing , tools for cyber security , tools for machine learning , but make it so simple that everyone can use it . And every step that we do to simplify things for people , we think adoption can grow . And so that 's a lot of what we 've done these last three , four years , and we made a number of announcements that next in machine learning and AI in particular , you know , we look at our work as four elements , how we take our large-scale compute systems that were building for AI and how we make that available to everybody . Second , what we 're doing with the software stacks and top of it , things like jacks and other things and how we 're making those available to everybody . Third is advances because different people have different levels of expertise . Some people say I need the hardware to build my own large language model or algorithm . Other people say , look , I really need to use a building block . You guys give me . So , 30s we 've done a lot with AutoML and we announce new capability for image , video , and translation to make it available to everybody . And then lastly , we 're also building completely packaged solutions for some areas and we announce some new stuff . "
|
||||
# """
|
||||
#
|
||||
# tokenizer.pad_token = tokenizer.eos_token
|
||||
# input_ids = tokenizer.encode(text,
|
||||
# max_length=100,
|
||||
# truncation=True,
|
||||
# return_tensors="pt")
|
||||
# attention_mask = torch.ones(input_ids.shape, dtype=torch.long)
|
||||
# output = model.generate(input_ids,
|
||||
# max_new_tokens=20,
|
||||
# num_return_sequences=1,
|
||||
# num_beams=2,
|
||||
# attention_mask=attention_mask)
|
||||
#
|
||||
# chapter_titles = [tokenizer.decode(output[i], skip_special_tokens=True) for i in range(output.shape[0])]
|
||||
# for i, title in enumerate(chapter_titles):
|
||||
# print("Caption: ", title)
|
||||
|
||||
# Approach 3
|
||||
|
||||
import torch
|
||||
from transformers import GPT2Tokenizer, GPT2LMHeadModel
|
||||
|
||||
def generate_response(conversation, max_length=100):
|
||||
input_text = ""
|
||||
for entry in conversation:
|
||||
role = entry["role"]
|
||||
content = entry["content"]
|
||||
input_text += f"{role}: {content}\n"
|
||||
|
||||
# Tokenize the entire conversation
|
||||
input_ids = tokenizer.encode(input_text, return_tensors="pt")
|
||||
|
||||
# Generate text based on the entire conversation
|
||||
with torch.no_grad():
|
||||
output = model.generate(input_ids, pad_token_id=tokenizer.eos_token_id)
|
||||
|
||||
# Decode the generated text and return it
|
||||
response = tokenizer.decode(output[0], skip_special_tokens=True)
|
||||
return response
|
||||
|
||||
if __name__ == "__main__":
|
||||
model_name = "gpt2"
|
||||
model = GPT2LMHeadModel.from_pretrained(model_name)
|
||||
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
|
||||
|
||||
sample_chunks = [
|
||||
"You all just came off of your incredible Google Cloud next conference where you released a wide variety of functionality and features and new products across artisan television and also across the entire sort of cloud ecosystem . You want to just first by walking through , first start by walking through all the innovations that you sort of released and what you 're excited about when you come to Google Cloud ? Now our vision is super simple . If you look at what smartphones did for a consumer , you know they took a computer and internet browser , a communication device , and a camera , and made it so that it 's in everybody 's pocket , so it really brought computation to every person . We feel that , you know , our , what we 're trying to do is take all the technological innovation that Google 's doing , but make it super simple so that everyone can consume it . And so that includes our global data center footprint , all the new types of hardware and large-scale systems we work on , the software that we 're making available for people to do high-scale computation , tools for data processing , tools for cybersecurity , processing , tools for cyber security , tools for machine learning , but make it so simple that everyone can use it . And every step that we do to simplify things for people , we think adoption can grow . And so that 's a lot of what we 've done these last three , four years , and we made a number of announcements that next in machine learning and AI in particular , you know , we look at our work as four elements , how we take our large-scale compute systems that were building for AI and how we make that available to everybody . Second , what we 're doing with the software stacks and top of it , things like jacks and other things and how we 're making those available to everybody . Third is advances because different people have different levels of expertise . Some people say I need the hardware to build my own large language model or algorithm . Other people say , look , I really need to use a building block . You guys give me . So , 30s we 've done a lot with AutoML and we announce new capability for image , video , and translation to make it available to everybody . And then lastly , we 're also building completely packaged solutions for some areas and we announce some new stuff . "
|
||||
]
|
||||
|
||||
conversation = [
|
||||
{"role": "system", "content": "Summarize this text" },
|
||||
{"role": "user", "content": " text : " + sample_chunks[0]},
|
||||
]
|
||||
|
||||
response = generate_response(conversation)
|
||||
print("Response:", response)
|
||||
|
||||
0
trials/server/__init__.py
Normal file
0
trials/server/__init__.py
Normal file
@@ -16,8 +16,8 @@ from av import AudioFifo
|
||||
from sortedcontainers import SortedDict
|
||||
from whisper_jax import FlaxWhisperPipline
|
||||
|
||||
from utils.log_utils import logger
|
||||
from utils.run_utils import config, Mutex
|
||||
from reflector.utils.log_utils import logger
|
||||
from reflector.utils.run_utils import config, Mutex
|
||||
|
||||
WHISPER_MODEL_SIZE = config['DEFAULT']["WHISPER_REAL_TIME_MODEL_SIZE"]
|
||||
pcs = set()
|
||||
0
trials/title_summary/__init__.py
Normal file
0
trials/title_summary/__init__.py
Normal file
57
trials/title_summary/api.py
Normal file
57
trials/title_summary/api.py
Normal file
@@ -0,0 +1,57 @@
|
||||
import requests
|
||||
import spacy
|
||||
|
||||
# Enter the Machine where the LLM is hosted
|
||||
LLM_MACHINE_IP = ""
|
||||
# This is the URL of text-generation-webui
|
||||
URL = f"http://{LLM_MACHINE_IP}:5000/api/v1/generate"
|
||||
|
||||
headers = {
|
||||
"Content-Type": "application/json"
|
||||
}
|
||||
|
||||
|
||||
def split_text_file(filename, token_count):
|
||||
nlp = spacy.load('en_core_web_md')
|
||||
|
||||
with open(filename, 'r') as file:
|
||||
text = file.read()
|
||||
|
||||
doc = nlp(text)
|
||||
total_tokens = len(doc)
|
||||
|
||||
parts = []
|
||||
start_index = 0
|
||||
|
||||
while start_index < total_tokens:
|
||||
end_index = start_index + token_count
|
||||
part_tokens = doc[start_index:end_index - 5]
|
||||
part = ' '.join(token.text for token in part_tokens)
|
||||
parts.append(part)
|
||||
start_index = end_index
|
||||
|
||||
return parts
|
||||
|
||||
|
||||
final_summary = ""
|
||||
parts = split_text_file("transcript.txt", 1600)
|
||||
|
||||
for part in parts:
|
||||
prompt = f"""
|
||||
### Human:
|
||||
Given the following text, distill the most important information
|
||||
into a short summary: {part}
|
||||
|
||||
### Assistant:
|
||||
"""
|
||||
data = {
|
||||
"prompt": prompt
|
||||
}
|
||||
try:
|
||||
response = requests.post(URL, headers=headers, json=data)
|
||||
print(response.json())
|
||||
except Exception as e:
|
||||
print(str(e))
|
||||
|
||||
with open("summary.txt", "w") as sum:
|
||||
sum.write(" ".join(final_summary))
|
||||
43
trials/title_summary/bert.py
Normal file
43
trials/title_summary/bert.py
Normal file
@@ -0,0 +1,43 @@
|
||||
import torch
|
||||
from transformers import BertTokenizer, BertModel
|
||||
from sentence_transformers import SentenceTransformer
|
||||
from sklearn.metrics.pairwise import cosine_similarity
|
||||
|
||||
# Load the pre-trained BERT model and tokenizer
|
||||
model_name = "bert-base-uncased"
|
||||
model = BertModel.from_pretrained(model_name)
|
||||
tokenizer = BertTokenizer.from_pretrained(model_name)
|
||||
|
||||
# Set the device to use
|
||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
model.to(device)
|
||||
|
||||
# Load the SentenceTransformer model
|
||||
sentence_transformer_model = SentenceTransformer('average_word_embeddings_glove.6B.300d')
|
||||
|
||||
# Define the input text
|
||||
text = "Your input text to be summarized goes here."
|
||||
|
||||
# Tokenize the text
|
||||
tokens = tokenizer.tokenize(text)
|
||||
input_ids = tokenizer.convert_tokens_to_ids(tokens)
|
||||
input_ids = torch.tensor([input_ids]).to(device)
|
||||
|
||||
# Get the BERT model output
|
||||
with torch.no_grad():
|
||||
outputs = model(input_ids)[0] # Extract the last hidden states
|
||||
|
||||
# Calculate sentence embeddings
|
||||
sentence_embeddings = outputs.mean(dim=1).squeeze().cpu().numpy()
|
||||
input_text_embedding = sentence_transformer_model.encode([text])[0]
|
||||
|
||||
# Calculate cosine similarity between sentences and input text
|
||||
similarity_scores = cosine_similarity([input_text_embedding], sentence_embeddings)
|
||||
|
||||
# Sort the sentences by similarity scores in descending order
|
||||
sorted_sentences = [sent for _, sent in sorted(zip(similarity_scores[0], sentences), reverse=True)]
|
||||
|
||||
# Choose the top sentences as the summary
|
||||
num_summary_sentences = 2 # Adjust as needed
|
||||
summary = ". ".join(sorted_sentences[:num_summary_sentences])
|
||||
print("Summary:", summary)
|
||||
101
trials/title_summary/gpt2.py
Normal file
101
trials/title_summary/gpt2.py
Normal file
@@ -0,0 +1,101 @@
|
||||
# Approach 1
|
||||
from transformers import GPTNeoForCausalLM, GPT2Tokenizer
|
||||
|
||||
model_name = 'EleutherAI/gpt-neo-1.3B'
|
||||
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
|
||||
model = GPTNeoForCausalLM.from_pretrained(model_name)
|
||||
|
||||
conversation = """
|
||||
Summarize the following conversation in 3 key sentences:
|
||||
|
||||
We 're joined next by Thomas Curian , CEO of Google Cloud , and Alexander Wang , CEO and founder of Scale AI .
|
||||
Thomas joined Google in November 2018 as the CEO of Google Cloud . Prior to Google , Thomas spent 22 years at Oracle , where most recently he was president of product development .
|
||||
Before that , Thomas worked at McKinsey as a business analyst and engagement manager . His nearly 30 years of experience have given him a deep knowledge of engineering enterprise relationships and leadership of large organizations .
|
||||
Thomas 's degrees include an MBA in administration and management from Stanford University , as an RJ Miller scholar and a BSEE in electrical engineering and computer science from Princeton University , where he graduated suma cum laude .
|
||||
Thomas serves as a member of the Stanford graduate School of Business Advisory Council and Princeton University School of Engineering Advisory Council .
|
||||
Please welcome to the stage , Thomas Curian and Alexander Wang . This is a super exciting conversation . Thanks for being here , Thomas .
|
||||
"""
|
||||
|
||||
input_ids = tokenizer.encode(conversation, return_tensors='pt')
|
||||
|
||||
output = model.generate(input_ids,
|
||||
max_length=30,
|
||||
num_return_sequences=1)
|
||||
|
||||
caption = tokenizer.decode(output[0], skip_special_tokens=True)
|
||||
print("Caption:", caption[len(input_ids):])
|
||||
|
||||
|
||||
# Approach 2
|
||||
import torch
|
||||
from transformers import GPT2LMHeadModel, GPT2Tokenizer
|
||||
|
||||
model_name = "gpt2"
|
||||
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
|
||||
model = GPT2LMHeadModel.from_pretrained(model_name)
|
||||
|
||||
model.eval()
|
||||
|
||||
text = """
|
||||
You all just came off of your incredible Google Cloud next conference where you released a wide variety of functionality and features and new products across artisan television and also across the entire sort of cloud ecosystem . You want to just first by walking through , first start by walking through all the innovations that you sort of released and what you 're excited about when you come to Google Cloud ? Now our vision is super simple . If you look at what smartphones did for a consumer , you know they took a computer and internet browser , a communication device , and a camera , and made it so that it 's in everybody 's pocket , so it really brought computation to every person . We feel that , you know , our , what we 're trying to do is take all the technological innovation that Google 's doing , but make it super simple so that everyone can consume it . And so that includes our global data center footprint , all the new types of hardware and large-scale systems we work on , the software that we 're making available for people to do high-scale computation , tools for data processing , tools for cybersecurity , processing , tools for cyber security , tools for machine learning , but make it so simple that everyone can use it . And every step that we do to simplify things for people , we think adoption can grow . And so that 's a lot of what we 've done these last three , four years , and we made a number of announcements that next in machine learning and AI in particular , you know , we look at our work as four elements , how we take our large-scale compute systems that were building for AI and how we make that available to everybody . Second , what we 're doing with the software stacks and top of it , things like jacks and other things and how we 're making those available to everybody . Third is advances because different people have different levels of expertise . Some people say I need the hardware to build my own large language model or algorithm . Other people say , look , I really need to use a building block . You guys give me . So , 30s we 've done a lot with AutoML and we announce new capability for image , video , and translation to make it available to everybody . And then lastly , we 're also building completely packaged solutions for some areas and we announce some new stuff . "
|
||||
"""
|
||||
|
||||
tokenizer.pad_token = tokenizer.eos_token
|
||||
input_ids = tokenizer.encode(text,
|
||||
max_length=100,
|
||||
truncation=True,
|
||||
return_tensors="pt")
|
||||
attention_mask = torch.ones(input_ids.shape, dtype=torch.long)
|
||||
output = model.generate(input_ids,
|
||||
max_new_tokens=20,
|
||||
num_return_sequences=1,
|
||||
num_beams=2,
|
||||
attention_mask=attention_mask)
|
||||
|
||||
chapter_titles = [tokenizer.decode(output[i], skip_special_tokens=True) for i in range(output.shape[0])]
|
||||
for i, title in enumerate(chapter_titles):
|
||||
print("Caption: ", title)
|
||||
|
||||
# Approach 3
|
||||
|
||||
import torch
|
||||
from transformers import GPT2LMHeadModel, GPT2Tokenizer
|
||||
|
||||
|
||||
def generate_response(conversation, max_length=100):
|
||||
input_text = ""
|
||||
for entry in conversation:
|
||||
role = entry["role"]
|
||||
content = entry["content"]
|
||||
input_text += f"{role}: {content}\n"
|
||||
|
||||
# Tokenize the entire conversation
|
||||
input_ids = tokenizer.encode(input_text, return_tensors="pt")
|
||||
|
||||
# Generate text based on the entire conversation
|
||||
with torch.no_grad():
|
||||
output = model.generate(input_ids, pad_token_id=tokenizer.eos_token_id)
|
||||
|
||||
# Decode the generated text and return it
|
||||
response = tokenizer.decode(output[0], skip_special_tokens=True)
|
||||
return response
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
# Call appropriate approach from the main while experimenting
|
||||
model_name = "gpt2"
|
||||
model = GPT2LMHeadModel.from_pretrained(model_name)
|
||||
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
|
||||
|
||||
sample_chunks = [
|
||||
"You all just came off of your incredible Google Cloud next conference where you released a wide variety of functionality and features and new products across artisan television and also across the entire sort of cloud ecosystem . You want to just first by walking through , first start by walking through all the innovations that you sort of released and what you 're excited about when you come to Google Cloud ? Now our vision is super simple . If you look at what smartphones did for a consumer , you know they took a computer and internet browser , a communication device , and a camera , and made it so that it 's in everybody 's pocket , so it really brought computation to every person . We feel that , you know , our , what we 're trying to do is take all the technological innovation that Google 's doing , but make it super simple so that everyone can consume it . And so that includes our global data center footprint , all the new types of hardware and large-scale systems we work on , the software that we 're making available for people to do high-scale computation , tools for data processing , tools for cybersecurity , processing , tools for cyber security , tools for machine learning , but make it so simple that everyone can use it . And every step that we do to simplify things for people , we think adoption can grow . And so that 's a lot of what we 've done these last three , four years , and we made a number of announcements that next in machine learning and AI in particular , you know , we look at our work as four elements , how we take our large-scale compute systems that were building for AI and how we make that available to everybody . Second , what we 're doing with the software stacks and top of it , things like jacks and other things and how we 're making those available to everybody . Third is advances because different people have different levels of expertise . Some people say I need the hardware to build my own large language model or algorithm . Other people say , look , I really need to use a building block . You guys give me . So , 30s we 've done a lot with AutoML and we announce new capability for image , video , and translation to make it available to everybody . And then lastly , we 're also building completely packaged solutions for some areas and we announce some new stuff . "
|
||||
]
|
||||
|
||||
conversation = [
|
||||
{"role": "system", "content": "Summarize this text"},
|
||||
{"role": "user", "content": " text : " + sample_chunks[0]},
|
||||
]
|
||||
|
||||
response = generate_response(conversation)
|
||||
print("Response:", response)
|
||||
@@ -1,9 +1,11 @@
|
||||
import spacy
|
||||
import sys
|
||||
|
||||
|
||||
# Observe the incremental summaries by performing summaries in chunks
|
||||
with open("transcript.txt") as f:
|
||||
transcription = f.read()
|
||||
|
||||
import spacy
|
||||
|
||||
|
||||
def split_text_file(filename, token_count):
|
||||
nlp = spacy.load('en_core_web_md')
|
||||
@@ -26,8 +28,9 @@ def split_text_file(filename, token_count):
|
||||
|
||||
return parts
|
||||
|
||||
|
||||
# Set the chunk length here to split the transcript and test
|
||||
MAX_CHUNK_LENGTH=1000
|
||||
MAX_CHUNK_LENGTH = 1000
|
||||
|
||||
chunks = split_text_file("transcript.txt", MAX_CHUNK_LENGTH)
|
||||
print("Number of chunks", len(chunks))
|
||||
@@ -41,19 +44,17 @@ with open("chunks" + str(MAX_CHUNK_LENGTH) + ".txt", "a") as f:
|
||||
# ex. python incsum.py 1 => will run approach 1
|
||||
# If no input, will run all approaches
|
||||
|
||||
import sys
|
||||
try:
|
||||
index = sys.argv[1]
|
||||
except:
|
||||
index = None
|
||||
|
||||
|
||||
# Approach 1 : facebook/bart-large-cnn
|
||||
if index == "1" or index is None:
|
||||
SUMMARY_MODEL="facebook/bart-large-cnn"
|
||||
MIN_LENGTH=5
|
||||
MAX_LENGTH=10
|
||||
BEAM_SIZE=2
|
||||
SUMMARY_MODEL = "facebook/bart-large-cnn"
|
||||
MIN_LENGTH = 5
|
||||
MAX_LENGTH = 10
|
||||
BEAM_SIZE = 2
|
||||
|
||||
print("Performing chunk summary : " + SUMMARY_MODEL)
|
||||
|
||||
@@ -81,7 +82,6 @@ if index == "1" or index is None:
|
||||
for summary in summaries:
|
||||
f.write(summary + "\n\n")
|
||||
|
||||
|
||||
# Approach 2
|
||||
if index == "2" or index is None:
|
||||
print("Performing chunk summary : " + "gpt-neo-1.3B")
|
||||
@@ -108,14 +108,14 @@ if index == "2" or index is None:
|
||||
max_length=max_length,
|
||||
attention_mask=attention_mask,
|
||||
pad_token_id=model.config.eos_token_id,
|
||||
num_beams=4,
|
||||
length_penalty=2.0,
|
||||
early_stopping=True)
|
||||
num_beams=4,
|
||||
length_penalty=2.0,
|
||||
early_stopping=True)
|
||||
summary_ids = output[0, input_length:]
|
||||
summary = tokenizer.decode(summary_ids, skip_special_tokens=True)
|
||||
summaries.append(summary)
|
||||
with open("gptneo1.3B-summaries.txt", "a") as f:
|
||||
f.write(summary + "\n\n")
|
||||
f.write(summary + "\n\n")
|
||||
|
||||
# Approach 3
|
||||
if index == "3" or index is None:
|
||||
@@ -155,4 +155,3 @@ if index == "3" or index is None:
|
||||
with open("mpt-7b-summaries.txt", "a") as f:
|
||||
for summary in summaries:
|
||||
f.write(summary + "\n\n")
|
||||
|
||||
37
trials/title_summary/openai_endpoint.py
Normal file
37
trials/title_summary/openai_endpoint.py
Normal file
@@ -0,0 +1,37 @@
|
||||
# Use OpenAI API endpoint to send data to OpenAI
|
||||
# along with prompts to caption/summarize the conversation
|
||||
|
||||
import openai
|
||||
|
||||
openai.api_key = ""
|
||||
|
||||
# to caption, user prompt used : "caption this conversation"
|
||||
# max_tokens=20
|
||||
|
||||
# to incremental summarize, user prompt used : "summarize this conversation in a few sentences by taking key points"
|
||||
# max_tokens=300
|
||||
|
||||
sample_chunks = [
|
||||
"You all just came off of your incredible Google Cloud next conference where you released a wide variety of functionality and features and new products across artisan television and also across the entire sort of cloud ecosystem . You want to just first by walking through , first start by walking through all the innovations that you sort of released and what you 're excited about when you come to Google Cloud ? Now our vision is super simple . If you look at what smartphones did for a consumer , you know they took a computer and internet browser , a communication device , and a camera , and made it so that it 's in everybody 's pocket , so it really brought computation to every person . We feel that , you know , our , what we 're trying to do is take all the technological innovation that Google 's doing , but make it super simple so that everyone can consume it . And so that includes our global data center footprint , all the new types of hardware and large-scale systems we work on , the software that we 're making available for people to do high-scale computation , tools for data processing , tools for cybersecurity , processing , tools for cyber security , tools for machine learning , but make it so simple that everyone can use it . And every step that we do to simplify things for people , we think adoption can grow . And so that 's a lot of what we 've done these last three , four years , and we made a number of announcements that next in machine learning and AI in particular , you know , we look at our work as four elements , how we take our large-scale compute systems that were building for AI and how we make that available to everybody . Second , what we 're doing with the software stacks and top of it , things like jacks and other things and how we 're making those available to everybody . Third is advances because different people have different levels of expertise . Some people say I need the hardware to build my own large language model or algorithm . Other people say , look , I really need to use a building block . You guys give me . So , 30s we 've done a lot with AutoML and we announce new capability for image , video , and translation to make it available to everybody . And then lastly , we 're also building completely packaged solutions for some areas and we announce some new stuff . ",
|
||||
" We 're joined next by Thomas Curian , CEO of Google Cloud , and Alexander Wang , CEO and founder of Scale AI . Thomas joined Google in November 2018 as the CEO of Google Cloud . Prior to Google , Thomas spent 22 years at Oracle , where most recently he was president of product development . Before that , Thomas worked at McKinsey as a business analyst and engagement manager . His nearly 30 years of experience have given him a deep knowledge of engineering enterprise relationships and leadership of large organizations . Thomas 's degrees include an MBA in administration and management from Stanford University , as an RJ Miller scholar and a BSEE in electrical engineering and computer science from Princeton University , where he graduated suma cum laude . Thomas serves as a member of the Stanford graduate School of Business Advisory Council and Princeton University School of Engineering Advisory Council . Please welcome to the stage , Thomas Curian and Alexander Wang . This is a super exciting conversation . Thanks for being here , Thomas ."]
|
||||
|
||||
conversation = [
|
||||
{"role": "system",
|
||||
"content": sample_chunks[1]},
|
||||
{"role": "user",
|
||||
"content": "summarize this conversation in a few sentences by taking key points"}
|
||||
]
|
||||
|
||||
model = "gpt-3.5-turbo"
|
||||
response = openai.ChatCompletion.create(model=model,
|
||||
messages=conversation,
|
||||
n=1,
|
||||
max_tokens=300)
|
||||
|
||||
# Try fine tuned model
|
||||
# model = "davinci:ft-personal-2023-07-14-10-43-51"
|
||||
# response = openai.Completion.create(model=model,
|
||||
# prompt=sample_chunks[0] + " -> ")
|
||||
|
||||
caption = response.choices[0]
|
||||
print(caption)
|
||||
33
trials/title_summary/pegasus.py
Normal file
33
trials/title_summary/pegasus.py
Normal file
@@ -0,0 +1,33 @@
|
||||
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
|
||||
import torch
|
||||
# Load the Pegasus model and tokenizer
|
||||
model_name = "google/pegasus-large"
|
||||
model = PegasusForConditionalGeneration.from_pretrained(model_name)
|
||||
tokenizer = PegasusTokenizer.from_pretrained(model_name)
|
||||
|
||||
# Set the device to use
|
||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
model.to(device)
|
||||
|
||||
sample_chunks = ["You all just came off of your incredible Google Cloud next conference where you released a wide variety of functionality and features and new products across artisan television and also across the entire sort of cloud ecosystem . You want to just first by walking through , first start by walking through all the innovations that you sort of released and what you 're excited about when you come to Google Cloud ? Now our vision is super simple . If you look at what smartphones did for a consumer , you know they took a computer and internet browser , a communication device , and a camera , and made it so that it 's in everybody 's pocket , so it really brought computation to every person . We feel that , you know , our , what we 're trying to do is take all the technological innovation that Google 's doing , but make it super simple so that everyone can consume it . And so that includes our global data center footprint , all the new types of hardware and large-scale systems we work on , the software that we 're making available for people to do high-scale computation , tools for data processing , tools for cybersecurity , processing , tools for cyber security , tools for machine learning , but make it so simple that everyone can use it . And every step that we do to simplify things for people , we think adoption can grow . And so that 's a lot of what we 've done these last three , four years , and we made a number of announcements that next in machine learning and AI in particular , you know , we look at our work as four elements , how we take our large-scale compute systems that were building for AI and how we make that available to everybody . Second , what we 're doing with the software stacks and top of it , things like jacks and other things and how we 're making those available to everybody . Third is advances because different people have different levels of expertise . Some people say I need the hardware to build my own large language model or algorithm . Other people say , look , I really need to use a building block . You guys give me . So , 30s we 've done a lot with AutoML and we announce new capability for image , video , and translation to make it available to everybody . And then lastly , we 're also building completely packaged solutions for some areas and we announce some new stuff . ",
|
||||
" We 're joined next by Thomas Curian , CEO of Google Cloud , and Alexander Wang , CEO and founder of Scale AI . Thomas joined Google in November 2018 as the CEO of Google Cloud . Prior to Google , Thomas spent 22 years at Oracle , where most recently he was president of product development . Before that , Thomas worked at McKinsey as a business analyst and engagement manager . His nearly 30 years of experience have given him a deep knowledge of engineering enterprise relationships and leadership of large organizations . Thomas 's degrees include an MBA in administration and management from Stanford University , as an RJ Miller scholar and a BSEE in electrical engineering and computer science from Princeton University , where he graduated suma cum laude . Thomas serves as a member of the Stanford graduate School of Business Advisory Council and Princeton University School of Engineering Advisory Council . Please welcome to the stage , Thomas Curian and Alexander Wang . This is a super exciting conversation . Thanks for being here , Thomas ."]
|
||||
|
||||
|
||||
# Define the input text for summarization
|
||||
text = sample_chunks[1]
|
||||
|
||||
inputs = tokenizer(text, truncation=True, padding="longest", return_tensors="pt").to(device)
|
||||
|
||||
# Generate the summary
|
||||
summary_ids = model.generate(
|
||||
inputs["input_ids"],
|
||||
attention_mask=inputs["attention_mask"],
|
||||
max_length=200,
|
||||
num_beams=4,
|
||||
length_penalty=2.0,
|
||||
early_stopping=True,
|
||||
)
|
||||
|
||||
# Decode and print the summary
|
||||
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
|
||||
print("Summary:", summary)
|
||||
@@ -1,36 +1,27 @@
|
||||
# Use OpenAI API endpoint to send data to OpenAI
|
||||
# along with prompts to caption/summarize the conversation
|
||||
from transformers import T5ForConditionalGeneration, T5Tokenizer
|
||||
import torch
|
||||
# Load the T5 model and tokenizer
|
||||
model_name = "t5-base"
|
||||
model = T5ForConditionalGeneration.from_pretrained(model_name)
|
||||
tokenizer = T5Tokenizer.from_pretrained(model_name)
|
||||
|
||||
import openai
|
||||
|
||||
openai.api_key = ""
|
||||
|
||||
# to caption, user prompt used : "caption this conversation"
|
||||
# max_tokens=20
|
||||
|
||||
# to incremental summarize, user prompt used : "summarize this conversation in a few sentences by taking key points"
|
||||
# max_tokens=300
|
||||
# Set the device to use
|
||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
model.to(device)
|
||||
|
||||
sample_chunks = ["You all just came off of your incredible Google Cloud next conference where you released a wide variety of functionality and features and new products across artisan television and also across the entire sort of cloud ecosystem . You want to just first by walking through , first start by walking through all the innovations that you sort of released and what you 're excited about when you come to Google Cloud ? Now our vision is super simple . If you look at what smartphones did for a consumer , you know they took a computer and internet browser , a communication device , and a camera , and made it so that it 's in everybody 's pocket , so it really brought computation to every person . We feel that , you know , our , what we 're trying to do is take all the technological innovation that Google 's doing , but make it super simple so that everyone can consume it . And so that includes our global data center footprint , all the new types of hardware and large-scale systems we work on , the software that we 're making available for people to do high-scale computation , tools for data processing , tools for cybersecurity , processing , tools for cyber security , tools for machine learning , but make it so simple that everyone can use it . And every step that we do to simplify things for people , we think adoption can grow . And so that 's a lot of what we 've done these last three , four years , and we made a number of announcements that next in machine learning and AI in particular , you know , we look at our work as four elements , how we take our large-scale compute systems that were building for AI and how we make that available to everybody . Second , what we 're doing with the software stacks and top of it , things like jacks and other things and how we 're making those available to everybody . Third is advances because different people have different levels of expertise . Some people say I need the hardware to build my own large language model or algorithm . Other people say , look , I really need to use a building block . You guys give me . So , 30s we 've done a lot with AutoML and we announce new capability for image , video , and translation to make it available to everybody . And then lastly , we 're also building completely packaged solutions for some areas and we announce some new stuff . ",
|
||||
" We 're joined next by Thomas Curian , CEO of Google Cloud , and Alexander Wang , CEO and founder of Scale AI . Thomas joined Google in November 2018 as the CEO of Google Cloud . Prior to Google , Thomas spent 22 years at Oracle , where most recently he was president of product development . Before that , Thomas worked at McKinsey as a business analyst and engagement manager . His nearly 30 years of experience have given him a deep knowledge of engineering enterprise relationships and leadership of large organizations . Thomas 's degrees include an MBA in administration and management from Stanford University , as an RJ Miller scholar and a BSEE in electrical engineering and computer science from Princeton University , where he graduated suma cum laude . Thomas serves as a member of the Stanford graduate School of Business Advisory Council and Princeton University School of Engineering Advisory Council . Please welcome to the stage , Thomas Curian and Alexander Wang . This is a super exciting conversation . Thanks for being here , Thomas ."]
|
||||
|
||||
conversation = [
|
||||
{"role": "system",
|
||||
"content": sample_chunks[1]},
|
||||
{"role": "user",
|
||||
"content": "summarize this conversation in a few sentences by taking key points"}
|
||||
]
|
||||
|
||||
model = "gpt-3.5-turbo"
|
||||
response = openai.ChatCompletion.create(model=model,
|
||||
messages=conversation,
|
||||
n=1,
|
||||
max_tokens=300)
|
||||
# Define the input text for summarization
|
||||
text = "Summarize the following text in 3 key points. text : " + sample_chunks[1]
|
||||
|
||||
# Try finetuned model
|
||||
# model = "davinci:ft-personal-2023-07-14-10-43-51"
|
||||
# response = openai.Completion.create(model=model,
|
||||
# prompt=sample_chunks[0] + " -> ")
|
||||
# Tokenize the input text
|
||||
inputs = tokenizer.encode(text, return_tensors="pt").to(device)
|
||||
|
||||
caption = response.choices[0]
|
||||
print(caption)
|
||||
# Generate the summary
|
||||
summary_ids = model.generate(inputs, max_length=1000, num_beams=4, early_stopping=True)
|
||||
|
||||
# Decode and print the summary
|
||||
summary = tokenizer.decode(summary_ids.squeeze(), skip_special_tokens=True)
|
||||
print("Summary:", summary)
|
||||
44
trials/title_summary/vicuna.py
Normal file
44
trials/title_summary/vicuna.py
Normal file
@@ -0,0 +1,44 @@
|
||||
from gpt4all import GPT4All
|
||||
|
||||
model = GPT4All("/Users/gokulmohanarangan/Library/Application Support/nomic.ai/GPT4All/ggml-vicuna-13b-1.1-q4_2.bin")
|
||||
|
||||
import spacy
|
||||
|
||||
|
||||
def split_text_file(filename, token_count):
|
||||
nlp = spacy.load('en_core_web_md')
|
||||
|
||||
with open(filename, 'r') as file:
|
||||
text = file.read()
|
||||
|
||||
doc = nlp(text)
|
||||
total_tokens = len(doc)
|
||||
|
||||
parts = []
|
||||
start_index = 0
|
||||
|
||||
while start_index < total_tokens:
|
||||
end_index = start_index + token_count
|
||||
part_tokens = doc[start_index:end_index]
|
||||
part = ' '.join(token.text for token in part_tokens)
|
||||
parts.append(part)
|
||||
start_index = end_index
|
||||
|
||||
return parts
|
||||
|
||||
parts = split_text_file("transcript.txt", 1800)
|
||||
final_summary = []
|
||||
for part in parts:
|
||||
prompt = f"""
|
||||
### Human:
|
||||
Summarize the following text without missing any key points and action items.
|
||||
|
||||
{part}
|
||||
### Assistant:
|
||||
"""
|
||||
output = model.generate(prompt)
|
||||
final_summary.append(output)
|
||||
|
||||
|
||||
with open("sum.txt", "w") as sum:
|
||||
sum.write(" ".join(final_summary))
|
||||
0
trials/whisper-jax/__init__.py
Normal file
0
trials/whisper-jax/__init__.py
Normal file
@@ -18,11 +18,11 @@ import nltk
|
||||
import yt_dlp as youtube_dl
|
||||
from whisper_jax import FlaxWhisperPipline
|
||||
|
||||
from utils.file_utils import download_files, upload_files
|
||||
from utils.log_utils import logger
|
||||
from utils.run_utils import config
|
||||
from utils.text_utilities import post_process_transcription, summarize
|
||||
from utils.viz_utilities import create_talk_diff_scatter_viz, create_wordcloud
|
||||
from ...utils.file_utils import download_files, upload_files
|
||||
from ...utils.log_utils import logger
|
||||
from ...utils.run_utils import config
|
||||
from ...utils.text_utils import post_process_transcription, summarize
|
||||
from ...utils.viz_utils import create_talk_diff_scatter_viz, create_wordcloud
|
||||
|
||||
nltk.download('punkt', quiet=True)
|
||||
nltk.download('stopwords', quiet=True)
|
||||
@@ -30,8 +30,8 @@ nltk.download('stopwords', quiet=True)
|
||||
WHISPER_MODEL_SIZE = config['DEFAULT']["WHISPER_MODEL_SIZE"]
|
||||
NOW = datetime.now()
|
||||
|
||||
if not os.path.exists('./artefacts'):
|
||||
os.makedirs('./artefacts')
|
||||
if not os.path.exists('../../artefacts'):
|
||||
os.makedirs('../../artefacts')
|
||||
|
||||
|
||||
def init_argparse() -> argparse.ArgumentParser:
|
||||
@@ -91,7 +91,7 @@ def main():
|
||||
# Download the audio
|
||||
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
|
||||
ydl.download([args.location])
|
||||
media_file = "./artefacts/audio.mp3"
|
||||
media_file = "../artefacts/audio.mp3"
|
||||
|
||||
logger.info("Saved downloaded YouTube video to: " + media_file)
|
||||
else:
|
||||
@@ -10,11 +10,11 @@ from pynput import keyboard
|
||||
from termcolor import colored
|
||||
from whisper_jax import FlaxWhisperPipline
|
||||
|
||||
from utils.file_utils import upload_files
|
||||
from utils.log_utils import logger
|
||||
from utils.run_utils import config
|
||||
from utils.text_utilities import post_process_transcription, summarize
|
||||
from utils.viz_utilities import create_talk_diff_scatter_viz, create_wordcloud
|
||||
from ...utils.file_utils import upload_files
|
||||
from ...utils.log_utils import logger
|
||||
from ...utils.run_utils import config
|
||||
from ...utils.text_utils import post_process_transcription, summarize
|
||||
from ...utils.viz_utils import create_talk_diff_scatter_viz, create_wordcloud
|
||||
|
||||
WHISPER_MODEL_SIZE = config['DEFAULT']["WHISPER_MODEL_SIZE"]
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
[DEFAULT]
|
||||
#SetexceptionruleforOpenMPerrortoallowduplicatelibinitialization
|
||||
#Set exception rule for OpenMP error
|
||||
#to allow duplicate lib initialization
|
||||
KMP_DUPLICATE_LIB_OK=TRUE
|
||||
#ExportOpenAIAPIKey
|
||||
OPENAI_APIKEY=
|
||||
@@ -7,8 +8,8 @@ OPENAI_APIKEY=
|
||||
WHISPER_MODEL_SIZE=tiny
|
||||
WHISPER_REAL_TIME_MODEL_SIZE=tiny
|
||||
#AWSconfig
|
||||
AWS_ACCESS_KEY=***REMOVED***
|
||||
AWS_SECRET_KEY=***REMOVED***
|
||||
AWS_ACCESS_KEY=
|
||||
AWS_SECRET_KEY=
|
||||
BUCKET_NAME=reflector-bucket
|
||||
#Summarizerconfig
|
||||
SUMMARY_MODEL=facebook/bart-large-cnn
|
||||
@@ -17,8 +18,9 @@ MAX_LENGTH=2048
|
||||
BEAM_SIZE=6
|
||||
MAX_CHUNK_LENGTH=1024
|
||||
SUMMARIZE_USING_CHUNKS=YES
|
||||
#Audiodevice
|
||||
# Audiodevice
|
||||
BLACKHOLE_INPUT_AGGREGATOR_DEVICE_NAME=aggregator
|
||||
AV_FOUNDATION_DEVICE_ID=1
|
||||
# LLM PATH
|
||||
LLM_PATH=
|
||||
# LLM configs
|
||||
LLM_MACHINE_IP=
|
||||
LLM_MACHINE_PORT=
|
||||
|
||||
32
utils/format_output.py
Normal file
32
utils/format_output.py
Normal file
@@ -0,0 +1,32 @@
|
||||
import json
|
||||
|
||||
with open("../artefacts/meeting_titles_and_summaries.txt", "r") as f:
|
||||
outputs = f.read()
|
||||
|
||||
outputs = json.loads(outputs)
|
||||
|
||||
transcript_file = open("../artefacts/meeting_transcript.txt", "a")
|
||||
title_desc_file = open("../artefacts/meeting_title_description.txt", "a")
|
||||
summary_file = open("../artefacts/meeting_summary.txt", "a")
|
||||
|
||||
for item in outputs["topics"]:
|
||||
transcript_file.write(item["transcript"])
|
||||
summary_file.write(item["description"])
|
||||
|
||||
title_desc_file.write("TITLE: \n")
|
||||
title_desc_file.write(item["title"])
|
||||
title_desc_file.write("\n")
|
||||
|
||||
title_desc_file.write("DESCRIPTION: \n")
|
||||
title_desc_file.write(item["description"])
|
||||
title_desc_file.write("\n")
|
||||
|
||||
title_desc_file.write("TRANSCRIPT: \n")
|
||||
title_desc_file.write(item["transcript"])
|
||||
title_desc_file.write("\n")
|
||||
|
||||
title_desc_file.write("---------------------------------------- \n\n")
|
||||
|
||||
transcript_file.close()
|
||||
title_desc_file.close()
|
||||
summary_file.close()
|
||||
@@ -6,8 +6,8 @@ from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from sklearn.metrics.pairwise import cosine_similarity
|
||||
from transformers import BartForConditionalGeneration, BartTokenizer
|
||||
|
||||
from utils.log_utils import logger
|
||||
from utils.run_utils import config
|
||||
from log_utils import logger
|
||||
from run_utils import config
|
||||
|
||||
nltk.download('punkt', quiet=True)
|
||||
|
||||
@@ -154,7 +154,7 @@ def chunk_text(text,
|
||||
|
||||
def summarize(transcript_text, timestamp,
|
||||
real_time=False,
|
||||
summarize_using_chunks=config["DEFAULT"]["SUMMARIZE_USING_CHUNKS"]):
|
||||
chunk_summarize=config["DEFAULT"]["SUMMARIZE_USING_CHUNKS"]):
|
||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
summary_model = config["DEFAULT"]["SUMMARY_MODEL"]
|
||||
if not summary_model:
|
||||
@@ -166,27 +166,35 @@ def summarize(transcript_text, timestamp,
|
||||
model = BartForConditionalGeneration.from_pretrained(summary_model)
|
||||
model = model.to(device)
|
||||
|
||||
output_filename = "summary_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".txt"
|
||||
output_file = "summary_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".txt"
|
||||
if real_time:
|
||||
output_filename = "real_time_" + output_filename
|
||||
output_file = "real_time_" + output_file
|
||||
|
||||
if summarize_using_chunks != "YES":
|
||||
inputs = tokenizer.\
|
||||
if chunk_summarize != "YES":
|
||||
max_length = int(config["DEFAULT"]["INPUT_ENCODING_MAX_LENGTH"])
|
||||
inputs = tokenizer. \
|
||||
batch_encode_plus([transcript_text], truncation=True,
|
||||
padding='longest',
|
||||
max_length=int(config["DEFAULT"]["INPUT_ENCODING_MAX_LENGTH"]),
|
||||
max_length=max_length,
|
||||
return_tensors='pt')
|
||||
inputs = inputs.to(device)
|
||||
|
||||
with torch.no_grad():
|
||||
num_beans = int(config["DEFAULT"]["BEAM_SIZE"])
|
||||
max_length = int(config["DEFAULT"]["MAX_LENGTH"])
|
||||
summaries = model.generate(inputs['input_ids'],
|
||||
num_beams=int(config["DEFAULT"]["BEAM_SIZE"]), length_penalty=2.0,
|
||||
max_length=int(config["DEFAULT"]["MAX_LENGTH"]), early_stopping=True)
|
||||
num_beams=num_beans,
|
||||
length_penalty=2.0,
|
||||
max_length=max_length,
|
||||
early_stopping=True)
|
||||
|
||||
decoded_summaries = [tokenizer.decode(summary, skip_special_tokens=True, clean_up_tokenization_spaces=False)
|
||||
for summary in summaries]
|
||||
decoded_summaries = \
|
||||
[tokenizer.decode(summary,
|
||||
skip_special_tokens=True,
|
||||
clean_up_tokenization_spaces=False)
|
||||
for summary in summaries]
|
||||
summary = " ".join(decoded_summaries)
|
||||
with open("./artefacts/" + output_filename, 'w') as f:
|
||||
with open("./artefacts/" + output_file, 'w') as f:
|
||||
f.write(summary.strip() + "\n")
|
||||
else:
|
||||
logger.info("Breaking transcript into smaller chunks")
|
||||
@@ -195,8 +203,8 @@ def summarize(transcript_text, timestamp,
|
||||
logger.info(f"Transcript broken into {len(chunks)} "
|
||||
f"chunks of at most 500 words")
|
||||
|
||||
logger.info(f"Writing summary text to: {output_filename}")
|
||||
with open(output_filename, 'w') as f:
|
||||
logger.info(f"Writing summary text to: {output_file}")
|
||||
with open(output_file, 'w') as f:
|
||||
summaries = summarize_chunks(chunks, tokenizer, model)
|
||||
for summary in summaries:
|
||||
f.write(summary.strip() + " ")
|
||||
@@ -13,7 +13,7 @@ from wordcloud import STOPWORDS, WordCloud
|
||||
en = spacy.load('en_core_web_md')
|
||||
spacy_stopwords = en.Defaults.stop_words
|
||||
|
||||
STOPWORDS = set(STOPWORDS).union(set(stopwords.words("english"))).\
|
||||
STOPWORDS = set(STOPWORDS).union(set(stopwords.words("english"))). \
|
||||
union(set(spacy_stopwords))
|
||||
|
||||
|
||||
@@ -24,7 +24,7 @@ def create_wordcloud(timestamp, real_time=False):
|
||||
"""
|
||||
filename = "transcript"
|
||||
if real_time:
|
||||
filename = "real_time_" + filename + "_" +\
|
||||
filename = "real_time_" + filename + "_" + \
|
||||
timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".txt"
|
||||
else:
|
||||
filename += "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".txt"
|
||||
@@ -45,24 +45,24 @@ def create_wordcloud(timestamp, real_time=False):
|
||||
plt.axis("off")
|
||||
plt.tight_layout(pad=0)
|
||||
|
||||
wordcloud_name = "wordcloud"
|
||||
wordcloud = "wordcloud"
|
||||
if real_time:
|
||||
wordcloud_name = "real_time_" + wordcloud_name + "_" +\
|
||||
wordcloud = "real_time_" + wordcloud + "_" + \
|
||||
timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".png"
|
||||
else:
|
||||
wordcloud_name += "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".png"
|
||||
wordcloud += "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".png"
|
||||
|
||||
plt.savefig("./artefacts/" + wordcloud_name)
|
||||
plt.savefig("./artefacts/" + wordcloud)
|
||||
|
||||
|
||||
def create_talk_diff_scatter_viz(timestamp, real_time=False):
|
||||
"""
|
||||
Perform agenda vs transription diff to see covered topics.
|
||||
Perform agenda vs transcription diff to see covered topics.
|
||||
Create a scatter plot of words in topics.
|
||||
:return: None. Saved locally.
|
||||
"""
|
||||
spaCy_model = "en_core_web_md"
|
||||
nlp = spacy.load(spaCy_model)
|
||||
spacy_model = "en_core_web_md"
|
||||
nlp = spacy.load(spacy_model)
|
||||
nlp.add_pipe('sentencizer')
|
||||
|
||||
agenda_topics = []
|
||||
@@ -75,12 +75,11 @@ def create_talk_diff_scatter_viz(timestamp, real_time=False):
|
||||
agenda_topics.append(line.split(":")[0])
|
||||
|
||||
# Load the transcription with timestamp
|
||||
filename = ""
|
||||
if real_time:
|
||||
filename = "./artefacts/real_time_transcript_with_timestamp_" +\
|
||||
filename = "./artefacts/real_time_transcript_with_timestamp_" + \
|
||||
timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".txt"
|
||||
else:
|
||||
filename = "./artefacts/transcript_with_timestamp_" +\
|
||||
filename = "./artefacts/transcript_with_timestamp_" + \
|
||||
timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".txt"
|
||||
with open(filename) as f:
|
||||
transcription_timestamp_text = f.read()
|
||||
@@ -142,7 +141,7 @@ def create_talk_diff_scatter_viz(timestamp, real_time=False):
|
||||
|
||||
df = df.apply(create_new_columns, axis=1)
|
||||
|
||||
# Count the number of items covered and calculatre the percentage
|
||||
# Count the number of items covered and calculate the percentage
|
||||
num_covered_items = sum(covered_items.values())
|
||||
percentage_covered = num_covered_items / len(agenda) * 100
|
||||
|
||||
@@ -158,7 +157,7 @@ def create_talk_diff_scatter_viz(timestamp, real_time=False):
|
||||
# Save df, mappings for further experimentation
|
||||
df_name = "df"
|
||||
if real_time:
|
||||
df_name = "real_time_" + df_name + "_" +\
|
||||
df_name = "real_time_" + df_name + "_" + \
|
||||
timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".pkl"
|
||||
else:
|
||||
df_name += "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".pkl"
|
||||
@@ -169,7 +168,7 @@ def create_talk_diff_scatter_viz(timestamp, real_time=False):
|
||||
|
||||
mappings_name = "mappings"
|
||||
if real_time:
|
||||
mappings_name = "real_time_" + mappings_name + "_" +\
|
||||
mappings_name = "real_time_" + mappings_name + "_" + \
|
||||
timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".pkl"
|
||||
else:
|
||||
mappings_name += "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".pkl"
|
||||
Reference in New Issue
Block a user