diff --git a/.gitignore b/.gitignore index c08eb9a3..c6d9edb2 100644 --- a/.gitignore +++ b/.gitignore @@ -165,7 +165,7 @@ cython_debug/ transcript_*.txt test_*.txt wordcloud*.png -*.ini +utils/config.ini test_samples/ *.wav *.mp3 diff --git a/trials/api.py b/__init__.py similarity index 100% rename from trials/api.py rename to __init__.py diff --git a/client.py b/client.py index b0fa46a5..b2167d3b 100644 --- a/client.py +++ b/client.py @@ -5,15 +5,15 @@ import signal from aiortc.contrib.signaling import (add_signaling_arguments, create_signaling) -from stream_client import StreamClient from utils.log_utils import logger +from stream_client import StreamClient async def main(): parser = argparse.ArgumentParser(description="Data channels ping/pong") parser.add_argument( - "--url", type=str, nargs="?", default="http://127.0.0.1:1250/offer" + "--url", type=str, nargs="?", default="http://0.0.0.0:1250/offer" ) parser.add_argument( diff --git a/requirements.txt b/pipeline-requirements.txt similarity index 91% rename from requirements.txt rename to pipeline-requirements.txt index fb69c4bd..24e7a092 100644 --- a/requirements.txt +++ b/pipeline-requirements.txt @@ -2,8 +2,6 @@ pyaudio==0.2.13 keyboard==0.13.5 pynput==1.7.6 wave==0.0.2 -aiohttp==3.8.4 -aiosignal==1.3.1 async-timeout==4.0.2 attrs==23.1.0 certifi==2023.5.7 @@ -51,11 +49,8 @@ matplotlib==3.7.2 matplotlib-inline==0.1.6 termcolor==2.3.0 ffmpeg==1.4 -aiortc==1.5.0 cached_property==1.5.2 stamina==23.1.0 httpx==0.24.1 -sortedcontainers==2.4.0 https://github.com/yt-dlp/yt-dlp/archive/master.tar.gz gpt4all==1.0.5 -aiohttp_cors==0.7.0 diff --git a/scripts/setup_dependencies.sh b/scripts/setup_pipeline_dependencies.sh old mode 100755 new mode 100644 similarity index 93% rename from scripts/setup_dependencies.sh rename to scripts/setup_pipeline_dependencies.sh index b7dc6d77..95d5d41d --- a/scripts/setup_dependencies.sh +++ b/scripts/setup_pipeline_dependencies.sh @@ -26,7 +26,7 @@ pip install git+https://github.com/sanchit-gandhi/whisper-jax.git # Update to latest version pip install --upgrade --no-deps --force-reinstall git+https://github.com/sanchit-gandhi/whisper-jax.git -pip install -r ../requirements.txt +pip install -r ../server-requirements.txt # download spacy models spacy download en_core_web_sm diff --git a/scripts/setup_server_dependencies.sh b/scripts/setup_server_dependencies.sh new file mode 100755 index 00000000..50288d54 --- /dev/null +++ b/scripts/setup_server_dependencies.sh @@ -0,0 +1,4 @@ +#!/bin/sh + +pip install --upgrade pip +pip install -r ../server-requirements.txt \ No newline at end of file diff --git a/server-requirements.txt b/server-requirements.txt new file mode 100644 index 00000000..01d7af38 --- /dev/null +++ b/server-requirements.txt @@ -0,0 +1,50 @@ +aiohttp==3.8.5 +aiohttp-cors==0.7.0 +aioice==0.9.0 +aiortc==1.5.0 +aiosignal==1.3.1 +anyio==3.7.1 +async-timeout==4.0.2 +attrs==23.1.0 +av==10.0.0 +certifi==2023.7.22 +cffi==1.15.1 +charset-normalizer==3.2.0 +coloredlogs==15.0.1 +cryptography==41.0.2 +ctranslate2==3.17.1 +dnspython==2.4.0 +faster-whisper==0.7.1 +filelock==3.12.2 +flatbuffers==23.5.26 +frozenlist==1.4.0 +fsspec==2023.6.0 +google-crc32c==1.5.0 +h11==0.14.0 +httpcore==0.17.3 +huggingface-hub==0.16.4 +humanfriendly==10.0 +idna==3.4 +ifaddr==0.2.0 +loguru==0.7.0 +mpmath==1.3.0 +multidict==6.0.4 +numpy==1.25.1 +onnxruntime==1.15.1 +packaging==23.1 +protobuf==4.23.4 +pycparser==2.21 +pyee==11.0.0 +pylibsrtp==0.8.0 +pyOpenSSL==23.2.0 +PyYAML==6.0.1 +requests==2.31.0 +sniffio==1.3.0 +sortedcontainers==2.4.0 +sympy==1.12 +tokenizers==0.13.3 +tqdm==4.65.0 +typing_extensions==4.7.1 +urllib3==2.0.4 +yarl==1.9.2 +wave==0.0.2 diff --git a/server_executor_cleaned.py b/server.py similarity index 73% rename from server_executor_cleaned.py rename to server.py index 2d8f3747..55066eef 100644 --- a/server_executor_cleaned.py +++ b/server.py @@ -1,29 +1,30 @@ +import argparse import asyncio import datetime -import io import json +import os import uuid import wave from concurrent.futures import ThreadPoolExecutor import aiohttp_cors -import jax.numpy as jnp import requests from aiohttp import web from aiortc import MediaStreamTrack, RTCPeerConnection, RTCSessionDescription from aiortc.contrib.media import MediaRelay from av import AudioFifo +from faster_whisper import WhisperModel from loguru import logger -from whisper_jax import FlaxWhisperPipline -from utils.run_utils import run_in_executor from sortedcontainers import SortedDict +from utils.run_utils import run_in_executor, config + pcs = set() relay = MediaRelay() data_channel = None -pipeline = FlaxWhisperPipline("openai/whisper-tiny", - dtype=jnp.float16, - batch_size=16) +model = WhisperModel("tiny", device="cpu", + compute_type="float32", + num_workers=12) CHANNELS = 2 RATE = 48000 @@ -31,8 +32,8 @@ audio_buffer = AudioFifo() executor = ThreadPoolExecutor() transcription_text = "" last_transcribed_time = 0.0 -LLM_MACHINE_IP = "216.153.52.83" -LLM_MACHINE_PORT = "5000" +LLM_MACHINE_IP = config["DEFAULT"]["LLM_MACHINE_IP"] +LLM_MACHINE_PORT = config["DEFAULT"]["LLM_MACHINE_PORT"] LLM_URL = f"http://{LLM_MACHINE_IP}:{LLM_MACHINE_PORT}/api/v1/generate" incremental_responses = [] sorted_transcripts = SortedDict() @@ -43,7 +44,7 @@ blacklisted_messages = [" Thank you.", " See you next time!", def get_title_and_summary(llm_input_text, last_timestamp): - print("Generating title and summary") + logger.info("Generating title and summary") # output = llm.generate(prompt) # Use monadical-ml to fire this query to an LLM and get result @@ -53,11 +54,11 @@ def get_title_and_summary(llm_input_text, last_timestamp): prompt = f""" ### Human: - Create a JSON object as response. The JSON object must have 2 fields: - i) title and ii) summary. For the title field,generate a short title - for the given text. For the summary field, summarize the given text + Create a JSON object as response. The JSON object must have 2 fields: + i) title and ii) summary. For the title field,generate a short title + for the given text. For the summary field, summarize the given text in three sentences. - + {llm_input_text} ### Assistant: @@ -67,27 +68,28 @@ def get_title_and_summary(llm_input_text, last_timestamp): "prompt": prompt } - # To-do: Handle unexpected output formats from the model + # TODO : Handle unexpected output formats from the model try: response = requests.post(LLM_URL, headers=headers, json=data) output = json.loads(response.json()["results"][0]["text"]) output["description"] = output.pop("summary") output["transcript"] = llm_input_text - output["timestamp"] =\ + output["timestamp"] = \ str(datetime.timedelta(seconds=round(last_timestamp))) incremental_responses.append(output) result = { "cmd": "UPDATE_TOPICS", "topics": incremental_responses, } + except Exception as e: - print("Exception" + str(e)) + logger.info("Exception" + str(e)) result = None return result def channel_log(channel, t, message): - print("channel(%s) %s %s" % (channel.label, t, message)) + logger.info("channel(%s) %s %s" % (channel.label, t, message)) def channel_send(channel, message): @@ -113,18 +115,25 @@ def channel_send_transcript(channel): # Due to exceptions if one of the earlier batches can't return # a transcript, we don't want to be stuck waiting for the result # With the threshold size of 3, we pop the first(lost) element - elif len(sorted_transcripts) >= 3: - del sorted_transcripts[least_time] + else: + if len(sorted_transcripts) >= 3: + del sorted_transcripts[least_time] except Exception as e: - print("Exception", str(e)) + logger.info("Exception", str(e)) pass def get_transcription(frames): - print("Transcribing..") + logger.info("Transcribing..") sorted_transcripts[frames[0].time] = None - out_file = io.BytesIO() - wf = wave.open(out_file, "wb") + + # TODO: + # Passing IO objects instead of temporary files throws an error + # Passing ndarrays (typecasted with float) does not give any + # transcription. Refer issue, + # https://github.com/guillaumekln/faster-whisper/issues/369 + audiofilename = "test" + str(datetime.datetime.now()) + wf = wave.open(audiofilename, "wb") wf.setnchannels(CHANNELS) wf.setframerate(RATE) wf.setsampwidth(2) @@ -133,22 +142,40 @@ def get_transcription(frames): wf.writeframes(b"".join(frame.to_ndarray())) wf.close() - # To-Do: Look into WhisperTimeStampLogitsProcessor exception - try: - whisper_result = pipeline(out_file.getvalue(), return_timestamps=True) - except Exception as e: - return + result_text = "" - global transcription_text, last_transcribed_time - transcription_text += whisper_result["text"] - duration = whisper_result["chunks"][0]["timestamp"][1] - if not duration: - duration = 5.0 - last_transcribed_time += duration + try: + segments, _ = \ + model.transcribe(audiofilename, + language="en", + beam_size=5, + vad_filter=True, + vad_parameters=dict(min_silence_duration_ms=500)) + os.remove(audiofilename) + segments = list(segments) + result_text = "" + duration = 0.0 + for segment in segments: + result_text += segment.text + start_time = segment.start + end_time = segment.end + if not segment.start: + start_time = 0.0 + if not segment.end: + end_time = 5.5 + duration += (end_time - start_time) + + global last_transcribed_time, transcription_text + last_transcribed_time += duration + transcription_text += result_text + + except Exception as e: + logger.info("Exception" + str(e)) + pass result = { "cmd": "SHOW_TRANSCRIPTION", - "text": whisper_result["text"] + "text": result_text } sorted_transcripts[frames[0].time] = result return result @@ -167,6 +194,9 @@ def get_final_summary_response(): seconds=round(last_transcribed_time))), "summary": final_summary } + + with open("./artefacts/meeting_titles_and_summaries.txt", "a") as f: + f.write(json.dumps(incremental_responses)) return response @@ -196,7 +226,7 @@ class AudioStreamTrack(MediaStreamTrack): else None ) - if len(transcription_text) > 500: + if len(transcription_text) > 750: llm_input_text = transcription_text transcription_text = "" llm_result = run_in_executor(get_title_and_summary, @@ -245,7 +275,6 @@ async def offer(request): if isinstance(message, str) and message.startswith("ping"): channel_send(channel, "pong" + message[4:]) - @pc.on("connectionstatechange") async def on_connectionstatechange(): log_info("Connection state is " + pc.connectionState) @@ -278,6 +307,16 @@ async def on_shutdown(app): if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="WebRTC based server for Reflector" + ) + parser.add_argument( + "--host", default="0.0.0.0", help="Server host IP (def: 0.0.0.0)" + ) + parser.add_argument( + "--port", type=int, default=1250, help="Server port (def: 1250)" + ) + args = parser.parse_args() app = web.Application() cors = aiohttp_cors.setup( app, @@ -293,4 +332,4 @@ if __name__ == "__main__": offer_resource = cors.add(app.router.add_resource("/offer")) cors.add(offer_resource.add_route("POST", offer)) app.on_shutdown.append(on_shutdown) - web.run_app(app, access_log=None, host="127.0.0.1", port=1250) + web.run_app(app, access_log=None, host=args.host, port=args.port) diff --git a/stream_client.py b/stream_client.py index 124c734d..c2238ee5 100644 --- a/stream_client.py +++ b/stream_client.py @@ -17,7 +17,7 @@ class StreamClient: def __init__( self, signaling, - url="http://127.0.0.1:1250", + url="http://0.0.0.0:1250", play_from=None, ping_pong=False ): @@ -114,7 +114,7 @@ class StreamClient: self.channel_log(channel, "<", message) if isinstance(message, str) and message.startswith("pong"): - elapsed_ms = (self.current_stamp() - int(message[5:]))\ + elapsed_ms = (self.current_stamp() - int(message[5:])) \ / 1000 print(" RTT %.2f ms" % elapsed_ms) diff --git a/trials/__init__.py b/trials/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/trials/finetuning/__init__.py b/trials/finetuning/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/trials/finetuning/inference_fine_tuned.py b/trials/finetuning/inference_fine_tuned.py new file mode 100644 index 00000000..4a396071 --- /dev/null +++ b/trials/finetuning/inference_fine_tuned.py @@ -0,0 +1,24 @@ +# Steps to prepare data and submit/check OpenAI finetuning +# import subprocess +# subprocess.run("openai tools fine_tunes.prepare_data -f " + "finetuning_dataset.jsonl") +# export OPENAI_API_KEY= +# openai api fine_tunes.create -t -m +# openai api fine_tunes.list + + +import openai + +# Use your OpenAI API Key +openai.api_key = "" + +sample_chunks = ["You all just came off of your incredible Google Cloud next conference where you released a wide variety of functionality and features and new products across artisan television and also across the entire sort of cloud ecosystem . You want to just first by walking through , first start by walking through all the innovations that you sort of released and what you 're excited about when you come to Google Cloud ? Now our vision is super simple . If you look at what smartphones did for a consumer , you know they took a computer and internet browser , a communication device , and a camera , and made it so that it 's in everybody 's pocket , so it really brought computation to every person . We feel that , you know , our , what we 're trying to do is take all the technological innovation that Google 's doing , but make it super simple so that everyone can consume it . And so that includes our global data center footprint , all the new types of hardware and large-scale systems we work on , the software that we 're making available for people to do high-scale computation , tools for data processing , tools for cybersecurity , processing , tools for cyber security , tools for machine learning , but make it so simple that everyone can use it . And every step that we do to simplify things for people , we think adoption can grow . And so that 's a lot of what we 've done these last three , four years , and we made a number of announcements that next in machine learning and AI in particular , you know , we look at our work as four elements , how we take our large-scale compute systems that were building for AI and how we make that available to everybody . Second , what we 're doing with the software stacks and top of it , things like jacks and other things and how we 're making those available to everybody . Third is advances because different people have different levels of expertise . Some people say I need the hardware to build my own large language model or algorithm . Other people say , look , I really need to use a building block . You guys give me . So , 30s we 've done a lot with AutoML and we announce new capability for image , video , and translation to make it available to everybody . And then lastly , we 're also building completely packaged solutions for some areas and we announce some new stuff . -> ", + " We 're joined next by Thomas Curian , CEO of Google Cloud , and Alexander Wang , CEO and founder of Scale AI . Thomas joined Google in November 2018 as the CEO of Google Cloud . Prior to Google , Thomas spent 22 years at Oracle , where most recently he was president of product development . Before that , Thomas worked at McKinsey as a business analyst and engagement manager . His nearly 30 years of experience have given him a deep knowledge of engineering enterprise relationships and leadership of large organizations . Thomas 's degrees include an MBA in administration and management from Stanford University , as an RJ Miller scholar and a BSEE in electrical engineering and computer science from Princeton University , where he graduated suma cum laude . Thomas serves as a member of the Stanford graduate School of Business Advisory Council and Princeton University School of Engineering Advisory Council . Please welcome to the stage , Thomas Curian and Alexander Wang . This is a super exciting conversation . Thanks for being here , Thomas . - > "] + +# Give your finetuned model name here +# "davinci:ft-personal-2023-07-14-10-43-51" +model_name = "" +response = openai.Completion.create( + model=model_name, + prompt=sample_chunks[0]) + +print(response) diff --git a/trials/finetuning/youtube_scraping.py b/trials/finetuning/youtube_scraping.py new file mode 100644 index 00000000..b0892f47 --- /dev/null +++ b/trials/finetuning/youtube_scraping.py @@ -0,0 +1,98 @@ +import json +import yt_dlp as youtube_dl +from whisper_jax import FlaxWhisperPipline +import jax.numpy as jnp + +# Function to extract chapter information from a YouTube video URL +def get_youtube_chapters(video_id): + video_url = "https://www.youtube.com/watch?v=" + video_id + ydl_opts = { + 'extract_flat': 'in_playlist', + 'skip_download': True, + 'quiet': True, + } + + with youtube_dl.YoutubeDL(ydl_opts) as ydl: + video_info = ydl.extract_info(video_url, download=False) + + chapters = [] + + if 'chapters' in video_info: + for chapter in video_info['chapters']: + start_time = chapter['start_time'] + end_time = chapter['end_time'] + title = chapter['title'] + + chapters.append({ + 'start': start_time, + 'end': end_time, + 'title': title + }) + + return chapters + + +# Function to extract video transcription using yt_dlp +def get_youtube_transcription(video_id): + ydl_opts = { + 'format': 'bestaudio/best', + 'postprocessors': [{ + 'key': 'FFmpegExtractAudio', + 'preferredcodec': 'mp3', + 'preferredquality': '192', + }], + 'outtmpl': './artefacts/audio', # Specify output file path and name + } + + # Download the audio + with youtube_dl.YoutubeDL(ydl_opts) as ydl: + ydl.download(["https://www.youtube.com/watch?v=" + video_id]) + media_file = "./artefacts/audio.mp3" + + pipeline = FlaxWhisperPipline("openai/whisper-" + "tiny", + dtype=jnp.float16, + batch_size=16) + whisper_result = pipeline(media_file, return_timestamps=True) + return whisper_result["chunks"] + + + +# Function to scrape YouTube video transcripts and chapter information +def scrape_youtube_data(video_id): + transcript_text = get_youtube_transcription(video_id) + chapters = get_youtube_chapters(video_id) + print("transcript_text", transcript_text) + print("chapters", chapters) + return transcript_text, chapters + + +# Function to generate fine-tuning dataset from YouTube data +def generate_finetuning_dataset(video_ids): + prompt_completion_pairs = [] + for video_id in video_ids: + transcript_text, chapters = scrape_youtube_data(video_id) + if transcript_text is not None and chapters is not None: + for chapter in chapters: + start_time = chapter["start"] + end_time = chapter["end"] + chapter_text = chapter["title"] + + prompt = "" + for transcript in transcript_text: + if transcript["timestamp"][0] >= start_time and transcript["timestamp"][1] < end_time: + prompt += transcript["text"] + + if prompt is not None: + completion = chapter_text + prompt_completion_pairs.append({"prompt": prompt, "completion": completion}) + + return prompt_completion_pairs + + +# Add all the video ids here, the videos must have captions [chapters] +video_ids = ["yTnSEZIwnkU"] +dataset = generate_finetuning_dataset(video_ids) + +with open("finetuning_dataset.jsonl", "w") as f: + for example in dataset: + f.write(json.dumps(example) + "\n") diff --git a/trials/gpt2.py b/trials/gpt2.py deleted file mode 100644 index d3917af2..00000000 --- a/trials/gpt2.py +++ /dev/null @@ -1,98 +0,0 @@ -# # Approach 1 -# from transformers import GPTNeoForCausalLM, GPT2Tokenizer -# -# model_name = 'EleutherAI/gpt-neo-1.3B' -# tokenizer = GPT2Tokenizer.from_pretrained(model_name) -# model = GPTNeoForCausalLM.from_pretrained(model_name) -# -# conversation = """ -# Summarize the following conversation in 3 key sentences: -# -# We 're joined next by Thomas Curian , CEO of Google Cloud , and Alexander Wang , CEO and founder of Scale AI . -# Thomas joined Google in November 2018 as the CEO of Google Cloud . Prior to Google , Thomas spent 22 years at Oracle , where most recently he was president of product development . -# Before that , Thomas worked at McKinsey as a business analyst and engagement manager . His nearly 30 years of experience have given him a deep knowledge of engineering enterprise relationships and leadership of large organizations . -# Thomas 's degrees include an MBA in administration and management from Stanford University , as an RJ Miller scholar and a BSEE in electrical engineering and computer science from Princeton University , where he graduated suma cum laude . -# Thomas serves as a member of the Stanford graduate School of Business Advisory Council and Princeton University School of Engineering Advisory Council . -# Please welcome to the stage , Thomas Curian and Alexander Wang . This is a super exciting conversation . Thanks for being here , Thomas . -# """ -# -# input_ids = tokenizer.encode(conversation, return_tensors='pt') -# -# output = model.generate(input_ids, -# max_length=30, -# num_return_sequences=1) -# -# caption = tokenizer.decode(output[0], skip_special_tokens=True) -# print("Caption:", caption[len(input_ids):]) - -# -# # Approach 2 -# import torch -# from transformers import GPT2LMHeadModel, GPT2Tokenizer -# -# model_name = "gpt2" -# tokenizer = GPT2Tokenizer.from_pretrained(model_name) -# model = GPT2LMHeadModel.from_pretrained(model_name) -# -# model.eval() -# -# text = """ -# You all just came off of your incredible Google Cloud next conference where you released a wide variety of functionality and features and new products across artisan television and also across the entire sort of cloud ecosystem . You want to just first by walking through , first start by walking through all the innovations that you sort of released and what you 're excited about when you come to Google Cloud ? Now our vision is super simple . If you look at what smartphones did for a consumer , you know they took a computer and internet browser , a communication device , and a camera , and made it so that it 's in everybody 's pocket , so it really brought computation to every person . We feel that , you know , our , what we 're trying to do is take all the technological innovation that Google 's doing , but make it super simple so that everyone can consume it . And so that includes our global data center footprint , all the new types of hardware and large-scale systems we work on , the software that we 're making available for people to do high-scale computation , tools for data processing , tools for cybersecurity , processing , tools for cyber security , tools for machine learning , but make it so simple that everyone can use it . And every step that we do to simplify things for people , we think adoption can grow . And so that 's a lot of what we 've done these last three , four years , and we made a number of announcements that next in machine learning and AI in particular , you know , we look at our work as four elements , how we take our large-scale compute systems that were building for AI and how we make that available to everybody . Second , what we 're doing with the software stacks and top of it , things like jacks and other things and how we 're making those available to everybody . Third is advances because different people have different levels of expertise . Some people say I need the hardware to build my own large language model or algorithm . Other people say , look , I really need to use a building block . You guys give me . So , 30s we 've done a lot with AutoML and we announce new capability for image , video , and translation to make it available to everybody . And then lastly , we 're also building completely packaged solutions for some areas and we announce some new stuff . " -# """ -# -# tokenizer.pad_token = tokenizer.eos_token -# input_ids = tokenizer.encode(text, -# max_length=100, -# truncation=True, -# return_tensors="pt") -# attention_mask = torch.ones(input_ids.shape, dtype=torch.long) -# output = model.generate(input_ids, -# max_new_tokens=20, -# num_return_sequences=1, -# num_beams=2, -# attention_mask=attention_mask) -# -# chapter_titles = [tokenizer.decode(output[i], skip_special_tokens=True) for i in range(output.shape[0])] -# for i, title in enumerate(chapter_titles): -# print("Caption: ", title) - -# Approach 3 - -import torch -from transformers import GPT2Tokenizer, GPT2LMHeadModel - -def generate_response(conversation, max_length=100): - input_text = "" - for entry in conversation: - role = entry["role"] - content = entry["content"] - input_text += f"{role}: {content}\n" - - # Tokenize the entire conversation - input_ids = tokenizer.encode(input_text, return_tensors="pt") - - # Generate text based on the entire conversation - with torch.no_grad(): - output = model.generate(input_ids, pad_token_id=tokenizer.eos_token_id) - - # Decode the generated text and return it - response = tokenizer.decode(output[0], skip_special_tokens=True) - return response - -if __name__ == "__main__": - model_name = "gpt2" - model = GPT2LMHeadModel.from_pretrained(model_name) - tokenizer = GPT2Tokenizer.from_pretrained(model_name) - - sample_chunks = [ - "You all just came off of your incredible Google Cloud next conference where you released a wide variety of functionality and features and new products across artisan television and also across the entire sort of cloud ecosystem . You want to just first by walking through , first start by walking through all the innovations that you sort of released and what you 're excited about when you come to Google Cloud ? Now our vision is super simple . If you look at what smartphones did for a consumer , you know they took a computer and internet browser , a communication device , and a camera , and made it so that it 's in everybody 's pocket , so it really brought computation to every person . We feel that , you know , our , what we 're trying to do is take all the technological innovation that Google 's doing , but make it super simple so that everyone can consume it . And so that includes our global data center footprint , all the new types of hardware and large-scale systems we work on , the software that we 're making available for people to do high-scale computation , tools for data processing , tools for cybersecurity , processing , tools for cyber security , tools for machine learning , but make it so simple that everyone can use it . And every step that we do to simplify things for people , we think adoption can grow . And so that 's a lot of what we 've done these last three , four years , and we made a number of announcements that next in machine learning and AI in particular , you know , we look at our work as four elements , how we take our large-scale compute systems that were building for AI and how we make that available to everybody . Second , what we 're doing with the software stacks and top of it , things like jacks and other things and how we 're making those available to everybody . Third is advances because different people have different levels of expertise . Some people say I need the hardware to build my own large language model or algorithm . Other people say , look , I really need to use a building block . You guys give me . So , 30s we 've done a lot with AutoML and we announce new capability for image , video , and translation to make it available to everybody . And then lastly , we 're also building completely packaged solutions for some areas and we announce some new stuff . " - ] - - conversation = [ - {"role": "system", "content": "Summarize this text" }, - {"role": "user", "content": " text : " + sample_chunks[0]}, - ] - - response = generate_response(conversation) - print("Response:", response) - diff --git a/trials/server/__init__.py b/trials/server/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/server_multithreaded.py b/trials/server/server_multithreaded.py similarity index 98% rename from server_multithreaded.py rename to trials/server/server_multithreaded.py index 2862fa36..1c5e75d7 100644 --- a/server_multithreaded.py +++ b/trials/server/server_multithreaded.py @@ -16,8 +16,8 @@ from av import AudioFifo from sortedcontainers import SortedDict from whisper_jax import FlaxWhisperPipline -from utils.log_utils import logger -from utils.run_utils import config, Mutex +from reflector.utils.log_utils import logger +from reflector.utils.run_utils import config, Mutex WHISPER_MODEL_SIZE = config['DEFAULT']["WHISPER_REAL_TIME_MODEL_SIZE"] pcs = set() diff --git a/trials/title_summary/__init__.py b/trials/title_summary/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/trials/title_summary/api.py b/trials/title_summary/api.py new file mode 100644 index 00000000..eb6a1fbb --- /dev/null +++ b/trials/title_summary/api.py @@ -0,0 +1,57 @@ +import requests +import spacy + +# Enter the Machine where the LLM is hosted +LLM_MACHINE_IP = "" +# This is the URL of text-generation-webui +URL = f"http://{LLM_MACHINE_IP}:5000/api/v1/generate" + +headers = { + "Content-Type": "application/json" +} + + +def split_text_file(filename, token_count): + nlp = spacy.load('en_core_web_md') + + with open(filename, 'r') as file: + text = file.read() + + doc = nlp(text) + total_tokens = len(doc) + + parts = [] + start_index = 0 + + while start_index < total_tokens: + end_index = start_index + token_count + part_tokens = doc[start_index:end_index - 5] + part = ' '.join(token.text for token in part_tokens) + parts.append(part) + start_index = end_index + + return parts + + +final_summary = "" +parts = split_text_file("transcript.txt", 1600) + +for part in parts: + prompt = f""" + ### Human: + Given the following text, distill the most important information + into a short summary: {part} + + ### Assistant: + """ + data = { + "prompt": prompt + } + try: + response = requests.post(URL, headers=headers, json=data) + print(response.json()) + except Exception as e: + print(str(e)) + +with open("summary.txt", "w") as sum: + sum.write(" ".join(final_summary)) diff --git a/trials/title_summary/bert.py b/trials/title_summary/bert.py new file mode 100644 index 00000000..a79bb76d --- /dev/null +++ b/trials/title_summary/bert.py @@ -0,0 +1,43 @@ +import torch +from transformers import BertTokenizer, BertModel +from sentence_transformers import SentenceTransformer +from sklearn.metrics.pairwise import cosine_similarity + +# Load the pre-trained BERT model and tokenizer +model_name = "bert-base-uncased" +model = BertModel.from_pretrained(model_name) +tokenizer = BertTokenizer.from_pretrained(model_name) + +# Set the device to use +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +model.to(device) + +# Load the SentenceTransformer model +sentence_transformer_model = SentenceTransformer('average_word_embeddings_glove.6B.300d') + +# Define the input text +text = "Your input text to be summarized goes here." + +# Tokenize the text +tokens = tokenizer.tokenize(text) +input_ids = tokenizer.convert_tokens_to_ids(tokens) +input_ids = torch.tensor([input_ids]).to(device) + +# Get the BERT model output +with torch.no_grad(): + outputs = model(input_ids)[0] # Extract the last hidden states + +# Calculate sentence embeddings +sentence_embeddings = outputs.mean(dim=1).squeeze().cpu().numpy() +input_text_embedding = sentence_transformer_model.encode([text])[0] + +# Calculate cosine similarity between sentences and input text +similarity_scores = cosine_similarity([input_text_embedding], sentence_embeddings) + +# Sort the sentences by similarity scores in descending order +sorted_sentences = [sent for _, sent in sorted(zip(similarity_scores[0], sentences), reverse=True)] + +# Choose the top sentences as the summary +num_summary_sentences = 2 # Adjust as needed +summary = ". ".join(sorted_sentences[:num_summary_sentences]) +print("Summary:", summary) diff --git a/trials/title_summary/gpt2.py b/trials/title_summary/gpt2.py new file mode 100644 index 00000000..1930a2d2 --- /dev/null +++ b/trials/title_summary/gpt2.py @@ -0,0 +1,101 @@ +# Approach 1 +from transformers import GPTNeoForCausalLM, GPT2Tokenizer + +model_name = 'EleutherAI/gpt-neo-1.3B' +tokenizer = GPT2Tokenizer.from_pretrained(model_name) +model = GPTNeoForCausalLM.from_pretrained(model_name) + +conversation = """ +Summarize the following conversation in 3 key sentences: + +We 're joined next by Thomas Curian , CEO of Google Cloud , and Alexander Wang , CEO and founder of Scale AI . +Thomas joined Google in November 2018 as the CEO of Google Cloud . Prior to Google , Thomas spent 22 years at Oracle , where most recently he was president of product development . +Before that , Thomas worked at McKinsey as a business analyst and engagement manager . His nearly 30 years of experience have given him a deep knowledge of engineering enterprise relationships and leadership of large organizations . +Thomas 's degrees include an MBA in administration and management from Stanford University , as an RJ Miller scholar and a BSEE in electrical engineering and computer science from Princeton University , where he graduated suma cum laude . +Thomas serves as a member of the Stanford graduate School of Business Advisory Council and Princeton University School of Engineering Advisory Council . +Please welcome to the stage , Thomas Curian and Alexander Wang . This is a super exciting conversation . Thanks for being here , Thomas . +""" + +input_ids = tokenizer.encode(conversation, return_tensors='pt') + +output = model.generate(input_ids, + max_length=30, + num_return_sequences=1) + +caption = tokenizer.decode(output[0], skip_special_tokens=True) +print("Caption:", caption[len(input_ids):]) + + +# Approach 2 +import torch +from transformers import GPT2LMHeadModel, GPT2Tokenizer + +model_name = "gpt2" +tokenizer = GPT2Tokenizer.from_pretrained(model_name) +model = GPT2LMHeadModel.from_pretrained(model_name) + +model.eval() + +text = """ +You all just came off of your incredible Google Cloud next conference where you released a wide variety of functionality and features and new products across artisan television and also across the entire sort of cloud ecosystem . You want to just first by walking through , first start by walking through all the innovations that you sort of released and what you 're excited about when you come to Google Cloud ? Now our vision is super simple . If you look at what smartphones did for a consumer , you know they took a computer and internet browser , a communication device , and a camera , and made it so that it 's in everybody 's pocket , so it really brought computation to every person . We feel that , you know , our , what we 're trying to do is take all the technological innovation that Google 's doing , but make it super simple so that everyone can consume it . And so that includes our global data center footprint , all the new types of hardware and large-scale systems we work on , the software that we 're making available for people to do high-scale computation , tools for data processing , tools for cybersecurity , processing , tools for cyber security , tools for machine learning , but make it so simple that everyone can use it . And every step that we do to simplify things for people , we think adoption can grow . And so that 's a lot of what we 've done these last three , four years , and we made a number of announcements that next in machine learning and AI in particular , you know , we look at our work as four elements , how we take our large-scale compute systems that were building for AI and how we make that available to everybody . Second , what we 're doing with the software stacks and top of it , things like jacks and other things and how we 're making those available to everybody . Third is advances because different people have different levels of expertise . Some people say I need the hardware to build my own large language model or algorithm . Other people say , look , I really need to use a building block . You guys give me . So , 30s we 've done a lot with AutoML and we announce new capability for image , video , and translation to make it available to everybody . And then lastly , we 're also building completely packaged solutions for some areas and we announce some new stuff . " +""" + +tokenizer.pad_token = tokenizer.eos_token +input_ids = tokenizer.encode(text, + max_length=100, + truncation=True, + return_tensors="pt") +attention_mask = torch.ones(input_ids.shape, dtype=torch.long) +output = model.generate(input_ids, + max_new_tokens=20, + num_return_sequences=1, + num_beams=2, + attention_mask=attention_mask) + +chapter_titles = [tokenizer.decode(output[i], skip_special_tokens=True) for i in range(output.shape[0])] +for i, title in enumerate(chapter_titles): + print("Caption: ", title) + +# Approach 3 + +import torch +from transformers import GPT2LMHeadModel, GPT2Tokenizer + + +def generate_response(conversation, max_length=100): + input_text = "" + for entry in conversation: + role = entry["role"] + content = entry["content"] + input_text += f"{role}: {content}\n" + + # Tokenize the entire conversation + input_ids = tokenizer.encode(input_text, return_tensors="pt") + + # Generate text based on the entire conversation + with torch.no_grad(): + output = model.generate(input_ids, pad_token_id=tokenizer.eos_token_id) + + # Decode the generated text and return it + response = tokenizer.decode(output[0], skip_special_tokens=True) + return response + + +if __name__ == "__main__": + + # Call appropriate approach from the main while experimenting + model_name = "gpt2" + model = GPT2LMHeadModel.from_pretrained(model_name) + tokenizer = GPT2Tokenizer.from_pretrained(model_name) + + sample_chunks = [ + "You all just came off of your incredible Google Cloud next conference where you released a wide variety of functionality and features and new products across artisan television and also across the entire sort of cloud ecosystem . You want to just first by walking through , first start by walking through all the innovations that you sort of released and what you 're excited about when you come to Google Cloud ? Now our vision is super simple . If you look at what smartphones did for a consumer , you know they took a computer and internet browser , a communication device , and a camera , and made it so that it 's in everybody 's pocket , so it really brought computation to every person . We feel that , you know , our , what we 're trying to do is take all the technological innovation that Google 's doing , but make it super simple so that everyone can consume it . And so that includes our global data center footprint , all the new types of hardware and large-scale systems we work on , the software that we 're making available for people to do high-scale computation , tools for data processing , tools for cybersecurity , processing , tools for cyber security , tools for machine learning , but make it so simple that everyone can use it . And every step that we do to simplify things for people , we think adoption can grow . And so that 's a lot of what we 've done these last three , four years , and we made a number of announcements that next in machine learning and AI in particular , you know , we look at our work as four elements , how we take our large-scale compute systems that were building for AI and how we make that available to everybody . Second , what we 're doing with the software stacks and top of it , things like jacks and other things and how we 're making those available to everybody . Third is advances because different people have different levels of expertise . Some people say I need the hardware to build my own large language model or algorithm . Other people say , look , I really need to use a building block . You guys give me . So , 30s we 've done a lot with AutoML and we announce new capability for image , video , and translation to make it available to everybody . And then lastly , we 're also building completely packaged solutions for some areas and we announce some new stuff . " + ] + + conversation = [ + {"role": "system", "content": "Summarize this text"}, + {"role": "user", "content": " text : " + sample_chunks[0]}, + ] + + response = generate_response(conversation) + print("Response:", response) diff --git a/trials/incsum.py b/trials/title_summary/incsum.py similarity index 94% rename from trials/incsum.py rename to trials/title_summary/incsum.py index 41b3d500..5081d16c 100644 --- a/trials/incsum.py +++ b/trials/title_summary/incsum.py @@ -1,9 +1,11 @@ +import spacy +import sys + + # Observe the incremental summaries by performing summaries in chunks with open("transcript.txt") as f: transcription = f.read() -import spacy - def split_text_file(filename, token_count): nlp = spacy.load('en_core_web_md') @@ -26,8 +28,9 @@ def split_text_file(filename, token_count): return parts + # Set the chunk length here to split the transcript and test -MAX_CHUNK_LENGTH=1000 +MAX_CHUNK_LENGTH = 1000 chunks = split_text_file("transcript.txt", MAX_CHUNK_LENGTH) print("Number of chunks", len(chunks)) @@ -41,19 +44,17 @@ with open("chunks" + str(MAX_CHUNK_LENGTH) + ".txt", "a") as f: # ex. python incsum.py 1 => will run approach 1 # If no input, will run all approaches -import sys try: index = sys.argv[1] except: index = None - # Approach 1 : facebook/bart-large-cnn if index == "1" or index is None: - SUMMARY_MODEL="facebook/bart-large-cnn" - MIN_LENGTH=5 - MAX_LENGTH=10 - BEAM_SIZE=2 + SUMMARY_MODEL = "facebook/bart-large-cnn" + MIN_LENGTH = 5 + MAX_LENGTH = 10 + BEAM_SIZE = 2 print("Performing chunk summary : " + SUMMARY_MODEL) @@ -81,7 +82,6 @@ if index == "1" or index is None: for summary in summaries: f.write(summary + "\n\n") - # Approach 2 if index == "2" or index is None: print("Performing chunk summary : " + "gpt-neo-1.3B") @@ -108,14 +108,14 @@ if index == "2" or index is None: max_length=max_length, attention_mask=attention_mask, pad_token_id=model.config.eos_token_id, - num_beams=4, - length_penalty=2.0, - early_stopping=True) + num_beams=4, + length_penalty=2.0, + early_stopping=True) summary_ids = output[0, input_length:] summary = tokenizer.decode(summary_ids, skip_special_tokens=True) summaries.append(summary) with open("gptneo1.3B-summaries.txt", "a") as f: - f.write(summary + "\n\n") + f.write(summary + "\n\n") # Approach 3 if index == "3" or index is None: @@ -155,4 +155,3 @@ if index == "3" or index is None: with open("mpt-7b-summaries.txt", "a") as f: for summary in summaries: f.write(summary + "\n\n") - diff --git a/trials/title_summary/openai_endpoint.py b/trials/title_summary/openai_endpoint.py new file mode 100644 index 00000000..c92856c5 --- /dev/null +++ b/trials/title_summary/openai_endpoint.py @@ -0,0 +1,37 @@ +# Use OpenAI API endpoint to send data to OpenAI +# along with prompts to caption/summarize the conversation + +import openai + +openai.api_key = "" + +# to caption, user prompt used : "caption this conversation" +# max_tokens=20 + +# to incremental summarize, user prompt used : "summarize this conversation in a few sentences by taking key points" +# max_tokens=300 + +sample_chunks = [ + "You all just came off of your incredible Google Cloud next conference where you released a wide variety of functionality and features and new products across artisan television and also across the entire sort of cloud ecosystem . You want to just first by walking through , first start by walking through all the innovations that you sort of released and what you 're excited about when you come to Google Cloud ? Now our vision is super simple . If you look at what smartphones did for a consumer , you know they took a computer and internet browser , a communication device , and a camera , and made it so that it 's in everybody 's pocket , so it really brought computation to every person . We feel that , you know , our , what we 're trying to do is take all the technological innovation that Google 's doing , but make it super simple so that everyone can consume it . And so that includes our global data center footprint , all the new types of hardware and large-scale systems we work on , the software that we 're making available for people to do high-scale computation , tools for data processing , tools for cybersecurity , processing , tools for cyber security , tools for machine learning , but make it so simple that everyone can use it . And every step that we do to simplify things for people , we think adoption can grow . And so that 's a lot of what we 've done these last three , four years , and we made a number of announcements that next in machine learning and AI in particular , you know , we look at our work as four elements , how we take our large-scale compute systems that were building for AI and how we make that available to everybody . Second , what we 're doing with the software stacks and top of it , things like jacks and other things and how we 're making those available to everybody . Third is advances because different people have different levels of expertise . Some people say I need the hardware to build my own large language model or algorithm . Other people say , look , I really need to use a building block . You guys give me . So , 30s we 've done a lot with AutoML and we announce new capability for image , video , and translation to make it available to everybody . And then lastly , we 're also building completely packaged solutions for some areas and we announce some new stuff . ", + " We 're joined next by Thomas Curian , CEO of Google Cloud , and Alexander Wang , CEO and founder of Scale AI . Thomas joined Google in November 2018 as the CEO of Google Cloud . Prior to Google , Thomas spent 22 years at Oracle , where most recently he was president of product development . Before that , Thomas worked at McKinsey as a business analyst and engagement manager . His nearly 30 years of experience have given him a deep knowledge of engineering enterprise relationships and leadership of large organizations . Thomas 's degrees include an MBA in administration and management from Stanford University , as an RJ Miller scholar and a BSEE in electrical engineering and computer science from Princeton University , where he graduated suma cum laude . Thomas serves as a member of the Stanford graduate School of Business Advisory Council and Princeton University School of Engineering Advisory Council . Please welcome to the stage , Thomas Curian and Alexander Wang . This is a super exciting conversation . Thanks for being here , Thomas ."] + +conversation = [ + {"role": "system", + "content": sample_chunks[1]}, + {"role": "user", + "content": "summarize this conversation in a few sentences by taking key points"} +] + +model = "gpt-3.5-turbo" +response = openai.ChatCompletion.create(model=model, + messages=conversation, + n=1, + max_tokens=300) + +# Try fine tuned model +# model = "davinci:ft-personal-2023-07-14-10-43-51" +# response = openai.Completion.create(model=model, +# prompt=sample_chunks[0] + " -> ") + +caption = response.choices[0] +print(caption) diff --git a/trials/title_summary/pegasus.py b/trials/title_summary/pegasus.py new file mode 100644 index 00000000..884ed3ee --- /dev/null +++ b/trials/title_summary/pegasus.py @@ -0,0 +1,33 @@ +from transformers import PegasusForConditionalGeneration, PegasusTokenizer +import torch +# Load the Pegasus model and tokenizer +model_name = "google/pegasus-large" +model = PegasusForConditionalGeneration.from_pretrained(model_name) +tokenizer = PegasusTokenizer.from_pretrained(model_name) + +# Set the device to use +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +model.to(device) + +sample_chunks = ["You all just came off of your incredible Google Cloud next conference where you released a wide variety of functionality and features and new products across artisan television and also across the entire sort of cloud ecosystem . You want to just first by walking through , first start by walking through all the innovations that you sort of released and what you 're excited about when you come to Google Cloud ? Now our vision is super simple . If you look at what smartphones did for a consumer , you know they took a computer and internet browser , a communication device , and a camera , and made it so that it 's in everybody 's pocket , so it really brought computation to every person . We feel that , you know , our , what we 're trying to do is take all the technological innovation that Google 's doing , but make it super simple so that everyone can consume it . And so that includes our global data center footprint , all the new types of hardware and large-scale systems we work on , the software that we 're making available for people to do high-scale computation , tools for data processing , tools for cybersecurity , processing , tools for cyber security , tools for machine learning , but make it so simple that everyone can use it . And every step that we do to simplify things for people , we think adoption can grow . And so that 's a lot of what we 've done these last three , four years , and we made a number of announcements that next in machine learning and AI in particular , you know , we look at our work as four elements , how we take our large-scale compute systems that were building for AI and how we make that available to everybody . Second , what we 're doing with the software stacks and top of it , things like jacks and other things and how we 're making those available to everybody . Third is advances because different people have different levels of expertise . Some people say I need the hardware to build my own large language model or algorithm . Other people say , look , I really need to use a building block . You guys give me . So , 30s we 've done a lot with AutoML and we announce new capability for image , video , and translation to make it available to everybody . And then lastly , we 're also building completely packaged solutions for some areas and we announce some new stuff . ", + " We 're joined next by Thomas Curian , CEO of Google Cloud , and Alexander Wang , CEO and founder of Scale AI . Thomas joined Google in November 2018 as the CEO of Google Cloud . Prior to Google , Thomas spent 22 years at Oracle , where most recently he was president of product development . Before that , Thomas worked at McKinsey as a business analyst and engagement manager . His nearly 30 years of experience have given him a deep knowledge of engineering enterprise relationships and leadership of large organizations . Thomas 's degrees include an MBA in administration and management from Stanford University , as an RJ Miller scholar and a BSEE in electrical engineering and computer science from Princeton University , where he graduated suma cum laude . Thomas serves as a member of the Stanford graduate School of Business Advisory Council and Princeton University School of Engineering Advisory Council . Please welcome to the stage , Thomas Curian and Alexander Wang . This is a super exciting conversation . Thanks for being here , Thomas ."] + + +# Define the input text for summarization +text = sample_chunks[1] + +inputs = tokenizer(text, truncation=True, padding="longest", return_tensors="pt").to(device) + +# Generate the summary +summary_ids = model.generate( + inputs["input_ids"], + attention_mask=inputs["attention_mask"], + max_length=200, + num_beams=4, + length_penalty=2.0, + early_stopping=True, +) + +# Decode and print the summary +summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True) +print("Summary:", summary) diff --git a/trials/openai_endpoint.py b/trials/title_summary/t5.py similarity index 76% rename from trials/openai_endpoint.py rename to trials/title_summary/t5.py index 30e6a900..0c366ac6 100644 --- a/trials/openai_endpoint.py +++ b/trials/title_summary/t5.py @@ -1,36 +1,27 @@ -# Use OpenAI API endpoint to send data to OpenAI -# along with prompts to caption/summarize the conversation +from transformers import T5ForConditionalGeneration, T5Tokenizer +import torch +# Load the T5 model and tokenizer +model_name = "t5-base" +model = T5ForConditionalGeneration.from_pretrained(model_name) +tokenizer = T5Tokenizer.from_pretrained(model_name) -import openai - -openai.api_key = "" - -# to caption, user prompt used : "caption this conversation" -# max_tokens=20 - -# to incremental summarize, user prompt used : "summarize this conversation in a few sentences by taking key points" -# max_tokens=300 +# Set the device to use +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +model.to(device) sample_chunks = ["You all just came off of your incredible Google Cloud next conference where you released a wide variety of functionality and features and new products across artisan television and also across the entire sort of cloud ecosystem . You want to just first by walking through , first start by walking through all the innovations that you sort of released and what you 're excited about when you come to Google Cloud ? Now our vision is super simple . If you look at what smartphones did for a consumer , you know they took a computer and internet browser , a communication device , and a camera , and made it so that it 's in everybody 's pocket , so it really brought computation to every person . We feel that , you know , our , what we 're trying to do is take all the technological innovation that Google 's doing , but make it super simple so that everyone can consume it . And so that includes our global data center footprint , all the new types of hardware and large-scale systems we work on , the software that we 're making available for people to do high-scale computation , tools for data processing , tools for cybersecurity , processing , tools for cyber security , tools for machine learning , but make it so simple that everyone can use it . And every step that we do to simplify things for people , we think adoption can grow . And so that 's a lot of what we 've done these last three , four years , and we made a number of announcements that next in machine learning and AI in particular , you know , we look at our work as four elements , how we take our large-scale compute systems that were building for AI and how we make that available to everybody . Second , what we 're doing with the software stacks and top of it , things like jacks and other things and how we 're making those available to everybody . Third is advances because different people have different levels of expertise . Some people say I need the hardware to build my own large language model or algorithm . Other people say , look , I really need to use a building block . You guys give me . So , 30s we 've done a lot with AutoML and we announce new capability for image , video , and translation to make it available to everybody . And then lastly , we 're also building completely packaged solutions for some areas and we announce some new stuff . ", " We 're joined next by Thomas Curian , CEO of Google Cloud , and Alexander Wang , CEO and founder of Scale AI . Thomas joined Google in November 2018 as the CEO of Google Cloud . Prior to Google , Thomas spent 22 years at Oracle , where most recently he was president of product development . Before that , Thomas worked at McKinsey as a business analyst and engagement manager . His nearly 30 years of experience have given him a deep knowledge of engineering enterprise relationships and leadership of large organizations . Thomas 's degrees include an MBA in administration and management from Stanford University , as an RJ Miller scholar and a BSEE in electrical engineering and computer science from Princeton University , where he graduated suma cum laude . Thomas serves as a member of the Stanford graduate School of Business Advisory Council and Princeton University School of Engineering Advisory Council . Please welcome to the stage , Thomas Curian and Alexander Wang . This is a super exciting conversation . Thanks for being here , Thomas ."] -conversation = [ - {"role": "system", - "content": sample_chunks[1]}, - {"role": "user", - "content": "summarize this conversation in a few sentences by taking key points"} -] -model = "gpt-3.5-turbo" -response = openai.ChatCompletion.create(model=model, - messages=conversation, - n=1, - max_tokens=300) +# Define the input text for summarization +text = "Summarize the following text in 3 key points. text : " + sample_chunks[1] -# Try finetuned model -# model = "davinci:ft-personal-2023-07-14-10-43-51" -# response = openai.Completion.create(model=model, -# prompt=sample_chunks[0] + " -> ") +# Tokenize the input text +inputs = tokenizer.encode(text, return_tensors="pt").to(device) -caption = response.choices[0] -print(caption) +# Generate the summary +summary_ids = model.generate(inputs, max_length=1000, num_beams=4, early_stopping=True) + +# Decode and print the summary +summary = tokenizer.decode(summary_ids.squeeze(), skip_special_tokens=True) +print("Summary:", summary) diff --git a/trials/transcript.txt b/trials/title_summary/transcript.txt similarity index 100% rename from trials/transcript.txt rename to trials/title_summary/transcript.txt diff --git a/trials/title_summary/vicuna.py b/trials/title_summary/vicuna.py new file mode 100644 index 00000000..588869c0 --- /dev/null +++ b/trials/title_summary/vicuna.py @@ -0,0 +1,44 @@ +from gpt4all import GPT4All + +model = GPT4All("/Users/gokulmohanarangan/Library/Application Support/nomic.ai/GPT4All/ggml-vicuna-13b-1.1-q4_2.bin") + +import spacy + + +def split_text_file(filename, token_count): + nlp = spacy.load('en_core_web_md') + + with open(filename, 'r') as file: + text = file.read() + + doc = nlp(text) + total_tokens = len(doc) + + parts = [] + start_index = 0 + + while start_index < total_tokens: + end_index = start_index + token_count + part_tokens = doc[start_index:end_index] + part = ' '.join(token.text for token in part_tokens) + parts.append(part) + start_index = end_index + + return parts + +parts = split_text_file("transcript.txt", 1800) +final_summary = [] +for part in parts: + prompt = f""" + ### Human: + Summarize the following text without missing any key points and action items. + + {part} + ### Assistant: + """ + output = model.generate(prompt) + final_summary.append(output) + + +with open("sum.txt", "w") as sum: + sum.write(" ".join(final_summary)) diff --git a/trials/whisper-jax/__init__.py b/trials/whisper-jax/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/whisjax.py b/trials/whisper-jax/whisjax.py similarity index 94% rename from whisjax.py rename to trials/whisper-jax/whisjax.py index cfc95574..eb87629d 100644 --- a/whisjax.py +++ b/trials/whisper-jax/whisjax.py @@ -18,11 +18,11 @@ import nltk import yt_dlp as youtube_dl from whisper_jax import FlaxWhisperPipline -from utils.file_utils import download_files, upload_files -from utils.log_utils import logger -from utils.run_utils import config -from utils.text_utilities import post_process_transcription, summarize -from utils.viz_utilities import create_talk_diff_scatter_viz, create_wordcloud +from ...utils.file_utils import download_files, upload_files +from ...utils.log_utils import logger +from ...utils.run_utils import config +from ...utils.text_utils import post_process_transcription, summarize +from ...utils.viz_utils import create_talk_diff_scatter_viz, create_wordcloud nltk.download('punkt', quiet=True) nltk.download('stopwords', quiet=True) @@ -30,8 +30,8 @@ nltk.download('stopwords', quiet=True) WHISPER_MODEL_SIZE = config['DEFAULT']["WHISPER_MODEL_SIZE"] NOW = datetime.now() -if not os.path.exists('./artefacts'): - os.makedirs('./artefacts') +if not os.path.exists('../../artefacts'): + os.makedirs('../../artefacts') def init_argparse() -> argparse.ArgumentParser: @@ -91,7 +91,7 @@ def main(): # Download the audio with youtube_dl.YoutubeDL(ydl_opts) as ydl: ydl.download([args.location]) - media_file = "./artefacts/audio.mp3" + media_file = "../artefacts/audio.mp3" logger.info("Saved downloaded YouTube video to: " + media_file) else: diff --git a/whisjax_realtime.py b/trials/whisper-jax/whisjax_realtime.py similarity index 94% rename from whisjax_realtime.py rename to trials/whisper-jax/whisjax_realtime.py index 63eab04d..efb39461 100644 --- a/whisjax_realtime.py +++ b/trials/whisper-jax/whisjax_realtime.py @@ -10,11 +10,11 @@ from pynput import keyboard from termcolor import colored from whisper_jax import FlaxWhisperPipline -from utils.file_utils import upload_files -from utils.log_utils import logger -from utils.run_utils import config -from utils.text_utilities import post_process_transcription, summarize -from utils.viz_utilities import create_talk_diff_scatter_viz, create_wordcloud +from ...utils.file_utils import upload_files +from ...utils.log_utils import logger +from ...utils.run_utils import config +from ...utils.text_utils import post_process_transcription, summarize +from ...utils.viz_utils import create_talk_diff_scatter_viz, create_wordcloud WHISPER_MODEL_SIZE = config['DEFAULT']["WHISPER_MODEL_SIZE"] diff --git a/utils/config.ini b/utils/config.ini index 976f4a32..001ed7c4 100644 --- a/utils/config.ini +++ b/utils/config.ini @@ -1,5 +1,6 @@ [DEFAULT] -#SetexceptionruleforOpenMPerrortoallowduplicatelibinitialization +#Set exception rule for OpenMP error +#to allow duplicate lib initialization KMP_DUPLICATE_LIB_OK=TRUE #ExportOpenAIAPIKey OPENAI_APIKEY= @@ -7,8 +8,8 @@ OPENAI_APIKEY= WHISPER_MODEL_SIZE=tiny WHISPER_REAL_TIME_MODEL_SIZE=tiny #AWSconfig -AWS_ACCESS_KEY=***REMOVED*** -AWS_SECRET_KEY=***REMOVED*** +AWS_ACCESS_KEY= +AWS_SECRET_KEY= BUCKET_NAME=reflector-bucket #Summarizerconfig SUMMARY_MODEL=facebook/bart-large-cnn @@ -17,8 +18,9 @@ MAX_LENGTH=2048 BEAM_SIZE=6 MAX_CHUNK_LENGTH=1024 SUMMARIZE_USING_CHUNKS=YES -#Audiodevice +# Audiodevice BLACKHOLE_INPUT_AGGREGATOR_DEVICE_NAME=aggregator AV_FOUNDATION_DEVICE_ID=1 -# LLM PATH -LLM_PATH= +# LLM configs +LLM_MACHINE_IP= +LLM_MACHINE_PORT= diff --git a/utils/format_output.py b/utils/format_output.py new file mode 100644 index 00000000..4f026ce2 --- /dev/null +++ b/utils/format_output.py @@ -0,0 +1,32 @@ +import json + +with open("../artefacts/meeting_titles_and_summaries.txt", "r") as f: + outputs = f.read() + +outputs = json.loads(outputs) + +transcript_file = open("../artefacts/meeting_transcript.txt", "a") +title_desc_file = open("../artefacts/meeting_title_description.txt", "a") +summary_file = open("../artefacts/meeting_summary.txt", "a") + +for item in outputs["topics"]: + transcript_file.write(item["transcript"]) + summary_file.write(item["description"]) + + title_desc_file.write("TITLE: \n") + title_desc_file.write(item["title"]) + title_desc_file.write("\n") + + title_desc_file.write("DESCRIPTION: \n") + title_desc_file.write(item["description"]) + title_desc_file.write("\n") + + title_desc_file.write("TRANSCRIPT: \n") + title_desc_file.write(item["transcript"]) + title_desc_file.write("\n") + + title_desc_file.write("---------------------------------------- \n\n") + +transcript_file.close() +title_desc_file.close() +summary_file.close() diff --git a/utils/text_utilities.py b/utils/text_utils.py similarity index 84% rename from utils/text_utilities.py rename to utils/text_utils.py index ef15c7a3..25126b34 100644 --- a/utils/text_utilities.py +++ b/utils/text_utils.py @@ -6,8 +6,8 @@ from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity from transformers import BartForConditionalGeneration, BartTokenizer -from utils.log_utils import logger -from utils.run_utils import config +from log_utils import logger +from run_utils import config nltk.download('punkt', quiet=True) @@ -154,7 +154,7 @@ def chunk_text(text, def summarize(transcript_text, timestamp, real_time=False, - summarize_using_chunks=config["DEFAULT"]["SUMMARIZE_USING_CHUNKS"]): + chunk_summarize=config["DEFAULT"]["SUMMARIZE_USING_CHUNKS"]): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") summary_model = config["DEFAULT"]["SUMMARY_MODEL"] if not summary_model: @@ -166,27 +166,35 @@ def summarize(transcript_text, timestamp, model = BartForConditionalGeneration.from_pretrained(summary_model) model = model.to(device) - output_filename = "summary_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".txt" + output_file = "summary_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".txt" if real_time: - output_filename = "real_time_" + output_filename + output_file = "real_time_" + output_file - if summarize_using_chunks != "YES": - inputs = tokenizer.\ + if chunk_summarize != "YES": + max_length = int(config["DEFAULT"]["INPUT_ENCODING_MAX_LENGTH"]) + inputs = tokenizer. \ batch_encode_plus([transcript_text], truncation=True, padding='longest', - max_length=int(config["DEFAULT"]["INPUT_ENCODING_MAX_LENGTH"]), + max_length=max_length, return_tensors='pt') inputs = inputs.to(device) with torch.no_grad(): + num_beans = int(config["DEFAULT"]["BEAM_SIZE"]) + max_length = int(config["DEFAULT"]["MAX_LENGTH"]) summaries = model.generate(inputs['input_ids'], - num_beams=int(config["DEFAULT"]["BEAM_SIZE"]), length_penalty=2.0, - max_length=int(config["DEFAULT"]["MAX_LENGTH"]), early_stopping=True) + num_beams=num_beans, + length_penalty=2.0, + max_length=max_length, + early_stopping=True) - decoded_summaries = [tokenizer.decode(summary, skip_special_tokens=True, clean_up_tokenization_spaces=False) - for summary in summaries] + decoded_summaries = \ + [tokenizer.decode(summary, + skip_special_tokens=True, + clean_up_tokenization_spaces=False) + for summary in summaries] summary = " ".join(decoded_summaries) - with open("./artefacts/" + output_filename, 'w') as f: + with open("./artefacts/" + output_file, 'w') as f: f.write(summary.strip() + "\n") else: logger.info("Breaking transcript into smaller chunks") @@ -195,8 +203,8 @@ def summarize(transcript_text, timestamp, logger.info(f"Transcript broken into {len(chunks)} " f"chunks of at most 500 words") - logger.info(f"Writing summary text to: {output_filename}") - with open(output_filename, 'w') as f: + logger.info(f"Writing summary text to: {output_file}") + with open(output_file, 'w') as f: summaries = summarize_chunks(chunks, tokenizer, model) for summary in summaries: f.write(summary.strip() + " ") diff --git a/utils/viz_utilities.py b/utils/viz_utils.py similarity index 91% rename from utils/viz_utilities.py rename to utils/viz_utils.py index 93a9b56f..d7debd0c 100644 --- a/utils/viz_utilities.py +++ b/utils/viz_utils.py @@ -13,7 +13,7 @@ from wordcloud import STOPWORDS, WordCloud en = spacy.load('en_core_web_md') spacy_stopwords = en.Defaults.stop_words -STOPWORDS = set(STOPWORDS).union(set(stopwords.words("english"))).\ +STOPWORDS = set(STOPWORDS).union(set(stopwords.words("english"))). \ union(set(spacy_stopwords)) @@ -24,7 +24,7 @@ def create_wordcloud(timestamp, real_time=False): """ filename = "transcript" if real_time: - filename = "real_time_" + filename + "_" +\ + filename = "real_time_" + filename + "_" + \ timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".txt" else: filename += "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".txt" @@ -45,24 +45,24 @@ def create_wordcloud(timestamp, real_time=False): plt.axis("off") plt.tight_layout(pad=0) - wordcloud_name = "wordcloud" + wordcloud = "wordcloud" if real_time: - wordcloud_name = "real_time_" + wordcloud_name + "_" +\ + wordcloud = "real_time_" + wordcloud + "_" + \ timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".png" else: - wordcloud_name += "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".png" + wordcloud += "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".png" - plt.savefig("./artefacts/" + wordcloud_name) + plt.savefig("./artefacts/" + wordcloud) def create_talk_diff_scatter_viz(timestamp, real_time=False): """ - Perform agenda vs transription diff to see covered topics. + Perform agenda vs transcription diff to see covered topics. Create a scatter plot of words in topics. :return: None. Saved locally. """ - spaCy_model = "en_core_web_md" - nlp = spacy.load(spaCy_model) + spacy_model = "en_core_web_md" + nlp = spacy.load(spacy_model) nlp.add_pipe('sentencizer') agenda_topics = [] @@ -75,12 +75,11 @@ def create_talk_diff_scatter_viz(timestamp, real_time=False): agenda_topics.append(line.split(":")[0]) # Load the transcription with timestamp - filename = "" if real_time: - filename = "./artefacts/real_time_transcript_with_timestamp_" +\ + filename = "./artefacts/real_time_transcript_with_timestamp_" + \ timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".txt" else: - filename = "./artefacts/transcript_with_timestamp_" +\ + filename = "./artefacts/transcript_with_timestamp_" + \ timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".txt" with open(filename) as f: transcription_timestamp_text = f.read() @@ -142,7 +141,7 @@ def create_talk_diff_scatter_viz(timestamp, real_time=False): df = df.apply(create_new_columns, axis=1) - # Count the number of items covered and calculatre the percentage + # Count the number of items covered and calculate the percentage num_covered_items = sum(covered_items.values()) percentage_covered = num_covered_items / len(agenda) * 100 @@ -158,7 +157,7 @@ def create_talk_diff_scatter_viz(timestamp, real_time=False): # Save df, mappings for further experimentation df_name = "df" if real_time: - df_name = "real_time_" + df_name + "_" +\ + df_name = "real_time_" + df_name + "_" + \ timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".pkl" else: df_name += "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".pkl" @@ -169,7 +168,7 @@ def create_talk_diff_scatter_viz(timestamp, real_time=False): mappings_name = "mappings" if real_time: - mappings_name = "real_time_" + mappings_name + "_" +\ + mappings_name = "real_time_" + mappings_name + "_" + \ timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".pkl" else: mappings_name += "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".pkl"