mirror of
https://github.com/Monadical-SAS/reflector.git
synced 2026-02-04 09:56:47 +00:00
Moved all server files to server/
This commit is contained in:
176
server/.gitignore
vendored
Normal file
176
server/.gitignore
vendored
Normal file
@@ -0,0 +1,176 @@
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
share/python-wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
MANIFEST
|
||||
|
||||
# PyInstaller
|
||||
# Usually these files are written by a python script from a template
|
||||
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||
*.manifest
|
||||
*.spec
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
pip-delete-this-directory.txt
|
||||
|
||||
# Unit test / coverage reports
|
||||
htmlcov/
|
||||
.tox/
|
||||
.nox/
|
||||
.coverage
|
||||
.coverage.*
|
||||
.cache
|
||||
nosetests.xml
|
||||
coverage.xml
|
||||
*.cover
|
||||
*.py,cover
|
||||
.hypothesis/
|
||||
.pytest_cache/
|
||||
cover/
|
||||
|
||||
# Translations
|
||||
*.mo
|
||||
*.pot
|
||||
|
||||
# Django stuff:
|
||||
*.log
|
||||
local_settings.py
|
||||
db.sqlite3
|
||||
db.sqlite3-journal
|
||||
|
||||
# Flask stuff:
|
||||
instance/
|
||||
.webassets-cache
|
||||
|
||||
# Scrapy stuff:
|
||||
.scrapy
|
||||
|
||||
# Sphinx documentation
|
||||
docs/_build/
|
||||
|
||||
# PyBuilder
|
||||
.pybuilder/
|
||||
target/
|
||||
|
||||
# Jupyter Notebook
|
||||
.ipynb_checkpoints
|
||||
|
||||
# IPython
|
||||
profile_default/
|
||||
ipython_config.py
|
||||
|
||||
# pyenv
|
||||
# For a library or package, you might want to ignore these files since the code is
|
||||
# intended to run in multiple environments; otherwise, check them in:
|
||||
# .python-version
|
||||
|
||||
# pipenv
|
||||
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
||||
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
||||
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
||||
# install all needed dependencies.
|
||||
#Pipfile.lock
|
||||
|
||||
# poetry
|
||||
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
||||
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
||||
# commonly ignored for libraries.
|
||||
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
||||
#poetry.lock
|
||||
|
||||
# pdm
|
||||
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
||||
#pdm.lock
|
||||
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
||||
# in version control.
|
||||
# https://pdm.fming.dev/#use-with-ide
|
||||
.pdm.toml
|
||||
|
||||
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
||||
__pypackages__/
|
||||
|
||||
# Celery stuff
|
||||
celerybeat-schedule
|
||||
celerybeat.pid
|
||||
|
||||
# SageMath parsed files
|
||||
*.sage.py
|
||||
|
||||
# Environments
|
||||
.env
|
||||
.venv
|
||||
env/
|
||||
venv/
|
||||
ENV/
|
||||
env.bak/
|
||||
venv.bak/
|
||||
|
||||
# Spyder project settings
|
||||
.spyderproject
|
||||
.spyproject
|
||||
|
||||
# Rope project settings
|
||||
.ropeproject
|
||||
|
||||
# mkdocs documentation
|
||||
/site
|
||||
|
||||
# mypy
|
||||
.mypy_cache/
|
||||
.dmypy.json
|
||||
dmypy.json
|
||||
|
||||
# Pyre type checker
|
||||
.pyre/
|
||||
|
||||
# pytype static type analyzer
|
||||
.pytype/
|
||||
|
||||
# Cython debug symbols
|
||||
cython_debug/
|
||||
|
||||
# PyCharm
|
||||
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
||||
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
||||
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
||||
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||
#.idea/
|
||||
|
||||
*.mp4
|
||||
*.html
|
||||
*.pkl
|
||||
transcript_*.txt
|
||||
test_*.txt
|
||||
wordcloud*.png
|
||||
utils/config.ini
|
||||
test_samples/
|
||||
*.wav
|
||||
*.mp3
|
||||
*.m4a
|
||||
.DS_Store/
|
||||
.DS_Store
|
||||
.vscode/
|
||||
artefacts/
|
||||
174
server/README.md
Normal file
174
server/README.md
Normal file
@@ -0,0 +1,174 @@
|
||||
# Reflector
|
||||
|
||||
This is the code base for the Reflector demo (formerly called agenda-talk-diff) for the leads : Troy Web Consulting
|
||||
panel (A Chat with AWS about AI: Real AI/ML AWS projects and what you should know) on 6/14 at 430PM.
|
||||
|
||||
The target deliverable is a local-first live transcription and visualization tool to compare a discussion's target
|
||||
agenda/objectives to the actual discussion live.
|
||||
|
||||
**S3 bucket:**
|
||||
|
||||
Everything you need for S3 is already configured in config.ini. Only edit it if you need to change it deliberately.
|
||||
|
||||
S3 bucket name is mentioned in config.ini. All transfers will happen between this bucket and the local computer where
|
||||
the
|
||||
script is run. You need AWS_ACCESS_KEY / AWS_SECRET_KEY to authenticate your calls to S3 (done in config.ini).
|
||||
|
||||
For AWS S3 Web UI,
|
||||
|
||||
1) Login to AWS management console.
|
||||
2) Search for S3 in the search bar at the top.
|
||||
3) Navigate to list the buckets under the current account, if needed and choose your bucket [```reflector-bucket```]
|
||||
4) You should be able to see items in the bucket. You can upload/download files here directly.
|
||||
|
||||
For CLI,
|
||||
Refer to the FILE UTIL section below.
|
||||
|
||||
**FILE UTIL MODULE:**
|
||||
|
||||
A file_util module has been created to upload/download files with AWS S3 bucket pre-configured using config.ini.
|
||||
Though not needed for the workflow, if you need to upload / download file, separately on your own, apart from the
|
||||
pipeline workflow in the script, you can do so by :
|
||||
|
||||
Upload:
|
||||
|
||||
``` python3 file_util.py upload <object_name_in_S3_bucket>```
|
||||
|
||||
Download:
|
||||
|
||||
``` python3 file_util.py download <object_name_in_S3_bucket>```
|
||||
|
||||
If you want to access the S3 artefacts, from another machine, you can either use the python file_util with the commands
|
||||
mentioned above or simply use the GUI of AWS Management Console.
|
||||
|
||||
To setup,
|
||||
|
||||
1) Check values in config.ini file. Specifically add your OPENAI_APIKEY if you plan to use OpenAI API requests.
|
||||
2) Run ``` export KMP_DUPLICATE_LIB_OK=True``` in
|
||||
Terminal. [This is taken care of in code, but not reflecting, Will fix this issue later.]
|
||||
|
||||
NOTE: If you don't have portaudio installed already, run ```brew install portaudio```
|
||||
|
||||
3) Run the script setup_depedencies.sh.
|
||||
|
||||
``` chmod +x setup_dependencies.sh ```
|
||||
|
||||
``` sh setup_dependencies.sh <ENV>```
|
||||
|
||||
ENV refers to the intended environment for JAX. JAX is available in several
|
||||
variants, [CPU | GPU | Colab TPU | Google Cloud TPU]
|
||||
|
||||
```ENV``` is :
|
||||
|
||||
cpu -> JAX CPU installation
|
||||
|
||||
cuda11 -> JAX CUDA 11.x version
|
||||
|
||||
cuda12 -> JAX CUDA 12.x version (Core Weave has CUDA 12 version, can check with ```nvidia-smi```)
|
||||
|
||||
```sh setup_dependencies.sh cuda12```
|
||||
|
||||
4) If not already done, install ffmpeg. ```brew install ffmpeg```
|
||||
|
||||
For NLTK SSL error,
|
||||
check [here](https://stackoverflow.com/questions/38916452/nltk-download-ssl-certificate-verify-failed)
|
||||
|
||||
5) Run the Whisper-JAX pipeline. Currently, the repo can take a Youtube video and transcribes/summarizes it.
|
||||
|
||||
``` python3 whisjax.py "https://www.youtube.com/watch?v=ihf0S97oxuQ"```
|
||||
|
||||
You can even run it on local file or a file in your configured S3 bucket.
|
||||
|
||||
``` python3 whisjax.py "startup.mp4"```
|
||||
|
||||
The script will take care of a few cases like youtube file, local file, video file, audio-only file,
|
||||
file in S3, etc. If local file is not present, it can automatically take the file from S3.
|
||||
|
||||
**OFFLINE WORKFLOW:**
|
||||
|
||||
1) Specify the input source file] from a local, youtube link or upload to S3 if needed and pass it as input to the
|
||||
script.If the source file is in
|
||||
```.m4a``` format, it will get converted to ```.mp4``` automatically.
|
||||
2) Keep the agenda header topics in a local file named ```agenda-headers.txt```. This needs to be present where the
|
||||
script is run.
|
||||
This version of the pipeline compares covered agenda topics using agenda headers in the following format.
|
||||
1) ```agenda_topic : <short description>```
|
||||
3) Check all the values in ```config.ini```. You need to predefine 2 categories for which you need to scatter plot the
|
||||
topic modelling visualization in the config file. This is the default visualization. But, from the dataframe artefact
|
||||
called
|
||||
```df_<timestamp>.pkl``` , you can load the df and choose different topics to plot. You can filter using certain
|
||||
words to search for the
|
||||
transcriptions and you can see the top influencers and characteristic in each topic we have chosen to plot in the
|
||||
interactive HTML document. I have added a new jupyter notebook that gives the base template to play around with,
|
||||
named
|
||||
```Viz_experiments.ipynb```.
|
||||
4) Run the script. The script automatically transcribes, summarizes and creates a scatter plot of words & topics in the
|
||||
form of an interactive
|
||||
HTML file, a sample word cloud and uploads them to the S3 bucket
|
||||
5) Additional artefacts pushed to S3:
|
||||
1) HTML visualization file
|
||||
2) pandas df in pickle format for others to collaborate and make their own visualizations
|
||||
3) Summary, transcript and transcript with timestamps file in text format.
|
||||
|
||||
The script also creates 2 types of mappings.
|
||||
1) Timestamp -> The top 2 matched agenda topic
|
||||
2) Topic -> All matched timestamps in the transcription
|
||||
|
||||
Other visualizations can be planned based on available artefacts or new ones can be created. Refer the
|
||||
section ```Viz-experiments```.
|
||||
|
||||
**Visualization experiments:**
|
||||
|
||||
This is a jupyter notebook playground with template instructions on handling the metadata and data artefacts generated
|
||||
from the
|
||||
pipeline. Follow the instructions given and tweak your own logic into it or use it as a playground to experiment
|
||||
libraries and
|
||||
visualizations on top of the metadata.
|
||||
|
||||
**WHISPER-JAX REALTIME TRANSCRIPTION PIPELINE:**
|
||||
|
||||
We also support a provision to perform real-time transcripton using whisper-jax pipeline. But, there are
|
||||
a few pre-requisites before you run it on your local machine. The instructions are for
|
||||
configuring on a MacOS.
|
||||
|
||||
We need to way to route audio from an application opened via the browser, ex. "Whereby" and audio from your local
|
||||
microphone input which you will be using for speaking. We
|
||||
use [Blackhole](https://github.com/ExistentialAudio/BlackHole).
|
||||
|
||||
1) Install Blackhole-2ch (2 ch is enough) by 1 of 2 options listed.
|
||||
2) Setup [Aggregate device](https://github.com/ExistentialAudio/BlackHole/wiki/Aggregate-Device) to route web audio and
|
||||
local microphone input.
|
||||
|
||||
Be sure to mirror the settings given 
|
||||
3) Setup [Multi-Output device](https://github.com/ExistentialAudio/BlackHole/wiki/Multi-Output-Device)
|
||||
|
||||
Refer 
|
||||
|
||||
4) Set the aggregator input device name created in step 2 in config.ini as ```BLACKHOLE_INPUT_AGGREGATOR_DEVICE_NAME```
|
||||
|
||||
5) Then goto ``` System Preferences -> Sound ``` and choose the devices created from the Output and
|
||||
Input tabs.
|
||||
|
||||
6) The input from your local microphone, the browser run meeting should be aggregated into one virtual stream to listen
|
||||
to
|
||||
and the output should be fed back to your specified output devices if everything is configured properly. Check this
|
||||
before trying out the trial.
|
||||
|
||||
**Permissions:**
|
||||
|
||||
You may have to add permission for "Terminal"/Code Editors [Pycharm/VSCode, etc.] microphone access to record audio in
|
||||
```System Preferences -> Privacy & Security -> Microphone```,
|
||||
```System Preferences -> Privacy & Security -> Accessibility```,
|
||||
```System Preferences -> Privacy & Security -> Input Monitoring```.
|
||||
|
||||
From the reflector root folder,
|
||||
|
||||
run ```python3 whisjax_realtime.py```
|
||||
|
||||
The transcription text should be written to ```real_time_transcription_<timestamp>.txt```.
|
||||
|
||||
NEXT STEPS:
|
||||
|
||||
1) Create a RunPod setup for this feature (mentioned in 1 & 2) and test it end-to-end
|
||||
2) Perform Speaker Diarization using Whisper-JAX
|
||||
3) Based on the feasibility of the above points, explore suitable visualizations for transcription & summarization.
|
||||
0
server/__init__.py
Normal file
0
server/__init__.py
Normal file
8
server/agenda-headers.txt
Normal file
8
server/agenda-headers.txt
Normal file
@@ -0,0 +1,8 @@
|
||||
AGENDA: Most important things to look for in a start up
|
||||
TAM: Make sure the market is sufficiently large than once they win they can get rewarded
|
||||
Product market fit: Being in a good market with a product than can satisfy that market
|
||||
Unit economics: Profit for delivering all-in cost must be attractive (% or $ amount)
|
||||
LTV CAC: Life-time value (revenue contribution) vs cost to acquire customer must be healthy
|
||||
Churn: Fits into LTV, low churn leads to higher LTV and helps keep future CAC down
|
||||
Business: Must have sufficient barriers to entry to ward off copy-cats once established
|
||||
Founders: Must be religious about their product. Believe they will change the world against all odds.
|
||||
77
server/client.py
Normal file
77
server/client.py
Normal file
@@ -0,0 +1,77 @@
|
||||
import argparse
|
||||
import asyncio
|
||||
import signal
|
||||
|
||||
from aiortc.contrib.signaling import (add_signaling_arguments,
|
||||
create_signaling)
|
||||
|
||||
from utils.log_utils import LOGGER
|
||||
from stream_client import StreamClient
|
||||
from typing import NoReturn
|
||||
|
||||
async def main() -> NoReturn:
|
||||
"""
|
||||
Reflector's entry point to the python client for WebRTC streaming if not
|
||||
using the browser based UI-application
|
||||
:return:
|
||||
"""
|
||||
parser = argparse.ArgumentParser(description="Data channels ping/pong")
|
||||
|
||||
parser.add_argument(
|
||||
"--url", type=str, nargs="?", default="http://0.0.0.0:1250/offer"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--ping-pong",
|
||||
help="Benchmark data channel with ping pong",
|
||||
type=eval,
|
||||
choices=[True, False],
|
||||
default="False",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--play-from",
|
||||
type=str,
|
||||
default="",
|
||||
)
|
||||
add_signaling_arguments(parser)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
signaling = create_signaling(args)
|
||||
|
||||
async def shutdown(signal, loop):
|
||||
"""Cleanup tasks tied to the service's shutdown."""
|
||||
LOGGER.info(f"Received exit signal {signal.name}...")
|
||||
LOGGER.info("Closing database connections")
|
||||
LOGGER.info("Nacking outstanding messages")
|
||||
tasks = [t for t in asyncio.all_tasks() if t is not
|
||||
asyncio.current_task()]
|
||||
|
||||
[task.cancel() for task in tasks]
|
||||
|
||||
LOGGER.info(f"Cancelling {len(tasks)} outstanding tasks")
|
||||
await asyncio.gather(*tasks, return_exceptions=True)
|
||||
LOGGER.info(f'{"Flushing metrics"}')
|
||||
loop.stop()
|
||||
|
||||
signals = (signal.SIGHUP, signal.SIGTERM, signal.SIGINT)
|
||||
loop = asyncio.get_event_loop()
|
||||
for s in signals:
|
||||
loop.add_signal_handler(
|
||||
s, lambda s=s: asyncio.create_task(shutdown(s, loop)))
|
||||
|
||||
# Init client
|
||||
sc = StreamClient(
|
||||
signaling=signaling,
|
||||
url=args.url,
|
||||
play_from=args.play_from,
|
||||
ping_pong=args.ping_pong
|
||||
)
|
||||
await sc.start()
|
||||
async for msg in sc.get_reader():
|
||||
print(msg)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
BIN
server/images/aggregate_input.png
Normal file
BIN
server/images/aggregate_input.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 124 KiB |
BIN
server/images/multi-output.png
Normal file
BIN
server/images/multi-output.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 113 KiB |
860
server/notebooks/Viz-experiments.ipynb
Normal file
860
server/notebooks/Viz-experiments.ipynb
Normal file
File diff suppressed because one or more lines are too long
2534
server/notebooks/incsum.ipynb
Normal file
2534
server/notebooks/incsum.ipynb
Normal file
File diff suppressed because it is too large
Load Diff
61
server/pipeline-requirements.txt
Normal file
61
server/pipeline-requirements.txt
Normal file
@@ -0,0 +1,61 @@
|
||||
pyaudio==0.2.13
|
||||
keyboard==0.13.5
|
||||
pynput==1.7.6
|
||||
wave==0.0.2
|
||||
async-timeout==4.0.2
|
||||
attrs==23.1.0
|
||||
certifi==2023.5.7
|
||||
charset-normalizer==3.1.0
|
||||
decorator==4.4.2
|
||||
filelock==3.12.0
|
||||
frozenlist==1.3.3
|
||||
idna==3.4
|
||||
imageio==2.29.0
|
||||
imageio-ffmpeg==0.4.8
|
||||
Jinja2==3.1.2
|
||||
llvmlite==0.40.0
|
||||
loguru==0.7.0
|
||||
MarkupSafe==2.1.2
|
||||
more-itertools==9.1.0
|
||||
moviepy==1.0.3
|
||||
mpmath==1.3.0
|
||||
multidict==6.0.4
|
||||
networkx==3.1
|
||||
numba==0.57.0
|
||||
numpy==1.24.3
|
||||
openai==0.27.7
|
||||
openai-whisper@ git+https://github.com/openai/whisper.git@248b6cb124225dd263bb9bd32d060b6517e067f8
|
||||
Pillow==9.5.0
|
||||
proglog==0.1.10
|
||||
pytube==15.0.0
|
||||
regex==2023.5.5
|
||||
six==1.16.0
|
||||
sympy==1.12
|
||||
tiktoken==0.3.3
|
||||
torch==2.0.1
|
||||
tqdm==4.65.0
|
||||
typing_extensions==4.6.2
|
||||
urllib3
|
||||
yarl==1.9.2
|
||||
boto3==1.26.151
|
||||
nltk==3.8.1
|
||||
wordcloud==1.9.2
|
||||
spacy==3.5.4
|
||||
scattertext==0.1.19
|
||||
pandas==2.0.3
|
||||
jupyter==1.0.0
|
||||
seaborn==0.12.2
|
||||
matplotlib==3.7.2
|
||||
matplotlib-inline==0.1.6
|
||||
termcolor==2.3.0
|
||||
ffmpeg==1.4
|
||||
cached_property==1.5.2
|
||||
stamina==23.1.0
|
||||
httpx==0.24.1
|
||||
https://github.com/yt-dlp/yt-dlp/archive/master.tar.gz
|
||||
gpt4all==1.0.5
|
||||
aiohttp==3.8.5
|
||||
aiohttp-cors==0.7.0
|
||||
aioice==0.9.0
|
||||
aiortc==1.5.0
|
||||
aiosignal==1.3.1
|
||||
34
server/reflector-local/0-reflector-local.py
Normal file
34
server/reflector-local/0-reflector-local.py
Normal file
@@ -0,0 +1,34 @@
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
from loguru import logger
|
||||
|
||||
# Get the input file name from the command line argument
|
||||
input_file = sys.argv[1]
|
||||
# example use: python 0-reflector-local.py input.m4a agenda.txt
|
||||
|
||||
# Get the agenda file name from the command line argument if provided
|
||||
if len(sys.argv) > 2:
|
||||
agenda_file = sys.argv[2]
|
||||
else:
|
||||
agenda_file = "agenda.txt"
|
||||
# example use: python 0-reflector-local.py input.m4a my_agenda.txt
|
||||
|
||||
# Check if the agenda file exists
|
||||
if not os.path.exists(agenda_file):
|
||||
logger.error("agenda_file is missing")
|
||||
|
||||
# Check if the input file is .m4a, if so convert to .mp4
|
||||
if input_file.endswith(".m4a"):
|
||||
subprocess.run(["ffmpeg", "-i", input_file, f"{input_file}.mp4"])
|
||||
input_file = f"{input_file}.mp4"
|
||||
|
||||
# Run the first script to generate the transcript
|
||||
subprocess.run(["python3", "1-transcript-generator.py", input_file, f"{input_file}_transcript.txt"])
|
||||
|
||||
# Run the second script to compare the transcript to the agenda
|
||||
subprocess.run(["python3", "2-agenda-transcript-diff.py", agenda_file, f"{input_file}_transcript.txt"])
|
||||
|
||||
# Run the third script to summarize the transcript
|
||||
subprocess.run(["python3", "3-transcript-summarizer.py", f"{input_file}_transcript.txt", f"{input_file}_summary.txt"])
|
||||
62
server/reflector-local/1-transcript-generator.py
Executable file
62
server/reflector-local/1-transcript-generator.py
Executable file
@@ -0,0 +1,62 @@
|
||||
import argparse
|
||||
import os
|
||||
|
||||
import moviepy.editor
|
||||
import whisper
|
||||
from loguru import logger
|
||||
|
||||
WHISPER_MODEL_SIZE = "base"
|
||||
|
||||
|
||||
def init_argparse() -> argparse.ArgumentParser:
|
||||
parser = argparse.ArgumentParser(
|
||||
usage="%(prog)s <LOCATION> <OUTPUT>",
|
||||
description="Creates a transcript of a video or audio file using the OpenAI Whisper model"
|
||||
)
|
||||
parser.add_argument("location", help="Location of the media file")
|
||||
parser.add_argument("output", help="Output file path")
|
||||
return parser
|
||||
|
||||
|
||||
def main():
|
||||
import sys
|
||||
sys.setrecursionlimit(10000)
|
||||
|
||||
parser = init_argparse()
|
||||
args = parser.parse_args()
|
||||
|
||||
media_file = args.location
|
||||
logger.info(f"Processing file: {media_file}")
|
||||
|
||||
# Check if the media file is a valid audio or video file
|
||||
if os.path.isfile(media_file) and not media_file.endswith(
|
||||
('.mp3', '.wav', '.ogg', '.flac', '.mp4', '.avi', '.flv')):
|
||||
logger.error(f"Invalid file format: {media_file}")
|
||||
return
|
||||
|
||||
# If the media file we just retrieved is an audio file then skip extraction step
|
||||
audio_filename = media_file
|
||||
logger.info(f"Found audio-only file, skipping audio extraction")
|
||||
|
||||
audio = moviepy.editor.AudioFileClip(audio_filename)
|
||||
|
||||
logger.info("Selected extracted audio")
|
||||
|
||||
# Transcribe the audio file using the OpenAI Whisper model
|
||||
logger.info("Loading Whisper speech-to-text model")
|
||||
whisper_model = whisper.load_model(WHISPER_MODEL_SIZE)
|
||||
|
||||
logger.info(f"Transcribing file: {media_file}")
|
||||
whisper_result = whisper_model.transcribe(media_file)
|
||||
|
||||
logger.info("Finished transcribing file")
|
||||
|
||||
# Save the transcript to the specified file.
|
||||
logger.info(f"Saving transcript to: {args.output}")
|
||||
transcript_file = open(args.output, "w")
|
||||
transcript_file.write(whisper_result["text"])
|
||||
transcript_file.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
68
server/reflector-local/2-agenda-transcript-diff.py
Normal file
68
server/reflector-local/2-agenda-transcript-diff.py
Normal file
@@ -0,0 +1,68 @@
|
||||
import argparse
|
||||
|
||||
import spacy
|
||||
from loguru import logger
|
||||
|
||||
|
||||
# Define the paths for agenda and transcription files
|
||||
def init_argparse() -> argparse.ArgumentParser:
|
||||
parser = argparse.ArgumentParser(
|
||||
usage="%(prog)s <AGENDA> <TRANSCRIPTION>",
|
||||
description="Compares the transcript of a video or audio file to an agenda using the SpaCy model"
|
||||
)
|
||||
parser.add_argument("agenda", help="Location of the agenda file")
|
||||
parser.add_argument("transcription", help="Location of the transcription file")
|
||||
return parser
|
||||
|
||||
|
||||
args = init_argparse().parse_args()
|
||||
agenda_path = args.agenda
|
||||
transcription_path = args.transcription
|
||||
|
||||
# Load the spaCy model and add the sentencizer
|
||||
spaCy_model = "en_core_web_md"
|
||||
nlp = spacy.load(spaCy_model)
|
||||
nlp.add_pipe('sentencizer')
|
||||
logger.info("Loaded spaCy model " + spaCy_model)
|
||||
|
||||
# Load the agenda
|
||||
with open(agenda_path, "r") as f:
|
||||
agenda = [line.strip() for line in f.readlines() if line.strip()]
|
||||
logger.info("Loaded agenda items")
|
||||
|
||||
# Load the transcription
|
||||
with open(transcription_path, "r") as f:
|
||||
transcription = f.read()
|
||||
logger.info("Loaded transcription")
|
||||
|
||||
# Tokenize the transcription using spaCy
|
||||
doc_transcription = nlp(transcription)
|
||||
logger.info("Tokenized transcription")
|
||||
|
||||
# Find the items covered in the transcription
|
||||
covered_items = {}
|
||||
for item in agenda:
|
||||
item_doc = nlp(item)
|
||||
for sent in doc_transcription.sents:
|
||||
if not sent or not all(token.has_vector for token in sent):
|
||||
# Skip an empty span or one without any word vectors
|
||||
continue
|
||||
similarity = sent.similarity(item_doc)
|
||||
similarity_threshold = 0.7
|
||||
if similarity > similarity_threshold: # Set the threshold to determine what is considered a match
|
||||
covered_items[item] = True
|
||||
break
|
||||
|
||||
# Count the number of items covered and calculatre the percentage
|
||||
num_covered_items = sum(covered_items.values())
|
||||
percentage_covered = num_covered_items / len(agenda) * 100
|
||||
|
||||
# Print the results
|
||||
print("💬 Agenda items covered in the transcription:")
|
||||
for item in agenda:
|
||||
if item in covered_items and covered_items[item]:
|
||||
print("✅ ", item)
|
||||
else:
|
||||
print("❌ ", item)
|
||||
print("📊 Coverage: {:.2f}%".format(percentage_covered))
|
||||
logger.info("Finished comparing agenda to transcription with similarity threshold of " + str(similarity_threshold))
|
||||
94
server/reflector-local/3-transcript-summarizer.py
Normal file
94
server/reflector-local/3-transcript-summarizer.py
Normal file
@@ -0,0 +1,94 @@
|
||||
import argparse
|
||||
|
||||
import nltk
|
||||
|
||||
nltk.download('stopwords')
|
||||
from nltk.corpus import stopwords
|
||||
from nltk.tokenize import word_tokenize, sent_tokenize
|
||||
from heapq import nlargest
|
||||
from loguru import logger
|
||||
|
||||
|
||||
# Function to initialize the argument parser
|
||||
def init_argparse():
|
||||
parser = argparse.ArgumentParser(
|
||||
usage="%(prog)s <TRANSCRIPT> <SUMMARY>",
|
||||
description="Summarization"
|
||||
)
|
||||
parser.add_argument("transcript", type=str, default="transcript.txt", help="Path to the input transcript file")
|
||||
parser.add_argument("summary", type=str, default="summary.txt", help="Path to the output summary file")
|
||||
parser.add_argument("--num_sentences", type=int, default=5, help="Number of sentences to include in the summary")
|
||||
return parser
|
||||
|
||||
|
||||
# Function to read the input transcript file
|
||||
def read_transcript(file_path):
|
||||
with open(file_path, "r") as file:
|
||||
transcript = file.read()
|
||||
return transcript
|
||||
|
||||
|
||||
# Function to preprocess the text by removing stop words and special characters
|
||||
def preprocess_text(text):
|
||||
stop_words = set(stopwords.words('english'))
|
||||
words = word_tokenize(text)
|
||||
words = [w.lower() for w in words if w.isalpha() and w.lower() not in stop_words]
|
||||
return words
|
||||
|
||||
|
||||
# Function to score each sentence based on the frequency of its words and return the top sentences
|
||||
def summarize_text(text, num_sentences):
|
||||
# Tokenize the text into sentences
|
||||
sentences = sent_tokenize(text)
|
||||
|
||||
# Preprocess the text by removing stop words and special characters
|
||||
words = preprocess_text(text)
|
||||
|
||||
# Calculate the frequency of each word in the text
|
||||
word_freq = nltk.FreqDist(words)
|
||||
|
||||
# Calculate the score for each sentence based on the frequency of its words
|
||||
sentence_scores = {}
|
||||
for i, sentence in enumerate(sentences):
|
||||
sentence_words = preprocess_text(sentence)
|
||||
for word in sentence_words:
|
||||
if word in word_freq:
|
||||
if i not in sentence_scores:
|
||||
sentence_scores[i] = word_freq[word]
|
||||
else:
|
||||
sentence_scores[i] += word_freq[word]
|
||||
|
||||
# Select the top sentences based on their scores
|
||||
top_sentences = nlargest(num_sentences, sentence_scores, key=sentence_scores.get)
|
||||
|
||||
# Sort the top sentences in the order they appeared in the original text
|
||||
summary_sent = sorted(top_sentences)
|
||||
summary = [sentences[i] for i in summary_sent]
|
||||
|
||||
return " ".join(summary)
|
||||
|
||||
|
||||
def main():
|
||||
# Initialize the argument parser and parse the arguments
|
||||
parser = init_argparse()
|
||||
args = parser.parse_args()
|
||||
|
||||
# Read the input transcript file
|
||||
logger.info(f"Reading transcript from: {args.transcript}")
|
||||
transcript = read_transcript(args.transcript)
|
||||
|
||||
# Summarize the transcript using the nltk library
|
||||
logger.info("Summarizing transcript")
|
||||
summary = summarize_text(transcript, args.num_sentences)
|
||||
|
||||
# Write the summary to the output file
|
||||
logger.info(f"Writing summary to: {args.summary}")
|
||||
with open(args.summary, "w") as f:
|
||||
f.write("Summary of: " + args.transcript + "\n\n")
|
||||
f.write(summary)
|
||||
|
||||
logger.info("Summarization completed")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,4 @@
|
||||
# Deloitte HR @ NYS Cybersecurity Conference
|
||||
- ways to retain and grow your workforce
|
||||
- how to enable cybersecurity professionals to do their best work
|
||||
- low-budget activities that can be implemented starting tomorrow
|
||||
File diff suppressed because one or more lines are too long
@@ -0,0 +1,3 @@
|
||||
Summary of: 30min-CyberHR/30min-CyberHR.m4a.mp4_transcript.txt
|
||||
|
||||
Since the workforce is an organization's most valuable asset, investing in workforce experience activities, we've found has lead to more productive work, more efficient work, more innovative approaches to the work, and more engaged teams which ultimately results in better mission outcomes for your organization. And this one really focuses on not just pulsing a workforce once a year through an annual HR survey of, how do you really feel like, you know, what leadership considerations should we implement or, you know, how can we enhance the performance management process. We've just found that, you know, by investing in this and putting the workforce as, you know, the center part of what you invest in as an organization and leaders, it's not only about retention, talent, you know, the cyber workforce crisis, but people want to do work well and they're able to get more done and achieve more without you, you know, directly supervising and micromanaging or looking at everything because, you know, you know, you know, you're not going to be able to do anything. I hope there was a little bit of, you know, the landscape of the cyber workforce with some practical tips that you can take away for how to just think about, you know, improving the overall workforce experience and investing in your employees. So with this, you know, we know that all of you are in the trenches every day, you're facing this, you're living this, and we are just interested to hear from all of you, you know, just to start, like, what's one thing that has worked well in your organization in terms of enhancing or investing in the workforce experience?
|
||||
File diff suppressed because one or more lines are too long
@@ -0,0 +1,47 @@
|
||||
AGENDA: Most important things to look for in a start up
|
||||
|
||||
TAM: Make sure the market is sufficiently large than once they win they can get rewarded
|
||||
- Medium sized markets that should be winner take all can work
|
||||
- TAM needs to be realistic of direct market size
|
||||
|
||||
Product market fit: Being in a good market with a product than can satisfy that market
|
||||
- Solves a problem
|
||||
- Builds a solution a customer wants to buy
|
||||
- Either saves the customer something (time/money/pain) or gives them something (revenue/enjoyment)
|
||||
|
||||
Unit economics: Profit for delivering all-in cost must be attractive (% or $ amount)
|
||||
- Revenue minus direct costs
|
||||
- Raw input costs (materials, variable labour), direct cost of delivering and servicing the sale
|
||||
- Attractive as a % of sales so it can contribute to fixed overhead
|
||||
- Look for high incremental contribution margin
|
||||
|
||||
LTV CAC: Life-time value (revenue contribution) vs cost to acquire customer must be healthy
|
||||
- LTV = Purchase value x number of purchases x customer lifespan
|
||||
- CAC = All-in costs of sales + marketing over number of new customer additions
|
||||
- Strong reputation leads to referrals leads to lower CAC. Want customers evangelizing product/service
|
||||
- Rule of thumb higher than 3
|
||||
|
||||
Churn: Fits into LTV, low churn leads to higher LTV and helps keep future CAC down
|
||||
- Selling to replenish revenue every year is hard
|
||||
- Can run through entire customer base over time
|
||||
- Low churn builds strong net dollar retention
|
||||
|
||||
Business: Must have sufficient barriers to entry to ward off copy-cats once established
|
||||
- High switching costs (lock-in)
|
||||
- Addictive
|
||||
- Steep learning curve once adopted (form of switching cost)
|
||||
- Two sided liquidity
|
||||
- Patents, IP, Branding
|
||||
- No hyper-scaler who can roll over you quickly
|
||||
- Scale could be a barrier to entry but works against most start-ups, not for them
|
||||
- Once developed, answer question: Could a well funded competitor starting up today easily duplicate this business or is it cheaper to buy the start up?
|
||||
|
||||
Founders: Must be religious about their product. Believe they will change the world against all odds.
|
||||
- Just money in the bank is not enough to build a successful company. Just good tech not enough
|
||||
to build a successful company
|
||||
- Founders must be motivated to build something, not (all) about money. They would be doing
|
||||
this for free because they believe in it. Not looking for quick score
|
||||
- Founders must be persuasive. They will be asking others to sacrifice to make their dream come
|
||||
to life. They will need to convince investors this company can work and deserves funding.
|
||||
- Must understand who the customer is and what problem they are helping to solve.
|
||||
- Founders aren’t expected to know all the preceding points in this document but have an understanding of most of this, and be able to offer a vision.
|
||||
@@ -0,0 +1,8 @@
|
||||
AGENDA: Most important things to look for in a start up
|
||||
TAM: Make sure the market is sufficiently large than once they win they can get rewarded
|
||||
Product market fit: Being in a good market with a product than can satisfy that market
|
||||
Unit economics: Profit for delivering all-in cost must be attractive (% or $ amount)
|
||||
LTV CAC: Life-time value (revenue contribution) vs cost to acquire customer must be healthy
|
||||
Churn: Fits into LTV, low churn leads to higher LTV and helps keep future CAC down
|
||||
Business: Must have sufficient barriers to entry to ward off copy-cats once established
|
||||
Founders: Must be religious about their product. Believe they will change the world against all odds.
|
||||
@@ -0,0 +1,10 @@
|
||||
Summary of: recordings/42min-StartupsTechTalk.mp4
|
||||
|
||||
The speaker discusses their plan to launch an investment company, which will sit on a pool of cash raised from various partners and investors. They will take equity stakes in startups that they believe have the potential to scale and become successful. The speaker emphasizes the importance of investing in companies that have a large total addressable market (TAM) and good product-market fit. They also discuss the concept of unit economics and how it is important to ensure that the profit from selling a product or service outweighs the cost of producing it. The speaker encourages their team to keep an eye out for interesting startups and to send them their way if they come across any.
|
||||
|
||||
The conversation is about the importance of unit economics, incremental margin, lifetime value, customer acquisition costs, churn, and barriers to entry in evaluating businesses for investment. The speaker explains that companies with good unit economics and high incremental contribution margins are ideal for investment. Lifetime value measures how much a customer will spend on a business over their entire existence, while customer acquisition costs measure the cost of acquiring a new customer. Churn refers to the rate at which customers leave a business, and businesses with low churn tend to have high lifetime values. High barriers to entry, such as high switching costs, can make it difficult for competitors to enter the market and kill established businesses.
|
||||
|
||||
The speaker discusses various factors that can contribute to a company's success and create a competitive advantage. These include making the product addictive, having steep learning curves, creating two-sided liquidity for marketplaces, having patents or intellectual property, strong branding, and scale as a barrier to entry. The speaker also emphasizes the importance of founders having a plan to differentiate themselves from competitors and avoid being rolled over by larger companies. Additionally, the speaker mentions MasterCard and Visa as examples of companies that invented their markets, while Apple was able to build a strong brand despite starting with no developers or users.
|
||||
|
||||
The speaker discusses the importance of founders in building successful companies, emphasizing that they must be passionate and believe in their product. They should also be charismatic and able to persuade others to work towards their vision. The speaker cites examples of successful CEOs such as Zuckerberg, Steve Jobs, Elon Musk, Bill Gates, Jeff Bezos, Travis Kalanick, and emphasizes that luck is also a factor in success. The speaker encourages listeners to have a critical eye when evaluating startups and to look for those with a clear understanding of their customers and the problem they are solving.
|
||||
|
||||
File diff suppressed because one or more lines are too long
@@ -0,0 +1,3 @@
|
||||
Summary of: 42min-StartupsTechTalk/42min-StartupsTechTalk.mp4_transcript.txt
|
||||
|
||||
If you had perfect knowledge, and you need like one more piece of advertising, drove like 0.2 customers in each customer generates, like let's say you wanted to completely maximize, you'd make it say your contribution margin, on incremental sales, is just over what you're spending on ad revenue. Like if you're, I don't know, well, let's see, I got like you don't really want to advertise a ton in the huge and everywhere, and then getting to ubiquitous, because you grab it, damage your brands, but just like an economic textbook theory, and be like, it'd be that basic math. And the table's like exactly, we're going to be really cautious to like be able to move in a year if we need to, but Google's goal is going to be giving away foundational models, lock everyone in, make them use Google Cloud, make them use Google Tools, and it's going to be very hard to switch off. Like if you were starting to develop Figma, you might say, okay, well Adobe is just gonna eat my lunch, right, like right away. So when you see a startup or talk to a founder and he's saying these things in your head like, man, this isn't gonna work because of, you know, there's no tab or there's, you know, like Amazon's gonna roll these cuts over in like two days or whatever, you know, or the man, this is really interesting because not only they're not doing it and no one else is doing this, but like they're going after a big market.
|
||||
File diff suppressed because one or more lines are too long
@@ -0,0 +1,4 @@
|
||||
GitHub
|
||||
Requirements
|
||||
Junior Developers
|
||||
Riding Elephants
|
||||
@@ -0,0 +1,4 @@
|
||||
Summary of: https://www.youtube.com/watch?v=DzRoYc2UGKI
|
||||
|
||||
Small Developer is a program that creates an entire project for you based on a prompt. It uses the JATGPT API to generate code and files, and it's easy to use. The program can be installed by cloning the GitHub repository and using modalcom. The program can create projects for various languages, including Python and Ruby. You can also create a prompt.md file to input your prompt instead of pasting it into the terminal. The program is useful for creating detailed specs that can be passed on to junior developers. Overall, Small Developer is a helpful tool for quickly generating code and projects.
|
||||
|
||||
File diff suppressed because one or more lines are too long
11
server/reflector-local/readme.md
Normal file
11
server/reflector-local/readme.md
Normal file
@@ -0,0 +1,11 @@
|
||||
# Record on Voice Memos on iPhone
|
||||
|
||||
# Airdrop to MacBook Air
|
||||
|
||||
# Run Reflector on .m4a Recording and Agenda
|
||||
|
||||
python 0-reflector-local.py voicememo.m4a agenda.txt
|
||||
|
||||
OR - using 30min-CyberHR example:
|
||||
|
||||
python 0-reflector-local.py 30min-CyberHR/30min-CyberHR.m4a 30min-CyberHR/30min-CyberHR-agenda.txt
|
||||
125
server/reflector-local/whisper_summarizer_bart.py
Normal file
125
server/reflector-local/whisper_summarizer_bart.py
Normal file
@@ -0,0 +1,125 @@
|
||||
import argparse
|
||||
import os
|
||||
import tempfile
|
||||
|
||||
import moviepy.editor
|
||||
import nltk
|
||||
import whisper
|
||||
from loguru import logger
|
||||
from transformers import BartTokenizer, BartForConditionalGeneration
|
||||
|
||||
nltk.download('punkt', quiet=True)
|
||||
|
||||
WHISPER_MODEL_SIZE = "base"
|
||||
|
||||
|
||||
def init_argparse() -> argparse.ArgumentParser:
|
||||
parser = argparse.ArgumentParser(
|
||||
usage="%(prog)s [OPTIONS] <LOCATION> <OUTPUT>",
|
||||
description="Creates a transcript of a video or audio file, then summarizes it using BART."
|
||||
)
|
||||
|
||||
parser.add_argument("location", help="Location of the media file")
|
||||
parser.add_argument("output", help="Output file path")
|
||||
|
||||
parser.add_argument(
|
||||
"-t", "--transcript", help="Save a copy of the intermediary transcript file", type=str)
|
||||
parser.add_argument(
|
||||
"-l", "--language", help="Language that the summary should be written in",
|
||||
type=str, default="english", choices=['english', 'spanish', 'french', 'german', 'romanian'])
|
||||
parser.add_argument(
|
||||
"-m", "--model_name", help="Name or path of the BART model",
|
||||
type=str, default="facebook/bart-large-cnn")
|
||||
|
||||
return parser
|
||||
|
||||
|
||||
# NLTK chunking function
|
||||
def chunk_text(txt, max_chunk_length=500):
|
||||
"Split text into smaller chunks."
|
||||
sentences = nltk.sent_tokenize(txt)
|
||||
chunks = []
|
||||
current_chunk = ""
|
||||
for sentence in sentences:
|
||||
if len(current_chunk) + len(sentence) < max_chunk_length:
|
||||
current_chunk += f" {sentence.strip()}"
|
||||
else:
|
||||
chunks.append(current_chunk.strip())
|
||||
current_chunk = f"{sentence.strip()}"
|
||||
chunks.append(current_chunk.strip())
|
||||
return chunks
|
||||
|
||||
|
||||
# BART summary function
|
||||
def summarize_chunks(chunks, tokenizer, model):
|
||||
summaries = []
|
||||
for c in chunks:
|
||||
input_ids = tokenizer.encode(c, return_tensors='pt')
|
||||
summary_ids = model.generate(
|
||||
input_ids, num_beams=4, length_penalty=2.0, max_length=1024, no_repeat_ngram_size=3)
|
||||
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
|
||||
summaries.append(summary)
|
||||
return summaries
|
||||
|
||||
|
||||
def main():
|
||||
import sys
|
||||
sys.setrecursionlimit(10000)
|
||||
|
||||
parser = init_argparse()
|
||||
args = parser.parse_args()
|
||||
|
||||
media_file = args.location
|
||||
logger.info(f"Processing file: {media_file}")
|
||||
|
||||
# If the media file we just retrieved is a video, extract its audio stream.
|
||||
if os.path.isfile(media_file) and media_file.endswith(('.mp4', '.avi', '.flv')):
|
||||
audio_filename = tempfile.NamedTemporaryFile(
|
||||
suffix=".mp3", delete=False).name
|
||||
logger.info(f"Extracting audio to: {audio_filename}")
|
||||
|
||||
video = moviepy.editor.VideoFileClip(media_file)
|
||||
video.audio.write_audiofile(audio_filename, logger=None)
|
||||
|
||||
logger.info("Finished extracting audio")
|
||||
media_file = audio_filename
|
||||
|
||||
# Transcribe the audio file using the OpenAI Whisper model
|
||||
logger.info("Loading Whisper speech-to-text model")
|
||||
whisper_model = whisper.load_model(WHISPER_MODEL_SIZE)
|
||||
|
||||
logger.info(f"Transcribing audio file: {media_file}")
|
||||
whisper_result = whisper_model.transcribe(media_file)
|
||||
|
||||
logger.info("Finished transcribing file")
|
||||
|
||||
# If we got the transcript parameter on the command line, save the transcript to the specified file.
|
||||
if args.transcript:
|
||||
logger.info(f"Saving transcript to: {args.transcript}")
|
||||
transcript_file = open(args.transcript, "w")
|
||||
transcript_file.write(whisper_result["text"])
|
||||
transcript_file.close()
|
||||
|
||||
# Summarize the generated transcript using the BART model
|
||||
logger.info(f"Loading BART model: {args.model_name}")
|
||||
tokenizer = BartTokenizer.from_pretrained(args.model_name)
|
||||
model = BartForConditionalGeneration.from_pretrained(args.model_name)
|
||||
|
||||
logger.info("Breaking transcript into smaller chunks")
|
||||
chunks = chunk_text(whisper_result['text'])
|
||||
|
||||
logger.info(
|
||||
f"Transcript broken into {len(chunks)} chunks of at most 500 words") # TODO fix variable
|
||||
|
||||
logger.info(f"Writing summary text in {args.language} to: {args.output}")
|
||||
with open(args.output, 'w') as f:
|
||||
f.write('Summary of: ' + args.location + "\n\n")
|
||||
summaries = summarize_chunks(chunks, tokenizer, model)
|
||||
for summary in summaries:
|
||||
f.write(summary.strip() + "\n\n")
|
||||
|
||||
logger.info("Summarization completed")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
164
server/reflector_dataclasses.py
Normal file
164
server/reflector_dataclasses.py
Normal file
@@ -0,0 +1,164 @@
|
||||
"""
|
||||
Collection of data classes for streamlining and rigidly structuring
|
||||
the input and output parameters of functions
|
||||
"""
|
||||
|
||||
import datetime
|
||||
from dataclasses import dataclass
|
||||
from typing import List
|
||||
|
||||
import av
|
||||
|
||||
|
||||
@dataclass
|
||||
class TitleSummaryInput:
|
||||
"""
|
||||
Data class for the input to generate title and summaries.
|
||||
The outcome will be used to send query to the LLM for processing.
|
||||
"""
|
||||
input_text = str
|
||||
transcribed_time = float
|
||||
prompt = str
|
||||
data = dict
|
||||
|
||||
def __init__(self, transcribed_time, input_text=""):
|
||||
self.input_text = input_text
|
||||
self.transcribed_time = transcribed_time
|
||||
self.prompt = \
|
||||
f"""
|
||||
### Human:
|
||||
Create a JSON object as response.The JSON object must have 2 fields:
|
||||
i) title and ii) summary.For the title field,generate a short title
|
||||
for the given text. For the summary field, summarize the given text
|
||||
in three sentences.
|
||||
|
||||
{self.input_text}
|
||||
|
||||
### Assistant:
|
||||
"""
|
||||
self.data = {"data": self.prompt}
|
||||
self.headers = {"Content-Type": "application/json"}
|
||||
|
||||
|
||||
@dataclass
|
||||
class IncrementalResult:
|
||||
"""
|
||||
Data class for the result of generating one title and summaries.
|
||||
Defines how a single "topic" looks like.
|
||||
"""
|
||||
title = str
|
||||
description = str
|
||||
transcript = str
|
||||
|
||||
def __init__(self, title, desc, transcript):
|
||||
self.title = title
|
||||
self.description = desc
|
||||
self.transcript = transcript
|
||||
|
||||
|
||||
@dataclass
|
||||
class TitleSummaryOutput:
|
||||
"""
|
||||
Data class for the result of all generated titles and summaries.
|
||||
The result will be sent back to the client
|
||||
"""
|
||||
cmd = str
|
||||
topics = List[IncrementalResult]
|
||||
|
||||
def __init__(self, inc_responses):
|
||||
self.topics = inc_responses
|
||||
|
||||
def get_result(self):
|
||||
return {
|
||||
"cmd": self.cmd,
|
||||
"topics": self.topics
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class ParseLLMResult:
|
||||
"""
|
||||
Data class to parse the result returned by the LLM while generating title
|
||||
and summaries. The result will be sent back to the client.
|
||||
"""
|
||||
description = str
|
||||
transcript = str
|
||||
timestamp = str
|
||||
|
||||
def __init__(self, param: TitleSummaryInput, output: dict):
|
||||
self.transcript = param.input_text
|
||||
self.description = output.pop("summary")
|
||||
self.timestamp = \
|
||||
str(datetime.timedelta(seconds=round(param.transcribed_time)))
|
||||
|
||||
def get_result(self):
|
||||
return {
|
||||
"description": self.description,
|
||||
"transcript": self.transcript,
|
||||
"timestamp": self.timestamp
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class TranscriptionInput:
|
||||
"""
|
||||
Data class to define the input to the transcription function
|
||||
AudioFrames -> input
|
||||
"""
|
||||
frames = List[av.audio.frame.AudioFrame]
|
||||
|
||||
def __init__(self, frames):
|
||||
self.frames = frames
|
||||
|
||||
|
||||
@dataclass
|
||||
class TranscriptionOutput:
|
||||
"""
|
||||
Dataclass to define the result of the transcription function.
|
||||
The result will be sent back to the client
|
||||
"""
|
||||
cmd = str
|
||||
result_text = str
|
||||
|
||||
def __init__(self, result_text):
|
||||
self.cmd = "SHOW_TRANSCRIPTION"
|
||||
self.result_text = result_text
|
||||
|
||||
def get_result(self):
|
||||
return {
|
||||
"cmd": self.cmd,
|
||||
"text": self.result_text
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class FinalSummaryResult:
|
||||
"""
|
||||
Dataclass to define the result of the final summary function.
|
||||
The result will be sent back to the client.
|
||||
"""
|
||||
cmd = str
|
||||
final_summary = str
|
||||
duration = str
|
||||
|
||||
def __init__(self, final_summary, time):
|
||||
self.duration = str(datetime.timedelta(seconds=round(time)))
|
||||
self.final_summary = final_summary
|
||||
self.cmd = ""
|
||||
|
||||
def get_result(self):
|
||||
return {
|
||||
"cmd": self.cmd,
|
||||
"duration": self.duration,
|
||||
"summary": self.final_summary
|
||||
}
|
||||
|
||||
|
||||
class BlackListedMessages:
|
||||
"""
|
||||
Class to hold the blacklisted messages. These messages should be filtered
|
||||
out and not sent back to the client as part of the transcription.
|
||||
"""
|
||||
messages = [" Thank you.", " See you next time!",
|
||||
" Thank you for watching!", " Bye!",
|
||||
" And that's what I'm talking about."]
|
||||
30
server/scripts/clear_artefacts.sh
Executable file
30
server/scripts/clear_artefacts.sh
Executable file
@@ -0,0 +1,30 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Directory to search for Python files
|
||||
cwd=$(pwd)
|
||||
last_component="${cwd##*/}"
|
||||
|
||||
if [ "$last_component" = "reflector" ]; then
|
||||
directory="./artefacts"
|
||||
elif [ "$last_component" = "scripts" ]; then
|
||||
directory="../artefacts"
|
||||
fi
|
||||
|
||||
# Pattern to match Python files (e.g., "*.py" for all .py files)
|
||||
transcript_file_pattern="transcript_*.txt"
|
||||
summary_file_pattern="summary_*.txt"
|
||||
pickle_file_pattern="*.pkl"
|
||||
html_file_pattern="*.html"
|
||||
png_file_pattern="wordcloud*.png"
|
||||
mp3_file_pattern="*.mp3"
|
||||
mp4_file_pattern="*.mp4"
|
||||
m4a_file_pattern="*.m4a"
|
||||
|
||||
find "$directory" -type f -name "$transcript_file_pattern" -delete
|
||||
find "$directory" -type f -name "$summary_file_pattern" -delete
|
||||
find "$directory" -type f -name "$pickle_file_pattern" -delete
|
||||
find "$directory" -type f -name "$html_file_pattern" -delete
|
||||
find "$directory" -type f -name "$png_file_pattern" -delete
|
||||
find "$directory" -type f -name "$mp3_file_pattern" -delete
|
||||
find "$directory" -type f -name "$mp4_file_pattern" -delete
|
||||
find "$directory" -type f -name "$m4a_file_pattern" -delete
|
||||
39
server/scripts/setup_pipeline_dependencies.sh
Normal file
39
server/scripts/setup_pipeline_dependencies.sh
Normal file
@@ -0,0 +1,39 @@
|
||||
#!/bin/sh
|
||||
|
||||
# Upgrade pip
|
||||
pip install --upgrade pip
|
||||
|
||||
# Default to CPU Installation of JAX
|
||||
jax_mode="jax[cpu]"
|
||||
|
||||
# Install JAX
|
||||
if [ "$1" == "cpu" ]
|
||||
then
|
||||
jax_mode="jax[cpu]"
|
||||
elif [ "$1" == "cuda11" ]
|
||||
then
|
||||
jax_mode="jax[cuda11_pip]"
|
||||
elif [ "$1" == "cuda12" ]
|
||||
then
|
||||
jax_mode="jax[cuda12_pip]"
|
||||
fi
|
||||
|
||||
pip install --upgrade "$jax_mode"
|
||||
|
||||
# Install Whisper-JAX base
|
||||
pip install git+https://github.com/sanchit-gandhi/whisper-jax.git
|
||||
|
||||
# Update to latest version
|
||||
pip install --upgrade --no-deps --force-reinstall git+https://github.com/sanchit-gandhi/whisper-jax.git
|
||||
|
||||
cwd=$(pwd)
|
||||
last_component="${cwd##*/}"
|
||||
if [ "$last_component" = "reflector" ]; then
|
||||
pip install -r pipeline-requirements.txt
|
||||
elif [ "$last_component" = "scripts" ]; then
|
||||
pip install -r ../pipeline-requirements.txt
|
||||
fi
|
||||
|
||||
# download spacy models
|
||||
spacy download en_core_web_sm
|
||||
spacy download en_core_web_md
|
||||
11
server/scripts/setup_server_dependencies.sh
Executable file
11
server/scripts/setup_server_dependencies.sh
Executable file
@@ -0,0 +1,11 @@
|
||||
#!/bin/sh
|
||||
|
||||
pip install --upgrade pip
|
||||
|
||||
cwd=$(pwd)
|
||||
last_component="${cwd##*/}"
|
||||
if [ "$last_component" = "reflector" ]; then
|
||||
pip install -r server-requirements.txt
|
||||
elif [ "$last_component" = "scripts" ]; then
|
||||
pip install -r ../server-requirements.txt
|
||||
fi
|
||||
50
server/server-requirements.txt
Normal file
50
server/server-requirements.txt
Normal file
@@ -0,0 +1,50 @@
|
||||
aiohttp==3.8.5
|
||||
aiohttp-cors==0.7.0
|
||||
aioice==0.9.0
|
||||
aiortc==1.5.0
|
||||
aiosignal==1.3.1
|
||||
anyio==3.7.1
|
||||
async-timeout==4.0.2
|
||||
attrs==23.1.0
|
||||
av==10.0.0
|
||||
certifi==2023.7.22
|
||||
cffi==1.15.1
|
||||
charset-normalizer==3.2.0
|
||||
coloredlogs==15.0.1
|
||||
cryptography==41.0.2
|
||||
ctranslate2==3.17.1
|
||||
dnspython==2.4.0
|
||||
faster-whisper==0.7.1
|
||||
filelock==3.12.2
|
||||
flatbuffers==23.5.26
|
||||
frozenlist==1.4.0
|
||||
fsspec==2023.6.0
|
||||
google-crc32c==1.5.0
|
||||
h11==0.14.0
|
||||
httpcore==0.17.3
|
||||
huggingface-hub==0.16.4
|
||||
humanfriendly==10.0
|
||||
idna==3.4
|
||||
ifaddr==0.2.0
|
||||
loguru==0.7.0
|
||||
mpmath==1.3.0
|
||||
multidict==6.0.4
|
||||
numpy==1.25.1
|
||||
onnxruntime==1.15.1
|
||||
packaging==23.1
|
||||
protobuf==4.23.4
|
||||
pycparser==2.21
|
||||
pyee==11.0.0
|
||||
pylibsrtp==0.8.0
|
||||
pyOpenSSL==23.2.0
|
||||
PyYAML==6.0.1
|
||||
requests==2.31.0
|
||||
sniffio==1.3.0
|
||||
sortedcontainers==2.4.0
|
||||
sympy==1.12
|
||||
tokenizers==0.13.3
|
||||
tqdm==4.65.0
|
||||
typing_extensions==4.7.1
|
||||
urllib3==2.0.4
|
||||
yarl==1.9.2
|
||||
wave==0.0.2
|
||||
324
server/server.py
Normal file
324
server/server.py
Normal file
@@ -0,0 +1,324 @@
|
||||
import argparse
|
||||
import asyncio
|
||||
import datetime
|
||||
import json
|
||||
import os
|
||||
import uuid
|
||||
import wave
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from typing import Union, NoReturn
|
||||
|
||||
import aiohttp_cors
|
||||
import av
|
||||
import requests
|
||||
from aiohttp import web
|
||||
from aiortc import MediaStreamTrack, RTCPeerConnection, RTCSessionDescription
|
||||
from aiortc.contrib.media import MediaRelay
|
||||
from faster_whisper import WhisperModel
|
||||
from sortedcontainers import SortedDict
|
||||
|
||||
from reflector_dataclasses import FinalSummaryResult, ParseLLMResult,\
|
||||
TitleSummaryInput, TitleSummaryOutput, TranscriptionInput,\
|
||||
TranscriptionOutput, BlackListedMessages
|
||||
from utils.run_utils import CONFIG, run_in_executor
|
||||
from utils.log_utils import LOGGER
|
||||
|
||||
pcs = set()
|
||||
relay = MediaRelay()
|
||||
data_channel = None
|
||||
model = WhisperModel("tiny", device="cpu",
|
||||
compute_type="float32",
|
||||
num_workers=12)
|
||||
|
||||
CHANNELS = 2
|
||||
RATE = 48000
|
||||
audio_buffer = av.AudioFifo()
|
||||
executor = ThreadPoolExecutor()
|
||||
transcription_text = ""
|
||||
last_transcribed_time = 0.0
|
||||
LLM_MACHINE_IP = CONFIG["LLM"]["LLM_MACHINE_IP"]
|
||||
LLM_MACHINE_PORT = CONFIG["LLM"]["LLM_MACHINE_PORT"]
|
||||
LLM_URL = f"http://{LLM_MACHINE_IP}:{LLM_MACHINE_PORT}/api/v1/generate"
|
||||
incremental_responses = []
|
||||
sorted_transcripts = SortedDict()
|
||||
|
||||
|
||||
def parse_llm_output(param: TitleSummaryInput, response: requests.Response) -> Union[None, ParseLLMResult]:
|
||||
try:
|
||||
output = json.loads(response.json()["results"][0]["text"])
|
||||
return ParseLLMResult(param, output)
|
||||
except Exception as e:
|
||||
LOGGER.info("Exception" + str(e))
|
||||
return None
|
||||
|
||||
|
||||
def get_title_and_summary(param: TitleSummaryInput) -> Union[None, TitleSummaryOutput]:
|
||||
LOGGER.info("Generating title and summary")
|
||||
|
||||
# TODO : Handle unexpected output formats from the model
|
||||
try:
|
||||
response = requests.post(LLM_URL,
|
||||
headers=param.headers,
|
||||
json=param.data)
|
||||
output = parse_llm_output(param, response)
|
||||
if output:
|
||||
result = output.get_result()
|
||||
incremental_responses.append(result)
|
||||
return TitleSummaryOutput(incremental_responses)
|
||||
except Exception as e:
|
||||
LOGGER.info("Exception" + str(e))
|
||||
return None
|
||||
|
||||
|
||||
def channel_log(channel, t: str, message: str) -> NoReturn:
|
||||
LOGGER.info("channel(%s) %s %s" % (channel.label, t, message))
|
||||
|
||||
|
||||
def channel_send(channel, message: str) -> NoReturn:
|
||||
if channel:
|
||||
channel.send(message)
|
||||
|
||||
|
||||
def channel_send_increment(channel, param: Union[FinalSummaryResult, TitleSummaryOutput]) -> NoReturn:
|
||||
if channel and param:
|
||||
message = param.get_result()
|
||||
channel.send(json.dumps(message))
|
||||
|
||||
|
||||
def channel_send_transcript(channel) -> NoReturn:
|
||||
# channel_log(channel, ">", message)
|
||||
if channel:
|
||||
try:
|
||||
least_time = next(iter(sorted_transcripts))
|
||||
message = sorted_transcripts[least_time].get_result()
|
||||
if message:
|
||||
del sorted_transcripts[least_time]
|
||||
if message["text"] not in BlackListedMessages.messages:
|
||||
channel.send(json.dumps(message))
|
||||
# Due to exceptions if one of the earlier batches can't return
|
||||
# a transcript, we don't want to be stuck waiting for the result
|
||||
# With the threshold size of 3, we pop the first(lost) element
|
||||
else:
|
||||
if len(sorted_transcripts) >= 3:
|
||||
del sorted_transcripts[least_time]
|
||||
except Exception as exception:
|
||||
LOGGER.info("Exception", str(exception))
|
||||
|
||||
|
||||
def get_transcription(input_frames: TranscriptionInput) -> Union[None, TranscriptionOutput]:
|
||||
LOGGER.info("Transcribing..")
|
||||
sorted_transcripts[input_frames.frames[0].time] = None
|
||||
|
||||
# TODO: Find cleaner way, watch "no transcription" issue below
|
||||
# Passing IO objects instead of temporary files throws an error
|
||||
# Passing ndarray (type casted with float) does not give any
|
||||
# transcription. Refer issue,
|
||||
# https://github.com/guillaumekln/faster-whisper/issues/369
|
||||
audio_file = "test" + str(datetime.datetime.now())
|
||||
wf = wave.open(audio_file, "wb")
|
||||
wf.setnchannels(CHANNELS)
|
||||
wf.setframerate(RATE)
|
||||
wf.setsampwidth(2)
|
||||
|
||||
for frame in input_frames.frames:
|
||||
wf.writeframes(b"".join(frame.to_ndarray()))
|
||||
wf.close()
|
||||
|
||||
result_text = ""
|
||||
|
||||
try:
|
||||
segments, _ = \
|
||||
model.transcribe(audio_file,
|
||||
language="en",
|
||||
beam_size=5,
|
||||
vad_filter=True,
|
||||
vad_parameters={"min_silence_duration_ms": 500})
|
||||
os.remove(audio_file)
|
||||
segments = list(segments)
|
||||
result_text = ""
|
||||
duration = 0.0
|
||||
for segment in segments:
|
||||
result_text += segment.text
|
||||
start_time = segment.start
|
||||
end_time = segment.end
|
||||
if not segment.start:
|
||||
start_time = 0.0
|
||||
if not segment.end:
|
||||
end_time = 5.5
|
||||
duration += (end_time - start_time)
|
||||
|
||||
global last_transcribed_time, transcription_text
|
||||
last_transcribed_time += duration
|
||||
transcription_text += result_text
|
||||
|
||||
except Exception as exception:
|
||||
LOGGER.info("Exception" + str(exception))
|
||||
|
||||
result = TranscriptionOutput(result_text)
|
||||
sorted_transcripts[input_frames.frames[0].time] = result
|
||||
return result
|
||||
|
||||
|
||||
def get_final_summary_response() -> FinalSummaryResult:
|
||||
"""
|
||||
Collate the incremental summaries generated so far and return as the final
|
||||
summary
|
||||
:return:
|
||||
"""
|
||||
final_summary = ""
|
||||
|
||||
# Collate inc summaries
|
||||
for topic in incremental_responses:
|
||||
final_summary += topic["description"]
|
||||
|
||||
response = FinalSummaryResult(final_summary, last_transcribed_time)
|
||||
|
||||
with open("./artefacts/meeting_titles_and_summaries.txt", "a",
|
||||
encoding="utf-8") as file:
|
||||
file.write(json.dumps(incremental_responses))
|
||||
|
||||
return response
|
||||
|
||||
|
||||
class AudioStreamTrack(MediaStreamTrack):
|
||||
"""
|
||||
An audio stream track.
|
||||
"""
|
||||
|
||||
kind = "audio"
|
||||
|
||||
def __init__(self, track):
|
||||
super().__init__()
|
||||
self.track = track
|
||||
|
||||
async def recv(self) -> av.audio.frame.AudioFrame:
|
||||
global transcription_text
|
||||
frame = await self.track.recv()
|
||||
audio_buffer.write(frame)
|
||||
|
||||
if local_frames := audio_buffer.read_many(256 * 960, partial=False):
|
||||
whisper_result = run_in_executor(
|
||||
get_transcription,
|
||||
TranscriptionInput(local_frames),
|
||||
executor=executor
|
||||
)
|
||||
whisper_result.add_done_callback(
|
||||
lambda f: channel_send_transcript(data_channel)
|
||||
if f.result()
|
||||
else None
|
||||
)
|
||||
|
||||
if len(transcription_text) > 25:
|
||||
llm_input_text = transcription_text
|
||||
transcription_text = ""
|
||||
param = TitleSummaryInput(input_text=llm_input_text,
|
||||
transcribed_time=last_transcribed_time)
|
||||
llm_result = run_in_executor(get_title_and_summary,
|
||||
param,
|
||||
executor=executor)
|
||||
llm_result.add_done_callback(
|
||||
lambda f: channel_send_increment(data_channel,
|
||||
llm_result.result())
|
||||
if f.result()
|
||||
else None
|
||||
)
|
||||
return frame
|
||||
|
||||
|
||||
async def offer(request: requests.Request) -> web.Response:
|
||||
"""
|
||||
Establish the WebRTC connection with the client
|
||||
:param request:
|
||||
:return:
|
||||
"""
|
||||
params = await request.json()
|
||||
offer = RTCSessionDescription(sdp=params["sdp"], type=params["type"])
|
||||
|
||||
pc = RTCPeerConnection()
|
||||
pc_id = "PeerConnection(%s)" % uuid.uuid4()
|
||||
pcs.add(pc)
|
||||
|
||||
def log_info(msg, *args) -> NoReturn:
|
||||
LOGGER.info(pc_id + " " + msg, *args)
|
||||
|
||||
log_info("Created for " + request.remote)
|
||||
|
||||
@pc.on("datachannel")
|
||||
def on_datachannel(channel) -> NoReturn:
|
||||
global data_channel
|
||||
data_channel = channel
|
||||
channel_log(channel, "-", "created by remote party")
|
||||
|
||||
@channel.on("message")
|
||||
def on_message(message: str) -> NoReturn:
|
||||
channel_log(channel, "<", message)
|
||||
if json.loads(message)["cmd"] == "STOP":
|
||||
# Placeholder final summary
|
||||
response = get_final_summary_response()
|
||||
channel_send_increment(data_channel, response)
|
||||
# To-do Add code to stop connection from server side here
|
||||
# But have to handshake with client once
|
||||
|
||||
if isinstance(message, str) and message.startswith("ping"):
|
||||
channel_send(channel, "pong" + message[4:])
|
||||
|
||||
@pc.on("connectionstatechange")
|
||||
async def on_connectionstatechange() -> NoReturn:
|
||||
log_info("Connection state is " + pc.connectionState)
|
||||
if pc.connectionState == "failed":
|
||||
await pc.close()
|
||||
pcs.discard(pc)
|
||||
|
||||
@pc.on("track")
|
||||
def on_track(track) -> NoReturn:
|
||||
log_info("Track " + track.kind + " received")
|
||||
pc.addTrack(AudioStreamTrack(relay.subscribe(track)))
|
||||
|
||||
await pc.setRemoteDescription(offer)
|
||||
|
||||
answer = await pc.createAnswer()
|
||||
await pc.setLocalDescription(answer)
|
||||
return web.Response(
|
||||
content_type="application/json",
|
||||
text=json.dumps(
|
||||
{
|
||||
"sdp": pc.localDescription.sdp,
|
||||
"type": pc.localDescription.type
|
||||
}
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
async def on_shutdown(application: web.Application) -> NoReturn:
|
||||
coroutines = [pc.close() for pc in pcs]
|
||||
await asyncio.gather(*coroutines)
|
||||
pcs.clear()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(
|
||||
description="WebRTC based server for Reflector"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--host", default="0.0.0.0", help="Server host IP (def: 0.0.0.0)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--port", type=int, default=1250, help="Server port (def: 1250)"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
app = web.Application()
|
||||
cors = aiohttp_cors.setup(
|
||||
app,
|
||||
defaults={
|
||||
"*": aiohttp_cors.ResourceOptions(
|
||||
allow_credentials=True,
|
||||
expose_headers="*",
|
||||
allow_headers="*"
|
||||
)
|
||||
},
|
||||
)
|
||||
|
||||
offer_resource = cors.add(app.router.add_resource("/offer"))
|
||||
cors.add(offer_resource.add_route("POST", offer))
|
||||
app.on_shutdown.append(on_shutdown)
|
||||
web.run_app(app, access_log=None, host=args.host, port=args.port)
|
||||
152
server/stream_client.py
Normal file
152
server/stream_client.py
Normal file
@@ -0,0 +1,152 @@
|
||||
import asyncio
|
||||
import time
|
||||
import uuid
|
||||
|
||||
import httpx
|
||||
import pyaudio
|
||||
import requests
|
||||
import stamina
|
||||
from aiortc import (RTCPeerConnection, RTCSessionDescription)
|
||||
from aiortc.contrib.media import (MediaPlayer, MediaRelay)
|
||||
|
||||
from utils.log_utils import LOGGER
|
||||
from utils.run_utils import CONFIG
|
||||
|
||||
|
||||
class StreamClient:
|
||||
def __init__(
|
||||
self,
|
||||
signaling,
|
||||
url="http://0.0.0.0:1250",
|
||||
play_from=None,
|
||||
ping_pong=False
|
||||
):
|
||||
self.signaling = signaling
|
||||
self.server_url = url
|
||||
self.play_from = play_from
|
||||
self.ping_pong = ping_pong
|
||||
self.paudio = pyaudio.PyAudio()
|
||||
|
||||
self.pc = RTCPeerConnection()
|
||||
|
||||
self.loop = asyncio.get_event_loop()
|
||||
self.relay = None
|
||||
self.pcs = set()
|
||||
self.time_start = None
|
||||
self.queue = asyncio.Queue()
|
||||
self.player = MediaPlayer(
|
||||
':' + str(CONFIG['AUDIO']["AV_FOUNDATION_DEVICE_ID"]),
|
||||
format='avfoundation',
|
||||
options={'channels': '2'})
|
||||
|
||||
def stop(self):
|
||||
self.loop.run_until_complete(self.signaling.close())
|
||||
self.loop.run_until_complete(self.pc.close())
|
||||
# self.loop.close()
|
||||
|
||||
def create_local_tracks(self, play_from):
|
||||
if play_from:
|
||||
player = MediaPlayer(play_from)
|
||||
return player.audio, player.video
|
||||
else:
|
||||
if self.relay is None:
|
||||
self.relay = MediaRelay()
|
||||
return self.relay.subscribe(self.player.audio), None
|
||||
|
||||
def channel_log(self, channel, t, message):
|
||||
print("channel(%s) %s %s" % (channel.label, t, message))
|
||||
|
||||
def channel_send(self, channel, message):
|
||||
# self.channel_log(channel, ">", message)
|
||||
channel.send(message)
|
||||
|
||||
def current_stamp(self):
|
||||
if self.time_start is None:
|
||||
self.time_start = time.time()
|
||||
return 0
|
||||
else:
|
||||
return int((time.time() - self.time_start) * 1000000)
|
||||
|
||||
async def run_offer(self, pc, signaling):
|
||||
# microphone
|
||||
audio, video = self.create_local_tracks(self.play_from)
|
||||
pc_id = "PeerConnection(%s)" % uuid.uuid4()
|
||||
self.pcs.add(pc)
|
||||
|
||||
def log_info(msg, *args):
|
||||
LOGGER.info(pc_id + " " + msg, *args)
|
||||
|
||||
@pc.on("connectionstatechange")
|
||||
async def on_connectionstatechange():
|
||||
print("Connection state is %s" % pc.connectionState)
|
||||
if pc.connectionState == "failed":
|
||||
await pc.close()
|
||||
self.pcs.discard(pc)
|
||||
|
||||
@pc.on("track")
|
||||
def on_track(track):
|
||||
print("Sending %s" % track.kind)
|
||||
self.pc.addTrack(track)
|
||||
|
||||
@track.on("ended")
|
||||
async def on_ended():
|
||||
log_info("Track %s ended", track.kind)
|
||||
|
||||
self.pc.addTrack(audio)
|
||||
|
||||
channel = pc.createDataChannel("data-channel")
|
||||
self.channel_log(channel, "-", "created by local party")
|
||||
|
||||
async def send_pings():
|
||||
while True:
|
||||
self.channel_send(channel, "ping %d" % self.current_stamp())
|
||||
await asyncio.sleep(1)
|
||||
|
||||
@channel.on("open")
|
||||
def on_open():
|
||||
if self.ping_pong:
|
||||
asyncio.ensure_future(send_pings())
|
||||
|
||||
@channel.on("message")
|
||||
def on_message(message):
|
||||
self.queue.put_nowait(message)
|
||||
if self.ping_pong:
|
||||
self.channel_log(channel, "<", message)
|
||||
|
||||
if isinstance(message, str) and message.startswith("pong"):
|
||||
elapsed_ms = (self.current_stamp() - int(message[5:])) \
|
||||
/ 1000
|
||||
print(" RTT %.2f ms" % elapsed_ms)
|
||||
|
||||
await pc.setLocalDescription(await pc.createOffer())
|
||||
|
||||
sdp = {
|
||||
"sdp": pc.localDescription.sdp,
|
||||
"type": pc.localDescription.type
|
||||
}
|
||||
|
||||
@stamina.retry(on=httpx.HTTPError, attempts=5)
|
||||
def connect_to_server():
|
||||
response = requests.post(self.server_url, json=sdp, timeout=10)
|
||||
response.raise_for_status()
|
||||
return response
|
||||
|
||||
params = connect_to_server().json()
|
||||
answer = RTCSessionDescription(sdp=params["sdp"], type=params["type"])
|
||||
await pc.setRemoteDescription(answer)
|
||||
|
||||
self.reader = self.worker(f'{"worker"}', self.queue)
|
||||
|
||||
def get_reader(self):
|
||||
return self.reader
|
||||
|
||||
async def worker(self, name, queue):
|
||||
while True:
|
||||
msg = await self.queue.get()
|
||||
yield msg
|
||||
self.queue.task_done()
|
||||
|
||||
async def start(self):
|
||||
coro = self.run_offer(self.pc, self.signaling)
|
||||
task = asyncio.create_task(coro)
|
||||
await task
|
||||
0
server/trials/__init__.py
Normal file
0
server/trials/__init__.py
Normal file
0
server/trials/finetuning/__init__.py
Normal file
0
server/trials/finetuning/__init__.py
Normal file
24
server/trials/finetuning/inference_fine_tuned.py
Normal file
24
server/trials/finetuning/inference_fine_tuned.py
Normal file
@@ -0,0 +1,24 @@
|
||||
# Steps to prepare data and submit/check OpenAI finetuning
|
||||
# import subprocess
|
||||
# subprocess.run("openai tools fine_tunes.prepare_data -f " + "finetuning_dataset.jsonl")
|
||||
# export OPENAI_API_KEY=
|
||||
# openai api fine_tunes.create -t <TRAIN_FILE_ID_OR_PATH> -m <BASE_MODEL>
|
||||
# openai api fine_tunes.list
|
||||
|
||||
|
||||
import openai
|
||||
|
||||
# Use your OpenAI API Key
|
||||
openai.api_key = ""
|
||||
|
||||
sample_chunks = ["You all just came off of your incredible Google Cloud next conference where you released a wide variety of functionality and features and new products across artisan television and also across the entire sort of cloud ecosystem . You want to just first by walking through , first start by walking through all the innovations that you sort of released and what you 're excited about when you come to Google Cloud ? Now our vision is super simple . If you look at what smartphones did for a consumer , you know they took a computer and internet browser , a communication device , and a camera , and made it so that it 's in everybody 's pocket , so it really brought computation to every person . We feel that , you know , our , what we 're trying to do is take all the technological innovation that Google 's doing , but make it super simple so that everyone can consume it . And so that includes our global data center footprint , all the new types of hardware and large-scale systems we work on , the software that we 're making available for people to do high-scale computation , tools for data processing , tools for cybersecurity , processing , tools for cyber security , tools for machine learning , but make it so simple that everyone can use it . And every step that we do to simplify things for people , we think adoption can grow . And so that 's a lot of what we 've done these last three , four years , and we made a number of announcements that next in machine learning and AI in particular , you know , we look at our work as four elements , how we take our large-scale compute systems that were building for AI and how we make that available to everybody . Second , what we 're doing with the software stacks and top of it , things like jacks and other things and how we 're making those available to everybody . Third is advances because different people have different levels of expertise . Some people say I need the hardware to build my own large language model or algorithm . Other people say , look , I really need to use a building block . You guys give me . So , 30s we 've done a lot with AutoML and we announce new capability for image , video , and translation to make it available to everybody . And then lastly , we 're also building completely packaged solutions for some areas and we announce some new stuff . -> ",
|
||||
" We 're joined next by Thomas Curian , CEO of Google Cloud , and Alexander Wang , CEO and founder of Scale AI . Thomas joined Google in November 2018 as the CEO of Google Cloud . Prior to Google , Thomas spent 22 years at Oracle , where most recently he was president of product development . Before that , Thomas worked at McKinsey as a business analyst and engagement manager . His nearly 30 years of experience have given him a deep knowledge of engineering enterprise relationships and leadership of large organizations . Thomas 's degrees include an MBA in administration and management from Stanford University , as an RJ Miller scholar and a BSEE in electrical engineering and computer science from Princeton University , where he graduated suma cum laude . Thomas serves as a member of the Stanford graduate School of Business Advisory Council and Princeton University School of Engineering Advisory Council . Please welcome to the stage , Thomas Curian and Alexander Wang . This is a super exciting conversation . Thanks for being here , Thomas . - > "]
|
||||
|
||||
# Give your finetuned model name here
|
||||
# "davinci:ft-personal-2023-07-14-10-43-51"
|
||||
model_name = ""
|
||||
response = openai.Completion.create(
|
||||
model=model_name,
|
||||
prompt=sample_chunks[0])
|
||||
|
||||
print(response)
|
||||
98
server/trials/finetuning/youtube_scraping.py
Normal file
98
server/trials/finetuning/youtube_scraping.py
Normal file
@@ -0,0 +1,98 @@
|
||||
import json
|
||||
import yt_dlp as youtube_dl
|
||||
from whisper_jax import FlaxWhisperPipline
|
||||
import jax.numpy as jnp
|
||||
|
||||
# Function to extract chapter information from a YouTube video URL
|
||||
def get_youtube_chapters(video_id):
|
||||
video_url = "https://www.youtube.com/watch?v=" + video_id
|
||||
ydl_opts = {
|
||||
'extract_flat': 'in_playlist',
|
||||
'skip_download': True,
|
||||
'quiet': True,
|
||||
}
|
||||
|
||||
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
|
||||
video_info = ydl.extract_info(video_url, download=False)
|
||||
|
||||
chapters = []
|
||||
|
||||
if 'chapters' in video_info:
|
||||
for chapter in video_info['chapters']:
|
||||
start_time = chapter['start_time']
|
||||
end_time = chapter['end_time']
|
||||
title = chapter['title']
|
||||
|
||||
chapters.append({
|
||||
'start': start_time,
|
||||
'end': end_time,
|
||||
'title': title
|
||||
})
|
||||
|
||||
return chapters
|
||||
|
||||
|
||||
# Function to extract video transcription using yt_dlp
|
||||
def get_youtube_transcription(video_id):
|
||||
ydl_opts = {
|
||||
'format': 'bestaudio/best',
|
||||
'postprocessors': [{
|
||||
'key': 'FFmpegExtractAudio',
|
||||
'preferredcodec': 'mp3',
|
||||
'preferredquality': '192',
|
||||
}],
|
||||
'outtmpl': './artefacts/audio', # Specify output file path and name
|
||||
}
|
||||
|
||||
# Download the audio
|
||||
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
|
||||
ydl.download(["https://www.youtube.com/watch?v=" + video_id])
|
||||
media_file = "./artefacts/audio.mp3"
|
||||
|
||||
pipeline = FlaxWhisperPipline("openai/whisper-" + "tiny",
|
||||
dtype=jnp.float16,
|
||||
batch_size=16)
|
||||
whisper_result = pipeline(media_file, return_timestamps=True)
|
||||
return whisper_result["chunks"]
|
||||
|
||||
|
||||
|
||||
# Function to scrape YouTube video transcripts and chapter information
|
||||
def scrape_youtube_data(video_id):
|
||||
transcript_text = get_youtube_transcription(video_id)
|
||||
chapters = get_youtube_chapters(video_id)
|
||||
print("transcript_text", transcript_text)
|
||||
print("chapters", chapters)
|
||||
return transcript_text, chapters
|
||||
|
||||
|
||||
# Function to generate fine-tuning dataset from YouTube data
|
||||
def generate_finetuning_dataset(video_ids):
|
||||
prompt_completion_pairs = []
|
||||
for video_id in video_ids:
|
||||
transcript_text, chapters = scrape_youtube_data(video_id)
|
||||
if transcript_text is not None and chapters is not None:
|
||||
for chapter in chapters:
|
||||
start_time = chapter["start"]
|
||||
end_time = chapter["end"]
|
||||
chapter_text = chapter["title"]
|
||||
|
||||
prompt = ""
|
||||
for transcript in transcript_text:
|
||||
if transcript["timestamp"][0] >= start_time and transcript["timestamp"][1] < end_time:
|
||||
prompt += transcript["text"]
|
||||
|
||||
if prompt is not None:
|
||||
completion = chapter_text
|
||||
prompt_completion_pairs.append({"prompt": prompt, "completion": completion})
|
||||
|
||||
return prompt_completion_pairs
|
||||
|
||||
|
||||
# Add all the video ids here, the videos must have captions [chapters]
|
||||
video_ids = ["yTnSEZIwnkU"]
|
||||
dataset = generate_finetuning_dataset(video_ids)
|
||||
|
||||
with open("finetuning_dataset.jsonl", "w", encoding="utf-8") as file:
|
||||
for example in dataset:
|
||||
file.write(json.dumps(example) + "\n")
|
||||
0
server/trials/server/__init__.py
Normal file
0
server/trials/server/__init__.py
Normal file
188
server/trials/server/server_multithreaded.py
Normal file
188
server/trials/server/server_multithreaded.py
Normal file
@@ -0,0 +1,188 @@
|
||||
import asyncio
|
||||
import datetime
|
||||
import io
|
||||
import json
|
||||
import threading
|
||||
import uuid
|
||||
import wave
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
|
||||
import jax.numpy as jnp
|
||||
import requests
|
||||
from aiohttp import web
|
||||
from aiortc import MediaStreamTrack, RTCPeerConnection, RTCSessionDescription
|
||||
from aiortc.contrib.media import MediaRelay
|
||||
from av import AudioFifo
|
||||
from sortedcontainers import SortedDict
|
||||
from whisper_jax import FlaxWhisperPipline
|
||||
|
||||
from reflector.utils.log_utils import LOGGER
|
||||
from reflector.utils.run_utils import CONFIG, Mutex
|
||||
|
||||
WHISPER_MODEL_SIZE = CONFIG['WHISPER']["WHISPER_REAL_TIME_MODEL_SIZE"]
|
||||
pcs = set()
|
||||
relay = MediaRelay()
|
||||
data_channel = None
|
||||
sorted_message_queue = SortedDict()
|
||||
CHANNELS = 2
|
||||
RATE = 44100
|
||||
CHUNK_SIZE = 256
|
||||
pipeline = FlaxWhisperPipline("openai/whisper-" + WHISPER_MODEL_SIZE,
|
||||
dtype=jnp.float16,
|
||||
batch_size=16)
|
||||
start_time = datetime.datetime.now()
|
||||
executor = ThreadPoolExecutor()
|
||||
audio_buffer = AudioFifo()
|
||||
frame_lock = Mutex(audio_buffer)
|
||||
|
||||
|
||||
def channel_log(channel, t, message):
|
||||
print("channel(%s) %s %s" % (channel.label, t, message))
|
||||
|
||||
|
||||
def thread_queue_channel_send():
|
||||
loop = asyncio.new_event_loop()
|
||||
asyncio.set_event_loop(loop)
|
||||
try:
|
||||
least_time = sorted_message_queue.keys()[0]
|
||||
message = sorted_message_queue[least_time]
|
||||
if message:
|
||||
del sorted_message_queue[least_time]
|
||||
data_channel.send(message)
|
||||
except Exception as e:
|
||||
print("Exception", str(e))
|
||||
pass
|
||||
loop.run_forever()
|
||||
|
||||
|
||||
def get_transcription():
|
||||
while True:
|
||||
with frame_lock.lock() as audio_buffer:
|
||||
frames = audio_buffer.read_many(CHUNK_SIZE * 960, partial=False)
|
||||
if not frames:
|
||||
transcribe = False
|
||||
else:
|
||||
transcribe = True
|
||||
|
||||
if transcribe:
|
||||
print("Transcribing..")
|
||||
try:
|
||||
sorted_message_queue[frames[0].time] = None
|
||||
out_file = io.BytesIO()
|
||||
wf = wave.open(out_file, "wb")
|
||||
wf.setnchannels(CHANNELS)
|
||||
wf.setframerate(RATE)
|
||||
wf.setsampwidth(2)
|
||||
|
||||
for frame in frames:
|
||||
wf.writeframes(b''.join(frame.to_ndarray()))
|
||||
wf.close()
|
||||
|
||||
whisper_result = pipeline(out_file.getvalue())
|
||||
item = {
|
||||
'text': whisper_result["text"],
|
||||
'start_time': str(frames[0].time),
|
||||
'time': str(datetime.datetime.now())
|
||||
}
|
||||
sorted_message_queue[frames[0].time] = str(item)
|
||||
start_messaging_thread()
|
||||
except Exception as e:
|
||||
print("Exception -> ", str(e))
|
||||
|
||||
|
||||
class AudioStreamTrack(MediaStreamTrack):
|
||||
"""
|
||||
An audio stream track to send audio frames.
|
||||
"""
|
||||
|
||||
kind = "audio"
|
||||
|
||||
def __init__(self, track):
|
||||
super().__init__() # don't forget this!
|
||||
self.track = track
|
||||
|
||||
async def recv(self):
|
||||
frame = await self.track.recv()
|
||||
audio_buffer.write(frame)
|
||||
return frame
|
||||
|
||||
|
||||
def start_messaging_thread():
|
||||
message_thread = threading.Thread(target=thread_queue_channel_send)
|
||||
message_thread.start()
|
||||
|
||||
|
||||
def start_transcription_thread(max_threads: int):
|
||||
for i in range(max_threads):
|
||||
t_thread = threading.Thread(target=get_transcription)
|
||||
t_thread.start()
|
||||
|
||||
|
||||
async def offer(request: requests.Request):
|
||||
params = await request.json()
|
||||
offer = RTCSessionDescription(sdp=params["sdp"], type=params["type"])
|
||||
|
||||
pc = RTCPeerConnection()
|
||||
pc_id = "PeerConnection(%s)" % uuid.uuid4()
|
||||
pcs.add(pc)
|
||||
|
||||
def log_info(msg: str, *args):
|
||||
LOGGER.info(pc_id + " " + msg, *args)
|
||||
|
||||
log_info("Created for " + request.remote)
|
||||
|
||||
@pc.on("datachannel")
|
||||
def on_datachannel(channel):
|
||||
global data_channel, start_time
|
||||
data_channel = channel
|
||||
channel_log(channel, "-", "created by remote party")
|
||||
start_time = datetime.datetime.now()
|
||||
|
||||
@channel.on("message")
|
||||
def on_message(message: str):
|
||||
channel_log(channel, "<", message)
|
||||
if isinstance(message, str) and message.startswith("ping"):
|
||||
# reply
|
||||
channel.send("pong" + message[4:])
|
||||
|
||||
@pc.on("connectionstatechange")
|
||||
async def on_connectionstatechange():
|
||||
log_info("Connection state is " + pc.connectionState)
|
||||
if pc.connectionState == "failed":
|
||||
await pc.close()
|
||||
pcs.discard(pc)
|
||||
|
||||
@pc.on("track")
|
||||
def on_track(track):
|
||||
log_info("Track " + track.kind + " received")
|
||||
pc.addTrack(AudioStreamTrack(relay.subscribe(track)))
|
||||
|
||||
# handle offer
|
||||
await pc.setRemoteDescription(offer)
|
||||
|
||||
# send answer
|
||||
answer = await pc.createAnswer()
|
||||
await pc.setLocalDescription(answer)
|
||||
return web.Response(
|
||||
content_type="application/json",
|
||||
text=json.dumps({
|
||||
"sdp": pc.localDescription.sdp,
|
||||
"type": pc.localDescription.type
|
||||
}),
|
||||
)
|
||||
|
||||
|
||||
async def on_shutdown(app: web.Application):
|
||||
coros = [pc.close() for pc in pcs]
|
||||
await asyncio.gather(*coros)
|
||||
pcs.clear()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app = web.Application()
|
||||
app.on_shutdown.append(on_shutdown)
|
||||
start_transcription_thread(6)
|
||||
app.router.add_post("/offer", offer)
|
||||
web.run_app(
|
||||
app, access_log=None, host="127.0.0.1", port=1250
|
||||
)
|
||||
0
server/trials/title_summary/__init__.py
Normal file
0
server/trials/title_summary/__init__.py
Normal file
57
server/trials/title_summary/api.py
Normal file
57
server/trials/title_summary/api.py
Normal file
@@ -0,0 +1,57 @@
|
||||
import requests
|
||||
import spacy
|
||||
|
||||
# Enter the Machine where the LLM is hosted
|
||||
LLM_MACHINE_IP = ""
|
||||
# This is the URL of text-generation-webui
|
||||
URL = f"http://{LLM_MACHINE_IP}:5000/api/v1/generate"
|
||||
|
||||
headers = {
|
||||
"Content-Type": "application/json"
|
||||
}
|
||||
|
||||
|
||||
def split_text_file(filename, token_count):
|
||||
nlp = spacy.load('en_core_web_md')
|
||||
|
||||
with open(filename, 'r') as file:
|
||||
text = file.read()
|
||||
|
||||
doc = nlp(text)
|
||||
total_tokens = len(doc)
|
||||
|
||||
parts = []
|
||||
start_index = 0
|
||||
|
||||
while start_index < total_tokens:
|
||||
end_index = start_index + token_count
|
||||
part_tokens = doc[start_index:end_index - 5]
|
||||
part = ' '.join(token.text for token in part_tokens)
|
||||
parts.append(part)
|
||||
start_index = end_index
|
||||
|
||||
return parts
|
||||
|
||||
|
||||
final_summary = ""
|
||||
parts = split_text_file("transcript.txt", 1600)
|
||||
|
||||
for part in parts:
|
||||
prompt = f"""
|
||||
### Human:
|
||||
Given the following text, distill the most important information
|
||||
into a short summary: {part}
|
||||
|
||||
### Assistant:
|
||||
"""
|
||||
data = {
|
||||
"prompt": prompt
|
||||
}
|
||||
try:
|
||||
response = requests.post(URL, headers=headers, json=data)
|
||||
print(response.json())
|
||||
except Exception as e:
|
||||
print(str(e))
|
||||
|
||||
with open("summary.txt", "w") as sum:
|
||||
sum.write(" ".join(final_summary))
|
||||
43
server/trials/title_summary/bert.py
Normal file
43
server/trials/title_summary/bert.py
Normal file
@@ -0,0 +1,43 @@
|
||||
import torch
|
||||
from transformers import BertTokenizer, BertModel
|
||||
from sentence_transformers import SentenceTransformer
|
||||
from sklearn.metrics.pairwise import cosine_similarity
|
||||
|
||||
# Load the pre-trained BERT model and tokenizer
|
||||
model_name = "bert-base-uncased"
|
||||
model = BertModel.from_pretrained(model_name)
|
||||
tokenizer = BertTokenizer.from_pretrained(model_name)
|
||||
|
||||
# Set the device to use
|
||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
model.to(device)
|
||||
|
||||
# Load the SentenceTransformer model
|
||||
sentence_transformer_model = SentenceTransformer('average_word_embeddings_glove.6B.300d')
|
||||
|
||||
# Define the input text
|
||||
text = "Your input text to be summarized goes here."
|
||||
|
||||
# Tokenize the text
|
||||
tokens = tokenizer.tokenize(text)
|
||||
input_ids = tokenizer.convert_tokens_to_ids(tokens)
|
||||
input_ids = torch.tensor([input_ids]).to(device)
|
||||
|
||||
# Get the BERT model output
|
||||
with torch.no_grad():
|
||||
outputs = model(input_ids)[0] # Extract the last hidden states
|
||||
|
||||
# Calculate sentence embeddings
|
||||
sentence_embeddings = outputs.mean(dim=1).squeeze().cpu().numpy()
|
||||
input_text_embedding = sentence_transformer_model.encode([text])[0]
|
||||
|
||||
# Calculate cosine similarity between sentences and input text
|
||||
similarity_scores = cosine_similarity([input_text_embedding], sentence_embeddings)
|
||||
|
||||
# Sort the sentences by similarity scores in descending order
|
||||
sorted_sentences = [sent for _, sent in sorted(zip(similarity_scores[0], sentences), reverse=True)]
|
||||
|
||||
# Choose the top sentences as the summary
|
||||
num_summary_sentences = 2 # Adjust as needed
|
||||
summary = ". ".join(sorted_sentences[:num_summary_sentences])
|
||||
print("Summary:", summary)
|
||||
101
server/trials/title_summary/gpt2.py
Normal file
101
server/trials/title_summary/gpt2.py
Normal file
@@ -0,0 +1,101 @@
|
||||
# Approach 1
|
||||
from transformers import GPTNeoForCausalLM, GPT2Tokenizer
|
||||
|
||||
model_name = 'EleutherAI/gpt-neo-1.3B'
|
||||
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
|
||||
model = GPTNeoForCausalLM.from_pretrained(model_name)
|
||||
|
||||
conversation = """
|
||||
Summarize the following conversation in 3 key sentences:
|
||||
|
||||
We 're joined next by Thomas Curian , CEO of Google Cloud , and Alexander Wang , CEO and founder of Scale AI .
|
||||
Thomas joined Google in November 2018 as the CEO of Google Cloud . Prior to Google , Thomas spent 22 years at Oracle , where most recently he was president of product development .
|
||||
Before that , Thomas worked at McKinsey as a business analyst and engagement manager . His nearly 30 years of experience have given him a deep knowledge of engineering enterprise relationships and leadership of large organizations .
|
||||
Thomas 's degrees include an MBA in administration and management from Stanford University , as an RJ Miller scholar and a BSEE in electrical engineering and computer science from Princeton University , where he graduated suma cum laude .
|
||||
Thomas serves as a member of the Stanford graduate School of Business Advisory Council and Princeton University School of Engineering Advisory Council .
|
||||
Please welcome to the stage , Thomas Curian and Alexander Wang . This is a super exciting conversation . Thanks for being here , Thomas .
|
||||
"""
|
||||
|
||||
input_ids = tokenizer.encode(conversation, return_tensors='pt')
|
||||
|
||||
output = model.generate(input_ids,
|
||||
max_length=30,
|
||||
num_return_sequences=1)
|
||||
|
||||
caption = tokenizer.decode(output[0], skip_special_tokens=True)
|
||||
print("Caption:", caption[len(input_ids):])
|
||||
|
||||
|
||||
# Approach 2
|
||||
import torch
|
||||
from transformers import GPT2LMHeadModel, GPT2Tokenizer
|
||||
|
||||
model_name = "gpt2"
|
||||
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
|
||||
model = GPT2LMHeadModel.from_pretrained(model_name)
|
||||
|
||||
model.eval()
|
||||
|
||||
text = """
|
||||
You all just came off of your incredible Google Cloud next conference where you released a wide variety of functionality and features and new products across artisan television and also across the entire sort of cloud ecosystem . You want to just first by walking through , first start by walking through all the innovations that you sort of released and what you 're excited about when you come to Google Cloud ? Now our vision is super simple . If you look at what smartphones did for a consumer , you know they took a computer and internet browser , a communication device , and a camera , and made it so that it 's in everybody 's pocket , so it really brought computation to every person . We feel that , you know , our , what we 're trying to do is take all the technological innovation that Google 's doing , but make it super simple so that everyone can consume it . And so that includes our global data center footprint , all the new types of hardware and large-scale systems we work on , the software that we 're making available for people to do high-scale computation , tools for data processing , tools for cybersecurity , processing , tools for cyber security , tools for machine learning , but make it so simple that everyone can use it . And every step that we do to simplify things for people , we think adoption can grow . And so that 's a lot of what we 've done these last three , four years , and we made a number of announcements that next in machine learning and AI in particular , you know , we look at our work as four elements , how we take our large-scale compute systems that were building for AI and how we make that available to everybody . Second , what we 're doing with the software stacks and top of it , things like jacks and other things and how we 're making those available to everybody . Third is advances because different people have different levels of expertise . Some people say I need the hardware to build my own large language model or algorithm . Other people say , look , I really need to use a building block . You guys give me . So , 30s we 've done a lot with AutoML and we announce new capability for image , video , and translation to make it available to everybody . And then lastly , we 're also building completely packaged solutions for some areas and we announce some new stuff . "
|
||||
"""
|
||||
|
||||
tokenizer.pad_token = tokenizer.eos_token
|
||||
input_ids = tokenizer.encode(text,
|
||||
max_length=100,
|
||||
truncation=True,
|
||||
return_tensors="pt")
|
||||
attention_mask = torch.ones(input_ids.shape, dtype=torch.long)
|
||||
output = model.generate(input_ids,
|
||||
max_new_tokens=20,
|
||||
num_return_sequences=1,
|
||||
num_beams=2,
|
||||
attention_mask=attention_mask)
|
||||
|
||||
chapter_titles = [tokenizer.decode(output[i], skip_special_tokens=True) for i in range(output.shape[0])]
|
||||
for i, title in enumerate(chapter_titles):
|
||||
print("Caption: ", title)
|
||||
|
||||
# Approach 3
|
||||
|
||||
import torch
|
||||
from transformers import GPT2LMHeadModel, GPT2Tokenizer
|
||||
|
||||
|
||||
def generate_response(conversation, max_length=100):
|
||||
input_text = ""
|
||||
for entry in conversation:
|
||||
role = entry["role"]
|
||||
content = entry["content"]
|
||||
input_text += f"{role}: {content}\n"
|
||||
|
||||
# Tokenize the entire conversation
|
||||
input_ids = tokenizer.encode(input_text, return_tensors="pt")
|
||||
|
||||
# Generate text based on the entire conversation
|
||||
with torch.no_grad():
|
||||
output = model.generate(input_ids, pad_token_id=tokenizer.eos_token_id)
|
||||
|
||||
# Decode the generated text and return it
|
||||
response = tokenizer.decode(output[0], skip_special_tokens=True)
|
||||
return response
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
# Call appropriate approach from the main while experimenting
|
||||
model_name = "gpt2"
|
||||
model = GPT2LMHeadModel.from_pretrained(model_name)
|
||||
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
|
||||
|
||||
sample_chunks = [
|
||||
"You all just came off of your incredible Google Cloud next conference where you released a wide variety of functionality and features and new products across artisan television and also across the entire sort of cloud ecosystem . You want to just first by walking through , first start by walking through all the innovations that you sort of released and what you 're excited about when you come to Google Cloud ? Now our vision is super simple . If you look at what smartphones did for a consumer , you know they took a computer and internet browser , a communication device , and a camera , and made it so that it 's in everybody 's pocket , so it really brought computation to every person . We feel that , you know , our , what we 're trying to do is take all the technological innovation that Google 's doing , but make it super simple so that everyone can consume it . And so that includes our global data center footprint , all the new types of hardware and large-scale systems we work on , the software that we 're making available for people to do high-scale computation , tools for data processing , tools for cybersecurity , processing , tools for cyber security , tools for machine learning , but make it so simple that everyone can use it . And every step that we do to simplify things for people , we think adoption can grow . And so that 's a lot of what we 've done these last three , four years , and we made a number of announcements that next in machine learning and AI in particular , you know , we look at our work as four elements , how we take our large-scale compute systems that were building for AI and how we make that available to everybody . Second , what we 're doing with the software stacks and top of it , things like jacks and other things and how we 're making those available to everybody . Third is advances because different people have different levels of expertise . Some people say I need the hardware to build my own large language model or algorithm . Other people say , look , I really need to use a building block . You guys give me . So , 30s we 've done a lot with AutoML and we announce new capability for image , video , and translation to make it available to everybody . And then lastly , we 're also building completely packaged solutions for some areas and we announce some new stuff . "
|
||||
]
|
||||
|
||||
conversation = [
|
||||
{"role": "system", "content": "Summarize this text"},
|
||||
{"role": "user", "content": " text : " + sample_chunks[0]},
|
||||
]
|
||||
|
||||
response = generate_response(conversation)
|
||||
print("Response:", response)
|
||||
157
server/trials/title_summary/incsum.py
Normal file
157
server/trials/title_summary/incsum.py
Normal file
@@ -0,0 +1,157 @@
|
||||
import spacy
|
||||
import sys
|
||||
|
||||
|
||||
# Observe the incremental summaries by performing summaries in chunks
|
||||
with open("transcript.txt", "r", encoding="utf-8") as file:
|
||||
transcription = file.read()
|
||||
|
||||
|
||||
def split_text_file(filename, token_count):
|
||||
nlp = spacy.load('en_core_web_md')
|
||||
|
||||
with open(filename, 'r', encoding="utf-8") as file:
|
||||
text = file.read()
|
||||
|
||||
doc = nlp(text)
|
||||
total_tokens = len(doc)
|
||||
|
||||
parts = []
|
||||
start_index = 0
|
||||
|
||||
while start_index < total_tokens:
|
||||
end_index = start_index + token_count
|
||||
part_tokens = doc[start_index:end_index]
|
||||
part = ' '.join(token.text for token in part_tokens)
|
||||
parts.append(part)
|
||||
start_index = end_index
|
||||
|
||||
return parts
|
||||
|
||||
|
||||
# Set the chunk length here to split the transcript and test
|
||||
MAX_CHUNK_LENGTH = 1000
|
||||
|
||||
chunks = split_text_file("transcript.txt", MAX_CHUNK_LENGTH)
|
||||
print("Number of chunks", len(chunks))
|
||||
|
||||
# Write chunks to file to refer to input vs output, separated by blank lines
|
||||
with open("chunks" + str(MAX_CHUNK_LENGTH) + ".txt", "a", encoding="utf-8") as file:
|
||||
for c in chunks:
|
||||
file.write(c + "\n\n")
|
||||
|
||||
# If we want to run only a certain model, type the option while running
|
||||
# ex. python incsum.py 1 => will run approach 1
|
||||
# If no input, will run all approaches
|
||||
|
||||
try:
|
||||
index = sys.argv[1]
|
||||
except:
|
||||
index = None
|
||||
|
||||
# Approach 1 : facebook/bart-large-cnn
|
||||
if index == "1" or index is None:
|
||||
SUMMARY_MODEL = "facebook/bart-large-cnn"
|
||||
MIN_LENGTH = 5
|
||||
MAX_LENGTH = 10
|
||||
BEAM_SIZE = 2
|
||||
|
||||
print("Performing chunk summary : " + SUMMARY_MODEL)
|
||||
|
||||
from transformers import BartTokenizer, BartForConditionalGeneration
|
||||
|
||||
tokenizer = BartTokenizer.from_pretrained(SUMMARY_MODEL)
|
||||
model = BartForConditionalGeneration.from_pretrained(SUMMARY_MODEL)
|
||||
summaries = []
|
||||
for c in chunks:
|
||||
input_ids = tokenizer.encode(c,
|
||||
truncation=True,
|
||||
max_length=MAX_CHUNK_LENGTH,
|
||||
padding="max_length",
|
||||
return_tensors='pt')
|
||||
summary_ids = model.generate(
|
||||
input_ids,
|
||||
num_beams=BEAM_SIZE,
|
||||
max_length=56,
|
||||
early_stopping=True,
|
||||
length_penalty=1.0)
|
||||
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
|
||||
summaries.append(summary)
|
||||
|
||||
with open("bart-summaries.txt", "a", encoding="utf-8") as file:
|
||||
for summary in summaries:
|
||||
file.write(summary + "\n\n")
|
||||
|
||||
# Approach 2
|
||||
if index == "2" or index is None:
|
||||
print("Performing chunk summary : " + "gpt-neo-1.3B")
|
||||
|
||||
import torch
|
||||
from transformers import GPTNeoForCausalLM, GPT2Tokenizer
|
||||
|
||||
model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B")
|
||||
tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B")
|
||||
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
|
||||
summaries = []
|
||||
|
||||
for c in chunks:
|
||||
input_ids = tokenizer.encode(c,
|
||||
truncation=True,
|
||||
return_tensors='pt')
|
||||
input_length = input_ids.shape[1]
|
||||
attention_mask = torch.ones(input_ids.shape, dtype=torch.long)
|
||||
|
||||
max_summary_length = 100
|
||||
max_length = input_length + max_summary_length
|
||||
|
||||
output = model.generate(input_ids,
|
||||
max_length=max_length,
|
||||
attention_mask=attention_mask,
|
||||
pad_token_id=model.config.eos_token_id,
|
||||
num_beams=4,
|
||||
length_penalty=2.0,
|
||||
early_stopping=True)
|
||||
summary_ids = output[0, input_length:]
|
||||
summary = tokenizer.decode(summary_ids, skip_special_tokens=True)
|
||||
summaries.append(summary)
|
||||
with open("gptneo1.3B-summaries.txt", "a", encoding="utf-8") as file:
|
||||
file.write(summary + "\n\n")
|
||||
|
||||
# Approach 3
|
||||
if index == "3" or index is None:
|
||||
print("Performing chunk summary : " + "mpt-7B")
|
||||
|
||||
import torch
|
||||
import transformers
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
config = transformers.AutoConfig.from_pretrained('mosaicml/mpt-7b',
|
||||
trust_remote_code=True)
|
||||
config.attn_config['attn_impl'] = 'triton'
|
||||
config.max_seq_len = 1024
|
||||
config.init_device = "meta"
|
||||
|
||||
model = transformers.AutoModelForCausalLM.from_pretrained(
|
||||
'mosaicml/mpt-7b',
|
||||
trust_remote_code=True,
|
||||
torch_dtype=torch.bfloat16
|
||||
)
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained('EleutherAI/gpt-neox-20b')
|
||||
|
||||
summaries = []
|
||||
for c in chunks:
|
||||
input_ids = tokenizer.encode(c, return_tensors="pt")
|
||||
attention_mask = torch.ones(input_ids.shape, dtype=torch.long)
|
||||
output = model.generate(input_ids,
|
||||
max_new_tokens=25,
|
||||
attention_mask=attention_mask,
|
||||
pad_token_id=model.config.eos_token_id,
|
||||
num_return_sequences=1)
|
||||
summary = tokenizer.decode(output[0],
|
||||
skip_special_tokens=True)
|
||||
summaries.append(summary)
|
||||
|
||||
with open("mpt-7b-summaries.txt", "a", encoding="utf-8") as file:
|
||||
for summary in summaries:
|
||||
file.write(summary + "\n\n")
|
||||
37
server/trials/title_summary/openai_endpoint.py
Normal file
37
server/trials/title_summary/openai_endpoint.py
Normal file
@@ -0,0 +1,37 @@
|
||||
# Use OpenAI API endpoint to send data to OpenAI
|
||||
# along with prompts to caption/summarize the conversation
|
||||
|
||||
import openai
|
||||
|
||||
openai.api_key = ""
|
||||
|
||||
# to caption, user prompt used : "caption this conversation"
|
||||
# max_tokens=20
|
||||
|
||||
# to incremental summarize, user prompt used : "summarize this conversation in a few sentences by taking key points"
|
||||
# max_tokens=300
|
||||
|
||||
sample_chunks = [
|
||||
"You all just came off of your incredible Google Cloud next conference where you released a wide variety of functionality and features and new products across artisan television and also across the entire sort of cloud ecosystem . You want to just first by walking through , first start by walking through all the innovations that you sort of released and what you 're excited about when you come to Google Cloud ? Now our vision is super simple . If you look at what smartphones did for a consumer , you know they took a computer and internet browser , a communication device , and a camera , and made it so that it 's in everybody 's pocket , so it really brought computation to every person . We feel that , you know , our , what we 're trying to do is take all the technological innovation that Google 's doing , but make it super simple so that everyone can consume it . And so that includes our global data center footprint , all the new types of hardware and large-scale systems we work on , the software that we 're making available for people to do high-scale computation , tools for data processing , tools for cybersecurity , processing , tools for cyber security , tools for machine learning , but make it so simple that everyone can use it . And every step that we do to simplify things for people , we think adoption can grow . And so that 's a lot of what we 've done these last three , four years , and we made a number of announcements that next in machine learning and AI in particular , you know , we look at our work as four elements , how we take our large-scale compute systems that were building for AI and how we make that available to everybody . Second , what we 're doing with the software stacks and top of it , things like jacks and other things and how we 're making those available to everybody . Third is advances because different people have different levels of expertise . Some people say I need the hardware to build my own large language model or algorithm . Other people say , look , I really need to use a building block . You guys give me . So , 30s we 've done a lot with AutoML and we announce new capability for image , video , and translation to make it available to everybody . And then lastly , we 're also building completely packaged solutions for some areas and we announce some new stuff . ",
|
||||
" We 're joined next by Thomas Curian , CEO of Google Cloud , and Alexander Wang , CEO and founder of Scale AI . Thomas joined Google in November 2018 as the CEO of Google Cloud . Prior to Google , Thomas spent 22 years at Oracle , where most recently he was president of product development . Before that , Thomas worked at McKinsey as a business analyst and engagement manager . His nearly 30 years of experience have given him a deep knowledge of engineering enterprise relationships and leadership of large organizations . Thomas 's degrees include an MBA in administration and management from Stanford University , as an RJ Miller scholar and a BSEE in electrical engineering and computer science from Princeton University , where he graduated suma cum laude . Thomas serves as a member of the Stanford graduate School of Business Advisory Council and Princeton University School of Engineering Advisory Council . Please welcome to the stage , Thomas Curian and Alexander Wang . This is a super exciting conversation . Thanks for being here , Thomas ."]
|
||||
|
||||
conversation = [
|
||||
{"role": "system",
|
||||
"content": sample_chunks[1]},
|
||||
{"role": "user",
|
||||
"content": "summarize this conversation in a few sentences by taking key points"}
|
||||
]
|
||||
|
||||
model = "gpt-3.5-turbo"
|
||||
response = openai.ChatCompletion.create(model=model,
|
||||
messages=conversation,
|
||||
n=1,
|
||||
max_tokens=300)
|
||||
|
||||
# Try fine tuned model
|
||||
# model = "davinci:ft-personal-2023-07-14-10-43-51"
|
||||
# response = openai.Completion.create(model=model,
|
||||
# prompt=sample_chunks[0] + " -> ")
|
||||
|
||||
caption = response.choices[0]
|
||||
print(caption)
|
||||
33
server/trials/title_summary/pegasus.py
Normal file
33
server/trials/title_summary/pegasus.py
Normal file
@@ -0,0 +1,33 @@
|
||||
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
|
||||
import torch
|
||||
# Load the Pegasus model and tokenizer
|
||||
model_name = "google/pegasus-large"
|
||||
model = PegasusForConditionalGeneration.from_pretrained(model_name)
|
||||
tokenizer = PegasusTokenizer.from_pretrained(model_name)
|
||||
|
||||
# Set the device to use
|
||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
model.to(device)
|
||||
|
||||
sample_chunks = ["You all just came off of your incredible Google Cloud next conference where you released a wide variety of functionality and features and new products across artisan television and also across the entire sort of cloud ecosystem . You want to just first by walking through , first start by walking through all the innovations that you sort of released and what you 're excited about when you come to Google Cloud ? Now our vision is super simple . If you look at what smartphones did for a consumer , you know they took a computer and internet browser , a communication device , and a camera , and made it so that it 's in everybody 's pocket , so it really brought computation to every person . We feel that , you know , our , what we 're trying to do is take all the technological innovation that Google 's doing , but make it super simple so that everyone can consume it . And so that includes our global data center footprint , all the new types of hardware and large-scale systems we work on , the software that we 're making available for people to do high-scale computation , tools for data processing , tools for cybersecurity , processing , tools for cyber security , tools for machine learning , but make it so simple that everyone can use it . And every step that we do to simplify things for people , we think adoption can grow . And so that 's a lot of what we 've done these last three , four years , and we made a number of announcements that next in machine learning and AI in particular , you know , we look at our work as four elements , how we take our large-scale compute systems that were building for AI and how we make that available to everybody . Second , what we 're doing with the software stacks and top of it , things like jacks and other things and how we 're making those available to everybody . Third is advances because different people have different levels of expertise . Some people say I need the hardware to build my own large language model or algorithm . Other people say , look , I really need to use a building block . You guys give me . So , 30s we 've done a lot with AutoML and we announce new capability for image , video , and translation to make it available to everybody . And then lastly , we 're also building completely packaged solutions for some areas and we announce some new stuff . ",
|
||||
" We 're joined next by Thomas Curian , CEO of Google Cloud , and Alexander Wang , CEO and founder of Scale AI . Thomas joined Google in November 2018 as the CEO of Google Cloud . Prior to Google , Thomas spent 22 years at Oracle , where most recently he was president of product development . Before that , Thomas worked at McKinsey as a business analyst and engagement manager . His nearly 30 years of experience have given him a deep knowledge of engineering enterprise relationships and leadership of large organizations . Thomas 's degrees include an MBA in administration and management from Stanford University , as an RJ Miller scholar and a BSEE in electrical engineering and computer science from Princeton University , where he graduated suma cum laude . Thomas serves as a member of the Stanford graduate School of Business Advisory Council and Princeton University School of Engineering Advisory Council . Please welcome to the stage , Thomas Curian and Alexander Wang . This is a super exciting conversation . Thanks for being here , Thomas ."]
|
||||
|
||||
|
||||
# Define the input text for summarization
|
||||
text = sample_chunks[1]
|
||||
|
||||
inputs = tokenizer(text, truncation=True, padding="longest", return_tensors="pt").to(device)
|
||||
|
||||
# Generate the summary
|
||||
summary_ids = model.generate(
|
||||
inputs["input_ids"],
|
||||
attention_mask=inputs["attention_mask"],
|
||||
max_length=200,
|
||||
num_beams=4,
|
||||
length_penalty=2.0,
|
||||
early_stopping=True,
|
||||
)
|
||||
|
||||
# Decode and print the summary
|
||||
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
|
||||
print("Summary:", summary)
|
||||
27
server/trials/title_summary/t5.py
Normal file
27
server/trials/title_summary/t5.py
Normal file
@@ -0,0 +1,27 @@
|
||||
from transformers import T5ForConditionalGeneration, T5Tokenizer
|
||||
import torch
|
||||
# Load the T5 model and tokenizer
|
||||
model_name = "t5-base"
|
||||
model = T5ForConditionalGeneration.from_pretrained(model_name)
|
||||
tokenizer = T5Tokenizer.from_pretrained(model_name)
|
||||
|
||||
# Set the device to use
|
||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
model.to(device)
|
||||
|
||||
sample_chunks = ["You all just came off of your incredible Google Cloud next conference where you released a wide variety of functionality and features and new products across artisan television and also across the entire sort of cloud ecosystem . You want to just first by walking through , first start by walking through all the innovations that you sort of released and what you 're excited about when you come to Google Cloud ? Now our vision is super simple . If you look at what smartphones did for a consumer , you know they took a computer and internet browser , a communication device , and a camera , and made it so that it 's in everybody 's pocket , so it really brought computation to every person . We feel that , you know , our , what we 're trying to do is take all the technological innovation that Google 's doing , but make it super simple so that everyone can consume it . And so that includes our global data center footprint , all the new types of hardware and large-scale systems we work on , the software that we 're making available for people to do high-scale computation , tools for data processing , tools for cybersecurity , processing , tools for cyber security , tools for machine learning , but make it so simple that everyone can use it . And every step that we do to simplify things for people , we think adoption can grow . And so that 's a lot of what we 've done these last three , four years , and we made a number of announcements that next in machine learning and AI in particular , you know , we look at our work as four elements , how we take our large-scale compute systems that were building for AI and how we make that available to everybody . Second , what we 're doing with the software stacks and top of it , things like jacks and other things and how we 're making those available to everybody . Third is advances because different people have different levels of expertise . Some people say I need the hardware to build my own large language model or algorithm . Other people say , look , I really need to use a building block . You guys give me . So , 30s we 've done a lot with AutoML and we announce new capability for image , video , and translation to make it available to everybody . And then lastly , we 're also building completely packaged solutions for some areas and we announce some new stuff . ",
|
||||
" We 're joined next by Thomas Curian , CEO of Google Cloud , and Alexander Wang , CEO and founder of Scale AI . Thomas joined Google in November 2018 as the CEO of Google Cloud . Prior to Google , Thomas spent 22 years at Oracle , where most recently he was president of product development . Before that , Thomas worked at McKinsey as a business analyst and engagement manager . His nearly 30 years of experience have given him a deep knowledge of engineering enterprise relationships and leadership of large organizations . Thomas 's degrees include an MBA in administration and management from Stanford University , as an RJ Miller scholar and a BSEE in electrical engineering and computer science from Princeton University , where he graduated suma cum laude . Thomas serves as a member of the Stanford graduate School of Business Advisory Council and Princeton University School of Engineering Advisory Council . Please welcome to the stage , Thomas Curian and Alexander Wang . This is a super exciting conversation . Thanks for being here , Thomas ."]
|
||||
|
||||
|
||||
# Define the input text for summarization
|
||||
text = "Summarize the following text in 3 key points. text : " + sample_chunks[1]
|
||||
|
||||
# Tokenize the input text
|
||||
inputs = tokenizer.encode(text, return_tensors="pt").to(device)
|
||||
|
||||
# Generate the summary
|
||||
summary_ids = model.generate(inputs, max_length=1000, num_beams=4, early_stopping=True)
|
||||
|
||||
# Decode and print the summary
|
||||
summary = tokenizer.decode(summary_ids.squeeze(), skip_special_tokens=True)
|
||||
print("Summary:", summary)
|
||||
1
server/trials/title_summary/transcript.txt
Normal file
1
server/trials/title_summary/transcript.txt
Normal file
File diff suppressed because one or more lines are too long
44
server/trials/title_summary/vicuna.py
Normal file
44
server/trials/title_summary/vicuna.py
Normal file
@@ -0,0 +1,44 @@
|
||||
from gpt4all import GPT4All
|
||||
|
||||
model = GPT4All("/Users/gokulmohanarangan/Library/Application Support/nomic.ai/GPT4All/ggml-vicuna-13b-1.1-q4_2.bin")
|
||||
|
||||
import spacy
|
||||
|
||||
|
||||
def split_text_file(filename, token_count):
|
||||
nlp = spacy.load('en_core_web_md')
|
||||
|
||||
with open(filename, 'r') as file:
|
||||
text = file.read()
|
||||
|
||||
doc = nlp(text)
|
||||
total_tokens = len(doc)
|
||||
|
||||
parts = []
|
||||
start_index = 0
|
||||
|
||||
while start_index < total_tokens:
|
||||
end_index = start_index + token_count
|
||||
part_tokens = doc[start_index:end_index]
|
||||
part = ' '.join(token.text for token in part_tokens)
|
||||
parts.append(part)
|
||||
start_index = end_index
|
||||
|
||||
return parts
|
||||
|
||||
parts = split_text_file("transcript.txt", 1800)
|
||||
final_summary = []
|
||||
for part in parts:
|
||||
prompt = f"""
|
||||
### Human:
|
||||
Summarize the following text without missing any key points and action items.
|
||||
|
||||
{part}
|
||||
### Assistant:
|
||||
"""
|
||||
output = model.generate(prompt)
|
||||
final_summary.append(output)
|
||||
|
||||
|
||||
with open("sum.txt", "w") as sum:
|
||||
sum.write(" ".join(final_summary))
|
||||
0
server/trials/whisper-jax/__init__.py
Normal file
0
server/trials/whisper-jax/__init__.py
Normal file
183
server/trials/whisper-jax/whisjax.py
Normal file
183
server/trials/whisper-jax/whisjax.py
Normal file
@@ -0,0 +1,183 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
# summarize https://www.youtube.com/watch?v=imzTxoEDH_g
|
||||
# summarize https://www.sprocket.org/video/cheesemaking.mp4 summary.txt
|
||||
# summarize podcast.mp3 summary.txt
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
import tempfile
|
||||
from datetime import datetime
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import jax.numpy as jnp
|
||||
import moviepy.editor
|
||||
import nltk
|
||||
import yt_dlp as youtube_dl
|
||||
from whisper_jax import FlaxWhisperPipline
|
||||
|
||||
from ...utils.file_utils import download_files, upload_files
|
||||
from ...utils.log_utils import LOGGER
|
||||
from ...utils.run_utils import CONFIG
|
||||
from ...utils.text_utils import post_process_transcription, summarize
|
||||
from ...utils.viz_utils import create_talk_diff_scatter_viz, create_wordcloud
|
||||
|
||||
nltk.download('punkt', quiet=True)
|
||||
nltk.download('stopwords', quiet=True)
|
||||
|
||||
WHISPER_MODEL_SIZE = CONFIG['WHISPER']["WHISPER_MODEL_SIZE"]
|
||||
NOW = datetime.now()
|
||||
|
||||
if not os.path.exists('../../artefacts'):
|
||||
os.makedirs('../../artefacts')
|
||||
|
||||
|
||||
def init_argparse() -> argparse.ArgumentParser:
|
||||
"""
|
||||
Parse the CLI arguments
|
||||
:return: parser object
|
||||
"""
|
||||
parser = argparse.ArgumentParser(
|
||||
usage="%(prog)s [OPTIONS] <LOCATION> <OUTPUT>",
|
||||
description="Creates a transcript of a video or audio file, then"
|
||||
" summarizes it using ChatGPT."
|
||||
)
|
||||
|
||||
parser.add_argument("-l", "--language",
|
||||
help="Language that the summary should be written in",
|
||||
type=str,
|
||||
default="english",
|
||||
choices=['english', 'spanish', 'french', 'german',
|
||||
'romanian'])
|
||||
parser.add_argument("location")
|
||||
return parser
|
||||
|
||||
|
||||
def main():
|
||||
parser = init_argparse()
|
||||
args = parser.parse_args()
|
||||
|
||||
# Parse the location string that was given to us, and figure out if it's a
|
||||
# local file (audio or video), a YouTube URL, or a URL referencing an
|
||||
# audio or video file.
|
||||
url = urlparse(args.location)
|
||||
|
||||
# S3 : Pull artefacts to S3 bucket ?
|
||||
|
||||
media_file = ""
|
||||
if url.scheme == 'http' or url.scheme == 'https':
|
||||
# Check if we're being asked to retreive a YouTube URL, which is
|
||||
# handled differently, as we'll use a secondary site to download
|
||||
# the video first.
|
||||
if re.search('youtube.com', url.netloc, re.IGNORECASE):
|
||||
# Download the lowest resolution YouTube video
|
||||
# (since we're just interested in the audio).
|
||||
# It will be saved to the current directory.
|
||||
LOGGER.info("Downloading YouTube video at url: " + args.location)
|
||||
|
||||
# Create options for the download
|
||||
ydl_opts = {
|
||||
'format': 'bestaudio/best',
|
||||
'postprocessors': [{
|
||||
'key': 'FFmpegExtractAudio',
|
||||
'preferredcodec': 'mp3',
|
||||
'preferredquality': '192',
|
||||
}],
|
||||
'outtmpl': './artefacts/audio', # Specify output file path and name
|
||||
}
|
||||
|
||||
# Download the audio
|
||||
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
|
||||
ydl.download([args.location])
|
||||
media_file = "../artefacts/audio.mp3"
|
||||
|
||||
LOGGER.info("Saved downloaded YouTube video to: " + media_file)
|
||||
else:
|
||||
# XXX - Download file using urllib, check if file is
|
||||
# audio/video using python-magic
|
||||
LOGGER.info(f"Downloading file at url: {args.location}")
|
||||
LOGGER.info(" XXX - This method hasn't been implemented yet.")
|
||||
elif url.scheme == '':
|
||||
media_file = url.path
|
||||
# If file is not present locally, take it from S3 bucket
|
||||
if not os.path.exists(media_file):
|
||||
download_files([media_file])
|
||||
|
||||
if media_file.endswith(".m4a"):
|
||||
subprocess.run(["ffmpeg", "-i", media_file, f"./artefacts/{media_file}.mp4"])
|
||||
media_file = f"./artefacts/{media_file}.mp4"
|
||||
else:
|
||||
print("Unsupported URL scheme: " + url.scheme)
|
||||
quit()
|
||||
|
||||
# Handle video
|
||||
if not media_file.endswith(".mp3"):
|
||||
try:
|
||||
video = moviepy.editor.VideoFileClip(media_file)
|
||||
audio_filename = tempfile.NamedTemporaryFile(suffix=".mp3",
|
||||
delete=False).name
|
||||
video.audio.write_audiofile(audio_filename, logger=None)
|
||||
LOGGER.info(f"Extracting audio to: {audio_filename}")
|
||||
# Handle audio only file
|
||||
except Exception:
|
||||
audio = moviepy.editor.AudioFileClip(media_file)
|
||||
audio_filename = tempfile.NamedTemporaryFile(suffix=".mp3",
|
||||
delete=False).name
|
||||
audio.write_audiofile(audio_filename, logger=None)
|
||||
else:
|
||||
audio_filename = media_file
|
||||
|
||||
LOGGER.info("Finished extracting audio")
|
||||
LOGGER.info("Transcribing")
|
||||
# Convert the audio to text using the OpenAI Whisper model
|
||||
pipeline = FlaxWhisperPipline("openai/whisper-" + WHISPER_MODEL_SIZE,
|
||||
dtype=jnp.float16,
|
||||
batch_size=16)
|
||||
whisper_result = pipeline(audio_filename, return_timestamps=True)
|
||||
LOGGER.info("Finished transcribing file")
|
||||
|
||||
whisper_result = post_process_transcription(whisper_result)
|
||||
|
||||
transcript_text = ""
|
||||
for chunk in whisper_result["chunks"]:
|
||||
transcript_text += chunk["text"]
|
||||
|
||||
with open("./artefacts/transcript_" + NOW.strftime("%m-%d-%Y_%H:%M:%S") +
|
||||
".txt", "w") as transcript_file:
|
||||
transcript_file.write(transcript_text)
|
||||
|
||||
with open("./artefacts/transcript_with_timestamp_" +
|
||||
NOW.strftime("%m-%d-%Y_%H:%M:%S") + ".txt",
|
||||
"w") as transcript_file_timestamps:
|
||||
transcript_file_timestamps.write(str(whisper_result))
|
||||
|
||||
LOGGER.info("Creating word cloud")
|
||||
create_wordcloud(NOW)
|
||||
|
||||
LOGGER.info("Performing talk-diff and talk-diff visualization")
|
||||
create_talk_diff_scatter_viz(NOW)
|
||||
|
||||
# S3 : Push artefacts to S3 bucket
|
||||
prefix = "./artefacts/"
|
||||
suffix = NOW.strftime("%m-%d-%Y_%H:%M:%S")
|
||||
files_to_upload = [prefix + "transcript_" + suffix + ".txt",
|
||||
prefix + "transcript_with_timestamp_" + suffix + ".txt",
|
||||
prefix + "df_" + suffix + ".pkl",
|
||||
prefix + "wordcloud_" + suffix + ".png",
|
||||
prefix + "mappings_" + suffix + ".pkl",
|
||||
prefix + "scatter_" + suffix + ".html"]
|
||||
upload_files(files_to_upload)
|
||||
|
||||
summarize(transcript_text, NOW, False, False)
|
||||
|
||||
LOGGER.info("Summarization completed")
|
||||
|
||||
# Summarization takes a lot of time, so do this separately at the end
|
||||
files_to_upload = [prefix + "summary_" + suffix + ".txt"]
|
||||
upload_files(files_to_upload)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
151
server/trials/whisper-jax/whisjax_realtime.py
Normal file
151
server/trials/whisper-jax/whisjax_realtime.py
Normal file
@@ -0,0 +1,151 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
import time
|
||||
import wave
|
||||
from datetime import datetime
|
||||
|
||||
import jax.numpy as jnp
|
||||
import pyaudio
|
||||
from pynput import keyboard
|
||||
from termcolor import colored
|
||||
from whisper_jax import FlaxWhisperPipline
|
||||
|
||||
from ...utils.file_utils import upload_files
|
||||
from ...utils.log_utils import LOGGER
|
||||
from ...utils.run_utils import CONFIG
|
||||
from ...utils.text_utils import post_process_transcription, summarize
|
||||
from ...utils.viz_utils import create_talk_diff_scatter_viz, create_wordcloud
|
||||
|
||||
WHISPER_MODEL_SIZE = CONFIG['WHISPER']["WHISPER_MODEL_SIZE"]
|
||||
|
||||
FRAMES_PER_BUFFER = 8000
|
||||
FORMAT = pyaudio.paInt16
|
||||
CHANNELS = 2
|
||||
RATE = 96000
|
||||
RECORD_SECONDS = 15
|
||||
NOW = datetime.now()
|
||||
|
||||
|
||||
def main():
|
||||
p = pyaudio.PyAudio()
|
||||
AUDIO_DEVICE_ID = -1
|
||||
for i in range(p.get_device_count()):
|
||||
if p.get_device_info_by_index(i)["name"] == \
|
||||
CONFIG["AUDIO"]["BLACKHOLE_INPUT_AGGREGATOR_DEVICE_NAME"]:
|
||||
AUDIO_DEVICE_ID = i
|
||||
audio_devices = p.get_device_info_by_index(AUDIO_DEVICE_ID)
|
||||
stream = p.open(
|
||||
format=FORMAT,
|
||||
channels=CHANNELS,
|
||||
rate=RATE,
|
||||
input=True,
|
||||
frames_per_buffer=FRAMES_PER_BUFFER,
|
||||
input_device_index=int(audio_devices['index'])
|
||||
)
|
||||
|
||||
pipeline = FlaxWhisperPipline("openai/whisper-" +
|
||||
CONFIG["WHISPER"]["WHISPER_REAL_TIME_MODEL_SIZE"],
|
||||
dtype=jnp.float16,
|
||||
batch_size=16)
|
||||
|
||||
transcription = ""
|
||||
|
||||
TEMP_AUDIO_FILE = "temp_audio.wav"
|
||||
global proceed
|
||||
proceed = True
|
||||
|
||||
def on_press(key):
|
||||
if key == keyboard.Key.esc:
|
||||
global proceed
|
||||
proceed = False
|
||||
|
||||
transcript_with_timestamp = {"text": "", "chunks": []}
|
||||
last_transcribed_time = 0.0
|
||||
|
||||
listener = keyboard.Listener(on_press=on_press)
|
||||
listener.start()
|
||||
print("Attempting real-time transcription.. Listening...")
|
||||
|
||||
try:
|
||||
while proceed:
|
||||
frames = []
|
||||
start_time = time.time()
|
||||
for i in range(0, int(RATE / FRAMES_PER_BUFFER * RECORD_SECONDS)):
|
||||
data = stream.read(FRAMES_PER_BUFFER,
|
||||
exception_on_overflow=False)
|
||||
frames.append(data)
|
||||
end_time = time.time()
|
||||
|
||||
wf = wave.open(TEMP_AUDIO_FILE, 'wb')
|
||||
wf.setnchannels(CHANNELS)
|
||||
wf.setsampwidth(p.get_sample_size(FORMAT))
|
||||
wf.setframerate(RATE)
|
||||
wf.writeframes(b''.join(frames))
|
||||
wf.close()
|
||||
|
||||
whisper_result = pipeline(TEMP_AUDIO_FILE, return_timestamps=True)
|
||||
timestamp = whisper_result["chunks"][0]["timestamp"]
|
||||
start = timestamp[0]
|
||||
end = timestamp[1]
|
||||
if end is None:
|
||||
end = start + 15.0
|
||||
duration = end - start
|
||||
item = {'timestamp': (last_transcribed_time,
|
||||
last_transcribed_time + duration),
|
||||
'text': whisper_result['text'],
|
||||
'stats': (str(end_time - start_time), str(duration))
|
||||
}
|
||||
last_transcribed_time = last_transcribed_time + duration
|
||||
transcript_with_timestamp["chunks"].append(item)
|
||||
transcription += whisper_result['text']
|
||||
|
||||
print(colored("<START>", "yellow"))
|
||||
print(colored(whisper_result['text'], 'green'))
|
||||
print(colored("<END> Recorded duration: " +
|
||||
str(end_time - start_time) +
|
||||
" | Transcribed duration: " +
|
||||
str(duration), "yellow"))
|
||||
|
||||
except Exception as exception:
|
||||
print(str(exception))
|
||||
finally:
|
||||
with open("real_time_transcript_" + NOW.strftime("%m-%d-%Y_%H:%M:%S")
|
||||
+ ".txt", "w", encoding="utf-8") as file:
|
||||
file.write(transcription)
|
||||
|
||||
with open("real_time_transcript_with_timestamp_" +
|
||||
NOW.strftime("%m-%d-%Y_%H:%M:%S") + ".txt", "w",
|
||||
encoding="utf-8") as file:
|
||||
transcript_with_timestamp["text"] = transcription
|
||||
file.write(str(transcript_with_timestamp))
|
||||
|
||||
transcript_with_timestamp = \
|
||||
post_process_transcription(transcript_with_timestamp)
|
||||
|
||||
LOGGER.info("Creating word cloud")
|
||||
create_wordcloud(NOW, True)
|
||||
|
||||
LOGGER.info("Performing talk-diff and talk-diff visualization")
|
||||
create_talk_diff_scatter_viz(NOW, True)
|
||||
|
||||
# S3 : Push artefacts to S3 bucket
|
||||
suffix = NOW.strftime("%m-%d-%Y_%H:%M:%S")
|
||||
files_to_upload = ["real_time_transcript_" + suffix + ".txt",
|
||||
"real_time_transcript_with_timestamp_" + suffix + ".txt",
|
||||
"real_time_df_" + suffix + ".pkl",
|
||||
"real_time_wordcloud_" + suffix + ".png",
|
||||
"real_time_mappings_" + suffix + ".pkl",
|
||||
"real_time_scatter_" + suffix + ".html"]
|
||||
upload_files(files_to_upload)
|
||||
|
||||
summarize(transcript_with_timestamp["text"], NOW, True, True)
|
||||
|
||||
LOGGER.info("Summarization completed")
|
||||
|
||||
# Summarization takes a lot of time, so do this separately at the end
|
||||
files_to_upload = ["real_time_summary_" + suffix + ".txt"]
|
||||
upload_files(files_to_upload)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
0
server/utils/__init__.py
Normal file
0
server/utils/__init__.py
Normal file
56
server/utils/file_utils.py
Normal file
56
server/utils/file_utils.py
Normal file
@@ -0,0 +1,56 @@
|
||||
"""
|
||||
Utility file for file handling related functions, including file downloads and
|
||||
uploads to cloud storage
|
||||
"""
|
||||
|
||||
import sys
|
||||
|
||||
import boto3
|
||||
import botocore
|
||||
|
||||
from .log_utils import LOGGER
|
||||
from .run_utils import CONFIG
|
||||
|
||||
BUCKET_NAME = CONFIG["AWS"]["BUCKET_NAME"]
|
||||
|
||||
s3 = boto3.client('s3',
|
||||
aws_access_key_id=CONFIG["AWS"]["AWS_ACCESS_KEY"],
|
||||
aws_secret_access_key=CONFIG["AWS"]["AWS_SECRET_KEY"])
|
||||
|
||||
|
||||
def upload_files(files_to_upload):
|
||||
"""
|
||||
Upload a list of files to the configured S3 bucket
|
||||
:param files_to_upload: List of files to upload
|
||||
:return: None
|
||||
"""
|
||||
for key in files_to_upload:
|
||||
LOGGER.info("Uploading file " + key)
|
||||
try:
|
||||
s3.upload_file(key, BUCKET_NAME, key)
|
||||
except botocore.exceptions.ClientError as exception:
|
||||
print(exception.response)
|
||||
|
||||
|
||||
def download_files(files_to_download):
|
||||
"""
|
||||
Download a list of files from the configured S3 bucket
|
||||
:param files_to_download: List of files to download
|
||||
:return: None
|
||||
"""
|
||||
for key in files_to_download:
|
||||
LOGGER.info("Downloading file " + key)
|
||||
try:
|
||||
s3.download_file(BUCKET_NAME, key, key)
|
||||
except botocore.exceptions.ClientError as exception:
|
||||
if exception.response['Error']['Code'] == "404":
|
||||
print("The object does not exist.")
|
||||
else:
|
||||
raise
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
if sys.argv[1] == "download":
|
||||
download_files([sys.argv[2]])
|
||||
elif sys.argv[1] == "upload":
|
||||
upload_files([sys.argv[2]])
|
||||
43
server/utils/format_output.py
Normal file
43
server/utils/format_output.py
Normal file
@@ -0,0 +1,43 @@
|
||||
"""
|
||||
Utility function to format the artefacts created during Reflector run
|
||||
"""
|
||||
|
||||
import json
|
||||
|
||||
with open("../artefacts/meeting_titles_and_summaries.txt", "r",
|
||||
encoding='utf-8') as f:
|
||||
outputs = f.read()
|
||||
|
||||
outputs = json.loads(outputs)
|
||||
|
||||
transcript_file = open("../artefacts/meeting_transcript.txt",
|
||||
"a",
|
||||
encoding='utf-8')
|
||||
title_desc_file = open("../artefacts/meeting_title_description.txt",
|
||||
"a",
|
||||
encoding='utf-8')
|
||||
summary_file = open("../artefacts/meeting_summary.txt",
|
||||
"a",
|
||||
encoding='utf-8')
|
||||
|
||||
for item in outputs["topics"]:
|
||||
transcript_file.write(item["transcript"])
|
||||
summary_file.write(item["description"])
|
||||
|
||||
title_desc_file.write("TITLE: \n")
|
||||
title_desc_file.write(item["title"])
|
||||
title_desc_file.write("\n")
|
||||
|
||||
title_desc_file.write("DESCRIPTION: \n")
|
||||
title_desc_file.write(item["description"])
|
||||
title_desc_file.write("\n")
|
||||
|
||||
title_desc_file.write("TRANSCRIPT: \n")
|
||||
title_desc_file.write(item["transcript"])
|
||||
title_desc_file.write("\n")
|
||||
|
||||
title_desc_file.write("---------------------------------------- \n\n")
|
||||
|
||||
transcript_file.close()
|
||||
title_desc_file.close()
|
||||
summary_file.close()
|
||||
26
server/utils/log_utils.py
Normal file
26
server/utils/log_utils.py
Normal file
@@ -0,0 +1,26 @@
|
||||
"""
|
||||
Utility file for logging
|
||||
"""
|
||||
|
||||
import loguru
|
||||
|
||||
|
||||
class SingletonLogger:
|
||||
"""
|
||||
Use Singleton design pattern to create a logger object and share it
|
||||
across the entire project
|
||||
"""
|
||||
__instance = None
|
||||
|
||||
@staticmethod
|
||||
def get_logger():
|
||||
"""
|
||||
Create or return the singleton instance for the SingletonLogger class
|
||||
:return: SingletonLogger instance
|
||||
"""
|
||||
if not SingletonLogger.__instance:
|
||||
SingletonLogger.__instance = loguru.logger
|
||||
return SingletonLogger.__instance
|
||||
|
||||
|
||||
LOGGER = SingletonLogger.get_logger()
|
||||
73
server/utils/run_utils.py
Normal file
73
server/utils/run_utils.py
Normal file
@@ -0,0 +1,73 @@
|
||||
"""
|
||||
Utility file for server side asynchronous task running and config objects
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import configparser
|
||||
import contextlib
|
||||
from functools import partial
|
||||
from threading import Lock
|
||||
from typing import ContextManager, Generic, TypeVar
|
||||
|
||||
|
||||
class ReflectorConfig:
|
||||
"""
|
||||
Create a single config object to share across the project
|
||||
"""
|
||||
__config = None
|
||||
|
||||
@staticmethod
|
||||
def get_config():
|
||||
if ReflectorConfig.__config is None:
|
||||
ReflectorConfig.__config = configparser.ConfigParser()
|
||||
ReflectorConfig.__config.read('utils/config.ini')
|
||||
return ReflectorConfig.__config
|
||||
|
||||
|
||||
CONFIG = ReflectorConfig.get_config()
|
||||
|
||||
|
||||
def run_in_executor(func, *args, executor=None, **kwargs):
|
||||
"""
|
||||
Run the function in an executor, unblocking the main loop
|
||||
:param func: Function to be run in executor
|
||||
:param args: function parameters
|
||||
:param executor: executor instance [Thread | Process]
|
||||
:param kwargs: Additional parameters
|
||||
:return: Future of function result upon completion
|
||||
"""
|
||||
callback = partial(func, *args, **kwargs)
|
||||
loop = asyncio.get_event_loop()
|
||||
return loop.run_in_executor(executor, callback)
|
||||
|
||||
|
||||
# Genetic type template
|
||||
T = TypeVar("T")
|
||||
|
||||
|
||||
class Mutex(Generic[T]):
|
||||
"""
|
||||
Mutex class to implement lock/release of a shared
|
||||
protected variable
|
||||
"""
|
||||
|
||||
def __init__(self, value: T):
|
||||
"""
|
||||
Create an instance of Mutex wrapper for the given resource
|
||||
:param value: Shared resources to be thread protected
|
||||
"""
|
||||
self.__value = value
|
||||
self.__lock = Lock()
|
||||
|
||||
@contextlib.contextmanager
|
||||
def lock(self) -> ContextManager[T]:
|
||||
"""
|
||||
Lock the resource with a mutex to be used within a context block
|
||||
The lock is automatically released on context exit
|
||||
:return: Shared resource
|
||||
"""
|
||||
self.__lock.acquire()
|
||||
try:
|
||||
yield self.__value
|
||||
finally:
|
||||
self.__lock.release()
|
||||
244
server/utils/text_utils.py
Normal file
244
server/utils/text_utils.py
Normal file
@@ -0,0 +1,244 @@
|
||||
"""
|
||||
Utility file for all text processing related functionalities
|
||||
"""
|
||||
|
||||
import nltk
|
||||
import torch
|
||||
from nltk.corpus import stopwords
|
||||
from nltk.tokenize import word_tokenize
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from sklearn.metrics.pairwise import cosine_similarity
|
||||
from transformers import BartForConditionalGeneration, BartTokenizer
|
||||
|
||||
from log_utils import LOGGER
|
||||
from run_utils import CONFIG
|
||||
|
||||
nltk.download('punkt', quiet=True)
|
||||
|
||||
|
||||
def preprocess_sentence(sentence):
|
||||
stop_words = set(stopwords.words('english'))
|
||||
tokens = word_tokenize(sentence.lower())
|
||||
tokens = [token for token in tokens
|
||||
if token.isalnum() and token not in stop_words]
|
||||
return ' '.join(tokens)
|
||||
|
||||
|
||||
def compute_similarity(sent1, sent2):
|
||||
"""
|
||||
Compute the similarity
|
||||
"""
|
||||
tfidf_vectorizer = TfidfVectorizer()
|
||||
if sent1 is not None and sent2 is not None:
|
||||
tfidf_matrix = tfidf_vectorizer.fit_transform([sent1, sent2])
|
||||
return cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])[0][0]
|
||||
return 0.0
|
||||
|
||||
|
||||
def remove_almost_alike_sentences(sentences, threshold=0.7):
|
||||
"""
|
||||
Filter sentences that are similar beyond a set threshold
|
||||
:param sentences:
|
||||
:param threshold:
|
||||
:return:
|
||||
"""
|
||||
num_sentences = len(sentences)
|
||||
removed_indices = set()
|
||||
|
||||
for i in range(num_sentences):
|
||||
if i not in removed_indices:
|
||||
for j in range(i + 1, num_sentences):
|
||||
if j not in removed_indices:
|
||||
l_i = len(sentences[i])
|
||||
l_j = len(sentences[j])
|
||||
if l_i == 0 or l_j == 0:
|
||||
if l_i == 0:
|
||||
removed_indices.add(i)
|
||||
if l_j == 0:
|
||||
removed_indices.add(j)
|
||||
else:
|
||||
sentence1 = preprocess_sentence(sentences[i])
|
||||
sentence2 = preprocess_sentence(sentences[j])
|
||||
if len(sentence1) != 0 and len(sentence2) != 0:
|
||||
similarity = compute_similarity(sentence1,
|
||||
sentence2)
|
||||
|
||||
if similarity >= threshold:
|
||||
removed_indices.add(max(i, j))
|
||||
|
||||
filtered_sentences = [sentences[i] for i in range(num_sentences)
|
||||
if i not in removed_indices]
|
||||
return filtered_sentences
|
||||
|
||||
|
||||
def remove_outright_duplicate_sentences_from_chunk(chunk):
|
||||
"""
|
||||
Remove repetitive sentences
|
||||
:param chunk:
|
||||
:return:
|
||||
"""
|
||||
chunk_text = chunk["text"]
|
||||
sentences = nltk.sent_tokenize(chunk_text)
|
||||
nonduplicate_sentences = list(dict.fromkeys(sentences))
|
||||
return nonduplicate_sentences
|
||||
|
||||
|
||||
def remove_whisper_repetitive_hallucination(nonduplicate_sentences):
|
||||
"""
|
||||
Remove sentences that are repeated as a result of Whisper
|
||||
hallucinations
|
||||
:param nonduplicate_sentences:
|
||||
:return:
|
||||
"""
|
||||
chunk_sentences = []
|
||||
|
||||
for sent in nonduplicate_sentences:
|
||||
temp_result = ""
|
||||
seen = {}
|
||||
words = nltk.word_tokenize(sent)
|
||||
n_gram_filter = 3
|
||||
for i in range(len(words)):
|
||||
if str(words[i:i + n_gram_filter]) in seen and \
|
||||
seen[str(words[i:i + n_gram_filter])] == \
|
||||
words[i + 1:i + n_gram_filter + 2]:
|
||||
pass
|
||||
else:
|
||||
seen[str(words[i:i + n_gram_filter])] = \
|
||||
words[i + 1:i + n_gram_filter + 2]
|
||||
temp_result += words[i]
|
||||
temp_result += " "
|
||||
chunk_sentences.append(temp_result)
|
||||
return chunk_sentences
|
||||
|
||||
|
||||
def post_process_transcription(whisper_result):
|
||||
"""
|
||||
Parent function to perform post-processing on the transcription result
|
||||
:param whisper_result:
|
||||
:return:
|
||||
"""
|
||||
transcript_text = ""
|
||||
for chunk in whisper_result["chunks"]:
|
||||
nonduplicate_sentences = \
|
||||
remove_outright_duplicate_sentences_from_chunk(chunk)
|
||||
chunk_sentences = \
|
||||
remove_whisper_repetitive_hallucination(nonduplicate_sentences)
|
||||
similarity_matched_sentences = \
|
||||
remove_almost_alike_sentences(chunk_sentences)
|
||||
chunk["text"] = " ".join(similarity_matched_sentences)
|
||||
transcript_text += chunk["text"]
|
||||
whisper_result["text"] = transcript_text
|
||||
return whisper_result
|
||||
|
||||
|
||||
def summarize_chunks(chunks, tokenizer, model):
|
||||
"""
|
||||
Summarize each chunk using a summarizer model
|
||||
:param chunks:
|
||||
:param tokenizer:
|
||||
:param model:
|
||||
:return:
|
||||
"""
|
||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
summaries = []
|
||||
for c in chunks:
|
||||
input_ids = tokenizer.encode(c, return_tensors='pt')
|
||||
input_ids = input_ids.to(device)
|
||||
with torch.no_grad():
|
||||
summary_ids = \
|
||||
model.generate(input_ids,
|
||||
num_beams=int(CONFIG["SUMMARIZER"]["BEAM_SIZE"]),
|
||||
length_penalty=2.0,
|
||||
max_length=int(CONFIG["SUMMARIZER"]["MAX_LENGTH"]),
|
||||
early_stopping=True)
|
||||
summary = tokenizer.decode(summary_ids[0],
|
||||
skip_special_tokens=True)
|
||||
summaries.append(summary)
|
||||
return summaries
|
||||
|
||||
|
||||
def chunk_text(text,
|
||||
max_chunk_length=int(CONFIG["SUMMARIZER"]["MAX_CHUNK_LENGTH"])):
|
||||
"""
|
||||
Split text into smaller chunks.
|
||||
:param text: Text to be chunked
|
||||
:param max_chunk_length: length of chunk
|
||||
:return: chunked texts
|
||||
"""
|
||||
sentences = nltk.sent_tokenize(text)
|
||||
chunks = []
|
||||
current_chunk = ""
|
||||
for sentence in sentences:
|
||||
if len(current_chunk) + len(sentence) < max_chunk_length:
|
||||
current_chunk += f" {sentence.strip()}"
|
||||
else:
|
||||
chunks.append(current_chunk.strip())
|
||||
current_chunk = f"{sentence.strip()}"
|
||||
chunks.append(current_chunk.strip())
|
||||
return chunks
|
||||
|
||||
|
||||
def summarize(transcript_text, timestamp,
|
||||
real_time=False,
|
||||
chunk_summarize=CONFIG["SUMMARIZER"]["SUMMARIZE_USING_CHUNKS"]):
|
||||
"""
|
||||
Summarize the given text either as a whole or as chunks as needed
|
||||
:param transcript_text:
|
||||
:param timestamp:
|
||||
:param real_time:
|
||||
:param chunk_summarize:
|
||||
:return:
|
||||
"""
|
||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
summary_model = CONFIG["SUMMARIZER"]["SUMMARY_MODEL"]
|
||||
if not summary_model:
|
||||
summary_model = "facebook/bart-large-cnn"
|
||||
|
||||
# Summarize the generated transcript using the BART model
|
||||
LOGGER.info(f"Loading BART model: {summary_model}")
|
||||
tokenizer = BartTokenizer.from_pretrained(summary_model)
|
||||
model = BartForConditionalGeneration.from_pretrained(summary_model)
|
||||
model = model.to(device)
|
||||
|
||||
output_file = "summary_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".txt"
|
||||
if real_time:
|
||||
output_file = "real_time_" + output_file
|
||||
|
||||
if chunk_summarize != "YES":
|
||||
max_length = int(CONFIG["SUMMARIZER"]["INPUT_ENCODING_MAX_LENGTH"])
|
||||
inputs = tokenizer. \
|
||||
batch_encode_plus([transcript_text], truncation=True,
|
||||
padding='longest',
|
||||
max_length=max_length,
|
||||
return_tensors='pt')
|
||||
inputs = inputs.to(device)
|
||||
|
||||
with torch.no_grad():
|
||||
num_beans = int(CONFIG["SUMMARIZER"]["BEAM_SIZE"])
|
||||
max_length = int(CONFIG["SUMMARIZER"]["MAX_LENGTH"])
|
||||
summaries = model.generate(inputs['input_ids'],
|
||||
num_beams=num_beans,
|
||||
length_penalty=2.0,
|
||||
max_length=max_length,
|
||||
early_stopping=True)
|
||||
|
||||
decoded_summaries = \
|
||||
[tokenizer.decode(summary,
|
||||
skip_special_tokens=True,
|
||||
clean_up_tokenization_spaces=False)
|
||||
for summary in summaries]
|
||||
summary = " ".join(decoded_summaries)
|
||||
with open("./artefacts/" + output_file, 'w', encoding="utf-8") as file:
|
||||
file.write(summary.strip() + "\n")
|
||||
else:
|
||||
LOGGER.info("Breaking transcript into smaller chunks")
|
||||
chunks = chunk_text(transcript_text)
|
||||
|
||||
LOGGER.info(f"Transcript broken into {len(chunks)} "
|
||||
f"chunks of at most 500 words")
|
||||
|
||||
LOGGER.info(f"Writing summary text to: {output_file}")
|
||||
with open(output_file, 'w') as f:
|
||||
summaries = summarize_chunks(chunks, tokenizer, model)
|
||||
for summary in summaries:
|
||||
f.write(summary.strip() + " ")
|
||||
217
server/utils/viz_utils.py
Normal file
217
server/utils/viz_utils.py
Normal file
@@ -0,0 +1,217 @@
|
||||
"""
|
||||
Utility file for all visualization related functions
|
||||
"""
|
||||
|
||||
import ast
|
||||
import collections
|
||||
import os
|
||||
import pickle
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
import pandas as pd
|
||||
import scattertext as st
|
||||
import spacy
|
||||
from nltk.corpus import stopwords
|
||||
from wordcloud import STOPWORDS, WordCloud
|
||||
|
||||
en = spacy.load('en_core_web_md')
|
||||
spacy_stopwords = en.Defaults.stop_words
|
||||
|
||||
STOPWORDS = set(STOPWORDS).union(set(stopwords.words("english"))). \
|
||||
union(set(spacy_stopwords))
|
||||
|
||||
|
||||
def create_wordcloud(timestamp, real_time=False):
|
||||
"""
|
||||
Create a basic word cloud visualization of transcribed text
|
||||
:return: None. The wordcloud image is saved locally
|
||||
"""
|
||||
filename = "transcript"
|
||||
if real_time:
|
||||
filename = "real_time_" + filename + "_" + \
|
||||
timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".txt"
|
||||
else:
|
||||
filename += "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".txt"
|
||||
|
||||
with open("./artefacts/" + filename, "r") as f:
|
||||
transcription_text = f.read()
|
||||
|
||||
# python_mask = np.array(PIL.Image.open("download1.png"))
|
||||
|
||||
wordcloud = WordCloud(height=800, width=800,
|
||||
background_color='white',
|
||||
stopwords=STOPWORDS,
|
||||
min_font_size=8).generate(transcription_text)
|
||||
|
||||
# Plot wordcloud and save image
|
||||
plt.figure(facecolor=None)
|
||||
plt.imshow(wordcloud, interpolation="bilinear")
|
||||
plt.axis("off")
|
||||
plt.tight_layout(pad=0)
|
||||
|
||||
wordcloud = "wordcloud"
|
||||
if real_time:
|
||||
wordcloud = "real_time_" + wordcloud + "_" + \
|
||||
timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".png"
|
||||
else:
|
||||
wordcloud += "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".png"
|
||||
|
||||
plt.savefig("./artefacts/" + wordcloud)
|
||||
|
||||
|
||||
def create_talk_diff_scatter_viz(timestamp, real_time=False):
|
||||
"""
|
||||
Perform agenda vs transcription diff to see covered topics.
|
||||
Create a scatter plot of words in topics.
|
||||
:return: None. Saved locally.
|
||||
"""
|
||||
spacy_model = "en_core_web_md"
|
||||
nlp = spacy.load(spacy_model)
|
||||
nlp.add_pipe('sentencizer')
|
||||
|
||||
agenda_topics = []
|
||||
agenda = []
|
||||
# Load the agenda
|
||||
with open(os.path.join(os.getcwd(), "agenda-headers.txt"), "r") as f:
|
||||
for line in f.readlines():
|
||||
if line.strip():
|
||||
agenda.append(line.strip())
|
||||
agenda_topics.append(line.split(":")[0])
|
||||
|
||||
# Load the transcription with timestamp
|
||||
if real_time:
|
||||
filename = "./artefacts/real_time_transcript_with_timestamp_" + \
|
||||
timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".txt"
|
||||
else:
|
||||
filename = "./artefacts/transcript_with_timestamp_" + \
|
||||
timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".txt"
|
||||
with open(filename) as file:
|
||||
transcription_timestamp_text = file.read()
|
||||
|
||||
res = ast.literal_eval(transcription_timestamp_text)
|
||||
chunks = res["chunks"]
|
||||
|
||||
# create df for processing
|
||||
df = pd.DataFrame.from_dict(res["chunks"])
|
||||
|
||||
covered_items = {}
|
||||
# ts: timestamp
|
||||
# Map each timestamped chunk with top1 and top2 matched agenda
|
||||
ts_to_topic_mapping_top_1 = {}
|
||||
ts_to_topic_mapping_top_2 = {}
|
||||
|
||||
# Also create a mapping of the different timestamps
|
||||
# in which each topic was covered
|
||||
topic_to_ts_mapping_top_1 = collections.defaultdict(list)
|
||||
topic_to_ts_mapping_top_2 = collections.defaultdict(list)
|
||||
|
||||
similarity_threshold = 0.7
|
||||
|
||||
for c in chunks:
|
||||
doc_transcription = nlp(c["text"])
|
||||
topic_similarities = []
|
||||
for item in range(len(agenda)):
|
||||
item_doc = nlp(agenda[item])
|
||||
# if not doc_transcription or not all
|
||||
# (token.has_vector for token in doc_transcription):
|
||||
if not doc_transcription:
|
||||
continue
|
||||
similarity = doc_transcription.similarity(item_doc)
|
||||
topic_similarities.append((item, similarity))
|
||||
topic_similarities.sort(key=lambda x: x[1], reverse=True)
|
||||
for i in range(2):
|
||||
if topic_similarities[i][1] >= similarity_threshold:
|
||||
covered_items[agenda[topic_similarities[i][0]]] = True
|
||||
# top1 match
|
||||
if i == 0:
|
||||
ts_to_topic_mapping_top_1[c["timestamp"]] = agenda_topics[topic_similarities[i][0]]
|
||||
topic_to_ts_mapping_top_1[agenda_topics[topic_similarities[i][0]]].append(c["timestamp"])
|
||||
# top2 match
|
||||
else:
|
||||
ts_to_topic_mapping_top_2[c["timestamp"]] = agenda_topics[topic_similarities[i][0]]
|
||||
topic_to_ts_mapping_top_2[agenda_topics[topic_similarities[i][0]]].append(c["timestamp"])
|
||||
|
||||
def create_new_columns(record):
|
||||
"""
|
||||
Accumulate the mapping information into the df
|
||||
:param record:
|
||||
:return:
|
||||
"""
|
||||
record["ts_to_topic_mapping_top_1"] = \
|
||||
ts_to_topic_mapping_top_1[record["timestamp"]]
|
||||
record["ts_to_topic_mapping_top_2"] = \
|
||||
ts_to_topic_mapping_top_2[record["timestamp"]]
|
||||
return record
|
||||
|
||||
df = df.apply(create_new_columns, axis=1)
|
||||
|
||||
# Count the number of items covered and calculate the percentage
|
||||
num_covered_items = sum(covered_items.values())
|
||||
percentage_covered = num_covered_items / len(agenda) * 100
|
||||
|
||||
# Print the results
|
||||
print("💬 Agenda items covered in the transcription:")
|
||||
for item in agenda:
|
||||
if item in covered_items and covered_items[item]:
|
||||
print("✅ ", item)
|
||||
else:
|
||||
print("❌ ", item)
|
||||
print("📊 Coverage: {:.2f}%".format(percentage_covered))
|
||||
|
||||
# Save df, mappings for further experimentation
|
||||
df_name = "df"
|
||||
if real_time:
|
||||
df_name = "real_time_" + df_name + "_" + \
|
||||
timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".pkl"
|
||||
else:
|
||||
df_name += "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".pkl"
|
||||
df.to_pickle("./artefacts/" + df_name)
|
||||
|
||||
my_mappings = [ts_to_topic_mapping_top_1, ts_to_topic_mapping_top_2,
|
||||
topic_to_ts_mapping_top_1, topic_to_ts_mapping_top_2]
|
||||
|
||||
mappings_name = "mappings"
|
||||
if real_time:
|
||||
mappings_name = "real_time_" + mappings_name + "_" + \
|
||||
timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".pkl"
|
||||
else:
|
||||
mappings_name += "_" + timestamp.strftime("%m-%d-%Y_%H:%M:%S") + ".pkl"
|
||||
pickle.dump(my_mappings, open("./artefacts/" + mappings_name, "wb"))
|
||||
|
||||
# to load, my_mappings = pickle.load( open ("mappings.pkl", "rb") )
|
||||
|
||||
# pick the 2 most matched topic to be used for plotting
|
||||
topic_times = collections.defaultdict(int)
|
||||
for key in ts_to_topic_mapping_top_1.keys():
|
||||
if key[0] is None or key[1] is None:
|
||||
continue
|
||||
duration = key[1] - key[0]
|
||||
topic_times[ts_to_topic_mapping_top_1[key]] += duration
|
||||
|
||||
topic_times = sorted(topic_times.items(), key=lambda x: x[1], reverse=True)
|
||||
|
||||
if len(topic_times) > 1:
|
||||
cat_1 = topic_times[0][0]
|
||||
cat_1_name = topic_times[0][0]
|
||||
cat_2_name = topic_times[1][0]
|
||||
|
||||
# Scatter plot of topics
|
||||
df = df.assign(parse=lambda df: df.text.apply(st.whitespace_nlp_with_sentences))
|
||||
corpus = st.CorpusFromParsedDocuments(
|
||||
df, category_col='ts_to_topic_mapping_top_1', parsed_col='parse'
|
||||
).build().get_unigram_corpus().compact(st.AssociationCompactor(2000))
|
||||
html = st.produce_scattertext_explorer(
|
||||
corpus,
|
||||
category=cat_1,
|
||||
category_name=cat_1_name,
|
||||
not_category_name=cat_2_name,
|
||||
minimum_term_frequency=0, pmi_threshold_coefficient=0,
|
||||
width_in_pixels=1000,
|
||||
transform=st.Scalers.dense_rank
|
||||
)
|
||||
if real_time:
|
||||
open('./artefacts/real_time_scatter_' +
|
||||
timestamp.strftime("%m-%d-%Y_%H:%M:%S") + '.html', 'w').write(html)
|
||||
else:
|
||||
open('./artefacts/scatter_' +
|
||||
timestamp.strftime("%m-%d-%Y_%H:%M:%S") + '.html', 'w').write(html)
|
||||
Reference in New Issue
Block a user