mirror of
https://github.com/Monadical-SAS/reflector.git
synced 2025-12-22 13:19:05 +00:00
flake8 / pylint updates
This commit is contained in:
@@ -93,6 +93,6 @@ def generate_finetuning_dataset(video_ids):
|
||||
video_ids = ["yTnSEZIwnkU"]
|
||||
dataset = generate_finetuning_dataset(video_ids)
|
||||
|
||||
with open("finetuning_dataset.jsonl", "w") as f:
|
||||
with open("finetuning_dataset.jsonl", "w", encoding="utf-8") as file:
|
||||
for example in dataset:
|
||||
f.write(json.dumps(example) + "\n")
|
||||
file.write(json.dumps(example) + "\n")
|
||||
|
||||
@@ -16,10 +16,10 @@ from av import AudioFifo
|
||||
from sortedcontainers import SortedDict
|
||||
from whisper_jax import FlaxWhisperPipline
|
||||
|
||||
from reflector.utils.log_utils import logger
|
||||
from reflector.utils.run_utils import config, Mutex
|
||||
from reflector.utils.log_utils import LOGGER
|
||||
from reflector.utils.run_utils import CONFIG, Mutex
|
||||
|
||||
WHISPER_MODEL_SIZE = config['WHISPER']["WHISPER_REAL_TIME_MODEL_SIZE"]
|
||||
WHISPER_MODEL_SIZE = CONFIG['WHISPER']["WHISPER_REAL_TIME_MODEL_SIZE"]
|
||||
pcs = set()
|
||||
relay = MediaRelay()
|
||||
data_channel = None
|
||||
@@ -127,7 +127,7 @@ async def offer(request: requests.Request):
|
||||
pcs.add(pc)
|
||||
|
||||
def log_info(msg: str, *args):
|
||||
logger.info(pc_id + " " + msg, *args)
|
||||
LOGGER.info(pc_id + " " + msg, *args)
|
||||
|
||||
log_info("Created for " + request.remote)
|
||||
|
||||
|
||||
@@ -3,14 +3,14 @@ import sys
|
||||
|
||||
|
||||
# Observe the incremental summaries by performing summaries in chunks
|
||||
with open("transcript.txt") as f:
|
||||
transcription = f.read()
|
||||
with open("transcript.txt", "r", encoding="utf-8") as file:
|
||||
transcription = file.read()
|
||||
|
||||
|
||||
def split_text_file(filename, token_count):
|
||||
nlp = spacy.load('en_core_web_md')
|
||||
|
||||
with open(filename, 'r') as file:
|
||||
with open(filename, 'r', encoding="utf-8") as file:
|
||||
text = file.read()
|
||||
|
||||
doc = nlp(text)
|
||||
@@ -36,9 +36,9 @@ chunks = split_text_file("transcript.txt", MAX_CHUNK_LENGTH)
|
||||
print("Number of chunks", len(chunks))
|
||||
|
||||
# Write chunks to file to refer to input vs output, separated by blank lines
|
||||
with open("chunks" + str(MAX_CHUNK_LENGTH) + ".txt", "a") as f:
|
||||
with open("chunks" + str(MAX_CHUNK_LENGTH) + ".txt", "a", encoding="utf-8") as file:
|
||||
for c in chunks:
|
||||
f.write(c + "\n\n")
|
||||
file.write(c + "\n\n")
|
||||
|
||||
# If we want to run only a certain model, type the option while running
|
||||
# ex. python incsum.py 1 => will run approach 1
|
||||
@@ -78,9 +78,9 @@ if index == "1" or index is None:
|
||||
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
|
||||
summaries.append(summary)
|
||||
|
||||
with open("bart-summaries.txt", "a") as f:
|
||||
with open("bart-summaries.txt", "a", encoding="utf-8") as file:
|
||||
for summary in summaries:
|
||||
f.write(summary + "\n\n")
|
||||
file.write(summary + "\n\n")
|
||||
|
||||
# Approach 2
|
||||
if index == "2" or index is None:
|
||||
@@ -114,8 +114,8 @@ if index == "2" or index is None:
|
||||
summary_ids = output[0, input_length:]
|
||||
summary = tokenizer.decode(summary_ids, skip_special_tokens=True)
|
||||
summaries.append(summary)
|
||||
with open("gptneo1.3B-summaries.txt", "a") as f:
|
||||
f.write(summary + "\n\n")
|
||||
with open("gptneo1.3B-summaries.txt", "a", encoding="utf-8") as file:
|
||||
file.write(summary + "\n\n")
|
||||
|
||||
# Approach 3
|
||||
if index == "3" or index is None:
|
||||
@@ -152,6 +152,6 @@ if index == "3" or index is None:
|
||||
skip_special_tokens=True)
|
||||
summaries.append(summary)
|
||||
|
||||
with open("mpt-7b-summaries.txt", "a") as f:
|
||||
with open("mpt-7b-summaries.txt", "a", encoding="utf-8") as file:
|
||||
for summary in summaries:
|
||||
f.write(summary + "\n\n")
|
||||
file.write(summary + "\n\n")
|
||||
|
||||
@@ -19,15 +19,15 @@ import yt_dlp as youtube_dl
|
||||
from whisper_jax import FlaxWhisperPipline
|
||||
|
||||
from ...utils.file_utils import download_files, upload_files
|
||||
from ...utils.log_utils import logger
|
||||
from ...utils.run_utils import config
|
||||
from ...utils.log_utils import LOGGER
|
||||
from ...utils.run_utils import CONFIG
|
||||
from ...utils.text_utils import post_process_transcription, summarize
|
||||
from ...utils.viz_utils import create_talk_diff_scatter_viz, create_wordcloud
|
||||
|
||||
nltk.download('punkt', quiet=True)
|
||||
nltk.download('stopwords', quiet=True)
|
||||
|
||||
WHISPER_MODEL_SIZE = config['WHISPER']["WHISPER_MODEL_SIZE"]
|
||||
WHISPER_MODEL_SIZE = CONFIG['WHISPER']["WHISPER_MODEL_SIZE"]
|
||||
NOW = datetime.now()
|
||||
|
||||
if not os.path.exists('../../artefacts'):
|
||||
@@ -75,7 +75,7 @@ def main():
|
||||
# Download the lowest resolution YouTube video
|
||||
# (since we're just interested in the audio).
|
||||
# It will be saved to the current directory.
|
||||
logger.info("Downloading YouTube video at url: " + args.location)
|
||||
LOGGER.info("Downloading YouTube video at url: " + args.location)
|
||||
|
||||
# Create options for the download
|
||||
ydl_opts = {
|
||||
@@ -93,12 +93,12 @@ def main():
|
||||
ydl.download([args.location])
|
||||
media_file = "../artefacts/audio.mp3"
|
||||
|
||||
logger.info("Saved downloaded YouTube video to: " + media_file)
|
||||
LOGGER.info("Saved downloaded YouTube video to: " + media_file)
|
||||
else:
|
||||
# XXX - Download file using urllib, check if file is
|
||||
# audio/video using python-magic
|
||||
logger.info(f"Downloading file at url: {args.location}")
|
||||
logger.info(" XXX - This method hasn't been implemented yet.")
|
||||
LOGGER.info(f"Downloading file at url: {args.location}")
|
||||
LOGGER.info(" XXX - This method hasn't been implemented yet.")
|
||||
elif url.scheme == '':
|
||||
media_file = url.path
|
||||
# If file is not present locally, take it from S3 bucket
|
||||
@@ -119,7 +119,7 @@ def main():
|
||||
audio_filename = tempfile.NamedTemporaryFile(suffix=".mp3",
|
||||
delete=False).name
|
||||
video.audio.write_audiofile(audio_filename, logger=None)
|
||||
logger.info(f"Extracting audio to: {audio_filename}")
|
||||
LOGGER.info(f"Extracting audio to: {audio_filename}")
|
||||
# Handle audio only file
|
||||
except Exception:
|
||||
audio = moviepy.editor.AudioFileClip(media_file)
|
||||
@@ -129,14 +129,14 @@ def main():
|
||||
else:
|
||||
audio_filename = media_file
|
||||
|
||||
logger.info("Finished extracting audio")
|
||||
logger.info("Transcribing")
|
||||
LOGGER.info("Finished extracting audio")
|
||||
LOGGER.info("Transcribing")
|
||||
# Convert the audio to text using the OpenAI Whisper model
|
||||
pipeline = FlaxWhisperPipline("openai/whisper-" + WHISPER_MODEL_SIZE,
|
||||
dtype=jnp.float16,
|
||||
batch_size=16)
|
||||
whisper_result = pipeline(audio_filename, return_timestamps=True)
|
||||
logger.info("Finished transcribing file")
|
||||
LOGGER.info("Finished transcribing file")
|
||||
|
||||
whisper_result = post_process_transcription(whisper_result)
|
||||
|
||||
@@ -153,10 +153,10 @@ def main():
|
||||
"w") as transcript_file_timestamps:
|
||||
transcript_file_timestamps.write(str(whisper_result))
|
||||
|
||||
logger.info("Creating word cloud")
|
||||
LOGGER.info("Creating word cloud")
|
||||
create_wordcloud(NOW)
|
||||
|
||||
logger.info("Performing talk-diff and talk-diff visualization")
|
||||
LOGGER.info("Performing talk-diff and talk-diff visualization")
|
||||
create_talk_diff_scatter_viz(NOW)
|
||||
|
||||
# S3 : Push artefacts to S3 bucket
|
||||
@@ -172,7 +172,7 @@ def main():
|
||||
|
||||
summarize(transcript_text, NOW, False, False)
|
||||
|
||||
logger.info("Summarization completed")
|
||||
LOGGER.info("Summarization completed")
|
||||
|
||||
# Summarization takes a lot of time, so do this separately at the end
|
||||
files_to_upload = [prefix + "summary_" + suffix + ".txt"]
|
||||
|
||||
@@ -11,12 +11,12 @@ from termcolor import colored
|
||||
from whisper_jax import FlaxWhisperPipline
|
||||
|
||||
from ...utils.file_utils import upload_files
|
||||
from ...utils.log_utils import logger
|
||||
from ...utils.run_utils import config
|
||||
from ...utils.log_utils import LOGGER
|
||||
from ...utils.run_utils import CONFIG
|
||||
from ...utils.text_utils import post_process_transcription, summarize
|
||||
from ...utils.viz_utils import create_talk_diff_scatter_viz, create_wordcloud
|
||||
|
||||
WHISPER_MODEL_SIZE = config['WHISPER']["WHISPER_MODEL_SIZE"]
|
||||
WHISPER_MODEL_SIZE = CONFIG['WHISPER']["WHISPER_MODEL_SIZE"]
|
||||
|
||||
FRAMES_PER_BUFFER = 8000
|
||||
FORMAT = pyaudio.paInt16
|
||||
@@ -31,7 +31,7 @@ def main():
|
||||
AUDIO_DEVICE_ID = -1
|
||||
for i in range(p.get_device_count()):
|
||||
if p.get_device_info_by_index(i)["name"] == \
|
||||
config["AUDIO"]["BLACKHOLE_INPUT_AGGREGATOR_DEVICE_NAME"]:
|
||||
CONFIG["AUDIO"]["BLACKHOLE_INPUT_AGGREGATOR_DEVICE_NAME"]:
|
||||
AUDIO_DEVICE_ID = i
|
||||
audio_devices = p.get_device_info_by_index(AUDIO_DEVICE_ID)
|
||||
stream = p.open(
|
||||
@@ -44,7 +44,7 @@ def main():
|
||||
)
|
||||
|
||||
pipeline = FlaxWhisperPipline("openai/whisper-" +
|
||||
config["WHISPER"]["WHISPER_REAL_TIME_MODEL_SIZE"],
|
||||
CONFIG["WHISPER"]["WHISPER_REAL_TIME_MODEL_SIZE"],
|
||||
dtype=jnp.float16,
|
||||
batch_size=16)
|
||||
|
||||
@@ -106,23 +106,26 @@ def main():
|
||||
" | Transcribed duration: " +
|
||||
str(duration), "yellow"))
|
||||
|
||||
except Exception as e:
|
||||
print(e)
|
||||
except Exception as exception:
|
||||
print(str(exception))
|
||||
finally:
|
||||
with open("real_time_transcript_" +
|
||||
NOW.strftime("%m-%d-%Y_%H:%M:%S") + ".txt", "w") as f:
|
||||
f.write(transcription)
|
||||
with open("real_time_transcript_" + NOW.strftime("%m-%d-%Y_%H:%M:%S")
|
||||
+ ".txt", "w", encoding="utf-8") as file:
|
||||
file.write(transcription)
|
||||
|
||||
with open("real_time_transcript_with_timestamp_" +
|
||||
NOW.strftime("%m-%d-%Y_%H:%M:%S") + ".txt", "w") as f:
|
||||
NOW.strftime("%m-%d-%Y_%H:%M:%S") + ".txt", "w",
|
||||
encoding="utf-8") as file:
|
||||
transcript_with_timestamp["text"] = transcription
|
||||
f.write(str(transcript_with_timestamp))
|
||||
file.write(str(transcript_with_timestamp))
|
||||
|
||||
transcript_with_timestamp = post_process_transcription(transcript_with_timestamp)
|
||||
transcript_with_timestamp = \
|
||||
post_process_transcription(transcript_with_timestamp)
|
||||
|
||||
logger.info("Creating word cloud")
|
||||
LOGGER.info("Creating word cloud")
|
||||
create_wordcloud(NOW, True)
|
||||
|
||||
logger.info("Performing talk-diff and talk-diff visualization")
|
||||
LOGGER.info("Performing talk-diff and talk-diff visualization")
|
||||
create_talk_diff_scatter_viz(NOW, True)
|
||||
|
||||
# S3 : Push artefacts to S3 bucket
|
||||
@@ -137,7 +140,7 @@ def main():
|
||||
|
||||
summarize(transcript_with_timestamp["text"], NOW, True, True)
|
||||
|
||||
logger.info("Summarization completed")
|
||||
LOGGER.info("Summarization completed")
|
||||
|
||||
# Summarization takes a lot of time, so do this separately at the end
|
||||
files_to_upload = ["real_time_summary_" + suffix + ".txt"]
|
||||
|
||||
Reference in New Issue
Block a user