flake8 / pylint updates

2026-04-16 18:26:54 +00:00 · 2023-07-26 11:28:14 +05:30
parent c970fc89dd
commit e512b4dca5
15 changed files with 279 additions and 146 deletions
--- a/trials/finetuning/youtube_scraping.py
+++ b/trials/finetuning/youtube_scraping.py
@@ -93,6 +93,6 @@ def generate_finetuning_dataset(video_ids):
 video_ids = ["yTnSEZIwnkU"]
 dataset = generate_finetuning_dataset(video_ids)

-with open("finetuning_dataset.jsonl", "w") as f:
+with open("finetuning_dataset.jsonl", "w", encoding="utf-8") as file:
    for example in dataset:
-        f.write(json.dumps(example) + "\n")
+        file.write(json.dumps(example) + "\n")
--- a/trials/server/server_multithreaded.py
+++ b/trials/server/server_multithreaded.py
@@ -16,10 +16,10 @@ from av import AudioFifo
 from sortedcontainers import SortedDict
 from whisper_jax import FlaxWhisperPipline

-from reflector.utils.log_utils import logger
-from reflector.utils.run_utils import config, Mutex
+from reflector.utils.log_utils import LOGGER
+from reflector.utils.run_utils import CONFIG, Mutex

-WHISPER_MODEL_SIZE = config['WHISPER']["WHISPER_REAL_TIME_MODEL_SIZE"]
+WHISPER_MODEL_SIZE = CONFIG['WHISPER']["WHISPER_REAL_TIME_MODEL_SIZE"]
 pcs = set()
 relay = MediaRelay()
 data_channel = None
@@ -127,7 +127,7 @@ async def offer(request: requests.Request):
    pcs.add(pc)

    def log_info(msg: str, *args):
-        logger.info(pc_id + " " + msg, *args)
+        LOGGER.info(pc_id + " " + msg, *args)

    log_info("Created for " + request.remote)

--- a/trials/title_summary/incsum.py
+++ b/trials/title_summary/incsum.py
@@ -3,14 +3,14 @@ import sys


 # Observe the incremental summaries by performing summaries in chunks
-with open("transcript.txt") as f:
-    transcription = f.read()
+with open("transcript.txt", "r", encoding="utf-8") as file:
+    transcription = file.read()


 def split_text_file(filename, token_count):
    nlp = spacy.load('en_core_web_md')

-    with open(filename, 'r') as file:
+    with open(filename, 'r', encoding="utf-8") as file:
        text = file.read()

    doc = nlp(text)
@@ -36,9 +36,9 @@ chunks = split_text_file("transcript.txt", MAX_CHUNK_LENGTH)
 print("Number of chunks", len(chunks))

 # Write chunks to file to refer to input vs output, separated by blank lines
-with open("chunks" + str(MAX_CHUNK_LENGTH) + ".txt", "a") as f:
+with open("chunks" + str(MAX_CHUNK_LENGTH) + ".txt", "a", encoding="utf-8") as file:
    for c in chunks:
-        f.write(c + "\n\n")
+        file.write(c + "\n\n")

 # If we want to run only a certain model, type the option while running
 # ex. python incsum.py 1 => will run approach 1
@@ -78,9 +78,9 @@ if index == "1" or index is None:
        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        summaries.append(summary)

-    with open("bart-summaries.txt", "a") as f:
+    with open("bart-summaries.txt", "a", encoding="utf-8") as file:
        for summary in summaries:
-            f.write(summary + "\n\n")
+            file.write(summary + "\n\n")

 # Approach 2
 if index == "2" or index is None:
@@ -114,8 +114,8 @@ if index == "2" or index is None:
        summary_ids = output[0, input_length:]
        summary = tokenizer.decode(summary_ids, skip_special_tokens=True)
        summaries.append(summary)
-        with open("gptneo1.3B-summaries.txt", "a") as f:
-            f.write(summary + "\n\n")
+        with open("gptneo1.3B-summaries.txt", "a", encoding="utf-8") as file:
+            file.write(summary + "\n\n")

 # Approach 3
 if index == "3" or index is None:
@@ -152,6 +152,6 @@ if index == "3" or index is None:
                                   skip_special_tokens=True)
        summaries.append(summary)

-    with open("mpt-7b-summaries.txt", "a") as f:
+    with open("mpt-7b-summaries.txt", "a", encoding="utf-8") as file:
        for summary in summaries:
-            f.write(summary + "\n\n")
+            file.write(summary + "\n\n")
--- a/trials/whisper-jax/whisjax.py
+++ b/trials/whisper-jax/whisjax.py
@@ -19,15 +19,15 @@ import yt_dlp as youtube_dl
 from whisper_jax import FlaxWhisperPipline

 from ...utils.file_utils import download_files, upload_files
-from ...utils.log_utils import logger
-from ...utils.run_utils import config
+from ...utils.log_utils import LOGGER
+from ...utils.run_utils import CONFIG
 from ...utils.text_utils import post_process_transcription, summarize
 from ...utils.viz_utils import create_talk_diff_scatter_viz, create_wordcloud

 nltk.download('punkt', quiet=True)
 nltk.download('stopwords', quiet=True)

-WHISPER_MODEL_SIZE = config['WHISPER']["WHISPER_MODEL_SIZE"]
+WHISPER_MODEL_SIZE = CONFIG['WHISPER']["WHISPER_MODEL_SIZE"]
 NOW = datetime.now()

 if not os.path.exists('../../artefacts'):
@@ -75,7 +75,7 @@ def main():
            # Download the lowest resolution YouTube video
            # (since we're just interested in the audio).
            # It will be saved to the current directory.
-            logger.info("Downloading YouTube video at url: " + args.location)
+            LOGGER.info("Downloading YouTube video at url: " + args.location)

            # Create options for the download
            ydl_opts = {
@@ -93,12 +93,12 @@ def main():
                ydl.download([args.location])
            media_file = "../artefacts/audio.mp3"

-            logger.info("Saved downloaded YouTube video to: " + media_file)
+            LOGGER.info("Saved downloaded YouTube video to: " + media_file)
        else:
            # XXX - Download file using urllib, check if file is
            # audio/video using python-magic
-            logger.info(f"Downloading file at url: {args.location}")
-            logger.info("  XXX - This method hasn't been implemented yet.")
+            LOGGER.info(f"Downloading file at url: {args.location}")
+            LOGGER.info("  XXX - This method hasn't been implemented yet.")
    elif url.scheme == '':
        media_file = url.path
        # If file is not present locally, take it from S3 bucket
@@ -119,7 +119,7 @@ def main():
            audio_filename = tempfile.NamedTemporaryFile(suffix=".mp3",
                                                         delete=False).name
            video.audio.write_audiofile(audio_filename, logger=None)
-            logger.info(f"Extracting audio to: {audio_filename}")
+            LOGGER.info(f"Extracting audio to: {audio_filename}")
        # Handle audio only file
        except Exception:
            audio = moviepy.editor.AudioFileClip(media_file)
@@ -129,14 +129,14 @@ def main():
    else:
        audio_filename = media_file

-    logger.info("Finished extracting audio")
-    logger.info("Transcribing")
+    LOGGER.info("Finished extracting audio")
+    LOGGER.info("Transcribing")
    # Convert the audio to text using the OpenAI Whisper model
    pipeline = FlaxWhisperPipline("openai/whisper-" + WHISPER_MODEL_SIZE,
                                  dtype=jnp.float16,
                                  batch_size=16)
    whisper_result = pipeline(audio_filename, return_timestamps=True)
-    logger.info("Finished transcribing file")
+    LOGGER.info("Finished transcribing file")

    whisper_result = post_process_transcription(whisper_result)

@@ -153,10 +153,10 @@ def main():
              "w") as transcript_file_timestamps:
        transcript_file_timestamps.write(str(whisper_result))

-    logger.info("Creating word cloud")
+    LOGGER.info("Creating word cloud")
    create_wordcloud(NOW)

-    logger.info("Performing talk-diff and talk-diff visualization")
+    LOGGER.info("Performing talk-diff and talk-diff visualization")
    create_talk_diff_scatter_viz(NOW)

    # S3 : Push artefacts to S3 bucket
@@ -172,7 +172,7 @@ def main():

    summarize(transcript_text, NOW, False, False)

-    logger.info("Summarization completed")
+    LOGGER.info("Summarization completed")

    # Summarization takes a lot of time, so do this separately at the end
    files_to_upload = [prefix + "summary_" + suffix + ".txt"]
--- a/trials/whisper-jax/whisjax_realtime.py
+++ b/trials/whisper-jax/whisjax_realtime.py
@@ -11,12 +11,12 @@ from termcolor import colored
 from whisper_jax import FlaxWhisperPipline

 from ...utils.file_utils import upload_files
-from ...utils.log_utils import logger
-from ...utils.run_utils import config
+from ...utils.log_utils import LOGGER
+from ...utils.run_utils import CONFIG
 from ...utils.text_utils import post_process_transcription, summarize
 from ...utils.viz_utils import create_talk_diff_scatter_viz, create_wordcloud

-WHISPER_MODEL_SIZE = config['WHISPER']["WHISPER_MODEL_SIZE"]
+WHISPER_MODEL_SIZE = CONFIG['WHISPER']["WHISPER_MODEL_SIZE"]

 FRAMES_PER_BUFFER = 8000
 FORMAT = pyaudio.paInt16
@@ -31,7 +31,7 @@ def main():
    AUDIO_DEVICE_ID = -1
    for i in range(p.get_device_count()):
        if p.get_device_info_by_index(i)["name"] == \
-                config["AUDIO"]["BLACKHOLE_INPUT_AGGREGATOR_DEVICE_NAME"]:
+                CONFIG["AUDIO"]["BLACKHOLE_INPUT_AGGREGATOR_DEVICE_NAME"]:
            AUDIO_DEVICE_ID = i
    audio_devices = p.get_device_info_by_index(AUDIO_DEVICE_ID)
    stream = p.open(
@@ -44,7 +44,7 @@ def main():
    )

    pipeline = FlaxWhisperPipline("openai/whisper-" +
-                                  config["WHISPER"]["WHISPER_REAL_TIME_MODEL_SIZE"],
+                                  CONFIG["WHISPER"]["WHISPER_REAL_TIME_MODEL_SIZE"],
                                  dtype=jnp.float16,
                                  batch_size=16)

@@ -106,23 +106,26 @@ def main():
                          " | Transcribed duration: " +
                          str(duration), "yellow"))

-    except Exception as e:
-        print(e)
+    except Exception as exception:
+        print(str(exception))
    finally:
-        with open("real_time_transcript_" +
-                  NOW.strftime("%m-%d-%Y_%H:%M:%S") + ".txt", "w") as f:
-            f.write(transcription)
+        with open("real_time_transcript_" + NOW.strftime("%m-%d-%Y_%H:%M:%S")
+                  + ".txt", "w", encoding="utf-8") as file:
+            file.write(transcription)
+
        with open("real_time_transcript_with_timestamp_" +
-                  NOW.strftime("%m-%d-%Y_%H:%M:%S") + ".txt", "w") as f:
+                  NOW.strftime("%m-%d-%Y_%H:%M:%S") + ".txt", "w",
+                  encoding="utf-8") as file:
            transcript_with_timestamp["text"] = transcription
-            f.write(str(transcript_with_timestamp))
+            file.write(str(transcript_with_timestamp))

-    transcript_with_timestamp = post_process_transcription(transcript_with_timestamp)
+    transcript_with_timestamp = \
+        post_process_transcription(transcript_with_timestamp)

-    logger.info("Creating word cloud")
+    LOGGER.info("Creating word cloud")
    create_wordcloud(NOW, True)

-    logger.info("Performing talk-diff and talk-diff visualization")
+    LOGGER.info("Performing talk-diff and talk-diff visualization")
    create_talk_diff_scatter_viz(NOW, True)

    # S3 : Push artefacts to S3 bucket
@@ -137,7 +140,7 @@ def main():

    summarize(transcript_with_timestamp["text"], NOW, True, True)

-    logger.info("Summarization completed")
+    LOGGER.info("Summarization completed")

    # Summarization takes a lot of time, so do this separately at the end
    files_to_upload = ["real_time_summary_" + suffix + ".txt"]