Merge pull request #12 from Monadical-SAS/whisper-jax-gokul

update
2026-02-04 09:56:47 +00:00 · 2023-06-14 23:13:33 +05:30
parent 088c0a224b 05dac39d4e
commit eab83c67ef
3 changed files with 109 additions and 98 deletions
--- a/Viz-experiments.ipynb
+++ b/Viz-experiments.ipynb
@@ -2,7 +2,7 @@
 "cells": [
  {
   "cell_type": "markdown",
-   "id": "5f8209ec",
+   "id": "a5ace857",
   "metadata": {},
   "source": [
    "# Visualization Experiments"
@@ -10,7 +10,7 @@
  },
  {
   "cell_type": "markdown",
-   "id": "f20e9fa1",
+   "id": "9bfc569d",
   "metadata": {},
   "source": [
    "Lets load the data artefacts to local memory. These files are to be downloaded from S3 as the pipeline automatically uploads them to the pre-configured S3 bucket."
@@ -18,17 +18,17 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 73,
-   "id": "16178ad6",
+   "execution_count": 5,
+   "id": "edc584b2",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "\u001b[32m2023-06-14 19:24:02.274\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mfile_util\u001b[0m:\u001b[36mdownload_files\u001b[0m:\u001b[36m36\u001b[0m - \u001b[1mDownloading file df.pkl\u001b[0m\n",
-      "\u001b[32m2023-06-14 19:24:04.697\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mfile_util\u001b[0m:\u001b[36mdownload_files\u001b[0m:\u001b[36m36\u001b[0m - \u001b[1mDownloading file mappings.pkl\u001b[0m\n",
-      "\u001b[32m2023-06-14 19:24:05.518\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mfile_util\u001b[0m:\u001b[36mdownload_files\u001b[0m:\u001b[36m36\u001b[0m - \u001b[1mDownloading file transcript_timestamps.txt\u001b[0m\n"
+      "\u001b[32m2023-06-14 22:52:15.596\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mfile_util\u001b[0m:\u001b[36mdownload_files\u001b[0m:\u001b[36m36\u001b[0m - \u001b[1mDownloading file df.pkl\u001b[0m\n",
+      "\u001b[32m2023-06-14 22:52:19.079\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mfile_util\u001b[0m:\u001b[36mdownload_files\u001b[0m:\u001b[36m36\u001b[0m - \u001b[1mDownloading file mappings.pkl\u001b[0m\n",
+      "\u001b[32m2023-06-14 22:52:19.659\u001b[0m | \u001b[1mINFO    \u001b[0m | \u001b[36mfile_util\u001b[0m:\u001b[36mdownload_files\u001b[0m:\u001b[36m36\u001b[0m - \u001b[1mDownloading file transcript_timestamps.txt\u001b[0m\n"
     ]
    }
   ],
@@ -38,59 +38,15 @@
    "\n",
    "# Download files from S3 bucket. You can download multiple files at a time by passing a list of names\n",
    "files_to_download = [\"df.pkl\", \"mappings.pkl\", 'transcript_timestamps.txt']\n",
-    "download_files(files_to_download)\n",
-    "\n"
+    "download_files(files_to_download)"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 59,
-   "id": "b03033e1",
+   "execution_count": null,
+   "id": "5027fe25",
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\u001b[33mDEPRECATION: https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.2.0/en_core_web_md-3.2.0-py3-none-any.whl#egg=en_core_web_md==3.2.0 contains an egg fragment with a non-PEP 508 name pip 25.0 will enforce this behaviour change. A possible replacement is to use the req @ url syntax, and remove the egg fragment. Discussion can be found at https://github.com/pypa/pip/issues/11617\u001b[0m\u001b[33m\n",
-      "\u001b[0mCollecting en-core-web-md==3.2.0\n",
-      "  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.2.0/en_core_web_md-3.2.0-py3-none-any.whl (45.7 MB)\n",
-      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m45.7/45.7 MB\u001b[0m \u001b[31m9.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m00:01\u001b[0m\n",
-      "\u001b[?25hRequirement already satisfied: spacy<3.3.0,>=3.2.0 in /opt/anaconda3/lib/python3.8/site-packages (from en-core-web-md==3.2.0) (3.2.3)\n",
-      "Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.8 in /opt/anaconda3/lib/python3.8/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-md==3.2.0) (3.0.9)\n",
-      "Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /opt/anaconda3/lib/python3.8/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-md==3.2.0) (1.0.1)\n",
-      "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /opt/anaconda3/lib/python3.8/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-md==3.2.0) (1.0.6)\n",
-      "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /opt/anaconda3/lib/python3.8/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-md==3.2.0) (2.0.6)\n",
-      "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /opt/anaconda3/lib/python3.8/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-md==3.2.0) (3.0.6)\n",
-      "Requirement already satisfied: thinc<8.1.0,>=8.0.12 in /opt/anaconda3/lib/python3.8/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-md==3.2.0) (8.0.15)\n",
-      "Requirement already satisfied: blis<0.8.0,>=0.4.0 in /opt/anaconda3/lib/python3.8/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-md==3.2.0) (0.7.7)\n",
-      "Requirement already satisfied: wasabi<1.1.0,>=0.8.1 in /opt/anaconda3/lib/python3.8/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-md==3.2.0) (0.9.0)\n",
-      "Requirement already satisfied: srsly<3.0.0,>=2.4.1 in /opt/anaconda3/lib/python3.8/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-md==3.2.0) (2.4.2)\n",
-      "Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /opt/anaconda3/lib/python3.8/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-md==3.2.0) (2.0.7)\n",
-      "Requirement already satisfied: typer<0.5.0,>=0.3.0 in /opt/anaconda3/lib/python3.8/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-md==3.2.0) (0.3.2)\n",
-      "Requirement already satisfied: pathy>=0.3.5 in /opt/anaconda3/lib/python3.8/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-md==3.2.0) (0.6.1)\n",
-      "Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /opt/anaconda3/lib/python3.8/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-md==3.2.0) (4.64.1)\n",
-      "Requirement already satisfied: numpy>=1.15.0 in /opt/anaconda3/lib/python3.8/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-md==3.2.0) (1.23.5)\n",
-      "Requirement already satisfied: requests<3.0.0,>=2.13.0 in /opt/anaconda3/lib/python3.8/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-md==3.2.0) (2.27.1)\n",
-      "Requirement already satisfied: pydantic!=1.8,!=1.8.1,<1.9.0,>=1.7.4 in /opt/anaconda3/lib/python3.8/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-md==3.2.0) (1.8.2)\n",
-      "Requirement already satisfied: jinja2 in /opt/anaconda3/lib/python3.8/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-md==3.2.0) (2.11.3)\n",
-      "Requirement already satisfied: setuptools in /opt/anaconda3/lib/python3.8/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-md==3.2.0) (67.7.2)\n",
-      "Requirement already satisfied: packaging>=20.0 in /opt/anaconda3/lib/python3.8/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-md==3.2.0) (21.3)\n",
-      "Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /opt/anaconda3/lib/python3.8/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-md==3.2.0) (3.3.0)\n",
-      "Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /opt/anaconda3/lib/python3.8/site-packages (from packaging>=20.0->spacy<3.3.0,>=3.2.0->en-core-web-md==3.2.0) (2.4.7)\n",
-      "Requirement already satisfied: smart-open<6.0.0,>=5.0.0 in /opt/anaconda3/lib/python3.8/site-packages (from pathy>=0.3.5->spacy<3.3.0,>=3.2.0->en-core-web-md==3.2.0) (5.2.1)\n",
-      "Requirement already satisfied: typing-extensions>=3.7.4.3 in /opt/anaconda3/lib/python3.8/site-packages (from pydantic!=1.8,!=1.8.1,<1.9.0,>=1.7.4->spacy<3.3.0,>=3.2.0->en-core-web-md==3.2.0) (4.5.0)\n",
-      "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /opt/anaconda3/lib/python3.8/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.3.0,>=3.2.0->en-core-web-md==3.2.0) (1.26.4)\n",
-      "Requirement already satisfied: certifi>=2017.4.17 in /opt/anaconda3/lib/python3.8/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.3.0,>=3.2.0->en-core-web-md==3.2.0) (2020.12.5)\n",
-      "Requirement already satisfied: charset-normalizer~=2.0.0 in /opt/anaconda3/lib/python3.8/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.3.0,>=3.2.0->en-core-web-md==3.2.0) (2.0.10)\n",
-      "Requirement already satisfied: idna<4,>=2.5 in /opt/anaconda3/lib/python3.8/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.3.0,>=3.2.0->en-core-web-md==3.2.0) (2.10)\n",
-      "Requirement already satisfied: click<7.2.0,>=7.1.1 in /opt/anaconda3/lib/python3.8/site-packages (from typer<0.5.0,>=0.3.0->spacy<3.3.0,>=3.2.0->en-core-web-md==3.2.0) (7.1.2)\n",
-      "Requirement already satisfied: MarkupSafe>=0.23 in /opt/anaconda3/lib/python3.8/site-packages (from jinja2->spacy<3.3.0,>=3.2.0->en-core-web-md==3.2.0) (2.0.1)\n",
-      "\u001b[38;5;2m✔ Download and installation successful\u001b[0m\n",
-      "You can now load the package via spacy.load('en_core_web_md')\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "# Download spacy model for the first time\n",
    "!spacy download en_core_web_md\n"
@@ -98,8 +54,8 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 60,
-   "id": "ee3a7ad9",
+   "execution_count": 6,
+   "id": "a1fc2846",
   "metadata": {},
   "outputs": [],
   "source": [
@@ -112,7 +68,7 @@
  },
  {
   "cell_type": "markdown",
-   "id": "9ba072d9",
+   "id": "8abc435d",
   "metadata": {},
   "source": [
    "## Example template 1"
@@ -120,7 +76,7 @@
  },
  {
   "cell_type": "markdown",
-   "id": "21d5dcd5",
+   "id": "2b1a4834",
   "metadata": {},
   "source": [
    "## Scatter plot of transcription with Topic modelling"
@@ -128,7 +84,7 @@
  },
  {
   "cell_type": "markdown",
-   "id": "ff6acd05",
+   "id": "a795137e",
   "metadata": {},
   "source": [
    "Change the values of \"category\", \"category_name\" to one agenda topic and change the value of \"not_category_name\" and see different plots."
@@ -136,7 +92,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 61,
+   "execution_count": 21,
   "id": "43e01074",
   "metadata": {},
   "outputs": [],
@@ -144,6 +100,7 @@
    "import pandas as pd\n",
    "import scattertext as st\n",
    "\n",
+    "df = pd.read_pickle(\"df.pkl\")\n",
    "\n",
    "def plot_topic_modelling_and_word_to_sentence_search(df, cat_1, cat_1_name, cat_2_name):\n",
    "    df = df.assign(parse=lambda df: df.text.apply(st.whitespace_nlp_with_sentences))\n",
@@ -162,16 +119,16 @@
    "    open('./demo_compact.html', 'w').write(html)\n",
    "\n",
    "plot_topic_modelling_and_word_to_sentence_search(df,\n",
-    "                                                cat_1=\"TAM\",\n",
-    "                                                cat_1_name=\"TAM\",\n",
-    "                                                cat_2_name=\"Churn\")\n",
+    "                                                cat_1=\"Founders\",\n",
+    "                                                cat_1_name=\"Founders\",\n",
+    "                                                cat_2_name=\"TAM\")\n",
    "\n",
    "# once you are done, check the generated HTML file\n"
   ]
  },
  {
   "cell_type": "markdown",
-   "id": "e0610165",
+   "id": "e9994c87",
   "metadata": {},
   "source": [
    "## Example template 2"
@@ -179,7 +136,7 @@
  },
  {
   "cell_type": "markdown",
-   "id": "8b1684df",
+   "id": "35c4f7fd",
   "metadata": {},
   "source": [
    "## Time driven Insights"
@@ -187,8 +144,8 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 62,
-   "id": "68eb04f7",
+   "execution_count": 9,
+   "id": "7cdcd66f",
   "metadata": {},
   "outputs": [],
   "source": [
@@ -201,8 +158,8 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 63,
-   "id": "eaf9c5ed",
+   "execution_count": 10,
+   "id": "11221022",
   "metadata": {},
   "outputs": [
    {
@@ -252,9 +209,64 @@
    "plot_time_spent_for_topic(timestamp_to_topic_second_match, \"second\")"
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "a691664f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import collections\n",
+    "topic_times = collections.defaultdict(int)\n",
+    "for key in timestamp_to_topic_first_match.keys():\n",
+    "    duration = key[1] - key[0]\n",
+    "    topic_times[timestamp_to_topic_first_match[key]] += duration\n",
+    "\n",
+    "topic_times = sorted(topic_times.items(), key=lambda x:x[1], reverse=True)\n",
+    "cat_1 = topic_times[0][0]\n",
+    "cat_1_name = topic_times[0][0]\n",
+    "cat_2_name = topic_times[1][0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "6451e86d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import scattertext as st\n",
+    "\n",
+    "df = pd.read_pickle(\"df.pkl\")\n",
+    "\n",
+    "def plot_topic_modelling_and_word_to_sentence_search(df, cat_1, cat_1_name, cat_2_name):\n",
+    "    df = df.assign(parse=lambda df: df.text.apply(st.whitespace_nlp_with_sentences))\n",
+    "\n",
+    "    corpus = st.CorpusFromParsedDocuments(\n",
+    "        df, category_col='ts_to_topic_mapping_top_1', parsed_col='parse'\n",
+    "    ).build().get_unigram_corpus().remove_terms(stopwords, ignore_absences=True).compact(st.AssociationCompactor(2000))\n",
+    "    \n",
+    "    html = st.produce_scattertext_explorer(\n",
+    "        corpus,\n",
+    "        category=cat_1, category_name=cat_1_name, not_category_name=cat_2_name,\n",
+    "        minimum_term_frequency=0, pmi_threshold_coefficient=0,\n",
+    "        width_in_pixels=1000,\n",
+    "        transform=st.Scalers.dense_rank\n",
+    "    )\n",
+    "    open('./demo_compact.html', 'w').write(html)\n",
+    "\n",
+    "plot_topic_modelling_and_word_to_sentence_search(df,\n",
+    "                                                cat_1=cat_1,\n",
+    "                                                cat_1_name=cat_1_name,\n",
+    "                                                cat_2_name=cat_2_name)\n",
+    "\n",
+    "# once you are done, check the generated HTML file\n"
+   ]
+  },
  {
   "cell_type": "markdown",
-   "id": "60bda970",
+   "id": "e9ae6e25",
   "metadata": {},
   "source": [
    "## Example template 3"
@@ -262,7 +274,7 @@
  },
  {
   "cell_type": "markdown",
-   "id": "e1707621",
+   "id": "69be38ce",
   "metadata": {},
   "source": [
    "## Enhanced search for timelines"
@@ -270,7 +282,7 @@
  },
  {
   "cell_type": "markdown",
-   "id": "d2d574de",
+   "id": "f8a47348",
   "metadata": {},
   "source": [
    "We can already search for a particular word in the interactive HTML document from example 1 to see a list of all transcribed sentences having an occurence of the word (in the context of the chosen topic). \n",
@@ -288,8 +300,8 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 86,
-   "id": "a5d1ea29",
+   "execution_count": 13,
+   "id": "69d814c9",
   "metadata": {},
   "outputs": [
    {
@@ -325,7 +337,7 @@
       " (2472.44, 2474.96)]"
      ]
     },
-     "execution_count": 86,
+     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
@@ -342,7 +354,7 @@
  },
  {
   "cell_type": "markdown",
-   "id": "10478204",
+   "id": "b587da79",
   "metadata": {},
   "source": [
    "## Selective segregation of content"
@@ -351,7 +363,7 @@
  {
   "cell_type": "code",
   "execution_count": 122,
-   "id": "9c0ee0a2",
+   "id": "5dc2014f",
   "metadata": {},
   "outputs": [],
   "source": [
@@ -374,7 +386,7 @@
  {
   "cell_type": "code",
   "execution_count": 121,
-   "id": "2501c721",
+   "id": "caeff7f1",
   "metadata": {},
   "outputs": [
    {
@@ -394,7 +406,7 @@
  },
  {
   "cell_type": "markdown",
-   "id": "1a61a12e",
+   "id": "a20896b4",
   "metadata": {},
   "source": [
    "## Selective topic summarization"
@@ -402,7 +414,7 @@
  },
  {
   "cell_type": "markdown",
-   "id": "490da9a0",
+   "id": "6f8ab415",
   "metadata": {},
   "source": [
    "We can use this selective content to now summarize using the already available pipeline !"
@@ -410,19 +422,11 @@
  },
  {
   "cell_type": "markdown",
-   "id": "53b525e3",
+   "id": "06f009d5",
   "metadata": {},
   "source": [
    "# And Much More !!"
   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "46b4730a",
-   "metadata": {},
-   "outputs": [],
-   "source": []
  }
 ],
 "metadata": {
--- a/config.ini
+++ b/config.ini
@@ -9,8 +9,3 @@ AWS_ACCESS_KEY=***REMOVED***
 AWS_SECRET_KEY=***REMOVED***
 BUCKET_NAME='reflector-bucket'

-# For the topic modelling viz chart
-CATEGORY_1=TAM
-CATEGORY_1_NAME=TAM
-CATEGORY_2_NAME=Churn
-
--- a/whisjax.py
+++ b/whisjax.py
@@ -222,6 +222,18 @@ def create_talk_diff_scatter_viz():

    # to load,  my_mappings = pickle.load( open ("mappings.pkl", "rb") )

+    # pick the 2 most matched topic to be used for plotting
+    topic_times = collections.defaultdict(int)
+    for key in ts_to_topic_mapping_top_1.keys():
+        duration = key[1] - key[0]
+        topic_times[ts_to_topic_mapping_top_1[key]] += duration
+
+    topic_times = sorted(topic_times.items(), key=lambda x: x[1], reverse=True)
+
+    cat_1 = topic_times[0][0]
+    cat_1_name = topic_times[0][0]
+    cat_2_name = topic_times[1][0]
+
    # Scatter plot of topics
    df = df.assign(parse=lambda df: df.text.apply(st.whitespace_nlp_with_sentences))
    corpus = st.CorpusFromParsedDocuments(
@@ -229,9 +241,9 @@ def create_talk_diff_scatter_viz():
    ).build().get_unigram_corpus().compact(st.AssociationCompactor(2000))
    html = st.produce_scattertext_explorer(
        corpus,
-        category=config["DEFAULT"]["CATEGORY_1"],
-        category_name=config["DEFAULT"]["CATEGORY_1_NAME"],
-        not_category_name=config["DEFAULT"]["CATEGORY_2_NAME"],
+        category=cat_1,
+        category_name=cat_1_name,
+        not_category_name=cat_2_name,
        minimum_term_frequency=0, pmi_threshold_coefficient=0,
        width_in_pixels=1000,
        transform=st.Scalers.dense_rank