From 05dac39d4e0f60d1d2050594efe10d36421321d9 Mon Sep 17 00:00:00 2001 From: gokul Date: Wed, 14 Jun 2023 23:12:42 +0530 Subject: [PATCH] update --- Viz-experiments.ipynb | 184 +++++++++++++++++++++--------------------- config.ini | 5 -- whisjax.py | 18 ++++- 3 files changed, 109 insertions(+), 98 deletions(-) diff --git a/Viz-experiments.ipynb b/Viz-experiments.ipynb index 5ee28973..150d5f0e 100644 --- a/Viz-experiments.ipynb +++ b/Viz-experiments.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "5f8209ec", + "id": "a5ace857", "metadata": {}, "source": [ "# Visualization Experiments" @@ -10,7 +10,7 @@ }, { "cell_type": "markdown", - "id": "f20e9fa1", + "id": "9bfc569d", "metadata": {}, "source": [ "Lets load the data artefacts to local memory. These files are to be downloaded from S3 as the pipeline automatically uploads them to the pre-configured S3 bucket." @@ -18,17 +18,17 @@ }, { "cell_type": "code", - "execution_count": 73, - "id": "16178ad6", + "execution_count": 5, + "id": "edc584b2", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "\u001b[32m2023-06-14 19:24:02.274\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mfile_util\u001b[0m:\u001b[36mdownload_files\u001b[0m:\u001b[36m36\u001b[0m - \u001b[1mDownloading file df.pkl\u001b[0m\n", - "\u001b[32m2023-06-14 19:24:04.697\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mfile_util\u001b[0m:\u001b[36mdownload_files\u001b[0m:\u001b[36m36\u001b[0m - \u001b[1mDownloading file mappings.pkl\u001b[0m\n", - "\u001b[32m2023-06-14 19:24:05.518\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mfile_util\u001b[0m:\u001b[36mdownload_files\u001b[0m:\u001b[36m36\u001b[0m - \u001b[1mDownloading file transcript_timestamps.txt\u001b[0m\n" + "\u001b[32m2023-06-14 22:52:15.596\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mfile_util\u001b[0m:\u001b[36mdownload_files\u001b[0m:\u001b[36m36\u001b[0m - \u001b[1mDownloading file df.pkl\u001b[0m\n", + "\u001b[32m2023-06-14 22:52:19.079\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mfile_util\u001b[0m:\u001b[36mdownload_files\u001b[0m:\u001b[36m36\u001b[0m - \u001b[1mDownloading file mappings.pkl\u001b[0m\n", + "\u001b[32m2023-06-14 22:52:19.659\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mfile_util\u001b[0m:\u001b[36mdownload_files\u001b[0m:\u001b[36m36\u001b[0m - \u001b[1mDownloading file transcript_timestamps.txt\u001b[0m\n" ] } ], @@ -38,59 +38,15 @@ "\n", "# Download files from S3 bucket. You can download multiple files at a time by passing a list of names\n", "files_to_download = [\"df.pkl\", \"mappings.pkl\", 'transcript_timestamps.txt']\n", - "download_files(files_to_download)\n", - "\n" + "download_files(files_to_download)" ] }, { "cell_type": "code", - "execution_count": 59, - "id": "b03033e1", + "execution_count": null, + "id": "5027fe25", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[33mDEPRECATION: https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.2.0/en_core_web_md-3.2.0-py3-none-any.whl#egg=en_core_web_md==3.2.0 contains an egg fragment with a non-PEP 508 name pip 25.0 will enforce this behaviour change. A possible replacement is to use the req @ url syntax, and remove the egg fragment. Discussion can be found at https://github.com/pypa/pip/issues/11617\u001b[0m\u001b[33m\n", - "\u001b[0mCollecting en-core-web-md==3.2.0\n", - " Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.2.0/en_core_web_md-3.2.0-py3-none-any.whl (45.7 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m45.7/45.7 MB\u001b[0m \u001b[31m9.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m00:01\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: spacy<3.3.0,>=3.2.0 in /opt/anaconda3/lib/python3.8/site-packages (from en-core-web-md==3.2.0) (3.2.3)\n", - "Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.8 in /opt/anaconda3/lib/python3.8/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-md==3.2.0) (3.0.9)\n", - "Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /opt/anaconda3/lib/python3.8/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-md==3.2.0) (1.0.1)\n", - "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /opt/anaconda3/lib/python3.8/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-md==3.2.0) (1.0.6)\n", - "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /opt/anaconda3/lib/python3.8/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-md==3.2.0) (2.0.6)\n", - "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /opt/anaconda3/lib/python3.8/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-md==3.2.0) (3.0.6)\n", - "Requirement already satisfied: thinc<8.1.0,>=8.0.12 in /opt/anaconda3/lib/python3.8/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-md==3.2.0) (8.0.15)\n", - "Requirement already satisfied: blis<0.8.0,>=0.4.0 in /opt/anaconda3/lib/python3.8/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-md==3.2.0) (0.7.7)\n", - "Requirement already satisfied: wasabi<1.1.0,>=0.8.1 in /opt/anaconda3/lib/python3.8/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-md==3.2.0) (0.9.0)\n", - "Requirement already satisfied: srsly<3.0.0,>=2.4.1 in /opt/anaconda3/lib/python3.8/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-md==3.2.0) (2.4.2)\n", - "Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /opt/anaconda3/lib/python3.8/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-md==3.2.0) (2.0.7)\n", - "Requirement already satisfied: typer<0.5.0,>=0.3.0 in /opt/anaconda3/lib/python3.8/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-md==3.2.0) (0.3.2)\n", - "Requirement already satisfied: pathy>=0.3.5 in /opt/anaconda3/lib/python3.8/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-md==3.2.0) (0.6.1)\n", - "Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /opt/anaconda3/lib/python3.8/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-md==3.2.0) (4.64.1)\n", - "Requirement already satisfied: numpy>=1.15.0 in /opt/anaconda3/lib/python3.8/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-md==3.2.0) (1.23.5)\n", - "Requirement already satisfied: requests<3.0.0,>=2.13.0 in /opt/anaconda3/lib/python3.8/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-md==3.2.0) (2.27.1)\n", - "Requirement already satisfied: pydantic!=1.8,!=1.8.1,<1.9.0,>=1.7.4 in /opt/anaconda3/lib/python3.8/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-md==3.2.0) (1.8.2)\n", - "Requirement already satisfied: jinja2 in /opt/anaconda3/lib/python3.8/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-md==3.2.0) (2.11.3)\n", - "Requirement already satisfied: setuptools in /opt/anaconda3/lib/python3.8/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-md==3.2.0) (67.7.2)\n", - "Requirement already satisfied: packaging>=20.0 in /opt/anaconda3/lib/python3.8/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-md==3.2.0) (21.3)\n", - "Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /opt/anaconda3/lib/python3.8/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-md==3.2.0) (3.3.0)\n", - "Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /opt/anaconda3/lib/python3.8/site-packages (from packaging>=20.0->spacy<3.3.0,>=3.2.0->en-core-web-md==3.2.0) (2.4.7)\n", - "Requirement already satisfied: smart-open<6.0.0,>=5.0.0 in /opt/anaconda3/lib/python3.8/site-packages (from pathy>=0.3.5->spacy<3.3.0,>=3.2.0->en-core-web-md==3.2.0) (5.2.1)\n", - "Requirement already satisfied: typing-extensions>=3.7.4.3 in /opt/anaconda3/lib/python3.8/site-packages (from pydantic!=1.8,!=1.8.1,<1.9.0,>=1.7.4->spacy<3.3.0,>=3.2.0->en-core-web-md==3.2.0) (4.5.0)\n", - "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /opt/anaconda3/lib/python3.8/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.3.0,>=3.2.0->en-core-web-md==3.2.0) (1.26.4)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /opt/anaconda3/lib/python3.8/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.3.0,>=3.2.0->en-core-web-md==3.2.0) (2020.12.5)\n", - "Requirement already satisfied: charset-normalizer~=2.0.0 in /opt/anaconda3/lib/python3.8/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.3.0,>=3.2.0->en-core-web-md==3.2.0) (2.0.10)\n", - "Requirement already satisfied: idna<4,>=2.5 in /opt/anaconda3/lib/python3.8/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.3.0,>=3.2.0->en-core-web-md==3.2.0) (2.10)\n", - "Requirement already satisfied: click<7.2.0,>=7.1.1 in /opt/anaconda3/lib/python3.8/site-packages (from typer<0.5.0,>=0.3.0->spacy<3.3.0,>=3.2.0->en-core-web-md==3.2.0) (7.1.2)\n", - "Requirement already satisfied: MarkupSafe>=0.23 in /opt/anaconda3/lib/python3.8/site-packages (from jinja2->spacy<3.3.0,>=3.2.0->en-core-web-md==3.2.0) (2.0.1)\n", - "\u001b[38;5;2m✔ Download and installation successful\u001b[0m\n", - "You can now load the package via spacy.load('en_core_web_md')\n" - ] - } - ], + "outputs": [], "source": [ "# Download spacy model for the first time\n", "!spacy download en_core_web_md\n" @@ -98,8 +54,8 @@ }, { "cell_type": "code", - "execution_count": 60, - "id": "ee3a7ad9", + "execution_count": 6, + "id": "a1fc2846", "metadata": {}, "outputs": [], "source": [ @@ -112,7 +68,7 @@ }, { "cell_type": "markdown", - "id": "9ba072d9", + "id": "8abc435d", "metadata": {}, "source": [ "## Example template 1" @@ -120,7 +76,7 @@ }, { "cell_type": "markdown", - "id": "21d5dcd5", + "id": "2b1a4834", "metadata": {}, "source": [ "## Scatter plot of transcription with Topic modelling" @@ -128,7 +84,7 @@ }, { "cell_type": "markdown", - "id": "ff6acd05", + "id": "a795137e", "metadata": {}, "source": [ "Change the values of \"category\", \"category_name\" to one agenda topic and change the value of \"not_category_name\" and see different plots." @@ -136,7 +92,7 @@ }, { "cell_type": "code", - "execution_count": 61, + "execution_count": 21, "id": "43e01074", "metadata": {}, "outputs": [], @@ -144,6 +100,7 @@ "import pandas as pd\n", "import scattertext as st\n", "\n", + "df = pd.read_pickle(\"df.pkl\")\n", "\n", "def plot_topic_modelling_and_word_to_sentence_search(df, cat_1, cat_1_name, cat_2_name):\n", " df = df.assign(parse=lambda df: df.text.apply(st.whitespace_nlp_with_sentences))\n", @@ -162,16 +119,16 @@ " open('./demo_compact.html', 'w').write(html)\n", "\n", "plot_topic_modelling_and_word_to_sentence_search(df,\n", - " cat_1=\"TAM\",\n", - " cat_1_name=\"TAM\",\n", - " cat_2_name=\"Churn\")\n", + " cat_1=\"Founders\",\n", + " cat_1_name=\"Founders\",\n", + " cat_2_name=\"TAM\")\n", "\n", "# once you are done, check the generated HTML file\n" ] }, { "cell_type": "markdown", - "id": "e0610165", + "id": "e9994c87", "metadata": {}, "source": [ "## Example template 2" @@ -179,7 +136,7 @@ }, { "cell_type": "markdown", - "id": "8b1684df", + "id": "35c4f7fd", "metadata": {}, "source": [ "## Time driven Insights" @@ -187,8 +144,8 @@ }, { "cell_type": "code", - "execution_count": 62, - "id": "68eb04f7", + "execution_count": 9, + "id": "7cdcd66f", "metadata": {}, "outputs": [], "source": [ @@ -201,8 +158,8 @@ }, { "cell_type": "code", - "execution_count": 63, - "id": "eaf9c5ed", + "execution_count": 10, + "id": "11221022", "metadata": {}, "outputs": [ { @@ -252,9 +209,64 @@ "plot_time_spent_for_topic(timestamp_to_topic_second_match, \"second\")" ] }, + { + "cell_type": "code", + "execution_count": 22, + "id": "a691664f", + "metadata": {}, + "outputs": [], + "source": [ + "import collections\n", + "topic_times = collections.defaultdict(int)\n", + "for key in timestamp_to_topic_first_match.keys():\n", + " duration = key[1] - key[0]\n", + " topic_times[timestamp_to_topic_first_match[key]] += duration\n", + "\n", + "topic_times = sorted(topic_times.items(), key=lambda x:x[1], reverse=True)\n", + "cat_1 = topic_times[0][0]\n", + "cat_1_name = topic_times[0][0]\n", + "cat_2_name = topic_times[1][0]" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "6451e86d", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import scattertext as st\n", + "\n", + "df = pd.read_pickle(\"df.pkl\")\n", + "\n", + "def plot_topic_modelling_and_word_to_sentence_search(df, cat_1, cat_1_name, cat_2_name):\n", + " df = df.assign(parse=lambda df: df.text.apply(st.whitespace_nlp_with_sentences))\n", + "\n", + " corpus = st.CorpusFromParsedDocuments(\n", + " df, category_col='ts_to_topic_mapping_top_1', parsed_col='parse'\n", + " ).build().get_unigram_corpus().remove_terms(stopwords, ignore_absences=True).compact(st.AssociationCompactor(2000))\n", + " \n", + " html = st.produce_scattertext_explorer(\n", + " corpus,\n", + " category=cat_1, category_name=cat_1_name, not_category_name=cat_2_name,\n", + " minimum_term_frequency=0, pmi_threshold_coefficient=0,\n", + " width_in_pixels=1000,\n", + " transform=st.Scalers.dense_rank\n", + " )\n", + " open('./demo_compact.html', 'w').write(html)\n", + "\n", + "plot_topic_modelling_and_word_to_sentence_search(df,\n", + " cat_1=cat_1,\n", + " cat_1_name=cat_1_name,\n", + " cat_2_name=cat_2_name)\n", + "\n", + "# once you are done, check the generated HTML file\n" + ] + }, { "cell_type": "markdown", - "id": "60bda970", + "id": "e9ae6e25", "metadata": {}, "source": [ "## Example template 3" @@ -262,7 +274,7 @@ }, { "cell_type": "markdown", - "id": "e1707621", + "id": "69be38ce", "metadata": {}, "source": [ "## Enhanced search for timelines" @@ -270,7 +282,7 @@ }, { "cell_type": "markdown", - "id": "d2d574de", + "id": "f8a47348", "metadata": {}, "source": [ "We can already search for a particular word in the interactive HTML document from example 1 to see a list of all transcribed sentences having an occurence of the word (in the context of the chosen topic). \n", @@ -288,8 +300,8 @@ }, { "cell_type": "code", - "execution_count": 86, - "id": "a5d1ea29", + "execution_count": 13, + "id": "69d814c9", "metadata": {}, "outputs": [ { @@ -325,7 +337,7 @@ " (2472.44, 2474.96)]" ] }, - "execution_count": 86, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -342,7 +354,7 @@ }, { "cell_type": "markdown", - "id": "10478204", + "id": "b587da79", "metadata": {}, "source": [ "## Selective segregation of content" @@ -351,7 +363,7 @@ { "cell_type": "code", "execution_count": 122, - "id": "9c0ee0a2", + "id": "5dc2014f", "metadata": {}, "outputs": [], "source": [ @@ -374,7 +386,7 @@ { "cell_type": "code", "execution_count": 121, - "id": "2501c721", + "id": "caeff7f1", "metadata": {}, "outputs": [ { @@ -394,7 +406,7 @@ }, { "cell_type": "markdown", - "id": "1a61a12e", + "id": "a20896b4", "metadata": {}, "source": [ "## Selective topic summarization" @@ -402,7 +414,7 @@ }, { "cell_type": "markdown", - "id": "490da9a0", + "id": "6f8ab415", "metadata": {}, "source": [ "We can use this selective content to now summarize using the already available pipeline !" @@ -410,19 +422,11 @@ }, { "cell_type": "markdown", - "id": "53b525e3", + "id": "06f009d5", "metadata": {}, "source": [ "# And Much More !!" ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "46b4730a", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/config.ini b/config.ini index 937b3f99..9c96931a 100644 --- a/config.ini +++ b/config.ini @@ -9,8 +9,3 @@ AWS_ACCESS_KEY=***REMOVED*** AWS_SECRET_KEY=***REMOVED*** BUCKET_NAME='reflector-bucket' -# For the topic modelling viz chart -CATEGORY_1=TAM -CATEGORY_1_NAME=TAM -CATEGORY_2_NAME=Churn - diff --git a/whisjax.py b/whisjax.py index 2f086e64..38ddec9f 100644 --- a/whisjax.py +++ b/whisjax.py @@ -222,6 +222,18 @@ def create_talk_diff_scatter_viz(): # to load, my_mappings = pickle.load( open ("mappings.pkl", "rb") ) + # pick the 2 most matched topic to be used for plotting + topic_times = collections.defaultdict(int) + for key in ts_to_topic_mapping_top_1.keys(): + duration = key[1] - key[0] + topic_times[ts_to_topic_mapping_top_1[key]] += duration + + topic_times = sorted(topic_times.items(), key=lambda x: x[1], reverse=True) + + cat_1 = topic_times[0][0] + cat_1_name = topic_times[0][0] + cat_2_name = topic_times[1][0] + # Scatter plot of topics df = df.assign(parse=lambda df: df.text.apply(st.whitespace_nlp_with_sentences)) corpus = st.CorpusFromParsedDocuments( @@ -229,9 +241,9 @@ def create_talk_diff_scatter_viz(): ).build().get_unigram_corpus().compact(st.AssociationCompactor(2000)) html = st.produce_scattertext_explorer( corpus, - category=config["DEFAULT"]["CATEGORY_1"], - category_name=config["DEFAULT"]["CATEGORY_1_NAME"], - not_category_name=config["DEFAULT"]["CATEGORY_2_NAME"], + category=cat_1, + category_name=cat_1_name, + not_category_name=cat_2_name, minimum_term_frequency=0, pmi_threshold_coefficient=0, width_in_pixels=1000, transform=st.Scalers.dense_rank