This commit is contained in:
gokul
2023-06-14 23:12:42 +05:30
parent a63c201858
commit 05dac39d4e
3 changed files with 109 additions and 98 deletions

View File

@@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "markdown",
"id": "5f8209ec",
"id": "a5ace857",
"metadata": {},
"source": [
"# Visualization Experiments"
@@ -10,7 +10,7 @@
},
{
"cell_type": "markdown",
"id": "f20e9fa1",
"id": "9bfc569d",
"metadata": {},
"source": [
"Lets load the data artefacts to local memory. These files are to be downloaded from S3 as the pipeline automatically uploads them to the pre-configured S3 bucket."
@@ -18,17 +18,17 @@
},
{
"cell_type": "code",
"execution_count": 73,
"id": "16178ad6",
"execution_count": 5,
"id": "edc584b2",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[32m2023-06-14 19:24:02.274\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mfile_util\u001b[0m:\u001b[36mdownload_files\u001b[0m:\u001b[36m36\u001b[0m - \u001b[1mDownloading file df.pkl\u001b[0m\n",
"\u001b[32m2023-06-14 19:24:04.697\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mfile_util\u001b[0m:\u001b[36mdownload_files\u001b[0m:\u001b[36m36\u001b[0m - \u001b[1mDownloading file mappings.pkl\u001b[0m\n",
"\u001b[32m2023-06-14 19:24:05.518\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mfile_util\u001b[0m:\u001b[36mdownload_files\u001b[0m:\u001b[36m36\u001b[0m - \u001b[1mDownloading file transcript_timestamps.txt\u001b[0m\n"
"\u001b[32m2023-06-14 22:52:15.596\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mfile_util\u001b[0m:\u001b[36mdownload_files\u001b[0m:\u001b[36m36\u001b[0m - \u001b[1mDownloading file df.pkl\u001b[0m\n",
"\u001b[32m2023-06-14 22:52:19.079\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mfile_util\u001b[0m:\u001b[36mdownload_files\u001b[0m:\u001b[36m36\u001b[0m - \u001b[1mDownloading file mappings.pkl\u001b[0m\n",
"\u001b[32m2023-06-14 22:52:19.659\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mfile_util\u001b[0m:\u001b[36mdownload_files\u001b[0m:\u001b[36m36\u001b[0m - \u001b[1mDownloading file transcript_timestamps.txt\u001b[0m\n"
]
}
],
@@ -38,59 +38,15 @@
"\n",
"# Download files from S3 bucket. You can download multiple files at a time by passing a list of names\n",
"files_to_download = [\"df.pkl\", \"mappings.pkl\", 'transcript_timestamps.txt']\n",
"download_files(files_to_download)\n",
"\n"
"download_files(files_to_download)"
]
},
{
"cell_type": "code",
"execution_count": 59,
"id": "b03033e1",
"execution_count": null,
"id": "5027fe25",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[33mDEPRECATION: https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.2.0/en_core_web_md-3.2.0-py3-none-any.whl#egg=en_core_web_md==3.2.0 contains an egg fragment with a non-PEP 508 name pip 25.0 will enforce this behaviour change. A possible replacement is to use the req @ url syntax, and remove the egg fragment. Discussion can be found at https://github.com/pypa/pip/issues/11617\u001b[0m\u001b[33m\n",
"\u001b[0mCollecting en-core-web-md==3.2.0\n",
" Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.2.0/en_core_web_md-3.2.0-py3-none-any.whl (45.7 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m45.7/45.7 MB\u001b[0m \u001b[31m9.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m00:01\u001b[0m\n",
"\u001b[?25hRequirement already satisfied: spacy<3.3.0,>=3.2.0 in /opt/anaconda3/lib/python3.8/site-packages (from en-core-web-md==3.2.0) (3.2.3)\n",
"Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.8 in /opt/anaconda3/lib/python3.8/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-md==3.2.0) (3.0.9)\n",
"Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /opt/anaconda3/lib/python3.8/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-md==3.2.0) (1.0.1)\n",
"Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /opt/anaconda3/lib/python3.8/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-md==3.2.0) (1.0.6)\n",
"Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /opt/anaconda3/lib/python3.8/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-md==3.2.0) (2.0.6)\n",
"Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /opt/anaconda3/lib/python3.8/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-md==3.2.0) (3.0.6)\n",
"Requirement already satisfied: thinc<8.1.0,>=8.0.12 in /opt/anaconda3/lib/python3.8/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-md==3.2.0) (8.0.15)\n",
"Requirement already satisfied: blis<0.8.0,>=0.4.0 in /opt/anaconda3/lib/python3.8/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-md==3.2.0) (0.7.7)\n",
"Requirement already satisfied: wasabi<1.1.0,>=0.8.1 in /opt/anaconda3/lib/python3.8/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-md==3.2.0) (0.9.0)\n",
"Requirement already satisfied: srsly<3.0.0,>=2.4.1 in /opt/anaconda3/lib/python3.8/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-md==3.2.0) (2.4.2)\n",
"Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /opt/anaconda3/lib/python3.8/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-md==3.2.0) (2.0.7)\n",
"Requirement already satisfied: typer<0.5.0,>=0.3.0 in /opt/anaconda3/lib/python3.8/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-md==3.2.0) (0.3.2)\n",
"Requirement already satisfied: pathy>=0.3.5 in /opt/anaconda3/lib/python3.8/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-md==3.2.0) (0.6.1)\n",
"Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /opt/anaconda3/lib/python3.8/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-md==3.2.0) (4.64.1)\n",
"Requirement already satisfied: numpy>=1.15.0 in /opt/anaconda3/lib/python3.8/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-md==3.2.0) (1.23.5)\n",
"Requirement already satisfied: requests<3.0.0,>=2.13.0 in /opt/anaconda3/lib/python3.8/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-md==3.2.0) (2.27.1)\n",
"Requirement already satisfied: pydantic!=1.8,!=1.8.1,<1.9.0,>=1.7.4 in /opt/anaconda3/lib/python3.8/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-md==3.2.0) (1.8.2)\n",
"Requirement already satisfied: jinja2 in /opt/anaconda3/lib/python3.8/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-md==3.2.0) (2.11.3)\n",
"Requirement already satisfied: setuptools in /opt/anaconda3/lib/python3.8/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-md==3.2.0) (67.7.2)\n",
"Requirement already satisfied: packaging>=20.0 in /opt/anaconda3/lib/python3.8/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-md==3.2.0) (21.3)\n",
"Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /opt/anaconda3/lib/python3.8/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-md==3.2.0) (3.3.0)\n",
"Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /opt/anaconda3/lib/python3.8/site-packages (from packaging>=20.0->spacy<3.3.0,>=3.2.0->en-core-web-md==3.2.0) (2.4.7)\n",
"Requirement already satisfied: smart-open<6.0.0,>=5.0.0 in /opt/anaconda3/lib/python3.8/site-packages (from pathy>=0.3.5->spacy<3.3.0,>=3.2.0->en-core-web-md==3.2.0) (5.2.1)\n",
"Requirement already satisfied: typing-extensions>=3.7.4.3 in /opt/anaconda3/lib/python3.8/site-packages (from pydantic!=1.8,!=1.8.1,<1.9.0,>=1.7.4->spacy<3.3.0,>=3.2.0->en-core-web-md==3.2.0) (4.5.0)\n",
"Requirement already satisfied: urllib3<1.27,>=1.21.1 in /opt/anaconda3/lib/python3.8/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.3.0,>=3.2.0->en-core-web-md==3.2.0) (1.26.4)\n",
"Requirement already satisfied: certifi>=2017.4.17 in /opt/anaconda3/lib/python3.8/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.3.0,>=3.2.0->en-core-web-md==3.2.0) (2020.12.5)\n",
"Requirement already satisfied: charset-normalizer~=2.0.0 in /opt/anaconda3/lib/python3.8/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.3.0,>=3.2.0->en-core-web-md==3.2.0) (2.0.10)\n",
"Requirement already satisfied: idna<4,>=2.5 in /opt/anaconda3/lib/python3.8/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.3.0,>=3.2.0->en-core-web-md==3.2.0) (2.10)\n",
"Requirement already satisfied: click<7.2.0,>=7.1.1 in /opt/anaconda3/lib/python3.8/site-packages (from typer<0.5.0,>=0.3.0->spacy<3.3.0,>=3.2.0->en-core-web-md==3.2.0) (7.1.2)\n",
"Requirement already satisfied: MarkupSafe>=0.23 in /opt/anaconda3/lib/python3.8/site-packages (from jinja2->spacy<3.3.0,>=3.2.0->en-core-web-md==3.2.0) (2.0.1)\n",
"\u001b[38;5;2m✔ Download and installation successful\u001b[0m\n",
"You can now load the package via spacy.load('en_core_web_md')\n"
]
}
],
"outputs": [],
"source": [
"# Download spacy model for the first time\n",
"!spacy download en_core_web_md\n"
@@ -98,8 +54,8 @@
},
{
"cell_type": "code",
"execution_count": 60,
"id": "ee3a7ad9",
"execution_count": 6,
"id": "a1fc2846",
"metadata": {},
"outputs": [],
"source": [
@@ -112,7 +68,7 @@
},
{
"cell_type": "markdown",
"id": "9ba072d9",
"id": "8abc435d",
"metadata": {},
"source": [
"## Example template 1"
@@ -120,7 +76,7 @@
},
{
"cell_type": "markdown",
"id": "21d5dcd5",
"id": "2b1a4834",
"metadata": {},
"source": [
"## Scatter plot of transcription with Topic modelling"
@@ -128,7 +84,7 @@
},
{
"cell_type": "markdown",
"id": "ff6acd05",
"id": "a795137e",
"metadata": {},
"source": [
"Change the values of \"category\", \"category_name\" to one agenda topic and change the value of \"not_category_name\" and see different plots."
@@ -136,7 +92,7 @@
},
{
"cell_type": "code",
"execution_count": 61,
"execution_count": 21,
"id": "43e01074",
"metadata": {},
"outputs": [],
@@ -144,6 +100,7 @@
"import pandas as pd\n",
"import scattertext as st\n",
"\n",
"df = pd.read_pickle(\"df.pkl\")\n",
"\n",
"def plot_topic_modelling_and_word_to_sentence_search(df, cat_1, cat_1_name, cat_2_name):\n",
" df = df.assign(parse=lambda df: df.text.apply(st.whitespace_nlp_with_sentences))\n",
@@ -162,16 +119,16 @@
" open('./demo_compact.html', 'w').write(html)\n",
"\n",
"plot_topic_modelling_and_word_to_sentence_search(df,\n",
" cat_1=\"TAM\",\n",
" cat_1_name=\"TAM\",\n",
" cat_2_name=\"Churn\")\n",
" cat_1=\"Founders\",\n",
" cat_1_name=\"Founders\",\n",
" cat_2_name=\"TAM\")\n",
"\n",
"# once you are done, check the generated HTML file\n"
]
},
{
"cell_type": "markdown",
"id": "e0610165",
"id": "e9994c87",
"metadata": {},
"source": [
"## Example template 2"
@@ -179,7 +136,7 @@
},
{
"cell_type": "markdown",
"id": "8b1684df",
"id": "35c4f7fd",
"metadata": {},
"source": [
"## Time driven Insights"
@@ -187,8 +144,8 @@
},
{
"cell_type": "code",
"execution_count": 62,
"id": "68eb04f7",
"execution_count": 9,
"id": "7cdcd66f",
"metadata": {},
"outputs": [],
"source": [
@@ -201,8 +158,8 @@
},
{
"cell_type": "code",
"execution_count": 63,
"id": "eaf9c5ed",
"execution_count": 10,
"id": "11221022",
"metadata": {},
"outputs": [
{
@@ -252,9 +209,64 @@
"plot_time_spent_for_topic(timestamp_to_topic_second_match, \"second\")"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "a691664f",
"metadata": {},
"outputs": [],
"source": [
"import collections\n",
"topic_times = collections.defaultdict(int)\n",
"for key in timestamp_to_topic_first_match.keys():\n",
" duration = key[1] - key[0]\n",
" topic_times[timestamp_to_topic_first_match[key]] += duration\n",
"\n",
"topic_times = sorted(topic_times.items(), key=lambda x:x[1], reverse=True)\n",
"cat_1 = topic_times[0][0]\n",
"cat_1_name = topic_times[0][0]\n",
"cat_2_name = topic_times[1][0]"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "6451e86d",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import scattertext as st\n",
"\n",
"df = pd.read_pickle(\"df.pkl\")\n",
"\n",
"def plot_topic_modelling_and_word_to_sentence_search(df, cat_1, cat_1_name, cat_2_name):\n",
" df = df.assign(parse=lambda df: df.text.apply(st.whitespace_nlp_with_sentences))\n",
"\n",
" corpus = st.CorpusFromParsedDocuments(\n",
" df, category_col='ts_to_topic_mapping_top_1', parsed_col='parse'\n",
" ).build().get_unigram_corpus().remove_terms(stopwords, ignore_absences=True).compact(st.AssociationCompactor(2000))\n",
" \n",
" html = st.produce_scattertext_explorer(\n",
" corpus,\n",
" category=cat_1, category_name=cat_1_name, not_category_name=cat_2_name,\n",
" minimum_term_frequency=0, pmi_threshold_coefficient=0,\n",
" width_in_pixels=1000,\n",
" transform=st.Scalers.dense_rank\n",
" )\n",
" open('./demo_compact.html', 'w').write(html)\n",
"\n",
"plot_topic_modelling_and_word_to_sentence_search(df,\n",
" cat_1=cat_1,\n",
" cat_1_name=cat_1_name,\n",
" cat_2_name=cat_2_name)\n",
"\n",
"# once you are done, check the generated HTML file\n"
]
},
{
"cell_type": "markdown",
"id": "60bda970",
"id": "e9ae6e25",
"metadata": {},
"source": [
"## Example template 3"
@@ -262,7 +274,7 @@
},
{
"cell_type": "markdown",
"id": "e1707621",
"id": "69be38ce",
"metadata": {},
"source": [
"## Enhanced search for timelines"
@@ -270,7 +282,7 @@
},
{
"cell_type": "markdown",
"id": "d2d574de",
"id": "f8a47348",
"metadata": {},
"source": [
"We can already search for a particular word in the interactive HTML document from example 1 to see a list of all transcribed sentences having an occurence of the word (in the context of the chosen topic). \n",
@@ -288,8 +300,8 @@
},
{
"cell_type": "code",
"execution_count": 86,
"id": "a5d1ea29",
"execution_count": 13,
"id": "69d814c9",
"metadata": {},
"outputs": [
{
@@ -325,7 +337,7 @@
" (2472.44, 2474.96)]"
]
},
"execution_count": 86,
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
@@ -342,7 +354,7 @@
},
{
"cell_type": "markdown",
"id": "10478204",
"id": "b587da79",
"metadata": {},
"source": [
"## Selective segregation of content"
@@ -351,7 +363,7 @@
{
"cell_type": "code",
"execution_count": 122,
"id": "9c0ee0a2",
"id": "5dc2014f",
"metadata": {},
"outputs": [],
"source": [
@@ -374,7 +386,7 @@
{
"cell_type": "code",
"execution_count": 121,
"id": "2501c721",
"id": "caeff7f1",
"metadata": {},
"outputs": [
{
@@ -394,7 +406,7 @@
},
{
"cell_type": "markdown",
"id": "1a61a12e",
"id": "a20896b4",
"metadata": {},
"source": [
"## Selective topic summarization"
@@ -402,7 +414,7 @@
},
{
"cell_type": "markdown",
"id": "490da9a0",
"id": "6f8ab415",
"metadata": {},
"source": [
"We can use this selective content to now summarize using the already available pipeline !"
@@ -410,19 +422,11 @@
},
{
"cell_type": "markdown",
"id": "53b525e3",
"id": "06f009d5",
"metadata": {},
"source": [
"# And Much More !!"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "46b4730a",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {

View File

@@ -9,8 +9,3 @@ AWS_ACCESS_KEY=***REMOVED***
AWS_SECRET_KEY=***REMOVED***
BUCKET_NAME='reflector-bucket'
# For the topic modelling viz chart
CATEGORY_1=TAM
CATEGORY_1_NAME=TAM
CATEGORY_2_NAME=Churn

View File

@@ -222,6 +222,18 @@ def create_talk_diff_scatter_viz():
# to load, my_mappings = pickle.load( open ("mappings.pkl", "rb") )
# pick the 2 most matched topic to be used for plotting
topic_times = collections.defaultdict(int)
for key in ts_to_topic_mapping_top_1.keys():
duration = key[1] - key[0]
topic_times[ts_to_topic_mapping_top_1[key]] += duration
topic_times = sorted(topic_times.items(), key=lambda x: x[1], reverse=True)
cat_1 = topic_times[0][0]
cat_1_name = topic_times[0][0]
cat_2_name = topic_times[1][0]
# Scatter plot of topics
df = df.assign(parse=lambda df: df.text.apply(st.whitespace_nlp_with_sentences))
corpus = st.CorpusFromParsedDocuments(
@@ -229,9 +241,9 @@ def create_talk_diff_scatter_viz():
).build().get_unigram_corpus().compact(st.AssociationCompactor(2000))
html = st.produce_scattertext_explorer(
corpus,
category=config["DEFAULT"]["CATEGORY_1"],
category_name=config["DEFAULT"]["CATEGORY_1_NAME"],
not_category_name=config["DEFAULT"]["CATEGORY_2_NAME"],
category=cat_1,
category_name=cat_1_name,
not_category_name=cat_2_name,
minimum_term_frequency=0, pmi_threshold_coefficient=0,
width_in_pixels=1000,
transform=st.Scalers.dense_rank