Files
reflector/Viz-experiments.ipynb
2023-06-14 17:49:59 +05:30

148 lines
3.8 KiB
Plaintext

{
"cells": [
{
"cell_type": "markdown",
"id": "f604fe38",
"metadata": {},
"source": [
"# Visualization Experiments"
]
},
{
"cell_type": "markdown",
"id": "cad594ed",
"metadata": {},
"source": [
"Lets load the data artefacts to local memory. These files are to be downloaded from S3 as the pipeline automatically uploads them to the pre-configured S3 bucket."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "dbd7b93d",
"metadata": {},
"outputs": [],
"source": [
"from file_util import download_files\n",
"import pickle\n",
"\n",
"# Download files from S3 bucket. You can download multiple files at a time by passing a list of names\n",
"files_to_download = [\"df.pkl\", \"mapping.pkl\"]\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f59ff46b",
"metadata": {},
"outputs": [],
"source": [
"# Download spacy model for the first time\n",
"!spacy download en_core_web_md\n"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "61aee352",
"metadata": {},
"outputs": [],
"source": [
"import spacy\n",
"\n",
"spaCy_model = \"en_core_web_md\"\n",
"nlp = spacy.load(spaCy_model)\n",
"stopwords = nlp.Defaults.stop_words\n"
]
},
{
"cell_type": "markdown",
"id": "5584c887",
"metadata": {},
"source": [
"## Scatter plot of transcription with Topic modelling"
]
},
{
"cell_type": "markdown",
"id": "5fae1776",
"metadata": {},
"source": [
"Change the values of \"category\", \"category_name\" to one agenda topic and change the value of \"not_category_name\" and see different plots."
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "43e01074",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import scattertext as st\n",
"\n",
"\n",
"def plot_topic_modelling_and_word_to_sentence_search(df, cat_1, cat_1_name, cat_2_name):\n",
" df = df.assign(parse=lambda df: df.text.apply(st.whitespace_nlp_with_sentences))\n",
"\n",
" corpus = st.CorpusFromParsedDocuments(\n",
" df, category_col='ts_to_topic_mapping_top_1', parsed_col='parse'\n",
" ).build().get_unigram_corpus().remove_terms(stopwords, ignore_absences=True).compact(st.AssociationCompactor(2000))\n",
" \n",
" html = st.produce_scattertext_explorer(\n",
" corpus,\n",
" category=cat_1, category_name=cat_1_name, not_category_name=cat_2_name,\n",
" minimum_term_frequency=0, pmi_threshold_coefficient=0,\n",
" width_in_pixels=1000,\n",
" transform=st.Scalers.dense_rank\n",
" )\n",
" open('./demo_compact.html', 'w').write(html)\n",
"\n",
"plot_topic_modelling_and_word_to_sentence_search(df,\n",
" cat_1=\"TAM\",\n",
" cat_1_name=\"TAM\",\n",
" cat_2_name=\"Churn\")\n",
"\n",
"# once you are done, check the generated HTML file\n"
]
},
{
"cell_type": "markdown",
"id": "e2d6ec49",
"metadata": {},
"source": [
"## Timeline visualizer"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "08e83128",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.8"
}
},
"nbformat": 4,
"nbformat_minor": 5
}