{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "f604fe38",
   "metadata": {},
   "source": [
    "# Visualization Experiments"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cad594ed",
   "metadata": {},
   "source": [
    "Lets load the data artefacts to local memory. These files are to be downloaded from S3 as the pipeline automatically uploads them to the pre-configured S3 bucket."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dbd7b93d",
   "metadata": {},
   "outputs": [],
   "source": [
    "from file_util import download_files\n",
    "import pickle\n",
    "\n",
    "# Download files from S3 bucket. You can download multiple files at a time by passing a list of names\n",
    "files_to_download = [\"df.pkl\", \"mapping.pkl\"]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f59ff46b",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Download spacy model for the first time\n",
    "!spacy download en_core_web_md\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "61aee352",
   "metadata": {},
   "outputs": [],
   "source": [
    "import spacy\n",
    "\n",
    "spaCy_model = \"en_core_web_md\"\n",
    "nlp = spacy.load(spaCy_model)\n",
    "stopwords = nlp.Defaults.stop_words\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5584c887",
   "metadata": {},
   "source": [
    "## Scatter plot of transcription with Topic modelling"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5fae1776",
   "metadata": {},
   "source": [
    "Change the values of \"category\", \"category_name\" to one agenda topic and change the value of \"not_category_name\" and see different plots."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "43e01074",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import scattertext as st\n",
    "\n",
    "\n",
    "def plot_topic_modelling_and_word_to_sentence_search(df, cat_1, cat_1_name, cat_2_name):\n",
    "    df = df.assign(parse=lambda df: df.text.apply(st.whitespace_nlp_with_sentences))\n",
    "\n",
    "    corpus = st.CorpusFromParsedDocuments(\n",
    "        df, category_col='ts_to_topic_mapping_top_1', parsed_col='parse'\n",
    "    ).build().get_unigram_corpus().remove_terms(stopwords, ignore_absences=True).compact(st.AssociationCompactor(2000))\n",
    "    \n",
    "    html = st.produce_scattertext_explorer(\n",
    "        corpus,\n",
    "        category=cat_1, category_name=cat_1_name, not_category_name=cat_2_name,\n",
    "        minimum_term_frequency=0, pmi_threshold_coefficient=0,\n",
    "        width_in_pixels=1000,\n",
    "        transform=st.Scalers.dense_rank\n",
    "    )\n",
    "    open('./demo_compact.html', 'w').write(html)\n",
    "\n",
    "plot_topic_modelling_and_word_to_sentence_search(df,\n",
    "                                                cat_1=\"TAM\",\n",
    "                                                cat_1_name=\"TAM\",\n",
    "                                                cat_2_name=\"Churn\")\n",
    "\n",
    "# once you are done, check the generated HTML file\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e2d6ec49",
   "metadata": {},
   "source": [
    "## Timeline visualizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "08e83128",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}