{ "cells": [ { "cell_type": "markdown", "id": "f604fe38", "metadata": {}, "source": [ "# Visualization Experiments" ] }, { "cell_type": "markdown", "id": "cad594ed", "metadata": {}, "source": [ "Lets load the data artefacts to local memory. These files are to be downloaded from S3 as the pipeline automatically uploads them to the pre-configured S3 bucket." ] }, { "cell_type": "code", "execution_count": null, "id": "dbd7b93d", "metadata": {}, "outputs": [], "source": [ "from file_util import download_files\n", "import pickle\n", "\n", "# Download files from S3 bucket. You can download multiple files at a time by passing a list of names\n", "files_to_download = [\"df.pkl\", \"mapping.pkl\"]\n" ] }, { "cell_type": "code", "execution_count": null, "id": "f59ff46b", "metadata": {}, "outputs": [], "source": [ "# Download spacy model for the first time\n", "!spacy download en_core_web_md\n" ] }, { "cell_type": "code", "execution_count": 16, "id": "61aee352", "metadata": {}, "outputs": [], "source": [ "import spacy\n", "\n", "spaCy_model = \"en_core_web_md\"\n", "nlp = spacy.load(spaCy_model)\n", "stopwords = nlp.Defaults.stop_words\n" ] }, { "cell_type": "markdown", "id": "5584c887", "metadata": {}, "source": [ "## Scatter plot of transcription with Topic modelling" ] }, { "cell_type": "markdown", "id": "5fae1776", "metadata": {}, "source": [ "Change the values of \"category\", \"category_name\" to one agenda topic and change the value of \"not_category_name\" and see different plots." ] }, { "cell_type": "code", "execution_count": 17, "id": "43e01074", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import scattertext as st\n", "\n", "\n", "def plot_topic_modelling_and_word_to_sentence_search(df, cat_1, cat_1_name, cat_2_name):\n", " df = df.assign(parse=lambda df: df.text.apply(st.whitespace_nlp_with_sentences))\n", "\n", " corpus = st.CorpusFromParsedDocuments(\n", " df, category_col='ts_to_topic_mapping_top_1', parsed_col='parse'\n", " ).build().get_unigram_corpus().remove_terms(stopwords, ignore_absences=True).compact(st.AssociationCompactor(2000))\n", " \n", " html = st.produce_scattertext_explorer(\n", " corpus,\n", " category=cat_1, category_name=cat_1_name, not_category_name=cat_2_name,\n", " minimum_term_frequency=0, pmi_threshold_coefficient=0,\n", " width_in_pixels=1000,\n", " transform=st.Scalers.dense_rank\n", " )\n", " open('./demo_compact.html', 'w').write(html)\n", "\n", "plot_topic_modelling_and_word_to_sentence_search(df,\n", " cat_1=\"TAM\",\n", " cat_1_name=\"TAM\",\n", " cat_2_name=\"Churn\")\n", "\n", "# once you are done, check the generated HTML file\n" ] }, { "cell_type": "markdown", "id": "e2d6ec49", "metadata": {}, "source": [ "## Timeline visualizer" ] }, { "cell_type": "code", "execution_count": null, "id": "08e83128", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.8" } }, "nbformat": 4, "nbformat_minor": 5 }