{ "metadata": { "kernelspec": { "display_name": "Streamlit Notebook", "name": "streamlit" } }, "nbformat_minor": 5, "nbformat": 4, "cells": [ { "cell_type": "code", "id": "3775908f-ca36-4846-8f38-5adca39217f2", "metadata": { "language": "python", "name": "cell1", "collapsed": false }, "source": "# Import python packages\nimport streamlit as st\nimport pandas as pd\n\n# We can also use Snowpark for our analyses!\nfrom snowflake.snowpark.context import get_active_session\nsession = get_active_session()\n", "execution_count": null, "outputs": [] }, { "cell_type": "code", "id": "8d50cbf4-0c8d-4950-86cb-114990437ac9", "metadata": { "language": "python", "name": "cell2", "codeCollapsed": false, "collapsed": false }, "source": "df_orig = session.table('MOVIE_REVIEWS').to_pandas()", "execution_count": null, "outputs": [] }, { "cell_type": "code", "id": "072873b9-1f70-4ed8-8db6-244e099c5b38", "metadata": { "language": "python", "name": "cell4", "codeCollapsed": false, "collapsed": false }, "outputs": [], "source": "df = df_orig.copy()\ndf.head()", "execution_count": null }, { "cell_type": "code", "id": "c695373e-ac74-4b62-a1f1-08206cbd5c81", "metadata": { "language": "python", "name": "cell3", "codeCollapsed": false, "collapsed": false }, "source": "# convert the 'Date' column to datetime format\ndf['DATE'] = pd.to_datetime(df['DATE'],errors='coerce').dt.date\n\ndf.info()", "execution_count": null, "outputs": [] }, { "cell_type": "code", "id": "2ef53de4-fb2c-4413-9c5e-9e7eba3c0124", "metadata": { "language": "python", "name": "cell5", "codeCollapsed": false, "collapsed": false }, "outputs": [], "source": "df.describe(include='all')", "execution_count": null }, { "cell_type": "code", "id": "544c767b-39af-4d26-adea-b216dbe9fee7", "metadata": { "language": "python", "name": "cell6", "collapsed": false, "codeCollapsed": false }, "outputs": [], "source": "#create bar plot to visualize frequency of each team\nimport matplotlib\ndf['MOVIE'].value_counts()[0:100].plot(kind='bar', \n xlabel='Category', \n ylabel='Articles Count', \n rot=90, \n title='Number of articles per category',\n figsize=(15,5))", "execution_count": null }, { "cell_type": "code", "id": "18be417a-4685-44e5-a933-593b0e879258", "metadata": { "language": "python", "name": "cell7", "collapsed": false, "codeCollapsed": false }, "outputs": [], "source": "df[df['REVIEW']=='click for full review']", "execution_count": null }, { "cell_type": "code", "id": "6052d406-b0a5-494a-8a05-9afafdafdf02", "metadata": { "language": "python", "name": "cell8", "collapsed": false, "codeCollapsed": false }, "outputs": [], "source": "df['REVIEW'].value_counts()[0:10]", "execution_count": null }, { "cell_type": "code", "id": "cb8d341a-3893-41ae-b278-bfcdf7b14a51", "metadata": { "language": "python", "name": "cell9", "collapsed": false, "codeCollapsed": false }, "outputs": [], "source": "df[\"WordCount\"] = df[\"REVIEW\"].apply(lambda n: len(n.split()))\ndf.head(100)", "execution_count": null }, { "cell_type": "code", "id": "f5b8e767-4cca-4cc4-9fd2-fd6c9a34cca7", "metadata": { "language": "python", "name": "cell10", "codeCollapsed": false }, "outputs": [], "source": "from matplotlib import pyplot as plt\nimport seaborn as sns\n\n# plot word counts for reviews of first 100 movies\ndf_plot = df[df['MOVIE'].isin(df['MOVIE'].value_counts()[0:100].index.to_list())]\n_ = sns.catplot(data=df_plot, x=\"MOVIE\", y=\"WordCount\", kind=\"box\", height=4, aspect=4)\n_ = plt.xticks(rotation=90)", "execution_count": null }, { "cell_type": "code", "id": "6dceba71-678b-447d-862a-6ed212e20613", "metadata": { "language": "python", "name": "cell11", "codeCollapsed": false }, "outputs": [], "source": "movie_list = df['MOVIE'].value_counts()[0:20].index.to_list()\n\ndf_sub = df[df['MOVIE'].isin(movie_list)].head(10)\ndf_sub", "execution_count": null }, { "cell_type": "code", "id": "da0b19f0-f480-463c-bd5b-fed54bec8f81", "metadata": { "language": "sql", "name": "cell13", "codeCollapsed": false }, "outputs": [], "source": "CREATE OR REPLACE TABLE MOVIE_REVIEWS_WC\n (\n MOVIE STRING,\n PUBLISH STRING,\n REVIEW STRING,\n DATE DATE,\n SCORE INT,\n \"WordCount\" INT);\n", "execution_count": null }, { "cell_type": "code", "id": "27c5313d-ac1a-48e1-a47c-3b7c901e0163", "metadata": { "language": "python", "name": "cell12", "codeCollapsed": false }, "outputs": [], "source": "session.write_pandas(df_sub, \"MOVIE_REVIEWS_WC\")\n", "execution_count": null } ] }