import findspark
findspark.init()


!/mnt/miniconda/bin/pip install sparknlp

Collecting sparknlp
  Downloading sparknlp-1.0.0-py3-none-any.whl (1.4 kB)
Requirement already satisfied: numpy in /mnt/miniconda/lib/python3.7/site-packages (from sparknlp) (1.21.2)
Collecting spark-nlp
  Downloading spark_nlp-3.4.3-py2.py3-none-any.whl (144 kB)
     |████████████████████████████████| 144 kB 30.7 MB/s eta 0:00:01
Installing collected packages: spark-nlp, sparknlp
Successfully installed spark-nlp-3.4.3 sparknlp-1.0.0


import pyspark.sql.functions as F
from pyspark.sql.functions import col, lit,size
from pyspark.sql import SparkSession
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import json
from pyspark.ml import Pipeline
from sparknlp.annotator import *
from sparknlp.base import *
import sparknlp
from sparknlp.pretrained import PretrainedPipeline


spark = SparkSession.builder \
        .appName("reddit") \
        .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
        .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:3.4.2") \
    .master('yarn') \
    .getOrCreate()

Ivy Default Cache set to: /home/hadoop/.ivy2/cache
The jars for the packages stored in: /home/hadoop/.ivy2/jars
:: loading settings :: url = jar:file:/usr/lib/spark/jars/ivy-2.4.0.jar!/org/apache/ivy/core/settings/ivysettings.xml
com.johnsnowlabs.nlp#spark-nlp_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-c342d107-ad2a-4bf6-85b2-decf3f433d62;1.0
	confs: [default]
	found com.johnsnowlabs.nlp#spark-nlp_2.12;3.4.2 in central
	found com.typesafe#config;1.4.1 in central
	found org.rocksdb#rocksdbjni;6.5.3 in central
	found com.amazonaws#aws-java-sdk-bundle;1.11.603 in central
	found com.github.universal-automata#liblevenshtein;3.0.0 in central
	found com.google.code.findbugs#annotations;3.0.1 in central
	found net.jcip#jcip-annotations;1.0 in central
	found com.google.code.findbugs#jsr305;3.0.1 in central
	found com.google.protobuf#protobuf-java-util;3.0.0-beta-3 in central
	found com.google.protobuf#protobuf-java;3.0.0-beta-3 in central
	found com.google.code.gson#gson;2.3 in central
	found it.unimi.dsi#fastutil;7.0.12 in central
	found org.projectlombok#lombok;1.16.8 in central
	found org.slf4j#slf4j-api;1.7.21 in central
	found com.navigamez#greex;1.0 in central
	found dk.brics.automaton#automaton;1.11-8 in central
	found org.json4s#json4s-ext_2.12;3.5.3 in central
	found joda-time#joda-time;2.9.5 in central
	found org.joda#joda-convert;1.8.1 in central
	found com.johnsnowlabs.nlp#tensorflow-cpu_2.12;0.3.3 in central
	found net.sf.trove4j#trove4j;3.0.3 in central
downloading https://repo1.maven.org/maven2/com/johnsnowlabs/nlp/spark-nlp_2.12/3.4.2/spark-nlp_2.12-3.4.2.jar ...
	[SUCCESSFUL ] com.johnsnowlabs.nlp#spark-nlp_2.12;3.4.2!spark-nlp_2.12.jar (573ms)
downloading https://repo1.maven.org/maven2/com/typesafe/config/1.4.1/config-1.4.1.jar ...
	[SUCCESSFUL ] com.typesafe#config;1.4.1!config.jar(bundle) (5ms)
downloading https://repo1.maven.org/maven2/org/rocksdb/rocksdbjni/6.5.3/rocksdbjni-6.5.3.jar ...
	[SUCCESSFUL ] org.rocksdb#rocksdbjni;6.5.3!rocksdbjni.jar (206ms)
downloading https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.11.603/aws-java-sdk-bundle-1.11.603.jar ...
	[SUCCESSFUL ] com.amazonaws#aws-java-sdk-bundle;1.11.603!aws-java-sdk-bundle.jar (854ms)
downloading https://repo1.maven.org/maven2/com/github/universal-automata/liblevenshtein/3.0.0/liblevenshtein-3.0.0.jar ...
	[SUCCESSFUL ] com.github.universal-automata#liblevenshtein;3.0.0!liblevenshtein.jar (4ms)
downloading https://repo1.maven.org/maven2/com/navigamez/greex/1.0/greex-1.0.jar ...
	[SUCCESSFUL ] com.navigamez#greex;1.0!greex.jar (2ms)
downloading https://repo1.maven.org/maven2/org/json4s/json4s-ext_2.12/3.5.3/json4s-ext_2.12-3.5.3.jar ...
	[SUCCESSFUL ] org.json4s#json4s-ext_2.12;3.5.3!json4s-ext_2.12.jar (3ms)
downloading https://repo1.maven.org/maven2/com/johnsnowlabs/nlp/tensorflow-cpu_2.12/0.3.3/tensorflow-cpu_2.12-0.3.3.jar ...
	[SUCCESSFUL ] com.johnsnowlabs.nlp#tensorflow-cpu_2.12;0.3.3!tensorflow-cpu_2.12.jar (1076ms)
downloading https://repo1.maven.org/maven2/net/sf/trove4j/trove4j/3.0.3/trove4j-3.0.3.jar ...
	[SUCCESSFUL ] net.sf.trove4j#trove4j;3.0.3!trove4j.jar (20ms)
downloading https://repo1.maven.org/maven2/com/google/code/findbugs/annotations/3.0.1/annotations-3.0.1.jar ...
	[SUCCESSFUL ] com.google.code.findbugs#annotations;3.0.1!annotations.jar (2ms)
downloading https://repo1.maven.org/maven2/com/google/protobuf/protobuf-java-util/3.0.0-beta-3/protobuf-java-util-3.0.0-beta-3.jar ...
	[SUCCESSFUL ] com.google.protobuf#protobuf-java-util;3.0.0-beta-3!protobuf-java-util.jar(bundle) (3ms)
downloading https://repo1.maven.org/maven2/com/google/protobuf/protobuf-java/3.0.0-beta-3/protobuf-java-3.0.0-beta-3.jar ...
	[SUCCESSFUL ] com.google.protobuf#protobuf-java;3.0.0-beta-3!protobuf-java.jar(bundle) (9ms)
downloading https://repo1.maven.org/maven2/it/unimi/dsi/fastutil/7.0.12/fastutil-7.0.12.jar ...
	[SUCCESSFUL ] it.unimi.dsi#fastutil;7.0.12!fastutil.jar (115ms)
downloading https://repo1.maven.org/maven2/org/projectlombok/lombok/1.16.8/lombok-1.16.8.jar ...
	[SUCCESSFUL ] org.projectlombok#lombok;1.16.8!lombok.jar (11ms)
downloading https://repo1.maven.org/maven2/org/slf4j/slf4j-api/1.7.21/slf4j-api-1.7.21.jar ...
	[SUCCESSFUL ] org.slf4j#slf4j-api;1.7.21!slf4j-api.jar (2ms)
downloading https://repo1.maven.org/maven2/net/jcip/jcip-annotations/1.0/jcip-annotations-1.0.jar ...
	[SUCCESSFUL ] net.jcip#jcip-annotations;1.0!jcip-annotations.jar (1ms)
downloading https://repo1.maven.org/maven2/com/google/code/findbugs/jsr305/3.0.1/jsr305-3.0.1.jar ...
	[SUCCESSFUL ] com.google.code.findbugs#jsr305;3.0.1!jsr305.jar (1ms)
downloading https://repo1.maven.org/maven2/com/google/code/gson/gson/2.3/gson-2.3.jar ...
	[SUCCESSFUL ] com.google.code.gson#gson;2.3!gson.jar (3ms)
downloading https://repo1.maven.org/maven2/dk/brics/automaton/automaton/1.11-8/automaton-1.11-8.jar ...
	[SUCCESSFUL ] dk.brics.automaton#automaton;1.11-8!automaton.jar (3ms)
downloading https://repo1.maven.org/maven2/joda-time/joda-time/2.9.5/joda-time-2.9.5.jar ...
	[SUCCESSFUL ] joda-time#joda-time;2.9.5!joda-time.jar (6ms)
downloading https://repo1.maven.org/maven2/org/joda/joda-convert/1.8.1/joda-convert-1.8.1.jar ...
	[SUCCESSFUL ] org.joda#joda-convert;1.8.1!joda-convert.jar (3ms)
:: resolution report :: resolve 2937ms :: artifacts dl 2912ms
	:: modules in use:
	com.amazonaws#aws-java-sdk-bundle;1.11.603 from central in [default]
	com.github.universal-automata#liblevenshtein;3.0.0 from central in [default]
	com.google.code.findbugs#annotations;3.0.1 from central in [default]
	com.google.code.findbugs#jsr305;3.0.1 from central in [default]
	com.google.code.gson#gson;2.3 from central in [default]
	com.google.protobuf#protobuf-java;3.0.0-beta-3 from central in [default]
	com.google.protobuf#protobuf-java-util;3.0.0-beta-3 from central in [default]
	com.johnsnowlabs.nlp#spark-nlp_2.12;3.4.2 from central in [default]
	com.johnsnowlabs.nlp#tensorflow-cpu_2.12;0.3.3 from central in [default]
	com.navigamez#greex;1.0 from central in [default]
	com.typesafe#config;1.4.1 from central in [default]
	dk.brics.automaton#automaton;1.11-8 from central in [default]
	it.unimi.dsi#fastutil;7.0.12 from central in [default]
	joda-time#joda-time;2.9.5 from central in [default]
	net.jcip#jcip-annotations;1.0 from central in [default]
	net.sf.trove4j#trove4j;3.0.3 from central in [default]
	org.joda#joda-convert;1.8.1 from central in [default]
	org.json4s#json4s-ext_2.12;3.5.3 from central in [default]
	org.projectlombok#lombok;1.16.8 from central in [default]
	org.rocksdb#rocksdbjni;6.5.3 from central in [default]
	org.slf4j#slf4j-api;1.7.21 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   21  |   21  |   21  |   0   ||   21  |   21  |
	---------------------------------------------------------------------

:: problems summary ::
:::: ERRORS
	SERVER ERROR: Bad Gateway url=https://dl.bintray.com/spark-packages/maven/com/amazonaws/aws-java-sdk-pom/1.11.603/aws-java-sdk-pom-1.11.603.jar

	SERVER ERROR: Bad Gateway url=https://dl.bintray.com/spark-packages/maven/org/sonatype/oss/oss-parent/7/oss-parent-7.jar

	SERVER ERROR: Bad Gateway url=https://dl.bintray.com/spark-packages/maven/net/jcip/jcip-annotations/1.0/jcip-annotations-1.0-javadoc.jar

	SERVER ERROR: Bad Gateway url=https://dl.bintray.com/spark-packages/maven/com/google/google/1/google-1.jar

	SERVER ERROR: Bad Gateway url=https://dl.bintray.com/spark-packages/maven/com/google/protobuf/protobuf-parent/3.0.0-beta-3/protobuf-parent-3.0.0-beta-3.jar

	SERVER ERROR: Bad Gateway url=https://dl.bintray.com/spark-packages/maven/com/google/protobuf/protobuf-java-util/3.0.0-beta-3/protobuf-java-util-3.0.0-beta-3-sources.jar

	SERVER ERROR: Bad Gateway url=https://dl.bintray.com/spark-packages/maven/com/google/protobuf/protobuf-java-util/3.0.0-beta-3/protobuf-java-util-3.0.0-beta-3-src.jar

	SERVER ERROR: Bad Gateway url=https://dl.bintray.com/spark-packages/maven/com/google/protobuf/protobuf-java-util/3.0.0-beta-3/protobuf-java-util-3.0.0-beta-3-javadoc.jar

	SERVER ERROR: Bad Gateway url=https://dl.bintray.com/spark-packages/maven/com/google/protobuf/protobuf-java/3.0.0-beta-3/protobuf-java-3.0.0-beta-3-sources.jar

	SERVER ERROR: Bad Gateway url=https://dl.bintray.com/spark-packages/maven/com/google/protobuf/protobuf-java/3.0.0-beta-3/protobuf-java-3.0.0-beta-3-src.jar

	SERVER ERROR: Bad Gateway url=https://dl.bintray.com/spark-packages/maven/com/google/protobuf/protobuf-java/3.0.0-beta-3/protobuf-java-3.0.0-beta-3-javadoc.jar

	SERVER ERROR: Bad Gateway url=https://dl.bintray.com/spark-packages/maven/org/sonatype/oss/oss-parent/9/oss-parent-9.jar

	SERVER ERROR: Bad Gateway url=https://dl.bintray.com/spark-packages/maven/org/slf4j/slf4j-parent/1.7.21/slf4j-parent-1.7.21.jar


:: USE VERBOSE OR DEBUG MESSAGE LEVEL FOR MORE DETAILS
:: retrieving :: org.apache.spark#spark-submit-parent-c342d107-ad2a-4bf6-85b2-decf3f433d62
	confs: [default]
	21 artifacts copied, 0 already retrieved (399905kB/258ms)
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/04/16 23:39:22 WARN Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.
22/04/16 23:39:27 WARN Client: Same path resource file:///home/hadoop/.ivy2/jars/com.johnsnowlabs.nlp_spark-nlp_2.12-3.4.2.jar added multiple times to distributed cache.
22/04/16 23:39:27 WARN Client: Same path resource file:///home/hadoop/.ivy2/jars/com.typesafe_config-1.4.1.jar added multiple times to distributed cache.
22/04/16 23:39:27 WARN Client: Same path resource file:///home/hadoop/.ivy2/jars/org.rocksdb_rocksdbjni-6.5.3.jar added multiple times to distributed cache.
22/04/16 23:39:27 WARN Client: Same path resource file:///home/hadoop/.ivy2/jars/com.amazonaws_aws-java-sdk-bundle-1.11.603.jar added multiple times to distributed cache.
22/04/16 23:39:27 WARN Client: Same path resource file:///home/hadoop/.ivy2/jars/com.github.universal-automata_liblevenshtein-3.0.0.jar added multiple times to distributed cache.
22/04/16 23:39:27 WARN Client: Same path resource file:///home/hadoop/.ivy2/jars/com.navigamez_greex-1.0.jar added multiple times to distributed cache.
22/04/16 23:39:27 WARN Client: Same path resource file:///home/hadoop/.ivy2/jars/org.json4s_json4s-ext_2.12-3.5.3.jar added multiple times to distributed cache.
22/04/16 23:39:27 WARN Client: Same path resource file:///home/hadoop/.ivy2/jars/com.johnsnowlabs.nlp_tensorflow-cpu_2.12-0.3.3.jar added multiple times to distributed cache.
22/04/16 23:39:27 WARN Client: Same path resource file:///home/hadoop/.ivy2/jars/net.sf.trove4j_trove4j-3.0.3.jar added multiple times to distributed cache.
22/04/16 23:39:27 WARN Client: Same path resource file:///home/hadoop/.ivy2/jars/com.google.code.findbugs_annotations-3.0.1.jar added multiple times to distributed cache.
22/04/16 23:39:27 WARN Client: Same path resource file:///home/hadoop/.ivy2/jars/com.google.protobuf_protobuf-java-util-3.0.0-beta-3.jar added multiple times to distributed cache.
22/04/16 23:39:27 WARN Client: Same path resource file:///home/hadoop/.ivy2/jars/com.google.protobuf_protobuf-java-3.0.0-beta-3.jar added multiple times to distributed cache.
22/04/16 23:39:27 WARN Client: Same path resource file:///home/hadoop/.ivy2/jars/it.unimi.dsi_fastutil-7.0.12.jar added multiple times to distributed cache.
22/04/16 23:39:27 WARN Client: Same path resource file:///home/hadoop/.ivy2/jars/org.projectlombok_lombok-1.16.8.jar added multiple times to distributed cache.
22/04/16 23:39:27 WARN Client: Same path resource file:///home/hadoop/.ivy2/jars/org.slf4j_slf4j-api-1.7.21.jar added multiple times to distributed cache.
22/04/16 23:39:27 WARN Client: Same path resource file:///home/hadoop/.ivy2/jars/net.jcip_jcip-annotations-1.0.jar added multiple times to distributed cache.
22/04/16 23:39:27 WARN Client: Same path resource file:///home/hadoop/.ivy2/jars/com.google.code.findbugs_jsr305-3.0.1.jar added multiple times to distributed cache.
22/04/16 23:39:27 WARN Client: Same path resource file:///home/hadoop/.ivy2/jars/com.google.code.gson_gson-2.3.jar added multiple times to distributed cache.
22/04/16 23:39:27 WARN Client: Same path resource file:///home/hadoop/.ivy2/jars/dk.brics.automaton_automaton-1.11-8.jar added multiple times to distributed cache.
22/04/16 23:39:27 WARN Client: Same path resource file:///home/hadoop/.ivy2/jars/joda-time_joda-time-2.9.5.jar added multiple times to distributed cache.
22/04/16 23:39:27 WARN Client: Same path resource file:///home/hadoop/.ivy2/jars/org.joda_joda-convert-1.8.1.jar added multiple times to distributed cache.
22/04/16 23:39:36 WARN YarnSchedulerBackend$YarnSchedulerEndpoint: Attempted to request executors before the AM has registered!


spark


df_full = spark.read.parquet('s3://ssp88-labdata2/eda_df_full')


df_full.printSchema()

root
 |-- all_awardings: string (nullable = true)
 |-- author: string (nullable = true)
 |-- author_created_utc: double (nullable = true)
 |-- author_flair_richtext: string (nullable = true)
 |-- author_flair_type: string (nullable = true)
 |-- author_fullname: string (nullable = true)
 |-- author_patreon_flair: boolean (nullable = true)
 |-- author_premium: boolean (nullable = true)
 |-- awarders: string (nullable = true)
 |-- body: string (nullable = true)
 |-- can_gild: boolean (nullable = true)
 |-- can_mod_post: boolean (nullable = true)
 |-- collapsed: boolean (nullable = true)
 |-- controversiality: long (nullable = true)
 |-- created_utc: long (nullable = true)
 |-- gilded: long (nullable = true)
 |-- gildings: string (nullable = true)
 |-- id: string (nullable = true)
 |-- is_submitter: boolean (nullable = true)
 |-- link_id: string (nullable = true)
 |-- locked: boolean (nullable = true)
 |-- no_follow: boolean (nullable = true)
 |-- parent_id: string (nullable = true)
 |-- permalink: string (nullable = true)
 |-- quarantined: boolean (nullable = true)
 |-- retrieved_on: long (nullable = true)
 |-- score: long (nullable = true)
 |-- send_replies: boolean (nullable = true)
 |-- stickied: boolean (nullable = true)
 |-- subreddit_id: string (nullable = true)
 |-- subreddit_name_prefixed: string (nullable = true)
 |-- subreddit_type: string (nullable = true)
 |-- total_awards_received: long (nullable = true)
 |-- treatment_tags: string (nullable = true)
 |-- editable: boolean (nullable = true)
 |-- ym_partition: integer (nullable = true)
 |-- comment_date: string (nullable = true)
 |-- year: string (nullable = true)
 |-- month: string (nullable = true)
 |-- hour: string (nullable = true)
 |-- original_post: string (nullable = true)


# Most Common Words
import pyspark.sql.functions as f
common_word = df_full.withColumn('word', f.explode(f.split(f.col('body'), ' '))) \
  .groupBy('word') \
  .count() \
  .sort('count', ascending=False) \
  .limit(10)


common_word.show()

22/04/16 23:39:50 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
[Stage 3:=====================================>                   (17 + 9) / 26]

+----+--------+
|word|   count|
+----+--------+
| the|14798077|
|  to|10378945|
|   a| 9822211|
| and| 7735354|
|  of| 6655809|
|   I| 5741511|
|  is| 5584017|
|that| 5065040|
| you| 5022835|
|  in| 4913229|
+----+--------+


#Distribution of Text Lengths
df_full = df_full.withColumn("comment_length", F.length(col('body')))
df_full.select('body','comment_length').sort('comment_length', ascending=False).show(10)

+--------------------+--------------+
|                body|comment_length|
+--------------------+--------------+
|**Money in Electi...|         11252|
|&gt; 
&gt; 
&gt; ...|         10271|
|Original comment ...|         10192|
|Original comment ...|         10190|
|LOL wait didnt yo...|         10166|
|Biases in Stops, ...|         10145|
|Part 2.

&gt; You...|         10081|
|&gt;I do understa...|         10079|
|UNDELETED comment...|         10074|
|UNDELETED comment...|         10074|
+--------------------+--------------+
only showing top 10 rows


from pyspark.ml.feature import HashingTF, IDF, Tokenizer

sentenceData = df_full.select(df_full["body"])


tokenizer = Tokenizer(inputCol="body", outputCol="words")
wordsData = tokenizer.transform(sentenceData)


hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
featurizedData = hashingTF.transform(wordsData)

idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)

for features_label in rescaledData.select("features", "words").take(3):
    print(features_label)

[Stage 152:>                                                        (0 + 1) / 1]

Row(features=SparseVector(20, {3: 0.6267, 7: 0.7122, 10: 1.7161, 19: 0.6819}), words=['so,', 'theyre', 'like', 'australian', 'geese?'])
Row(features=SparseVector(20, {0: 4.6762, 1: 3.639, 2: 1.1016, 3: 3.7599, 4: 3.136, 5: 3.5421, 6: 3.0408, 7: 4.2731, 8: 2.0492, 9: 4.9559, 10: 1.7161, 11: 3.14, 12: 2.6444, 13: 1.6626, 15: 3.1381, 16: 2.78, 17: 3.8672, 18: 4.4873, 19: 0.6819}), words=['by', 'that', 'definition', 'literally', 'any', 'food', 'is', 'a', 'drug.', "there's", 'nothing', 'special', 'about', "sugar's", 'effect', 'on', 'the', 'reward', 'system.', 'you', 'eat', 'food,', 'you', 'feel', 'good.', 'the', 'tongue', 'enjoys', 'the', 'sweetness', 'and', 'you', 'get', 'a', 'hit', 'of', 'dopamine', 'from', 'your', 'reward', 'system.', 'the', 'sugar', "isn't", 'binding', 'to', 'any', 'receptors', 'in', 'your', 'brain', "it's", 'entirely', 'your', "brain's", 'own', 'response', 'to', 'positive', 'stimulus.', 'other', 'examples', 'are', 'finishing', 'paperwork', 'or', 'getting', 'a', 'massage.', 'sugar', 'is', 'not', 'even', 'close', 'to', 'a', 'drug.'])
Row(features=SparseVector(20, {3: 1.2533, 6: 1.5204, 7: 1.4244, 11: 0.785, 13: 1.6626, 15: 2.3536, 18: 0.8975, 19: 1.3638}), words=["i'm", 'a', 'follower', 'of', 'beau', 'in', 'yt', 'and', 'twitter.', 'he', 'makes', 'a', 'lot', 'of', 'sense.'])


df_full_reddit = df_full.withColumn("Pandemic_Freakout", F.regexp_extract('body', \
                                                        r'(?i)\bcovid\b|(?i)\bpandemic\b|(?i)\bcovid-19\b|(?i)\bcorona\b|(?i)\bvirus\b|(?i)\bmasks\b|(?i)\hospital\b',0))
df_full_reddit = df_full_reddit.withColumn("Arrest_Freakout", F.regexp_extract('body', \
                                                        r'(?i)\barrest\b|(?i)\bofficer\b|(?i)\bpolice\b|(?i)\bcop\b|(?i)\bstab\b|(?i)\billegal\b|(?i)\brutal\b',0))


df_full_reddit = df_full_reddit.withColumn("Pandemic_Freakout",F.lower(F.col('Pandemic_Freakout')))
df_full_reddit = df_full_reddit.withColumn("Arrest_Freakout",F.lower(F.col('Arrest_Freakout')))


import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/hadoop/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.

True


from nltk.corpus import stopwords
eng_stopwords = stopwords.words('english')
#eng_stopwords.append('xxxx')


from sparknlp.base import Finisher, DocumentAssembler
from sparknlp.annotator import (Tokenizer, Normalizer,
                                LemmatizerModel, StopWordsCleaner)
from pyspark.ml import Pipeline


documentAssembler = DocumentAssembler() \
     .setInputCol('body') \
     .setOutputCol('document')
tokenizer = Tokenizer() \
     .setInputCols(['document']) \
     .setOutputCol('token')
# note normalizer defaults to changing all words to lowercase.
# Use .setLowercase(False) to maintain input case.
normalizer = Normalizer() \
     .setInputCols(['token']) \
     .setOutputCol('normalized') \
     .setLowercase(True)
# note that lemmatizer needs a dictionary. So I used the pre-trained
# model (note that it defaults to english)
lemmatizer = LemmatizerModel.pretrained() \
     .setInputCols(['normalized']) \
     .setOutputCol('lemma')
stopwords_cleaner = StopWordsCleaner() \
     .setInputCols(['lemma']) \
     .setOutputCol('clean_lemma') \
     .setCaseSensitive(False) \
     .setStopWords(eng_stopwords)
# finisher converts tokens to human-readable output
finisher = Finisher() \
     .setInputCols(['clean_lemma']) \
     .setCleanAnnotations(False)

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[ | ]lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
Download done! Loading the resource.
[ / ]

[OK!]


pipeline = Pipeline() \
     .setStages([
           documentAssembler,
           tokenizer,
           normalizer,
           lemmatizer,
           stopwords_cleaner,
           finisher
     ])


df_full_clean = pipeline.fit(df_full_reddit).transform(df_full_reddit)


df_full_clean.printSchema()

root
 |-- all_awardings: string (nullable = true)
 |-- author: string (nullable = true)
 |-- author_created_utc: double (nullable = true)
 |-- author_flair_richtext: string (nullable = true)
 |-- author_flair_type: string (nullable = true)
 |-- author_fullname: string (nullable = true)
 |-- author_patreon_flair: boolean (nullable = true)
 |-- author_premium: boolean (nullable = true)
 |-- awarders: string (nullable = true)
 |-- body: string (nullable = true)
 |-- can_gild: boolean (nullable = true)
 |-- can_mod_post: boolean (nullable = true)
 |-- collapsed: boolean (nullable = true)
 |-- controversiality: long (nullable = true)
 |-- created_utc: long (nullable = true)
 |-- gilded: long (nullable = true)
 |-- gildings: string (nullable = true)
 |-- id: string (nullable = true)
 |-- is_submitter: boolean (nullable = true)
 |-- link_id: string (nullable = true)
 |-- locked: boolean (nullable = true)
 |-- no_follow: boolean (nullable = true)
 |-- parent_id: string (nullable = true)
 |-- permalink: string (nullable = true)
 |-- quarantined: boolean (nullable = true)
 |-- retrieved_on: long (nullable = true)
 |-- score: long (nullable = true)
 |-- send_replies: boolean (nullable = true)
 |-- stickied: boolean (nullable = true)
 |-- subreddit_id: string (nullable = true)
 |-- subreddit_name_prefixed: string (nullable = true)
 |-- subreddit_type: string (nullable = true)
 |-- total_awards_received: long (nullable = true)
 |-- treatment_tags: string (nullable = true)
 |-- editable: boolean (nullable = true)
 |-- ym_partition: integer (nullable = true)
 |-- comment_date: string (nullable = true)
 |-- year: string (nullable = true)
 |-- month: string (nullable = true)
 |-- hour: string (nullable = true)
 |-- original_post: string (nullable = true)
 |-- comment_length: integer (nullable = true)
 |-- Pandemic_Freakout: string (nullable = true)
 |-- Arrest_Freakout: string (nullable = true)
 |-- document: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- embeddings: array (nullable = true)
 |    |    |    |-- element: float (containsNull = false)
 |-- token: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- embeddings: array (nullable = true)
 |    |    |    |-- element: float (containsNull = false)
 |-- normalized: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- embeddings: array (nullable = true)
 |    |    |    |-- element: float (containsNull = false)
 |-- lemma: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- embeddings: array (nullable = true)
 |    |    |    |-- element: float (containsNull = false)
 |-- clean_lemma: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- embeddings: array (nullable = true)
 |    |    |    |-- element: float (containsNull = false)
 |-- finished_clean_lemma: array (nullable = true)
 |    |-- element: string (containsNull = true)


from pyspark.sql.functions import concat_ws

df_full_new = df_full_clean.withColumn("text", concat_ws(" ", "clean_lemma.result"))


#Sentiment 
document_t = DocumentAssembler()\
    .setInputCol("text")\
    .setOutputCol("document")

use_t = UniversalSentenceEncoder.pretrained() \
 .setInputCols(["document"])\
 .setOutputCol("use_embeddings")

docClassifier_t = SentimentDLModel.pretrained('sentimentdl_use_twitter', lang = 'en') \
  .setInputCols(["use_embeddings"])\
  .setOutputCol("sentiment")

pipeline_t = Pipeline(
    stages = [
        document_t,
        use_t,
        docClassifier_t
    ])

tfhub_use download started this may take some time.
Approximate size to download 923.7 MB
[ | ]tfhub_use download started this may take some time.
Approximate size to download 923.7 MB
[ — ]Download done! Loading the resource.
[ \ ]

2022-04-16 23:41:30.869956: I external/org_tensorflow/tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.

[ | ]

2022-04-16 23:41:31.300613: I external/org_tensorflow/tensorflow/core/platform/profile_utils/cpu_utils.cc:112] CPU Frequency: 2499995000 Hz

[ / ]

2022-04-16 23:41:35.912778: W external/org_tensorflow/tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 60236800 exceeds 10% of free system memory.
2022-04-16 23:41:35.959925: W external/org_tensorflow/tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 60236800 exceeds 10% of free system memory.
2022-04-16 23:41:36.078408: W external/org_tensorflow/tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 60236800 exceeds 10% of free system memory.

[ — ]

2022-04-16 23:41:36.143312: W external/org_tensorflow/tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 60236800 exceeds 10% of free system memory.
2022-04-16 23:41:36.183993: W external/org_tensorflow/tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 60236800 exceeds 10% of free system memory.

[OK!]
sentimentdl_use_twitter download started this may take some time.
Approximate size to download 11.4 MB
[ | ]sentimentdl_use_twitter download started this may take some time.
Approximate size to download 11.4 MB
[ / ]Download done! Loading the resource.
[OK!]


pipelineModel_t = pipeline_t.fit(df_full_new)
result_t = pipelineModel_t.transform(df_full_new)


result_t.printSchema()

root
 |-- all_awardings: string (nullable = true)
 |-- author: string (nullable = true)
 |-- author_created_utc: double (nullable = true)
 |-- author_flair_richtext: string (nullable = true)
 |-- author_flair_type: string (nullable = true)
 |-- author_fullname: string (nullable = true)
 |-- author_patreon_flair: boolean (nullable = true)
 |-- author_premium: boolean (nullable = true)
 |-- awarders: string (nullable = true)
 |-- body: string (nullable = true)
 |-- can_gild: boolean (nullable = true)
 |-- can_mod_post: boolean (nullable = true)
 |-- collapsed: boolean (nullable = true)
 |-- controversiality: long (nullable = true)
 |-- created_utc: long (nullable = true)
 |-- gilded: long (nullable = true)
 |-- gildings: string (nullable = true)
 |-- id: string (nullable = true)
 |-- is_submitter: boolean (nullable = true)
 |-- link_id: string (nullable = true)
 |-- locked: boolean (nullable = true)
 |-- no_follow: boolean (nullable = true)
 |-- parent_id: string (nullable = true)
 |-- permalink: string (nullable = true)
 |-- quarantined: boolean (nullable = true)
 |-- retrieved_on: long (nullable = true)
 |-- score: long (nullable = true)
 |-- send_replies: boolean (nullable = true)
 |-- stickied: boolean (nullable = true)
 |-- subreddit_id: string (nullable = true)
 |-- subreddit_name_prefixed: string (nullable = true)
 |-- subreddit_type: string (nullable = true)
 |-- total_awards_received: long (nullable = true)
 |-- treatment_tags: string (nullable = true)
 |-- editable: boolean (nullable = true)
 |-- ym_partition: integer (nullable = true)
 |-- comment_date: string (nullable = true)
 |-- year: string (nullable = true)
 |-- month: string (nullable = true)
 |-- hour: string (nullable = true)
 |-- original_post: string (nullable = true)
 |-- comment_length: integer (nullable = true)
 |-- Pandemic_Freakout: string (nullable = true)
 |-- Arrest_Freakout: string (nullable = true)
 |-- document: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- embeddings: array (nullable = true)
 |    |    |    |-- element: float (containsNull = false)
 |-- token: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- embeddings: array (nullable = true)
 |    |    |    |-- element: float (containsNull = false)
 |-- normalized: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- embeddings: array (nullable = true)
 |    |    |    |-- element: float (containsNull = false)
 |-- lemma: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- embeddings: array (nullable = true)
 |    |    |    |-- element: float (containsNull = false)
 |-- clean_lemma: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- embeddings: array (nullable = true)
 |    |    |    |-- element: float (containsNull = false)
 |-- finished_clean_lemma: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- text: string (nullable = false)
 |-- use_embeddings: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- embeddings: array (nullable = true)
 |    |    |    |-- element: float (containsNull = false)
 |-- sentiment: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- embeddings: array (nullable = true)
 |    |    |    |-- element: float (containsNull = false)


sentiment_df = result_t.select(F.explode(F.arrays_zip('document.result', 'sentiment.result')).alias("cols"),
                               F.expr("author").alias("author"),
                               F.expr("controversiality").alias("controversiality"),
                               F.expr("score").alias("score"),
                               F.expr("total_awards_received").alias("total_awards_received"),
                               F.expr("comment_date").alias("comment_date"),
                               F.expr("year").alias("year"),
                               F.expr("month").alias("month"),
                               F.expr("hour").alias("hour"),
                               F.expr("comment_length").alias("comment_length"),
                               F.expr("Arrest_Freakout").alias("Arrest_Freakout"),
                               F.expr("Pandemic_Freakout").alias("Pandemic_Freakout")) \
.select(F.expr("cols['0']").alias("document"),
        F.expr("cols['1']").alias("sentiment"),
       "author","controversiality","score","total_awards_received","comment_date","year","month","hour","comment_length","Arrest_Freakout","Pandemic_Freakout")


sentiment_df.write.parquet("s3://ssp88-labdata2/sentiment_df/")


#Read parquet 
sentiment_read = spark.read.parquet('s3://ssp88-labdata2/sentiment_df')


sentiment_read.printSchema()

root
 |-- document: string (nullable = true)
 |-- sentiment: string (nullable = true)
 |-- author: string (nullable = true)
 |-- controversiality: long (nullable = true)
 |-- score: long (nullable = true)
 |-- total_awards_received: long (nullable = true)
 |-- comment_date: string (nullable = true)
 |-- year: string (nullable = true)
 |-- month: string (nullable = true)
 |-- hour: string (nullable = true)
 |-- comment_length: integer (nullable = true)
 |-- Arrest_Freakout: string (nullable = true)
 |-- Pandemic_Freakout: string (nullable = true)


!/mnt/miniconda/bin/pip install altair

Requirement already satisfied: altair in /mnt/miniconda/lib/python3.7/site-packages (4.2.0)
Requirement already satisfied: jinja2 in /mnt/miniconda/lib/python3.7/site-packages (from altair) (3.0.3)
Requirement already satisfied: toolz in /mnt/miniconda/lib/python3.7/site-packages (from altair) (0.11.2)
Requirement already satisfied: pandas>=0.18 in /mnt/miniconda/lib/python3.7/site-packages (from altair) (1.3.5)
Requirement already satisfied: numpy in /mnt/miniconda/lib/python3.7/site-packages (from altair) (1.21.2)
Requirement already satisfied: entrypoints in /mnt/miniconda/lib/python3.7/site-packages (from altair) (0.3)
Requirement already satisfied: jsonschema>=3.0 in /mnt/miniconda/lib/python3.7/site-packages (from altair) (3.2.0)
Requirement already satisfied: setuptools in /mnt/miniconda/lib/python3.7/site-packages (from jsonschema>=3.0->altair) (58.0.4)
Requirement already satisfied: attrs>=17.4.0 in /mnt/miniconda/lib/python3.7/site-packages (from jsonschema>=3.0->altair) (21.4.0)
Requirement already satisfied: six>=1.11.0 in /mnt/miniconda/lib/python3.7/site-packages (from jsonschema>=3.0->altair) (1.16.0)
Requirement already satisfied: importlib-metadata in /mnt/miniconda/lib/python3.7/site-packages (from jsonschema>=3.0->altair) (4.8.2)
Requirement already satisfied: pyrsistent>=0.14.0 in /mnt/miniconda/lib/python3.7/site-packages (from jsonschema>=3.0->altair) (0.18.0)
Requirement already satisfied: python-dateutil>=2.7.3 in /mnt/miniconda/lib/python3.7/site-packages (from pandas>=0.18->altair) (2.8.2)
Requirement already satisfied: pytz>=2017.3 in /mnt/miniconda/lib/python3.7/site-packages (from pandas>=0.18->altair) (2021.3)
Requirement already satisfied: typing-extensions>=3.6.4 in /mnt/miniconda/lib/python3.7/site-packages (from importlib-metadata->jsonschema>=3.0->altair) (3.10.0.2)
Requirement already satisfied: zipp>=0.5 in /mnt/miniconda/lib/python3.7/site-packages (from importlib-metadata->jsonschema>=3.0->altair) (3.7.0)
Requirement already satisfied: MarkupSafe>=2.0 in /mnt/miniconda/lib/python3.7/site-packages (from jinja2->altair) (2.0.1)


import altair as alt
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')


sentiment_count = sentiment_read.groupby('sentiment').agg(F.count('document'))


sentiment_count = sentiment_count.withColumnRenamed('count(document)','sentiment_count')


sentiment_count.show()

+---------+---------------+
|sentiment|sentiment_count|
+---------+---------------+
| positive|        9880051|
|     null|         108800|
|  neutral|         968815|
| negative|        7163286|
+---------+---------------+


sentiment_count = sentiment_count.toPandas()


sentiment_count_altered = sentiment_count.dropna(subset=['sentiment'])


sentiment_count_altered.head()


fig = (alt.Chart(sentiment_count_altered).mark_bar().encode(
    y=alt.Y('sentiment_count', axis = alt.Axis(title = "Count")),
    x=alt.X('sentiment', axis = alt.Axis(title = "Sentiment"),sort='-y'),
    color=alt.value('#7fc97f'),
    tooltip=['sentiment','sentiment_count']
)).properties(title={"text":'Sentiment Count',"subtitle" : "Sentiment of authors through each comment"},width = 500, height = 500)

fig.save("fig1.html")
fig


sentiment_time = sentiment_read.groupby('year','month','sentiment').agg(F.count('document'))


sentiment_time = sentiment_time.withColumnRenamed('count(document)','count')


sentiment_time.show(5)

+----+-----+---------+------+
|year|month|sentiment| count|
+----+-----+---------+------+
|2020|   04| negative|236243|
|2019|   08|  neutral| 22363|
|2020|   01| positive|231719|
|2020|   11| negative|379104|
|2019|   09|     null|  2636|
+----+-----+---------+------+
only showing top 5 rows


sentiment_time_df = sentiment_time.toPandas()


sentiment_time_df = sentiment_time_df.dropna(subset=['sentiment'])


sentiment_time_df


sentiment_time_df['time_concat'] = sentiment_time_df["year"] + "_" + sentiment_time_df["month"]
sentiment_time_df


sentiment_time_df = sentiment_time_df.sort_values(["year","month"]).reset_index().drop('index',axis = 1)


fig2 = (alt.Chart(sentiment_time_df).mark_line().encode(
    x=alt.X('time_concat', axis = alt.Axis(title = "Timeframe")),
    y=alt.Y('count', axis = alt.Axis(title = "Count of Comments")),
    color='sentiment',    
    tooltip=['time_concat','count']
)).resolve_scale(x='independent').properties(title={"text":'Count of Comments',"subtitle" : "Relationship between Time Period and Frequency of Comments for each Sentiment"},width = 500, height = 500).interactive()

fig2.save("fig2.html")
fig2


sentiment_covid = sentiment_read.groupby('Pandemic_Freakout','sentiment').agg(F.avg('score'))


sentiment_covid = sentiment_covid.withColumnRenamed('avg(score)','average_score')


sentiment_covid.show()

+-----------------+---------+------------------+
|Pandemic_Freakout|sentiment|     average_score|
+-----------------+---------+------------------+
|           corona|  neutral|18.661516853932586|
|            virus|  neutral| 11.61215932914046|
|            virus| negative|11.208408528841655|
|            masks| positive|15.556957011851445|
|         pandemic| positive| 15.93591145121618|
|            covid|  neutral| 12.82067415730337|
|                 |     null|           5.79875|
|                 |  neutral|14.973012614589047|
|            masks| negative|14.785119574844996|
|         pandemic|  neutral| 24.19124087591241|
|           corona| negative|13.306956201693044|
|         pandemic| negative| 18.04567284132118|
|                 | negative|13.945360261043453|
|            virus| positive|10.075320849989481|
|            covid| positive| 17.26737085805238|
|                 | positive|14.725523070829738|
|            covid| negative|19.250790647384783|
|           corona| positive|21.394075229000194|
|            masks|  neutral| 20.07885791978246|
+-----------------+---------+------------------+


sentiment_covid = sentiment_covid.toPandas()


sentiment_covid = sentiment_covid.dropna(subset=['sentiment'])


sentiment_covid.head(20)


fig3 = (alt.Chart(sentiment_covid).mark_bar().encode(
    x=alt.X('Pandemic_Freakout', axis = alt.Axis(title = "Covid Terms"),sort='-y'),
    y=alt.Y('average_score', axis = alt.Axis(title = "Average Score")),
    color='sentiment',
    tooltip=['Pandemic_Freakout','average_score','sentiment']
)).properties(title={"text":'Covid Sentiment',"subtitle" : "Sentiment of authors through each comment revolving around covid"},width = 500, height = 500)

fig.save("fig3.html")
fig3


sentiment_bar = sentiment_read.groupby('controversiality','sentiment').agg(F.avg('score'))


sentiment_bar = sentiment_bar.withColumnRenamed('avg(score)','average_score')


sentiment_bar.show()

+----------------+---------+-------------------+
|controversiality|sentiment|      average_score|
+----------------+---------+-------------------+
|               0|  neutral| 15.731455497859258|
|               1| negative| 0.6144336834713404|
|               1| positive| 0.3619774973596332|
|               0| negative| 14.649875335629206|
|               1|     null|0.27723321620122066|
|               0|     null|  6.087501088081398|
|               1|  neutral| 0.5500021304699817|
|               0| positive| 15.488053330822497|
+----------------+---------+-------------------+


sentiment_bar = sentiment_bar.toPandas()


sentiment_bar = sentiment_bar.dropna(subset=['sentiment']).reset_index().drop('index',axis = 1)


sentiment_bar.head(10)


sentiment_bar["controversiality"] = sentiment_bar["controversiality"].astype("category")


fig4 = (alt.Chart(sentiment_bar).mark_bar().encode(
    x=alt.X('controversiality', axis = alt.Axis(title = "Covid Terms")),
    y=alt.Y('average_score', axis = alt.Axis(title = "Score")),
    color='sentiment',
    tooltip=['controversiality','average_score','sentiment']
)).properties(title={"text":'Covid Sentiment',"subtitle" : "Sentiment of authors through each comment revolving around covid"},width = 500, height = 500)

fig4


spark.stop()

Reddit Analysis - NLP¶

Initializing Spark Session¶

Reading the entire dataset from s3¶

Data Text Checks¶

Important words according to TF-IDF¶

Creating Dummy Variables¶

Cleaning the data¶

Finding the Sentiment of comments¶

Graphs - Business Questions¶

Sentiment Count¶

Sentiments through time¶

Sentiment of authors around covid¶

Summary Table¶

controversiality¶

Summary Table¶

	year	month	sentiment	count
0	2020	11	negative	379104
1	2020	04	negative	236243
2	2019	08	neutral	22363
4	2020	01	positive	231719
5	2020	12	neutral	41181
...	...	...	...	...
90	2019	08	positive	240089
91	2020	01	neutral	22618
92	2021	01	neutral	61535
93	2020	10	positive	473585
94	2020	01	negative	153705

	Pandemic_Freakout	sentiment	average_score
0	corona	neutral	18.661517
1	virus	neutral	11.612159
2	virus	negative	11.208409
3	masks	positive	15.556957
4	pandemic	positive	15.935911
5	covid	neutral	12.820674
7		neutral	14.973013
8	masks	negative	14.785120
9	pandemic	neutral	24.191241
10	corona	negative	13.306956
11	pandemic	negative	18.045673
12		negative	13.945360
13	virus	positive	10.075321
14	covid	positive	17.267371
15		positive	14.725523
16	covid	negative	19.250791
17	corona	positive	21.394075
18	masks	neutral	20.078858

	controversiality	sentiment	average_score
0	0	neutral	15.731455
1	1	negative	0.614434
2	1	positive	0.361977
3	0	negative	14.649875
4	1	neutral	0.550002
5	0	positive	15.488053