import findspark
findspark.init()
!/mnt/miniconda/bin/pip install sparknlp
Collecting sparknlp Downloading sparknlp-1.0.0-py3-none-any.whl (1.4 kB) Requirement already satisfied: numpy in /mnt/miniconda/lib/python3.7/site-packages (from sparknlp) (1.21.2) Collecting spark-nlp Downloading spark_nlp-3.4.3-py2.py3-none-any.whl (144 kB) |████████████████████████████████| 144 kB 30.7 MB/s eta 0:00:01 Installing collected packages: spark-nlp, sparknlp Successfully installed spark-nlp-3.4.3 sparknlp-1.0.0
import pyspark.sql.functions as F
from pyspark.sql.functions import col, lit,size
from pyspark.sql import SparkSession
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import json
from pyspark.ml import Pipeline
from sparknlp.annotator import *
from sparknlp.base import *
import sparknlp
from sparknlp.pretrained import PretrainedPipeline
spark = SparkSession.builder \
.appName("reddit") \
.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
.config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:3.4.2") \
.master('yarn') \
.getOrCreate()
Ivy Default Cache set to: /home/hadoop/.ivy2/cache The jars for the packages stored in: /home/hadoop/.ivy2/jars :: loading settings :: url = jar:file:/usr/lib/spark/jars/ivy-2.4.0.jar!/org/apache/ivy/core/settings/ivysettings.xml com.johnsnowlabs.nlp#spark-nlp_2.12 added as a dependency :: resolving dependencies :: org.apache.spark#spark-submit-parent-c342d107-ad2a-4bf6-85b2-decf3f433d62;1.0 confs: [default] found com.johnsnowlabs.nlp#spark-nlp_2.12;3.4.2 in central found com.typesafe#config;1.4.1 in central found org.rocksdb#rocksdbjni;6.5.3 in central found com.amazonaws#aws-java-sdk-bundle;1.11.603 in central found com.github.universal-automata#liblevenshtein;3.0.0 in central found com.google.code.findbugs#annotations;3.0.1 in central found net.jcip#jcip-annotations;1.0 in central found com.google.code.findbugs#jsr305;3.0.1 in central found com.google.protobuf#protobuf-java-util;3.0.0-beta-3 in central found com.google.protobuf#protobuf-java;3.0.0-beta-3 in central found com.google.code.gson#gson;2.3 in central found it.unimi.dsi#fastutil;7.0.12 in central found org.projectlombok#lombok;1.16.8 in central found org.slf4j#slf4j-api;1.7.21 in central found com.navigamez#greex;1.0 in central found dk.brics.automaton#automaton;1.11-8 in central found org.json4s#json4s-ext_2.12;3.5.3 in central found joda-time#joda-time;2.9.5 in central found org.joda#joda-convert;1.8.1 in central found com.johnsnowlabs.nlp#tensorflow-cpu_2.12;0.3.3 in central found net.sf.trove4j#trove4j;3.0.3 in central downloading https://repo1.maven.org/maven2/com/johnsnowlabs/nlp/spark-nlp_2.12/3.4.2/spark-nlp_2.12-3.4.2.jar ... [SUCCESSFUL ] com.johnsnowlabs.nlp#spark-nlp_2.12;3.4.2!spark-nlp_2.12.jar (573ms) downloading https://repo1.maven.org/maven2/com/typesafe/config/1.4.1/config-1.4.1.jar ... [SUCCESSFUL ] com.typesafe#config;1.4.1!config.jar(bundle) (5ms) downloading https://repo1.maven.org/maven2/org/rocksdb/rocksdbjni/6.5.3/rocksdbjni-6.5.3.jar ... [SUCCESSFUL ] org.rocksdb#rocksdbjni;6.5.3!rocksdbjni.jar (206ms) downloading https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.11.603/aws-java-sdk-bundle-1.11.603.jar ... [SUCCESSFUL ] com.amazonaws#aws-java-sdk-bundle;1.11.603!aws-java-sdk-bundle.jar (854ms) downloading https://repo1.maven.org/maven2/com/github/universal-automata/liblevenshtein/3.0.0/liblevenshtein-3.0.0.jar ... [SUCCESSFUL ] com.github.universal-automata#liblevenshtein;3.0.0!liblevenshtein.jar (4ms) downloading https://repo1.maven.org/maven2/com/navigamez/greex/1.0/greex-1.0.jar ... [SUCCESSFUL ] com.navigamez#greex;1.0!greex.jar (2ms) downloading https://repo1.maven.org/maven2/org/json4s/json4s-ext_2.12/3.5.3/json4s-ext_2.12-3.5.3.jar ... [SUCCESSFUL ] org.json4s#json4s-ext_2.12;3.5.3!json4s-ext_2.12.jar (3ms) downloading https://repo1.maven.org/maven2/com/johnsnowlabs/nlp/tensorflow-cpu_2.12/0.3.3/tensorflow-cpu_2.12-0.3.3.jar ... [SUCCESSFUL ] com.johnsnowlabs.nlp#tensorflow-cpu_2.12;0.3.3!tensorflow-cpu_2.12.jar (1076ms) downloading https://repo1.maven.org/maven2/net/sf/trove4j/trove4j/3.0.3/trove4j-3.0.3.jar ... [SUCCESSFUL ] net.sf.trove4j#trove4j;3.0.3!trove4j.jar (20ms) downloading https://repo1.maven.org/maven2/com/google/code/findbugs/annotations/3.0.1/annotations-3.0.1.jar ... [SUCCESSFUL ] com.google.code.findbugs#annotations;3.0.1!annotations.jar (2ms) downloading https://repo1.maven.org/maven2/com/google/protobuf/protobuf-java-util/3.0.0-beta-3/protobuf-java-util-3.0.0-beta-3.jar ... [SUCCESSFUL ] com.google.protobuf#protobuf-java-util;3.0.0-beta-3!protobuf-java-util.jar(bundle) (3ms) downloading https://repo1.maven.org/maven2/com/google/protobuf/protobuf-java/3.0.0-beta-3/protobuf-java-3.0.0-beta-3.jar ... [SUCCESSFUL ] com.google.protobuf#protobuf-java;3.0.0-beta-3!protobuf-java.jar(bundle) (9ms) downloading https://repo1.maven.org/maven2/it/unimi/dsi/fastutil/7.0.12/fastutil-7.0.12.jar ... [SUCCESSFUL ] it.unimi.dsi#fastutil;7.0.12!fastutil.jar (115ms) downloading https://repo1.maven.org/maven2/org/projectlombok/lombok/1.16.8/lombok-1.16.8.jar ... [SUCCESSFUL ] org.projectlombok#lombok;1.16.8!lombok.jar (11ms) downloading https://repo1.maven.org/maven2/org/slf4j/slf4j-api/1.7.21/slf4j-api-1.7.21.jar ... [SUCCESSFUL ] org.slf4j#slf4j-api;1.7.21!slf4j-api.jar (2ms) downloading https://repo1.maven.org/maven2/net/jcip/jcip-annotations/1.0/jcip-annotations-1.0.jar ... [SUCCESSFUL ] net.jcip#jcip-annotations;1.0!jcip-annotations.jar (1ms) downloading https://repo1.maven.org/maven2/com/google/code/findbugs/jsr305/3.0.1/jsr305-3.0.1.jar ... [SUCCESSFUL ] com.google.code.findbugs#jsr305;3.0.1!jsr305.jar (1ms) downloading https://repo1.maven.org/maven2/com/google/code/gson/gson/2.3/gson-2.3.jar ... [SUCCESSFUL ] com.google.code.gson#gson;2.3!gson.jar (3ms) downloading https://repo1.maven.org/maven2/dk/brics/automaton/automaton/1.11-8/automaton-1.11-8.jar ... [SUCCESSFUL ] dk.brics.automaton#automaton;1.11-8!automaton.jar (3ms) downloading https://repo1.maven.org/maven2/joda-time/joda-time/2.9.5/joda-time-2.9.5.jar ... [SUCCESSFUL ] joda-time#joda-time;2.9.5!joda-time.jar (6ms) downloading https://repo1.maven.org/maven2/org/joda/joda-convert/1.8.1/joda-convert-1.8.1.jar ... [SUCCESSFUL ] org.joda#joda-convert;1.8.1!joda-convert.jar (3ms) :: resolution report :: resolve 2937ms :: artifacts dl 2912ms :: modules in use: com.amazonaws#aws-java-sdk-bundle;1.11.603 from central in [default] com.github.universal-automata#liblevenshtein;3.0.0 from central in [default] com.google.code.findbugs#annotations;3.0.1 from central in [default] com.google.code.findbugs#jsr305;3.0.1 from central in [default] com.google.code.gson#gson;2.3 from central in [default] com.google.protobuf#protobuf-java;3.0.0-beta-3 from central in [default] com.google.protobuf#protobuf-java-util;3.0.0-beta-3 from central in [default] com.johnsnowlabs.nlp#spark-nlp_2.12;3.4.2 from central in [default] com.johnsnowlabs.nlp#tensorflow-cpu_2.12;0.3.3 from central in [default] com.navigamez#greex;1.0 from central in [default] com.typesafe#config;1.4.1 from central in [default] dk.brics.automaton#automaton;1.11-8 from central in [default] it.unimi.dsi#fastutil;7.0.12 from central in [default] joda-time#joda-time;2.9.5 from central in [default] net.jcip#jcip-annotations;1.0 from central in [default] net.sf.trove4j#trove4j;3.0.3 from central in [default] org.joda#joda-convert;1.8.1 from central in [default] org.json4s#json4s-ext_2.12;3.5.3 from central in [default] org.projectlombok#lombok;1.16.8 from central in [default] org.rocksdb#rocksdbjni;6.5.3 from central in [default] org.slf4j#slf4j-api;1.7.21 from central in [default] --------------------------------------------------------------------- | | modules || artifacts | | conf | number| search|dwnlded|evicted|| number|dwnlded| --------------------------------------------------------------------- | default | 21 | 21 | 21 | 0 || 21 | 21 | --------------------------------------------------------------------- :: problems summary :: :::: ERRORS SERVER ERROR: Bad Gateway url=https://dl.bintray.com/spark-packages/maven/com/amazonaws/aws-java-sdk-pom/1.11.603/aws-java-sdk-pom-1.11.603.jar SERVER ERROR: Bad Gateway url=https://dl.bintray.com/spark-packages/maven/org/sonatype/oss/oss-parent/7/oss-parent-7.jar SERVER ERROR: Bad Gateway url=https://dl.bintray.com/spark-packages/maven/net/jcip/jcip-annotations/1.0/jcip-annotations-1.0-javadoc.jar SERVER ERROR: Bad Gateway url=https://dl.bintray.com/spark-packages/maven/com/google/google/1/google-1.jar SERVER ERROR: Bad Gateway url=https://dl.bintray.com/spark-packages/maven/com/google/protobuf/protobuf-parent/3.0.0-beta-3/protobuf-parent-3.0.0-beta-3.jar SERVER ERROR: Bad Gateway url=https://dl.bintray.com/spark-packages/maven/com/google/protobuf/protobuf-java-util/3.0.0-beta-3/protobuf-java-util-3.0.0-beta-3-sources.jar SERVER ERROR: Bad Gateway url=https://dl.bintray.com/spark-packages/maven/com/google/protobuf/protobuf-java-util/3.0.0-beta-3/protobuf-java-util-3.0.0-beta-3-src.jar SERVER ERROR: Bad Gateway url=https://dl.bintray.com/spark-packages/maven/com/google/protobuf/protobuf-java-util/3.0.0-beta-3/protobuf-java-util-3.0.0-beta-3-javadoc.jar SERVER ERROR: Bad Gateway url=https://dl.bintray.com/spark-packages/maven/com/google/protobuf/protobuf-java/3.0.0-beta-3/protobuf-java-3.0.0-beta-3-sources.jar SERVER ERROR: Bad Gateway url=https://dl.bintray.com/spark-packages/maven/com/google/protobuf/protobuf-java/3.0.0-beta-3/protobuf-java-3.0.0-beta-3-src.jar SERVER ERROR: Bad Gateway url=https://dl.bintray.com/spark-packages/maven/com/google/protobuf/protobuf-java/3.0.0-beta-3/protobuf-java-3.0.0-beta-3-javadoc.jar SERVER ERROR: Bad Gateway url=https://dl.bintray.com/spark-packages/maven/org/sonatype/oss/oss-parent/9/oss-parent-9.jar SERVER ERROR: Bad Gateway url=https://dl.bintray.com/spark-packages/maven/org/slf4j/slf4j-parent/1.7.21/slf4j-parent-1.7.21.jar :: USE VERBOSE OR DEBUG MESSAGE LEVEL FOR MORE DETAILS :: retrieving :: org.apache.spark#spark-submit-parent-c342d107-ad2a-4bf6-85b2-decf3f433d62 confs: [default] 21 artifacts copied, 0 already retrieved (399905kB/258ms) Setting default log level to "WARN". To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel). 22/04/16 23:39:22 WARN Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME. 22/04/16 23:39:27 WARN Client: Same path resource file:///home/hadoop/.ivy2/jars/com.johnsnowlabs.nlp_spark-nlp_2.12-3.4.2.jar added multiple times to distributed cache. 22/04/16 23:39:27 WARN Client: Same path resource file:///home/hadoop/.ivy2/jars/com.typesafe_config-1.4.1.jar added multiple times to distributed cache. 22/04/16 23:39:27 WARN Client: Same path resource file:///home/hadoop/.ivy2/jars/org.rocksdb_rocksdbjni-6.5.3.jar added multiple times to distributed cache. 22/04/16 23:39:27 WARN Client: Same path resource file:///home/hadoop/.ivy2/jars/com.amazonaws_aws-java-sdk-bundle-1.11.603.jar added multiple times to distributed cache. 22/04/16 23:39:27 WARN Client: Same path resource file:///home/hadoop/.ivy2/jars/com.github.universal-automata_liblevenshtein-3.0.0.jar added multiple times to distributed cache. 22/04/16 23:39:27 WARN Client: Same path resource file:///home/hadoop/.ivy2/jars/com.navigamez_greex-1.0.jar added multiple times to distributed cache. 22/04/16 23:39:27 WARN Client: Same path resource file:///home/hadoop/.ivy2/jars/org.json4s_json4s-ext_2.12-3.5.3.jar added multiple times to distributed cache. 22/04/16 23:39:27 WARN Client: Same path resource file:///home/hadoop/.ivy2/jars/com.johnsnowlabs.nlp_tensorflow-cpu_2.12-0.3.3.jar added multiple times to distributed cache. 22/04/16 23:39:27 WARN Client: Same path resource file:///home/hadoop/.ivy2/jars/net.sf.trove4j_trove4j-3.0.3.jar added multiple times to distributed cache. 22/04/16 23:39:27 WARN Client: Same path resource file:///home/hadoop/.ivy2/jars/com.google.code.findbugs_annotations-3.0.1.jar added multiple times to distributed cache. 22/04/16 23:39:27 WARN Client: Same path resource file:///home/hadoop/.ivy2/jars/com.google.protobuf_protobuf-java-util-3.0.0-beta-3.jar added multiple times to distributed cache. 22/04/16 23:39:27 WARN Client: Same path resource file:///home/hadoop/.ivy2/jars/com.google.protobuf_protobuf-java-3.0.0-beta-3.jar added multiple times to distributed cache. 22/04/16 23:39:27 WARN Client: Same path resource file:///home/hadoop/.ivy2/jars/it.unimi.dsi_fastutil-7.0.12.jar added multiple times to distributed cache. 22/04/16 23:39:27 WARN Client: Same path resource file:///home/hadoop/.ivy2/jars/org.projectlombok_lombok-1.16.8.jar added multiple times to distributed cache. 22/04/16 23:39:27 WARN Client: Same path resource file:///home/hadoop/.ivy2/jars/org.slf4j_slf4j-api-1.7.21.jar added multiple times to distributed cache. 22/04/16 23:39:27 WARN Client: Same path resource file:///home/hadoop/.ivy2/jars/net.jcip_jcip-annotations-1.0.jar added multiple times to distributed cache. 22/04/16 23:39:27 WARN Client: Same path resource file:///home/hadoop/.ivy2/jars/com.google.code.findbugs_jsr305-3.0.1.jar added multiple times to distributed cache. 22/04/16 23:39:27 WARN Client: Same path resource file:///home/hadoop/.ivy2/jars/com.google.code.gson_gson-2.3.jar added multiple times to distributed cache. 22/04/16 23:39:27 WARN Client: Same path resource file:///home/hadoop/.ivy2/jars/dk.brics.automaton_automaton-1.11-8.jar added multiple times to distributed cache. 22/04/16 23:39:27 WARN Client: Same path resource file:///home/hadoop/.ivy2/jars/joda-time_joda-time-2.9.5.jar added multiple times to distributed cache. 22/04/16 23:39:27 WARN Client: Same path resource file:///home/hadoop/.ivy2/jars/org.joda_joda-convert-1.8.1.jar added multiple times to distributed cache. 22/04/16 23:39:36 WARN YarnSchedulerBackend$YarnSchedulerEndpoint: Attempted to request executors before the AM has registered!
spark
SparkSession - in-memory
df_full = spark.read.parquet('s3://ssp88-labdata2/eda_df_full')
df_full.printSchema()
root |-- all_awardings: string (nullable = true) |-- author: string (nullable = true) |-- author_created_utc: double (nullable = true) |-- author_flair_richtext: string (nullable = true) |-- author_flair_type: string (nullable = true) |-- author_fullname: string (nullable = true) |-- author_patreon_flair: boolean (nullable = true) |-- author_premium: boolean (nullable = true) |-- awarders: string (nullable = true) |-- body: string (nullable = true) |-- can_gild: boolean (nullable = true) |-- can_mod_post: boolean (nullable = true) |-- collapsed: boolean (nullable = true) |-- controversiality: long (nullable = true) |-- created_utc: long (nullable = true) |-- gilded: long (nullable = true) |-- gildings: string (nullable = true) |-- id: string (nullable = true) |-- is_submitter: boolean (nullable = true) |-- link_id: string (nullable = true) |-- locked: boolean (nullable = true) |-- no_follow: boolean (nullable = true) |-- parent_id: string (nullable = true) |-- permalink: string (nullable = true) |-- quarantined: boolean (nullable = true) |-- retrieved_on: long (nullable = true) |-- score: long (nullable = true) |-- send_replies: boolean (nullable = true) |-- stickied: boolean (nullable = true) |-- subreddit_id: string (nullable = true) |-- subreddit_name_prefixed: string (nullable = true) |-- subreddit_type: string (nullable = true) |-- total_awards_received: long (nullable = true) |-- treatment_tags: string (nullable = true) |-- editable: boolean (nullable = true) |-- ym_partition: integer (nullable = true) |-- comment_date: string (nullable = true) |-- year: string (nullable = true) |-- month: string (nullable = true) |-- hour: string (nullable = true) |-- original_post: string (nullable = true)
# Most Common Words
import pyspark.sql.functions as f
common_word = df_full.withColumn('word', f.explode(f.split(f.col('body'), ' '))) \
.groupBy('word') \
.count() \
.sort('count', ascending=False) \
.limit(10)
common_word.show()
22/04/16 23:39:50 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'. [Stage 3:=====================================> (17 + 9) / 26]
+----+--------+ |word| count| +----+--------+ | the|14798077| | to|10378945| | a| 9822211| | and| 7735354| | of| 6655809| | I| 5741511| | is| 5584017| |that| 5065040| | you| 5022835| | in| 4913229| +----+--------+
Cleaning is required as stopwords cannot be the top 10 words with highest count
#Distribution of Text Lengths
df_full = df_full.withColumn("comment_length", F.length(col('body')))
df_full.select('body','comment_length').sort('comment_length', ascending=False).show(10)
+--------------------+--------------+ | body|comment_length| +--------------------+--------------+ |**Money in Electi...| 11252| |> > > ...| 10271| |Original comment ...| 10192| |Original comment ...| 10190| |LOL wait didnt yo...| 10166| |Biases in Stops, ...| 10145| |Part 2. > You...| 10081| |>I do understa...| 10079| |UNDELETED comment...| 10074| |UNDELETED comment...| 10074| +--------------------+--------------+ only showing top 10 rows
It can be observed that some of the texts with the higher comment length are not useful for analysis (deleted etc). These words/sentences can be removed later.
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
sentenceData = df_full.select(df_full["body"])
tokenizer = Tokenizer(inputCol="body", outputCol="words")
wordsData = tokenizer.transform(sentenceData)
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
featurizedData = hashingTF.transform(wordsData)
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)
for features_label in rescaledData.select("features", "words").take(3):
print(features_label)
[Stage 152:> (0 + 1) / 1]
Row(features=SparseVector(20, {3: 0.6267, 7: 0.7122, 10: 1.7161, 19: 0.6819}), words=['so,', 'theyre', 'like', 'australian', 'geese?']) Row(features=SparseVector(20, {0: 4.6762, 1: 3.639, 2: 1.1016, 3: 3.7599, 4: 3.136, 5: 3.5421, 6: 3.0408, 7: 4.2731, 8: 2.0492, 9: 4.9559, 10: 1.7161, 11: 3.14, 12: 2.6444, 13: 1.6626, 15: 3.1381, 16: 2.78, 17: 3.8672, 18: 4.4873, 19: 0.6819}), words=['by', 'that', 'definition', 'literally', 'any', 'food', 'is', 'a', 'drug.', "there's", 'nothing', 'special', 'about', "sugar's", 'effect', 'on', 'the', 'reward', 'system.', 'you', 'eat', 'food,', 'you', 'feel', 'good.', 'the', 'tongue', 'enjoys', 'the', 'sweetness', 'and', 'you', 'get', 'a', 'hit', 'of', 'dopamine', 'from', 'your', 'reward', 'system.', 'the', 'sugar', "isn't", 'binding', 'to', 'any', 'receptors', 'in', 'your', 'brain', "it's", 'entirely', 'your', "brain's", 'own', 'response', 'to', 'positive', 'stimulus.', 'other', 'examples', 'are', 'finishing', 'paperwork', 'or', 'getting', 'a', 'massage.', 'sugar', 'is', 'not', 'even', 'close', 'to', 'a', 'drug.']) Row(features=SparseVector(20, {3: 1.2533, 6: 1.5204, 7: 1.4244, 11: 0.785, 13: 1.6626, 15: 2.3536, 18: 0.8975, 19: 1.3638}), words=["i'm", 'a', 'follower', 'of', 'beau', 'in', 'yt', 'and', 'twitter.', 'he', 'makes', 'a', 'lot', 'of', 'sense.'])
df_full_reddit = df_full.withColumn("Pandemic_Freakout", F.regexp_extract('body', \
r'(?i)\bcovid\b|(?i)\bpandemic\b|(?i)\bcovid-19\b|(?i)\bcorona\b|(?i)\bvirus\b|(?i)\bmasks\b|(?i)\hospital\b',0))
df_full_reddit = df_full_reddit.withColumn("Arrest_Freakout", F.regexp_extract('body', \
r'(?i)\barrest\b|(?i)\bofficer\b|(?i)\bpolice\b|(?i)\bcop\b|(?i)\bstab\b|(?i)\billegal\b|(?i)\brutal\b',0))
df_full_reddit = df_full_reddit.withColumn("Pandemic_Freakout",F.lower(F.col('Pandemic_Freakout')))
df_full_reddit = df_full_reddit.withColumn("Arrest_Freakout",F.lower(F.col('Arrest_Freakout')))
import nltk
nltk.download('stopwords')
[nltk_data] Downloading package stopwords to /home/hadoop/nltk_data... [nltk_data] Unzipping corpora/stopwords.zip.
True
from nltk.corpus import stopwords
eng_stopwords = stopwords.words('english')
#eng_stopwords.append('xxxx')
from sparknlp.base import Finisher, DocumentAssembler
from sparknlp.annotator import (Tokenizer, Normalizer,
LemmatizerModel, StopWordsCleaner)
from pyspark.ml import Pipeline
documentAssembler = DocumentAssembler() \
.setInputCol('body') \
.setOutputCol('document')
tokenizer = Tokenizer() \
.setInputCols(['document']) \
.setOutputCol('token')
# note normalizer defaults to changing all words to lowercase.
# Use .setLowercase(False) to maintain input case.
normalizer = Normalizer() \
.setInputCols(['token']) \
.setOutputCol('normalized') \
.setLowercase(True)
# note that lemmatizer needs a dictionary. So I used the pre-trained
# model (note that it defaults to english)
lemmatizer = LemmatizerModel.pretrained() \
.setInputCols(['normalized']) \
.setOutputCol('lemma')
stopwords_cleaner = StopWordsCleaner() \
.setInputCols(['lemma']) \
.setOutputCol('clean_lemma') \
.setCaseSensitive(False) \
.setStopWords(eng_stopwords)
# finisher converts tokens to human-readable output
finisher = Finisher() \
.setInputCols(['clean_lemma']) \
.setCleanAnnotations(False)
lemma_antbnc download started this may take some time. Approximate size to download 907.6 KB [ | ]lemma_antbnc download started this may take some time. Approximate size to download 907.6 KB Download done! Loading the resource. [ / ]
[OK!]
pipeline = Pipeline() \
.setStages([
documentAssembler,
tokenizer,
normalizer,
lemmatizer,
stopwords_cleaner,
finisher
])
df_full_clean = pipeline.fit(df_full_reddit).transform(df_full_reddit)
df_full_clean.printSchema()
root |-- all_awardings: string (nullable = true) |-- author: string (nullable = true) |-- author_created_utc: double (nullable = true) |-- author_flair_richtext: string (nullable = true) |-- author_flair_type: string (nullable = true) |-- author_fullname: string (nullable = true) |-- author_patreon_flair: boolean (nullable = true) |-- author_premium: boolean (nullable = true) |-- awarders: string (nullable = true) |-- body: string (nullable = true) |-- can_gild: boolean (nullable = true) |-- can_mod_post: boolean (nullable = true) |-- collapsed: boolean (nullable = true) |-- controversiality: long (nullable = true) |-- created_utc: long (nullable = true) |-- gilded: long (nullable = true) |-- gildings: string (nullable = true) |-- id: string (nullable = true) |-- is_submitter: boolean (nullable = true) |-- link_id: string (nullable = true) |-- locked: boolean (nullable = true) |-- no_follow: boolean (nullable = true) |-- parent_id: string (nullable = true) |-- permalink: string (nullable = true) |-- quarantined: boolean (nullable = true) |-- retrieved_on: long (nullable = true) |-- score: long (nullable = true) |-- send_replies: boolean (nullable = true) |-- stickied: boolean (nullable = true) |-- subreddit_id: string (nullable = true) |-- subreddit_name_prefixed: string (nullable = true) |-- subreddit_type: string (nullable = true) |-- total_awards_received: long (nullable = true) |-- treatment_tags: string (nullable = true) |-- editable: boolean (nullable = true) |-- ym_partition: integer (nullable = true) |-- comment_date: string (nullable = true) |-- year: string (nullable = true) |-- month: string (nullable = true) |-- hour: string (nullable = true) |-- original_post: string (nullable = true) |-- comment_length: integer (nullable = true) |-- Pandemic_Freakout: string (nullable = true) |-- Arrest_Freakout: string (nullable = true) |-- document: array (nullable = true) | |-- element: struct (containsNull = true) | | |-- annotatorType: string (nullable = true) | | |-- begin: integer (nullable = false) | | |-- end: integer (nullable = false) | | |-- result: string (nullable = true) | | |-- metadata: map (nullable = true) | | | |-- key: string | | | |-- value: string (valueContainsNull = true) | | |-- embeddings: array (nullable = true) | | | |-- element: float (containsNull = false) |-- token: array (nullable = true) | |-- element: struct (containsNull = true) | | |-- annotatorType: string (nullable = true) | | |-- begin: integer (nullable = false) | | |-- end: integer (nullable = false) | | |-- result: string (nullable = true) | | |-- metadata: map (nullable = true) | | | |-- key: string | | | |-- value: string (valueContainsNull = true) | | |-- embeddings: array (nullable = true) | | | |-- element: float (containsNull = false) |-- normalized: array (nullable = true) | |-- element: struct (containsNull = true) | | |-- annotatorType: string (nullable = true) | | |-- begin: integer (nullable = false) | | |-- end: integer (nullable = false) | | |-- result: string (nullable = true) | | |-- metadata: map (nullable = true) | | | |-- key: string | | | |-- value: string (valueContainsNull = true) | | |-- embeddings: array (nullable = true) | | | |-- element: float (containsNull = false) |-- lemma: array (nullable = true) | |-- element: struct (containsNull = true) | | |-- annotatorType: string (nullable = true) | | |-- begin: integer (nullable = false) | | |-- end: integer (nullable = false) | | |-- result: string (nullable = true) | | |-- metadata: map (nullable = true) | | | |-- key: string | | | |-- value: string (valueContainsNull = true) | | |-- embeddings: array (nullable = true) | | | |-- element: float (containsNull = false) |-- clean_lemma: array (nullable = true) | |-- element: struct (containsNull = true) | | |-- annotatorType: string (nullable = true) | | |-- begin: integer (nullable = false) | | |-- end: integer (nullable = false) | | |-- result: string (nullable = true) | | |-- metadata: map (nullable = true) | | | |-- key: string | | | |-- value: string (valueContainsNull = true) | | |-- embeddings: array (nullable = true) | | | |-- element: float (containsNull = false) |-- finished_clean_lemma: array (nullable = true) | |-- element: string (containsNull = true)
from pyspark.sql.functions import concat_ws
df_full_new = df_full_clean.withColumn("text", concat_ws(" ", "clean_lemma.result"))
#Sentiment
document_t = DocumentAssembler()\
.setInputCol("text")\
.setOutputCol("document")
use_t = UniversalSentenceEncoder.pretrained() \
.setInputCols(["document"])\
.setOutputCol("use_embeddings")
docClassifier_t = SentimentDLModel.pretrained('sentimentdl_use_twitter', lang = 'en') \
.setInputCols(["use_embeddings"])\
.setOutputCol("sentiment")
pipeline_t = Pipeline(
stages = [
document_t,
use_t,
docClassifier_t
])
tfhub_use download started this may take some time. Approximate size to download 923.7 MB [ | ]tfhub_use download started this may take some time. Approximate size to download 923.7 MB [ — ]Download done! Loading the resource. [ \ ]
2022-04-16 23:41:30.869956: I external/org_tensorflow/tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 AVX512F FMA To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
[ | ]
2022-04-16 23:41:31.300613: I external/org_tensorflow/tensorflow/core/platform/profile_utils/cpu_utils.cc:112] CPU Frequency: 2499995000 Hz
[ / ]
2022-04-16 23:41:35.912778: W external/org_tensorflow/tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 60236800 exceeds 10% of free system memory. 2022-04-16 23:41:35.959925: W external/org_tensorflow/tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 60236800 exceeds 10% of free system memory. 2022-04-16 23:41:36.078408: W external/org_tensorflow/tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 60236800 exceeds 10% of free system memory.
[ — ]
2022-04-16 23:41:36.143312: W external/org_tensorflow/tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 60236800 exceeds 10% of free system memory. 2022-04-16 23:41:36.183993: W external/org_tensorflow/tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 60236800 exceeds 10% of free system memory.
[OK!] sentimentdl_use_twitter download started this may take some time. Approximate size to download 11.4 MB [ | ]sentimentdl_use_twitter download started this may take some time. Approximate size to download 11.4 MB [ / ]Download done! Loading the resource. [OK!]
pipelineModel_t = pipeline_t.fit(df_full_new)
result_t = pipelineModel_t.transform(df_full_new)
result_t.printSchema()
root |-- all_awardings: string (nullable = true) |-- author: string (nullable = true) |-- author_created_utc: double (nullable = true) |-- author_flair_richtext: string (nullable = true) |-- author_flair_type: string (nullable = true) |-- author_fullname: string (nullable = true) |-- author_patreon_flair: boolean (nullable = true) |-- author_premium: boolean (nullable = true) |-- awarders: string (nullable = true) |-- body: string (nullable = true) |-- can_gild: boolean (nullable = true) |-- can_mod_post: boolean (nullable = true) |-- collapsed: boolean (nullable = true) |-- controversiality: long (nullable = true) |-- created_utc: long (nullable = true) |-- gilded: long (nullable = true) |-- gildings: string (nullable = true) |-- id: string (nullable = true) |-- is_submitter: boolean (nullable = true) |-- link_id: string (nullable = true) |-- locked: boolean (nullable = true) |-- no_follow: boolean (nullable = true) |-- parent_id: string (nullable = true) |-- permalink: string (nullable = true) |-- quarantined: boolean (nullable = true) |-- retrieved_on: long (nullable = true) |-- score: long (nullable = true) |-- send_replies: boolean (nullable = true) |-- stickied: boolean (nullable = true) |-- subreddit_id: string (nullable = true) |-- subreddit_name_prefixed: string (nullable = true) |-- subreddit_type: string (nullable = true) |-- total_awards_received: long (nullable = true) |-- treatment_tags: string (nullable = true) |-- editable: boolean (nullable = true) |-- ym_partition: integer (nullable = true) |-- comment_date: string (nullable = true) |-- year: string (nullable = true) |-- month: string (nullable = true) |-- hour: string (nullable = true) |-- original_post: string (nullable = true) |-- comment_length: integer (nullable = true) |-- Pandemic_Freakout: string (nullable = true) |-- Arrest_Freakout: string (nullable = true) |-- document: array (nullable = true) | |-- element: struct (containsNull = true) | | |-- annotatorType: string (nullable = true) | | |-- begin: integer (nullable = false) | | |-- end: integer (nullable = false) | | |-- result: string (nullable = true) | | |-- metadata: map (nullable = true) | | | |-- key: string | | | |-- value: string (valueContainsNull = true) | | |-- embeddings: array (nullable = true) | | | |-- element: float (containsNull = false) |-- token: array (nullable = true) | |-- element: struct (containsNull = true) | | |-- annotatorType: string (nullable = true) | | |-- begin: integer (nullable = false) | | |-- end: integer (nullable = false) | | |-- result: string (nullable = true) | | |-- metadata: map (nullable = true) | | | |-- key: string | | | |-- value: string (valueContainsNull = true) | | |-- embeddings: array (nullable = true) | | | |-- element: float (containsNull = false) |-- normalized: array (nullable = true) | |-- element: struct (containsNull = true) | | |-- annotatorType: string (nullable = true) | | |-- begin: integer (nullable = false) | | |-- end: integer (nullable = false) | | |-- result: string (nullable = true) | | |-- metadata: map (nullable = true) | | | |-- key: string | | | |-- value: string (valueContainsNull = true) | | |-- embeddings: array (nullable = true) | | | |-- element: float (containsNull = false) |-- lemma: array (nullable = true) | |-- element: struct (containsNull = true) | | |-- annotatorType: string (nullable = true) | | |-- begin: integer (nullable = false) | | |-- end: integer (nullable = false) | | |-- result: string (nullable = true) | | |-- metadata: map (nullable = true) | | | |-- key: string | | | |-- value: string (valueContainsNull = true) | | |-- embeddings: array (nullable = true) | | | |-- element: float (containsNull = false) |-- clean_lemma: array (nullable = true) | |-- element: struct (containsNull = true) | | |-- annotatorType: string (nullable = true) | | |-- begin: integer (nullable = false) | | |-- end: integer (nullable = false) | | |-- result: string (nullable = true) | | |-- metadata: map (nullable = true) | | | |-- key: string | | | |-- value: string (valueContainsNull = true) | | |-- embeddings: array (nullable = true) | | | |-- element: float (containsNull = false) |-- finished_clean_lemma: array (nullable = true) | |-- element: string (containsNull = true) |-- text: string (nullable = false) |-- use_embeddings: array (nullable = true) | |-- element: struct (containsNull = true) | | |-- annotatorType: string (nullable = true) | | |-- begin: integer (nullable = false) | | |-- end: integer (nullable = false) | | |-- result: string (nullable = true) | | |-- metadata: map (nullable = true) | | | |-- key: string | | | |-- value: string (valueContainsNull = true) | | |-- embeddings: array (nullable = true) | | | |-- element: float (containsNull = false) |-- sentiment: array (nullable = true) | |-- element: struct (containsNull = true) | | |-- annotatorType: string (nullable = true) | | |-- begin: integer (nullable = false) | | |-- end: integer (nullable = false) | | |-- result: string (nullable = true) | | |-- metadata: map (nullable = true) | | | |-- key: string | | | |-- value: string (valueContainsNull = true) | | |-- embeddings: array (nullable = true) | | | |-- element: float (containsNull = false)
Selecting the required comments to store in s3
sentiment_df = result_t.select(F.explode(F.arrays_zip('document.result', 'sentiment.result')).alias("cols"),
F.expr("author").alias("author"),
F.expr("controversiality").alias("controversiality"),
F.expr("score").alias("score"),
F.expr("total_awards_received").alias("total_awards_received"),
F.expr("comment_date").alias("comment_date"),
F.expr("year").alias("year"),
F.expr("month").alias("month"),
F.expr("hour").alias("hour"),
F.expr("comment_length").alias("comment_length"),
F.expr("Arrest_Freakout").alias("Arrest_Freakout"),
F.expr("Pandemic_Freakout").alias("Pandemic_Freakout")) \
.select(F.expr("cols['0']").alias("document"),
F.expr("cols['1']").alias("sentiment"),
"author","controversiality","score","total_awards_received","comment_date","year","month","hour","comment_length","Arrest_Freakout","Pandemic_Freakout")
sentiment_df.write.parquet("s3://ssp88-labdata2/sentiment_df/")
#Read parquet
sentiment_read = spark.read.parquet('s3://ssp88-labdata2/sentiment_df')
sentiment_read.printSchema()
root |-- document: string (nullable = true) |-- sentiment: string (nullable = true) |-- author: string (nullable = true) |-- controversiality: long (nullable = true) |-- score: long (nullable = true) |-- total_awards_received: long (nullable = true) |-- comment_date: string (nullable = true) |-- year: string (nullable = true) |-- month: string (nullable = true) |-- hour: string (nullable = true) |-- comment_length: integer (nullable = true) |-- Arrest_Freakout: string (nullable = true) |-- Pandemic_Freakout: string (nullable = true)
!/mnt/miniconda/bin/pip install altair
Requirement already satisfied: altair in /mnt/miniconda/lib/python3.7/site-packages (4.2.0) Requirement already satisfied: jinja2 in /mnt/miniconda/lib/python3.7/site-packages (from altair) (3.0.3) Requirement already satisfied: toolz in /mnt/miniconda/lib/python3.7/site-packages (from altair) (0.11.2) Requirement already satisfied: pandas>=0.18 in /mnt/miniconda/lib/python3.7/site-packages (from altair) (1.3.5) Requirement already satisfied: numpy in /mnt/miniconda/lib/python3.7/site-packages (from altair) (1.21.2) Requirement already satisfied: entrypoints in /mnt/miniconda/lib/python3.7/site-packages (from altair) (0.3) Requirement already satisfied: jsonschema>=3.0 in /mnt/miniconda/lib/python3.7/site-packages (from altair) (3.2.0) Requirement already satisfied: setuptools in /mnt/miniconda/lib/python3.7/site-packages (from jsonschema>=3.0->altair) (58.0.4) Requirement already satisfied: attrs>=17.4.0 in /mnt/miniconda/lib/python3.7/site-packages (from jsonschema>=3.0->altair) (21.4.0) Requirement already satisfied: six>=1.11.0 in /mnt/miniconda/lib/python3.7/site-packages (from jsonschema>=3.0->altair) (1.16.0) Requirement already satisfied: importlib-metadata in /mnt/miniconda/lib/python3.7/site-packages (from jsonschema>=3.0->altair) (4.8.2) Requirement already satisfied: pyrsistent>=0.14.0 in /mnt/miniconda/lib/python3.7/site-packages (from jsonschema>=3.0->altair) (0.18.0) Requirement already satisfied: python-dateutil>=2.7.3 in /mnt/miniconda/lib/python3.7/site-packages (from pandas>=0.18->altair) (2.8.2) Requirement already satisfied: pytz>=2017.3 in /mnt/miniconda/lib/python3.7/site-packages (from pandas>=0.18->altair) (2021.3) Requirement already satisfied: typing-extensions>=3.6.4 in /mnt/miniconda/lib/python3.7/site-packages (from importlib-metadata->jsonschema>=3.0->altair) (3.10.0.2) Requirement already satisfied: zipp>=0.5 in /mnt/miniconda/lib/python3.7/site-packages (from importlib-metadata->jsonschema>=3.0->altair) (3.7.0) Requirement already satisfied: MarkupSafe>=2.0 in /mnt/miniconda/lib/python3.7/site-packages (from jinja2->altair) (2.0.1)
import altair as alt
alt.data_transformers.disable_max_rows()
DataTransformerRegistry.enable('default')
sentiment_count = sentiment_read.groupby('sentiment').agg(F.count('document'))
sentiment_count = sentiment_count.withColumnRenamed('count(document)','sentiment_count')
sentiment_count.show()
+---------+---------------+ |sentiment|sentiment_count| +---------+---------------+ | positive| 9880051| | null| 108800| | neutral| 968815| | negative| 7163286| +---------+---------------+
sentiment_count = sentiment_count.toPandas()
sentiment_count_altered = sentiment_count.dropna(subset=['sentiment'])
sentiment_count_altered.head()
sentiment | sentiment_count | |
---|---|---|
0 | positive | 9880051 |
2 | neutral | 968815 |
3 | negative | 7163286 |
fig = (alt.Chart(sentiment_count_altered).mark_bar().encode(
y=alt.Y('sentiment_count', axis = alt.Axis(title = "Count")),
x=alt.X('sentiment', axis = alt.Axis(title = "Sentiment"),sort='-y'),
color=alt.value('#7fc97f'),
tooltip=['sentiment','sentiment_count']
)).properties(title={"text":'Sentiment Count',"subtitle" : "Sentiment of authors through each comment"},width = 500, height = 500)
fig.save("fig1.html")
fig
There are very few comments overall having neutral sentiment compared to the other two sentiments. Majority of the comments have a positive sentiment, this might be the case as the authors might leave motivational or happy comments under freakout videos or else there are comparatively more video under happy freakout category.
sentiment_time = sentiment_read.groupby('year','month','sentiment').agg(F.count('document'))
sentiment_time = sentiment_time.withColumnRenamed('count(document)','count')
sentiment_time.show(5)
+----+-----+---------+------+ |year|month|sentiment| count| +----+-----+---------+------+ |2020| 04| negative|236243| |2019| 08| neutral| 22363| |2020| 01| positive|231719| |2020| 11| negative|379104| |2019| 09| null| 2636| +----+-----+---------+------+ only showing top 5 rows
sentiment_time_df = sentiment_time.toPandas()
sentiment_time_df = sentiment_time_df.dropna(subset=['sentiment'])
sentiment_time_df
year | month | sentiment | count | |
---|---|---|---|---|
0 | 2020 | 11 | negative | 379104 |
1 | 2020 | 04 | negative | 236243 |
2 | 2019 | 08 | neutral | 22363 |
4 | 2020 | 01 | positive | 231719 |
5 | 2020 | 12 | neutral | 41181 |
... | ... | ... | ... | ... |
90 | 2019 | 08 | positive | 240089 |
91 | 2020 | 01 | neutral | 22618 |
92 | 2021 | 01 | neutral | 61535 |
93 | 2020 | 10 | positive | 473585 |
94 | 2020 | 01 | negative | 153705 |
72 rows × 4 columns
sentiment_time_df['time_concat'] = sentiment_time_df["year"] + "_" + sentiment_time_df["month"]
sentiment_time_df
year | month | sentiment | count | time_concat | |
---|---|---|---|---|---|
0 | 2020 | 11 | negative | 379104 | 2020_11 |
1 | 2020 | 04 | negative | 236243 | 2020_04 |
2 | 2019 | 08 | neutral | 22363 | 2019_08 |
4 | 2020 | 01 | positive | 231719 | 2020_01 |
5 | 2020 | 12 | neutral | 41181 | 2020_12 |
... | ... | ... | ... | ... | ... |
90 | 2019 | 08 | positive | 240089 | 2019_08 |
91 | 2020 | 01 | neutral | 22618 | 2020_01 |
92 | 2021 | 01 | neutral | 61535 | 2021_01 |
93 | 2020 | 10 | positive | 473585 | 2020_10 |
94 | 2020 | 01 | negative | 153705 | 2020_01 |
72 rows × 5 columns
sentiment_time_df = sentiment_time_df.sort_values(["year","month"]).reset_index().drop('index',axis = 1)
fig2 = (alt.Chart(sentiment_time_df).mark_line().encode(
x=alt.X('time_concat', axis = alt.Axis(title = "Timeframe")),
y=alt.Y('count', axis = alt.Axis(title = "Count of Comments")),
color='sentiment',
tooltip=['time_concat','count']
)).resolve_scale(x='independent').properties(title={"text":'Count of Comments',"subtitle" : "Relationship between Time Period and Frequency of Comments for each Sentiment"},width = 500, height = 500).interactive()
fig2.save("fig2.html")
fig2
sentiment_covid = sentiment_read.groupby('Pandemic_Freakout','sentiment').agg(F.avg('score'))
sentiment_covid = sentiment_covid.withColumnRenamed('avg(score)','average_score')
sentiment_covid.show()
+-----------------+---------+------------------+ |Pandemic_Freakout|sentiment| average_score| +-----------------+---------+------------------+ | corona| neutral|18.661516853932586| | virus| neutral| 11.61215932914046| | virus| negative|11.208408528841655| | masks| positive|15.556957011851445| | pandemic| positive| 15.93591145121618| | covid| neutral| 12.82067415730337| | | null| 5.79875| | | neutral|14.973012614589047| | masks| negative|14.785119574844996| | pandemic| neutral| 24.19124087591241| | corona| negative|13.306956201693044| | pandemic| negative| 18.04567284132118| | | negative|13.945360261043453| | virus| positive|10.075320849989481| | covid| positive| 17.26737085805238| | | positive|14.725523070829738| | covid| negative|19.250790647384783| | corona| positive|21.394075229000194| | masks| neutral| 20.07885791978246| +-----------------+---------+------------------+
sentiment_covid = sentiment_covid.toPandas()
sentiment_covid = sentiment_covid.dropna(subset=['sentiment'])
sentiment_covid.head(20)
Pandemic_Freakout | sentiment | average_score | |
---|---|---|---|
0 | corona | neutral | 18.661517 |
1 | virus | neutral | 11.612159 |
2 | virus | negative | 11.208409 |
3 | masks | positive | 15.556957 |
4 | pandemic | positive | 15.935911 |
5 | covid | neutral | 12.820674 |
7 | neutral | 14.973013 | |
8 | masks | negative | 14.785120 |
9 | pandemic | neutral | 24.191241 |
10 | corona | negative | 13.306956 |
11 | pandemic | negative | 18.045673 |
12 | negative | 13.945360 | |
13 | virus | positive | 10.075321 |
14 | covid | positive | 17.267371 |
15 | positive | 14.725523 | |
16 | covid | negative | 19.250791 |
17 | corona | positive | 21.394075 |
18 | masks | neutral | 20.078858 |
fig3 = (alt.Chart(sentiment_covid).mark_bar().encode(
x=alt.X('Pandemic_Freakout', axis = alt.Axis(title = "Covid Terms"),sort='-y'),
y=alt.Y('average_score', axis = alt.Axis(title = "Average Score")),
color='sentiment',
tooltip=['Pandemic_Freakout','average_score','sentiment']
)).properties(title={"text":'Covid Sentiment',"subtitle" : "Sentiment of authors through each comment revolving around covid"},width = 500, height = 500)
fig.save("fig3.html")
fig3
sentiment_bar = sentiment_read.groupby('controversiality','sentiment').agg(F.avg('score'))
sentiment_bar = sentiment_bar.withColumnRenamed('avg(score)','average_score')
sentiment_bar.show()
+----------------+---------+-------------------+ |controversiality|sentiment| average_score| +----------------+---------+-------------------+ | 0| neutral| 15.731455497859258| | 1| negative| 0.6144336834713404| | 1| positive| 0.3619774973596332| | 0| negative| 14.649875335629206| | 1| null|0.27723321620122066| | 0| null| 6.087501088081398| | 1| neutral| 0.5500021304699817| | 0| positive| 15.488053330822497| +----------------+---------+-------------------+
sentiment_bar = sentiment_bar.toPandas()
sentiment_bar = sentiment_bar.dropna(subset=['sentiment']).reset_index().drop('index',axis = 1)
sentiment_bar.head(10)
controversiality | sentiment | average_score | |
---|---|---|---|
0 | 0 | neutral | 15.731455 |
1 | 1 | negative | 0.614434 |
2 | 1 | positive | 0.361977 |
3 | 0 | negative | 14.649875 |
4 | 1 | neutral | 0.550002 |
5 | 0 | positive | 15.488053 |
sentiment_bar["controversiality"] = sentiment_bar["controversiality"].astype("category")
fig4 = (alt.Chart(sentiment_bar).mark_bar().encode(
x=alt.X('controversiality', axis = alt.Axis(title = "Covid Terms")),
y=alt.Y('average_score', axis = alt.Axis(title = "Score")),
color='sentiment',
tooltip=['controversiality','average_score','sentiment']
)).properties(title={"text":'Covid Sentiment',"subtitle" : "Sentiment of authors through each comment revolving around covid"},width = 500, height = 500)
fig4
spark.stop()