import findspark
findspark.init()


import pyspark.sql.functions as F
from pyspark.sql.functions import col, lit,size
from pyspark.sql import SparkSession
import warnings
warnings.filterwarnings('ignore')


spark = SparkSession.builder.appName("reddit").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/04/05 22:48:18 WARN Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.


spark


df_full = spark.read.parquet('s3://ssp88-labdata2/reddit')


#Number of rows of the data
rows = df_full.count()
rows

22/04/05 22:48:39 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.

18120952


#Number of columns
columns = len(df_full.columns)
columns

51


df_full.printSchema()

root
 |-- all_awardings: string (nullable = true)
 |-- associated_award: string (nullable = true)
 |-- author: string (nullable = true)
 |-- author_created_utc: double (nullable = true)
 |-- author_flair_background_color: string (nullable = true)
 |-- author_flair_css_class: string (nullable = true)
 |-- author_flair_richtext: string (nullable = true)
 |-- author_flair_template_id: string (nullable = true)
 |-- author_flair_text: string (nullable = true)
 |-- author_flair_text_color: string (nullable = true)
 |-- author_flair_type: string (nullable = true)
 |-- author_fullname: string (nullable = true)
 |-- author_patreon_flair: boolean (nullable = true)
 |-- author_premium: boolean (nullable = true)
 |-- awarders: string (nullable = true)
 |-- body: string (nullable = true)
 |-- can_gild: boolean (nullable = true)
 |-- can_mod_post: boolean (nullable = true)
 |-- collapsed: boolean (nullable = true)
 |-- collapsed_because_crowd_control: string (nullable = true)
 |-- collapsed_reason: string (nullable = true)
 |-- comment_type: string (nullable = true)
 |-- controversiality: long (nullable = true)
 |-- created_utc: long (nullable = true)
 |-- distinguished: string (nullable = true)
 |-- edited: double (nullable = true)
 |-- gilded: long (nullable = true)
 |-- gildings: string (nullable = true)
 |-- id: string (nullable = true)
 |-- is_submitter: boolean (nullable = true)
 |-- link_id: string (nullable = true)
 |-- locked: boolean (nullable = true)
 |-- no_follow: boolean (nullable = true)
 |-- parent_id: string (nullable = true)
 |-- permalink: string (nullable = true)
 |-- quarantined: boolean (nullable = true)
 |-- removal_reason: string (nullable = true)
 |-- retrieved_on: long (nullable = true)
 |-- score: long (nullable = true)
 |-- send_replies: boolean (nullable = true)
 |-- stickied: boolean (nullable = true)
 |-- subreddit_id: string (nullable = true)
 |-- subreddit_name_prefixed: string (nullable = true)
 |-- subreddit_type: string (nullable = true)
 |-- top_awarded_type: string (nullable = true)
 |-- total_awards_received: long (nullable = true)
 |-- treatment_tags: string (nullable = true)
 |-- author_cakeday: boolean (nullable = true)
 |-- editable: boolean (nullable = true)
 |-- media_metadata: string (nullable = true)
 |-- ym_partition: integer (nullable = true)


df = spark.read.parquet('s3://ssp88-labdata2/reddit/ym_partition=201910')


df_small = df.limit(10000)


#Checking the missing values
from pyspark.sql.functions import col,isnan, when, count
missing_val = df_full.select(*[
    (
        F.count(F.when((F.isnan(c) | \
                        F.col(c).isNull() | \
                        F.col(c).contains('None') | \
                        F.col(c).contains('NULL') | \
                        (F.col(c) == '')), c)) 
        if t not in ("boolean","timestamp", "date")
        else F.count(F.when(F.col(c).isNull(), c))
    ).alias(c)
    for c, t in df_full.dtypes if c in df_full.columns
]).toPandas()


missing_val


import pandas as pd
missing_val = pd.melt(missing_val)


missing_val


missing_val['percentage'] = (missing_val['value'] / rows) * 100


missing_val


threshold = missing_val[missing_val['percentage'] > 70] 
threshold = threshold.reset_index()
threshold


columns_to_drop = threshold.variable.tolist()


df_full = df_full.drop(*columns_to_drop)


#New count of columns 
columns = len(df_full.columns)
columns

36


df_full.printSchema()

root
 |-- all_awardings: string (nullable = true)
 |-- author: string (nullable = true)
 |-- author_created_utc: double (nullable = true)
 |-- author_flair_richtext: string (nullable = true)
 |-- author_flair_type: string (nullable = true)
 |-- author_fullname: string (nullable = true)
 |-- author_patreon_flair: boolean (nullable = true)
 |-- author_premium: boolean (nullable = true)
 |-- awarders: string (nullable = true)
 |-- body: string (nullable = true)
 |-- can_gild: boolean (nullable = true)
 |-- can_mod_post: boolean (nullable = true)
 |-- collapsed: boolean (nullable = true)
 |-- controversiality: long (nullable = true)
 |-- created_utc: long (nullable = true)
 |-- gilded: long (nullable = true)
 |-- gildings: string (nullable = true)
 |-- id: string (nullable = true)
 |-- is_submitter: boolean (nullable = true)
 |-- link_id: string (nullable = true)
 |-- locked: boolean (nullable = true)
 |-- no_follow: boolean (nullable = true)
 |-- parent_id: string (nullable = true)
 |-- permalink: string (nullable = true)
 |-- quarantined: boolean (nullable = true)
 |-- retrieved_on: long (nullable = true)
 |-- score: long (nullable = true)
 |-- send_replies: boolean (nullable = true)
 |-- stickied: boolean (nullable = true)
 |-- subreddit_id: string (nullable = true)
 |-- subreddit_name_prefixed: string (nullable = true)
 |-- subreddit_type: string (nullable = true)
 |-- total_awards_received: long (nullable = true)
 |-- treatment_tags: string (nullable = true)
 |-- editable: boolean (nullable = true)
 |-- ym_partition: integer (nullable = true)


#Checking the length of the comment 
df_full = df_full.withColumn("comment_length", F.length(col('body')))


df_full.select('body','comment_length').show(10)

[Stage 8:>                                                          (0 + 1) / 1]

+--------------------+--------------+
|                body|comment_length|
+--------------------+--------------+
|so, theyre like a...|            33|
|By that definitio...|           444|
|I'm a follower of...|            66|
|Nowadays MAGA is ...|            50|
|     MORGAN FREEMAN.|            15|
|           [deleted]|             9|
|This is really sc...|            21|
|He should be on “...|            72|
|Yesss, and someti...|           129|
|This is why malls...|            31|
+--------------------+--------------+
only showing top 10 rows


#Maximum length
from pyspark.sql.functions import max
df_full.select([max("comment_length")]).show()

[Stage 9:=======================================================> (50 + 1) / 51]

+-------------------+
|max(comment_length)|
+-------------------+
|              11252|
+-------------------+


#Minimum Length
from pyspark.sql.functions import min
df_full.select([min("comment_length")]).show()

[Stage 12:======================================================> (50 + 1) / 51]

+-------------------+
|min(comment_length)|
+-------------------+
|                  0|
+-------------------+


#Checking for blank missing values in body
blank_body = df_full.filter(df_full['body'] == '')
blank_body.select('body').show(10)

[Stage 17:=======================>                                  (2 + 3) / 5]

+----+
|body|
+----+
|    |
|    |
|    |
|    |
|    |
|    |
|    |
|    |
|    |
|    |
+----+
only showing top 10 rows


blank_body.count()

117


from pyspark.sql.functions import date_format
df_full = df_full.withColumn("comment_date",F.from_unixtime(F.col("created_utc")))


df_full.select('created_utc','comment_date').show(10)

+-----------+-------------------+
|created_utc|       comment_date|
+-----------+-------------------+
| 1611578421|2021-01-25 12:40:21|
| 1611551895|2021-01-25 05:18:15|
| 1610352926|2021-01-11 08:15:26|
| 1610343356|2021-01-11 05:35:56|
| 1610333628|2021-01-11 02:53:48|
| 1610336206|2021-01-11 03:36:46|
| 1610343951|2021-01-11 05:45:51|
| 1610355160|2021-01-11 08:52:40|
| 1610343826|2021-01-11 05:43:46|
| 1610333461|2021-01-11 02:51:01|
+-----------+-------------------+
only showing top 10 rows


from pyspark.sql.functions import regexp_extract
df_full = df_full.withColumn("year",regexp_extract(col('comment_date'),r'(.*?)-',1))


df_full = df_full.withColumn("month",regexp_extract(col('comment_date'),r'-(.*?)-',1))


df_full = df_full.withColumn("hour",regexp_extract(col('comment_date'),r'\s(.+?):',1))


df_full.select('comment_date','year','month','hour').show(10)

+-------------------+----+-----+----+
|       comment_date|year|month|hour|
+-------------------+----+-----+----+
|2021-01-25 12:40:21|2021|   01|  12|
|2021-01-25 05:18:15|2021|   01|  05|
|2021-01-11 08:15:26|2021|   01|  08|
|2021-01-11 05:35:56|2021|   01|  05|
|2021-01-11 02:53:48|2021|   01|  02|
|2021-01-11 03:36:46|2021|   01|  03|
|2021-01-11 05:45:51|2021|   01|  05|
|2021-01-11 08:52:40|2021|   01|  08|
|2021-01-11 05:43:46|2021|   01|  05|
|2021-01-11 02:51:01|2021|   01|  02|
+-------------------+----+-----+----+
only showing top 10 rows


df_full = df_full.withColumn("original_post",regexp_extract(col('permalink'),r'comments/(.*)/(.*)/(.*)/',2))


df_full.select('original_post').show(10)

[Stage 23:>                                                         (0 + 1) / 1]

+--------------------+
|       original_post|
+--------------------+
|some_say_hes_stil...|
|babies_first_tast...|
|corey_forrester_r...|
|trump_terrorists_...|
|watch_governor_sc...|
|watch_governor_sc...|
|my_mom_just_sent_...|
|trump_terrorists_...|
|airport_freakout_...|
|please_god_omg_in...|
+--------------------+
only showing top 10 rows


#Replacing underscore with spaces
from pyspark.sql.functions import regexp_replace
df_full = df_full.withColumn("original_post",regexp_replace('original_post','_',' '))


df_full.select('original_post').show(10)

+--------------------+
|       original_post|
+--------------------+
|some say hes stil...|
|babies first tast...|
|corey forrester r...|
|trump terrorists ...|
|watch governor sc...|
|watch governor sc...|
|my mom just sent ...|
|trump terrorists ...|
|airport freakout ...|
|please god omg in...|
+--------------------+
only showing top 10 rows


df_full.select('score','comment_length').show(5)

[Stage 25:>                                                         (0 + 1) / 1]

+-----+--------------+
|score|comment_length|
+-----+--------------+
|   15|            33|
|   -5|           444|
|    5|            66|
|    9|            50|
|    1|            15|
+-----+--------------+
only showing top 5 rows


rel_score_length = df_full.groupby('comment_length').agg(F.mean('score'))


rel_score_length = rel_score_length.withColumnRenamed('avg(score)','avg_score')


rel_score_length.show(10)

[Stage 26:======================================================> (50 + 1) / 51]

+--------------+------------------+
|comment_length|         avg_score|
+--------------+------------------+
|           148|15.415712849777893|
|           496|14.188170183327568|
|          1591| 55.39772727272727|
|           833| 8.625352112676056|
|          1088| 13.56675749318801|
|          1342| 26.13855421686747|
|           243| 15.41590752507081|
|           540|13.578768577494692|
|          1395| 9.030864197530864|
|          1303|          44.55625|
+--------------+------------------+
only showing top 10 rows


rel_score_length_df = rel_score_length.toPandas()


rel_score_length_df


#!/mnt/miniconda/bin/pip install altair

Collecting altair
  Downloading altair-4.2.0-py3-none-any.whl (812 kB)
     |████████████████████████████████| 812 kB 33.7 MB/s eta 0:00:01
Requirement already satisfied: entrypoints in /mnt/miniconda/lib/python3.7/site-packages (from altair) (0.3)
Requirement already satisfied: numpy in /mnt/miniconda/lib/python3.7/site-packages (from altair) (1.21.2)
Requirement already satisfied: jsonschema>=3.0 in /mnt/miniconda/lib/python3.7/site-packages (from altair) (3.2.0)
Requirement already satisfied: toolz in /mnt/miniconda/lib/python3.7/site-packages (from altair) (0.11.2)
Requirement already satisfied: jinja2 in /mnt/miniconda/lib/python3.7/site-packages (from altair) (3.0.3)
Requirement already satisfied: pandas>=0.18 in /mnt/miniconda/lib/python3.7/site-packages (from altair) (1.3.5)
Requirement already satisfied: setuptools in /mnt/miniconda/lib/python3.7/site-packages (from jsonschema>=3.0->altair) (58.0.4)
Requirement already satisfied: pyrsistent>=0.14.0 in /mnt/miniconda/lib/python3.7/site-packages (from jsonschema>=3.0->altair) (0.18.0)
Requirement already satisfied: importlib-metadata in /mnt/miniconda/lib/python3.7/site-packages (from jsonschema>=3.0->altair) (4.8.2)
Requirement already satisfied: attrs>=17.4.0 in /mnt/miniconda/lib/python3.7/site-packages (from jsonschema>=3.0->altair) (21.4.0)
Requirement already satisfied: six>=1.11.0 in /mnt/miniconda/lib/python3.7/site-packages (from jsonschema>=3.0->altair) (1.16.0)
Requirement already satisfied: python-dateutil>=2.7.3 in /mnt/miniconda/lib/python3.7/site-packages (from pandas>=0.18->altair) (2.8.2)
Requirement already satisfied: pytz>=2017.3 in /mnt/miniconda/lib/python3.7/site-packages (from pandas>=0.18->altair) (2021.3)
Requirement already satisfied: typing-extensions>=3.6.4 in /mnt/miniconda/lib/python3.7/site-packages (from importlib-metadata->jsonschema>=3.0->altair) (3.10.0.2)
Requirement already satisfied: zipp>=0.5 in /mnt/miniconda/lib/python3.7/site-packages (from importlib-metadata->jsonschema>=3.0->altair) (3.7.0)
Requirement already satisfied: MarkupSafe>=2.0 in /mnt/miniconda/lib/python3.7/site-packages (from jinja2->altair) (2.0.1)
Installing collected packages: altair
Successfully installed altair-4.2.0


import altair as alt
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')


fig = (alt.Chart(rel_score_length_df).mark_line().encode(
    x=alt.X('comment_length', axis = alt.Axis(title = "Length of the Comment (Characters)")),
    y=alt.Y('avg_score', axis = alt.Axis(title = "Average Score")),
    color=alt.value('#beaed4'),
    tooltip=['avg_score','comment_length']
)).resolve_scale(x='independent').properties(title={"text":'Should Authors write Long Comments to increase their Score?',"subtitle" : "Relationship between Comment Score and Length"},width = 500, height = 500).interactive()

fig


fig.save("fig1.html")


rel_score_length_df_score_filter = rel_score_length_df[rel_score_length_df['avg_score'] < 2000]


fig = (alt.Chart(rel_score_length_df_score_filter).mark_line().encode(
    x=alt.X('comment_length', axis = alt.Axis(title = "Length of the Comment (Characters)")),
    y=alt.Y('avg_score', axis = alt.Axis(title = "Average Score")),
    color=alt.value('#beaed4'),
    tooltip=['avg_score','comment_length']
)).resolve_scale(x='independent').properties(title={"text":'Should Authors write Long Comments to increase their Score?',"subtitle" : "Relationship between Comment Score and Length"},width = 500, height = 500).interactive()

fig


fig.save("fig1.1.html")


comment_time = df_full.groupby('hour').agg(F.count('body'))


comment_time = comment_time.withColumnRenamed('count(body)','comment_count')


comment_time.show(5)

+----+-------------+
|hour|comment_count|
+----+-------------+
|  01|       847773|
|  18|       929931|
|  06|       593932|
|  15|       906672|
|  23|       907571|
+----+-------------+
only showing top 5 rows


comment_time_df = comment_time.toPandas()


comment_time_df


comment_time_df['hot_start'] = '15'
comment_time_df['hot_end'] = '19'
comment_time_df['text_x'] = '17'
comment_time_df['text_y'] = 300000


fig = (alt.Chart(comment_time_df).mark_line().encode(
    x=alt.X('hour', axis = alt.Axis(title = "Time of the day (Military Time)")),
    y=alt.Y('comment_count', axis = alt.Axis(title = "Count of Comments")),
    color=alt.value('#fdc086'),
    tooltip=['hour','comment_count']
)).properties(title={"text":'Hottest Comment Time',"subtitle" : "Relationship between Time of the day and Frequency of Comments"},width = 500, height = 500).interactive()

threshold =(
    alt.Chart(comment_time_df)
    .mark_rule(opacity = 0.01)
    .encode(
        x = 'hot_start'
    )
)

threshold2 =(
    alt.Chart(comment_time_df)
    .mark_rule(opacity = 0.01)
    .encode(
        x = 'hot_end'
    )
)

text =(
    alt.Chart(comment_time_df)
    .mark_text(text = "Hottest \nComment \nTime",lineBreak = ' ',opacity = 0.1)
    .encode(
        x = 'text_x',
        y = 'text_y'
    )
)


fig + threshold + threshold2 + text


fig.save("fig2.html")


comment_time_ym = df_full.groupby('year','month').agg(F.count('body'))


comment_time_ym = comment_time_ym.withColumnRenamed('count(body)','comment_count')


comment_time_ym.show(5)

+----+-----+-------------+
|year|month|comment_count|
+----+-----+-------------+
|2019|   09|       401982|
|2020|   03|       472989|
|2021|   04|       834271|
|2019|   10|       341954|
|2021|   03|       719236|
+----+-----+-------------+
only showing top 5 rows


comment_time_ym_df = comment_time_ym.toPandas()


comment_time_ym_df


comment_time_ym_df['time_concat'] = comment_time_ym_df["year"] + "_" + comment_time_ym_df["month"]
comment_time_ym_df


comment_time_ym_df['year'] = comment_time_ym_df['year'].apply(pd.to_numeric)


comment_time_ym_df = comment_time_ym_df.sort_values(["year","month"]).reset_index().drop('index',axis = 1)


comment_time_ym_df


fig = (alt.Chart(comment_time_ym_df).mark_line().encode(
    x=alt.X('time_concat', axis = alt.Axis(title = "Timeframe")),
    y=alt.Y('comment_count', axis = alt.Axis(title = "Count of Comments")),
    color=alt.value('#fdc086'),    
    tooltip=['time_concat','comment_count']
)).resolve_scale(x='independent').properties(title={"text":'Count of Comments',"subtitle" : "Relationship between Time Period and Frequency of Comments "},width = 500, height = 500).interactive()

fig


fig.save("fig3.html")


top_author = df_full.groupby('author').agg(F.count('body'))


top_author = top_author.withColumnRenamed('count(body)','author_freq')


top_author.show(10)

[Stage 55:=====================================================>  (49 + 2) / 51]

+----------------+-----------+
|          author|author_freq|
+----------------+-----------+
|peasant-overlord|         62|
|       BBresulla|        123|
|    sound_defect|        145|
|     LoachLounge|         17|
|   FuckYouTikTok|         84|
|     FeroxNumine|          4|
|    Doctor_Sauce|          2|
| FantaToTheKnees|         38|
|     Trespasserz|         14|
|anonymous_potato|        342|
+----------------+-----------+
only showing top 10 rows


top_author.count()

1438703


top_author_df = top_author.toPandas()


top_author_df['author_freq'] = top_author_df['author_freq'].apply(pd.to_numeric)


top_author_df = top_author_df.sort_values('author_freq',ascending=False).reset_index().drop('index',axis = 1)


top_author_df


#Removing the first row as the author's name is not mentioned and it os deleted
top_author_df = top_author_df.iloc[1: , :].reset_index().drop('index',axis = 1)


top_author_df


top_author_df_top10 = top_author_df.head(10)


top_author_df_top10.head(10)


fig = (alt.Chart(top_author_df_top10).mark_bar().encode(
    x=alt.X('author_freq', axis = alt.Axis(title = "Total comments of each author")),
    y=alt.Y('author', axis = alt.Axis(title = "Author"),sort='-x'),
    color=alt.value('#7fc97f'),
    tooltip=['author','author_freq']
)).resolve_scale(x='independent').properties(title={"text":'Most Active User',"subtitle" : "Top 10 authors with highest frequency of the comment"},width = 500, height = 500)

fig


fig.save("fig4.html")


authors_to_select = top_author_df_top10.author.tolist()


df_controversial = df_full[df_full['author'].isin(authors_to_select)]


df_controversial = df_controversial.toPandas()


author_contro_df = top_author_df_top10.merge(df_controversial, how='right', left_on='author', right_on='author')
author_contro_df.head()


author_contro_df = author_contro_df[['author','controversiality','author_freq']]


author_contro_df.head()


author_contro_df_stat = author_contro_df.groupby(['author','author_freq'])['controversiality'].agg(['sum','mean']).reset_index()


author_contro_df_stat = author_contro_df_stat.rename(columns={"sum": "controversiality_sum", "mean": "controversiality_mean"})


author_contro_df_stat = author_contro_df_stat.sort_values('author_freq',ascending=False).reset_index().drop('index',axis = 1)


author_contro_df_stat


fig = (alt.Chart(author_contro_df_stat).mark_bar().encode(
    x=alt.X('controversiality_mean', axis = alt.Axis(title = "Mean of Controversiality for a user")),
    y=alt.Y('author', axis = alt.Axis(title = "Author"), sort = '-x'),
    color=alt.value('#7fc97f'),
    tooltip=['author','controversiality_mean']
)).resolve_scale(x='independent').properties(title={"text":'Is the top user bound to be more controversial?',"subtitle" : "Relationship between Top Users and Controversiality"},width = 500, height = 500)

fig


fig.save("fig5.html")


mirror_st = df_full[df_full['author'].isin('a-mirror-bot')]


mirror_st_subset = mirror_st.select('score','total_awards_received','year','month')


mirror_st_subset.show(5)

+-----+---------------------+----+-----+
|score|total_awards_received|year|month|
+-----+---------------------+----+-----+
|    1|                    0|2021|   01|
|    1|                    0|2021|   01|
|    1|                    0|2021|   01|
|    1|                    0|2021|   01|
|    1|                    0|2021|   01|
+-----+---------------------+----+-----+
only showing top 5 rows


mirror_st_subset_df = mirror_st_subset.toPandas()


mirror_st_subset_df.head(5)


fig = (alt.Chart(mirror_st_subset_df).mark_point().encode(
    x=alt.X('total_awards_received', axis = alt.Axis(title = "Total Awards Received for each Comment"),scale=alt.Scale(domain=[-0.2, 2.5])),
    y=alt.Y('score', axis = alt.Axis(title = "Score of each Comment")),
    color=alt.value('#e31a1c'),
    tooltip=['score','total_awards_received']
)).resolve_scale(x='independent').properties(title={"text":'Is there a relationship between Score and Total Awards for a User ?',"subtitle" : "A-Mirror-Bot (Most active author) Statistics"},width = 500, height = 500).interactive()

fig


fig.save("fig6.html")


df_full_reddit = df_full.withColumn("Pandemic_Freakout", F.regexp_extract('body', \
                                                        r'(?i)\bcovid\b|(?i)\bpandemic\b|(?i)\bcovid-19\b|(?i)\bcorona\b|(?i)\bvirus\b|(?i)\bmasks\b|(?i)\hospital\b',0))

df_full_reddit = df_full_reddit.withColumn("Happy_Freakout", F.regexp_extract('body', \
                                                        r'(?i)\bsupport\b|(?i)\bwholesome\b|(?i)\bsurprise\b|(?i)\bhappy\b|(?i)\bfun\b|(?i)\bexcited\b|(?i)\positive\b',0))

df_full_reddit = df_full_reddit.withColumn("Karen_Freakout", F.regexp_extract('body', \
                                                        r'(?i)\bkaren\b|(?i)\bmad\b|(?i)\battack\b|(?i)\bwild\b|(?i)\bmanager\b|(?i)\bargue\b|(?i)\meltdown\b',0))

df_full_reddit = df_full_reddit.withColumn("Drunk_Freakout", F.regexp_extract('body', \
                                                        r'(?i)\bdrunk\b|(?i)\bliquor\b|(?i)\bbar\b|(?i)\bspill\b|(?i)\bbouncer\b|(?i)\bbeer\b | (?i)\balcohol\b',0))

df_full_reddit = df_full_reddit.withColumn("Arrest_Freakout", F.regexp_extract('body', \
                                                        r'(?i)\barrest\b|(?i)\bofficer\b|(?i)\bpolice\b|(?i)\bcop\b|(?i)\bstab\b|(?i)\billegal\b|(?i)\brutal\b',0))


df_full_reddit = df_full_reddit.withColumn("Pandemic_Freakout",F.lower(F.col('Pandemic_Freakout')))
df_full_reddit = df_full_reddit.withColumn("Happy_Freakout",F.lower(F.col('Happy_Freakout')))
df_full_reddit = df_full_reddit.withColumn("Karen_Freakout",F.lower(F.col('Karen_Freakout')))
df_full_reddit = df_full_reddit.withColumn("Drunk_Freakout",F.lower(F.col('Drunk_Freakout')))
df_full_reddit = df_full_reddit.withColumn("Arrest_Freakout",F.lower(F.col('Arrest_Freakout')))


#Rows with blank type are dropped
df_full_reddit = df_full_reddit.filter(df_full_reddit.Pandemic_Freakout != "")
df_full_reddit = df_full_reddit.filter(df_full_reddit.Happy_Freakout != "")
df_full_reddit = df_full_reddit.filter(df_full_reddit.Karen_Freakout != "")
df_full_reddit = df_full_reddit.filter(df_full_reddit.Drunk_Freakout != "")
df_full_reddit = df_full_reddit.filter(df_full_reddit.Arrest_Freakout != "")


df_full_reddit = df_full.withColumn("Pandemic_Freakout", F.regexp_extract('body', \
                                                        r'(?i)\bcovid\b|(?i)\bpandemic\b|(?i)\bcovid-19\b|(?i)\bcorona\b|(?i)\bvirus\b|(?i)\bmasks\b|(?i)\hospital\b',0))


df_full_reddit = df_full_reddit.withColumn("Pandemic_Freakout",F.lower(F.col('Pandemic_Freakout')))


reddit_df = df_full_reddit.select('body','Pandemic_Freakout','score','controversiality','comment_date','year','month','hour')


reddit_df.show(5)

[Stage 70:>                                                         (0 + 1) / 1]

+--------------------+-----------------+-----+----------------+-------------------+----+-----+----+
|                body|Pandemic_Freakout|score|controversiality|       comment_date|year|month|hour|
+--------------------+-----------------+-----+----------------+-------------------+----+-----+----+
|so, theyre like a...|                 |   15|               0|2021-01-25 12:40:21|2021|   01|  12|
|By that definitio...|                 |   -5|               0|2021-01-25 05:18:15|2021|   01|  05|
|I'm a follower of...|                 |    5|               0|2021-01-11 08:15:26|2021|   01|  08|
|Nowadays MAGA is ...|                 |    9|               0|2021-01-11 05:35:56|2021|   01|  05|
|     MORGAN FREEMAN.|                 |    1|               0|2021-01-11 02:53:48|2021|   01|  02|
+--------------------+-----------------+-----+----------------+-------------------+----+-----+----+
only showing top 5 rows


reddit_df.groupby("Pandemic_Freakout").count().collect()

[Row(Pandemic_Freakout='masks', count=52700),
 Row(Pandemic_Freakout='pandemic', count=37097),
 Row(Pandemic_Freakout='', count=17913227),
 Row(Pandemic_Freakout='covid', count=76000),
 Row(Pandemic_Freakout='corona', count=11277),
 Row(Pandemic_Freakout='virus', count=30651)]


#Rows with blank type are dropped
reddit_df = reddit_df.filter(reddit_df.Pandemic_Freakout != "")


reddit_pddf = reddit_df.toPandas()


reddit_summary_pandemic = reddit_pddf.groupby(['Pandemic_Freakout','year'])['controversiality','score'].agg(['sum','mean']).reset_index()


reddit_summary_pandemic.head(15)


covid_stats = pd.read_csv("worldometer_coronavirus_daily_data.csv")


covid_stats.head()


covid_stats_case = covid_stats.groupby('date')['cumulative_total_cases'].agg(['sum']).reset_index()


covid_stats_case['year'] = pd.DatetimeIndex(covid_stats_case['date']).year
covid_stats_case['month'] = pd.DatetimeIndex(covid_stats_case['date']).month


covid_stats_case.head()


covid_stats_case['date'] = pd.to_datetime(covid_stats_case['date'])


reddit_pddf['comment_date'] = pd.to_datetime(reddit_pddf['comment_date'])


reddit_pddf['month'] =  reddit_pddf['month'].astype('int')
reddit_pddf['year'] =  reddit_pddf['year'].astype('int')
covid_stats_case['year'] =  covid_stats_case['year'].astype('int')
covid_stats_case['month'] =  covid_stats_case['month'].astype('int')


covid_merge = pd.merge(reddit_pddf,covid_stats_case,left_on = ['year','month'],right_on = ["year","month"],how = "inner")


covid_merge.head()


covid_merge = covid_merge.rename(columns = {'sum':'covid_cases'})


covid_merge_stat = covid_merge.groupby(['Pandemic_Freakout','year','month'])['controversiality','covid_cases','score'].agg(['sum']).reset_index()


covid_merge_stat


spark.stop()

	comment_length	avg_score
0	496	14.188170
1	148	15.415713
2	833	8.625352
3	1088	13.566757
4	1342	26.138554
...	...	...
6718	7635	0.000000
6719	6634	1.000000
6720	8882	1.000000
6721	5318	2.000000
6722	5636	1.000000

	author	author_freq
0	[deleted]	3093369
1	a-mirror-bot	66720
2	VredditDownloader	20055
3	SaveVideo	13328
4	SajuPacapu	10619
...	...	...
1438698	Adampnad	1
1438699	Styzix1	1
1438700	Fbeezy	1
1438701	ruwheele	1
1438702	Wretinpo	1

	author	author_freq
0	a-mirror-bot	66720
1	VredditDownloader	20055
2	SaveVideo	13328
3	SajuPacapu	10619
4	PF_Mirror_Bot	8276
...	...	...
1438697	Adampnad	1
1438698	Styzix1	1
1438699	Fbeezy	1
1438700	ruwheele	1
1438701	Wretinpo	1

	author	author_freq
0	a-mirror-bot	66720
1	VredditDownloader	20055
2	SaveVideo	13328
3	SajuPacapu	10619
4	PF_Mirror_Bot	8276
5	Dolt-Dragoman	7284
6	felixjawesome	6639
7	CantStopPoppin	5853
8	Pardusco	5397
9	AutoModerator	5239

	author	author_freq
0	Pardusco	5397
1	SaveVideo	13328
2	a-mirror-bot	66720
3	SaveVideo	13328
4	a-mirror-bot	66720

Reddit Analysis - EDA¶

Initializing Spark Session¶

Reading the entire dataset¶

Basic Info of the Data¶

Data Quality Check¶

Data Transformations¶

Adding new column for the exact date of the comment¶

Adding new column for the content of the original post¶

Graphs - Business Questions¶

Relationship between Comment Score and Length¶

Summary Statistic¶

Relationship between Time of the day and frequency of comments¶

Relationship between Time Period and frequency of comments¶

Summary Statistic¶

Top 10 authors with highest freq of the comment¶

Checking if the Top 10 authors with highest freq of the comment are controversial¶

Summary Statistic¶

Relationship between Score and Total Awards for each comment of A-Mirror-Bot (Most active author)¶

Creating Dummy Variables using Regex¶

Summary Statistics¶

Summary Statistics¶

	variable	value
0	all_awardings	79493
1	associated_award	18120952
2	author	635
3	author_created_utc	6850551
4	author_flair_background_color	18120952
5	author_flair_css_class	18119403
6	author_flair_richtext	3093369
7	author_flair_template_id	18120952
8	author_flair_text	18036426
9	author_flair_text_color	14943013
10	author_flair_type	3093369
11	author_fullname	3093369
12	author_patreon_flair	3093369
13	author_premium	2988543
14	awarders	638049
15	body	16312
16	can_gild	0
17	can_mod_post	0
18	collapsed	0
19	collapsed_because_crowd_control	18120952
20	collapsed_reason	17406962
21	comment_type	18120952
22	controversiality	0
23	created_utc	0
24	distinguished	18038164
25	edited	17457460
26	gilded	0
27	gildings	0
28	id	0
29	is_submitter	0
30	link_id	0
31	locked	0
32	no_follow	0
33	parent_id	0
34	permalink	0
35	quarantined	0
36	removal_reason	18120904
37	retrieved_on	4146560
38	score	0
39	send_replies	0
40	stickied	0
41	subreddit_id	0
42	subreddit_name_prefixed	4146560
43	subreddit_type	0
44	top_awarded_type	18120952
45	total_awards_received	0
46	treatment_tags	2437
47	author_cakeday	18054621
48	editable	11416494
49	media_metadata	17925218
50	ym_partition	0

	variable	value	percentage
0	all_awardings	79493	0.438680
1	associated_award	18120952	100.000000
2	author	635	0.003504
3	author_created_utc	6850551	37.804587
4	author_flair_background_color	18120952	100.000000
5	author_flair_css_class	18119403	99.991452
6	author_flair_richtext	3093369	17.070676
7	author_flair_template_id	18120952	100.000000
8	author_flair_text	18036426	99.533545
9	author_flair_text_color	14943013	82.462627
10	author_flair_type	3093369	17.070676
11	author_fullname	3093369	17.070676
12	author_patreon_flair	3093369	17.070676
13	author_premium	2988543	16.492196
14	awarders	638049	3.521057
15	body	16312	0.090017
16	can_gild	0	0.000000
17	can_mod_post	0	0.000000
18	collapsed	0	0.000000
19	collapsed_because_crowd_control	18120952	100.000000
20	collapsed_reason	17406962	96.059865
21	comment_type	18120952	100.000000
22	controversiality	0	0.000000
23	created_utc	0	0.000000
24	distinguished	18038164	99.543137
25	edited	17457460	96.338537
26	gilded	0	0.000000
27	gildings	0	0.000000
28	id	0	0.000000
29	is_submitter	0	0.000000
30	link_id	0	0.000000
31	locked	0	0.000000
32	no_follow	0	0.000000
33	parent_id	0	0.000000
34	permalink	0	0.000000
35	quarantined	0	0.000000
36	removal_reason	18120904	99.999735
37	retrieved_on	4146560	22.882683
38	score	0	0.000000
39	send_replies	0	0.000000
40	stickied	0	0.000000
41	subreddit_id	0	0.000000
42	subreddit_name_prefixed	4146560	22.882683
43	subreddit_type	0	0.000000
44	top_awarded_type	18120952	100.000000
45	total_awards_received	0	0.000000
46	treatment_tags	2437	0.013449
47	author_cakeday	18054621	99.633954
48	editable	11416494	63.001624
49	media_metadata	17925218	98.919847
50	ym_partition	0	0.000000

	hour	comment_count
0	01	847773
1	18	929931
2	06	593932
3	15	906672
4	23	907571
5	03	793605
6	13	724590
7	14	841923
8	22	917424
9	07	517727
10	16	927490
11	00	870388
12	17	934621
13	09	422781
14	20	913494
15	21	915739
16	11	495848
17	05	673001
18	19	916244
19	08	457140
20	02	829869
21	10	430960
22	12	609438
23	04	742791

	year	month	comment_count
0	2019	09	401982
1	2020	03	472989
2	2021	04	834271
3	2019	10	341954
4	2021	03	719236
5	2020	06	1663388
6	2019	12	380552
7	2021	01	1081665
8	2020	11	989285
9	2020	10	850087
10	2019	07	374620
11	2019	08	417608
12	2021	02	631160
13	2020	05	1021888
14	2020	07	1343656
15	2020	02	383739
16	2021	06	979521
17	2020	08	1105279
18	2019	11	389697
19	2020	12	807404
20	2021	05	982372
21	2020	01	410716
22	2020	04	597946
23	2020	09	939937

	year	month	comment_count	time_concat
0	2019	09	401982	2019_09
1	2020	03	472989	2020_03
2	2021	04	834271	2021_04
3	2019	10	341954	2019_10
4	2021	03	719236	2021_03
5	2020	06	1663388	2020_06
6	2019	12	380552	2019_12
7	2021	01	1081665	2021_01
8	2020	11	989285	2020_11
9	2020	10	850087	2020_10
10	2019	07	374620	2019_07
11	2019	08	417608	2019_08
12	2021	02	631160	2021_02
13	2020	05	1021888	2020_05
14	2020	07	1343656	2020_07
15	2020	02	383739	2020_02
16	2021	06	979521	2021_06
17	2020	08	1105279	2020_08
18	2019	11	389697	2019_11
19	2020	12	807404	2020_12
20	2021	05	982372	2021_05
21	2020	01	410716	2020_01
22	2020	04	597946	2020_04
23	2020	09	939937	2020_09

	author	author_freq	all_awardings	author_created_utc	author_flair_richtext	author_flair_type	author_fullname	author_patreon_flair	author_premium	awarders	...	treatment_tags	editable	ym_partition	comment_length	comment_date	year	month	hour	original_post
0	Pardusco	5397	[]	1.537556e+09	[]	text	t2_29e64gkm	False	True	[]	...	[]	None	202101	10	2021-01-08 19:49:53	2021	01	19	trump crowd hurls slurs at lindsey graham
1	SaveVideo	13328	[]	NaN	[]	text	t2_8gveco3a	False	True	[]	...	[]	None	202101	524	2021-01-07 06:40:47	2021	01	06	heres the scary moment when protesters initially
2	a-mirror-bot	66720	[]	1.531009e+09	[]	text	t2_1q5xnz7f	False	False	[]	...	[]	None	202101	425	2021-01-10 22:25:02	2021	01	22	another video from the turmoil in san diego today
3	SaveVideo	13328	[]	NaN	[]	text	t2_8gveco3a	False	False	[]	...	[]	None	202101	493	2021-01-23 22:08:56	2021	01	22	23 january russia
4	a-mirror-bot	66720	[]	1.531009e+09	[]	text	t2_1q5xnz7f	False	False	[]	...	[]	None	202101	425	2021-01-21 02:24:02	2021	01	02	i cant believe this is going even more viral he

	author	author_freq	controversiality_sum	controversiality_mean
0	a-mirror-bot	66720	576	0.008633
1	VredditDownloader	20055	4	0.000199
2	SaveVideo	13328	10	0.000750
3	SajuPacapu	10619	846	0.079669
4	PF_Mirror_Bot	8276	87	0.010512
5	Dolt-Dragoman	7284	522	0.071664
6	felixjawesome	6639	510	0.076819
7	CantStopPoppin	5853	710	0.121305
8	Pardusco	5397	644	0.119326
9	AutoModerator	5239	0	0.000000

	Pandemic_Freakout	year	controversiality		score
			sum	mean	sum	mean
0	corona	2019	2	0.052632	924	24.315789
1	corona	2020	512	0.051525	166631	16.768743
2	corona	2021	64	0.049155	27815	21.363287
3	covid	2020	2429	0.048284	934836	18.582992
4	covid	2021	1306	0.050829	431073	16.777185
5	masks	2019	88	0.129602	6381	9.397644
6	masks	2020	1624	0.044522	589466	16.160379
7	masks	2021	694	0.044645	224237	14.425024
8	pandemic	2019	0	0.000000	88	4.631579
9	pandemic	2020	1555	0.058593	488145	18.393496
10	pandemic	2021	568	0.053895	162958	15.462378
11	virus	2019	12	0.092308	1655	12.730769
12	virus	2020	1303	0.053253	285935	11.686080
13	virus	2021	388	0.064100	40380	6.671072

	date	country	daily_new_cases	daily_new_deaths
0	2020-2-15	Afghanistan	NaN	NaN
1	2020-2-16	Afghanistan	NaN	NaN
2	2020-2-17	Afghanistan	NaN	NaN
3	2020-2-18	Afghanistan	NaN	NaN
4	2020-2-19	Afghanistan	NaN	NaN

	date	sum	year	month
0	2020-1-22	571.0	2020	1
1	2020-1-23	830.0	2020	1
2	2020-1-24	1287.0	2020	1
3	2020-1-25	1975.0	2020	1
4	2020-1-26	2744.0	2020	1

	body	Pandemic_Freakout	score	comment_date	year	month	hour	date	sum
0	I don't believe they were anti mask, so much a...	masks	3	2021-01-31 13:27:37	2021	1	13	2021-01-01	84893057.0
1	I don't believe they were anti mask, so much a...	masks	3	2021-01-31 13:27:37	2021	1	13	2021-01-02	85443850.0
2	I don't believe they were anti mask, so much a...	masks	3	2021-01-31 13:27:37	2021	1	13	2021-01-03	85993623.0
3	I don't believe they were anti mask, so much a...	masks	3	2021-01-31 13:27:37	2021	1	13	2021-01-04	86551281.0
4	I don't believe they were anti mask, so much a...	masks	3	2021-01-31 13:27:37	2021	1	13	2021-01-05	87292032.0

	Pandemic_Freakout	year	month	controversiality	covid_cases	score
				sum	sum	sum
0	corona	2020	1	60	4.332280e+06	5570
1	corona	2020	2	667	5.290302e+08	37961
2	corona	2020	3	2666	1.811681e+10	937347
3	corona	2020	4	1980	1.032122e+11	837060
4	corona	2020	5	1767	1.788242e+11	917786
...	...	...	...	...	...	...
84	virus	2021	2	1288	2.469863e+12	314832
85	virus	2021	3	2046	4.557631e+12	212970
86	virus	2021	4	2070	3.798920e+12	112290
87	virus	2021	5	1736	3.407792e+12	91884
88	virus	2021	6	1860	4.005785e+12	126870