import findspark
findspark.init()
import pyspark.sql.functions as F
from pyspark.sql.functions import col, lit,size
from pyspark.sql import SparkSession
import warnings
warnings.filterwarnings('ignore')
spark = SparkSession.builder.appName("reddit").getOrCreate()
Setting default log level to "WARN". To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel). 22/04/05 22:48:18 WARN Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.
spark
SparkSession - in-memory
df_full = spark.read.parquet('s3://ssp88-labdata2/reddit')
#Number of rows of the data
rows = df_full.count()
rows
22/04/05 22:48:39 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
18120952
#Number of columns
columns = len(df_full.columns)
columns
51
df_full.printSchema()
root |-- all_awardings: string (nullable = true) |-- associated_award: string (nullable = true) |-- author: string (nullable = true) |-- author_created_utc: double (nullable = true) |-- author_flair_background_color: string (nullable = true) |-- author_flair_css_class: string (nullable = true) |-- author_flair_richtext: string (nullable = true) |-- author_flair_template_id: string (nullable = true) |-- author_flair_text: string (nullable = true) |-- author_flair_text_color: string (nullable = true) |-- author_flair_type: string (nullable = true) |-- author_fullname: string (nullable = true) |-- author_patreon_flair: boolean (nullable = true) |-- author_premium: boolean (nullable = true) |-- awarders: string (nullable = true) |-- body: string (nullable = true) |-- can_gild: boolean (nullable = true) |-- can_mod_post: boolean (nullable = true) |-- collapsed: boolean (nullable = true) |-- collapsed_because_crowd_control: string (nullable = true) |-- collapsed_reason: string (nullable = true) |-- comment_type: string (nullable = true) |-- controversiality: long (nullable = true) |-- created_utc: long (nullable = true) |-- distinguished: string (nullable = true) |-- edited: double (nullable = true) |-- gilded: long (nullable = true) |-- gildings: string (nullable = true) |-- id: string (nullable = true) |-- is_submitter: boolean (nullable = true) |-- link_id: string (nullable = true) |-- locked: boolean (nullable = true) |-- no_follow: boolean (nullable = true) |-- parent_id: string (nullable = true) |-- permalink: string (nullable = true) |-- quarantined: boolean (nullable = true) |-- removal_reason: string (nullable = true) |-- retrieved_on: long (nullable = true) |-- score: long (nullable = true) |-- send_replies: boolean (nullable = true) |-- stickied: boolean (nullable = true) |-- subreddit_id: string (nullable = true) |-- subreddit_name_prefixed: string (nullable = true) |-- subreddit_type: string (nullable = true) |-- top_awarded_type: string (nullable = true) |-- total_awards_received: long (nullable = true) |-- treatment_tags: string (nullable = true) |-- author_cakeday: boolean (nullable = true) |-- editable: boolean (nullable = true) |-- media_metadata: string (nullable = true) |-- ym_partition: integer (nullable = true)
df = spark.read.parquet('s3://ssp88-labdata2/reddit/ym_partition=201910')
df_small = df.limit(10000)
#Checking the missing values
from pyspark.sql.functions import col,isnan, when, count
missing_val = df_full.select(*[
(
F.count(F.when((F.isnan(c) | \
F.col(c).isNull() | \
F.col(c).contains('None') | \
F.col(c).contains('NULL') | \
(F.col(c) == '')), c))
if t not in ("boolean","timestamp", "date")
else F.count(F.when(F.col(c).isNull(), c))
).alias(c)
for c, t in df_full.dtypes if c in df_full.columns
]).toPandas()
missing_val
all_awardings | associated_award | author | author_created_utc | author_flair_background_color | author_flair_css_class | author_flair_richtext | author_flair_template_id | author_flair_text | author_flair_text_color | ... | subreddit_id | subreddit_name_prefixed | subreddit_type | top_awarded_type | total_awards_received | treatment_tags | author_cakeday | editable | media_metadata | ym_partition | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 79493 | 18120952 | 635 | 6850551 | 18120952 | 18119403 | 3093369 | 18120952 | 18036426 | 14943013 | ... | 0 | 4146560 | 0 | 18120952 | 0 | 2437 | 18054621 | 11416494 | 17925218 | 0 |
1 rows × 51 columns
import pandas as pd
missing_val = pd.melt(missing_val)
missing_val
variable | value | |
---|---|---|
0 | all_awardings | 79493 |
1 | associated_award | 18120952 |
2 | author | 635 |
3 | author_created_utc | 6850551 |
4 | author_flair_background_color | 18120952 |
5 | author_flair_css_class | 18119403 |
6 | author_flair_richtext | 3093369 |
7 | author_flair_template_id | 18120952 |
8 | author_flair_text | 18036426 |
9 | author_flair_text_color | 14943013 |
10 | author_flair_type | 3093369 |
11 | author_fullname | 3093369 |
12 | author_patreon_flair | 3093369 |
13 | author_premium | 2988543 |
14 | awarders | 638049 |
15 | body | 16312 |
16 | can_gild | 0 |
17 | can_mod_post | 0 |
18 | collapsed | 0 |
19 | collapsed_because_crowd_control | 18120952 |
20 | collapsed_reason | 17406962 |
21 | comment_type | 18120952 |
22 | controversiality | 0 |
23 | created_utc | 0 |
24 | distinguished | 18038164 |
25 | edited | 17457460 |
26 | gilded | 0 |
27 | gildings | 0 |
28 | id | 0 |
29 | is_submitter | 0 |
30 | link_id | 0 |
31 | locked | 0 |
32 | no_follow | 0 |
33 | parent_id | 0 |
34 | permalink | 0 |
35 | quarantined | 0 |
36 | removal_reason | 18120904 |
37 | retrieved_on | 4146560 |
38 | score | 0 |
39 | send_replies | 0 |
40 | stickied | 0 |
41 | subreddit_id | 0 |
42 | subreddit_name_prefixed | 4146560 |
43 | subreddit_type | 0 |
44 | top_awarded_type | 18120952 |
45 | total_awards_received | 0 |
46 | treatment_tags | 2437 |
47 | author_cakeday | 18054621 |
48 | editable | 11416494 |
49 | media_metadata | 17925218 |
50 | ym_partition | 0 |
missing_val['percentage'] = (missing_val['value'] / rows) * 100
missing_val
variable | value | percentage | |
---|---|---|---|
0 | all_awardings | 79493 | 0.438680 |
1 | associated_award | 18120952 | 100.000000 |
2 | author | 635 | 0.003504 |
3 | author_created_utc | 6850551 | 37.804587 |
4 | author_flair_background_color | 18120952 | 100.000000 |
5 | author_flair_css_class | 18119403 | 99.991452 |
6 | author_flair_richtext | 3093369 | 17.070676 |
7 | author_flair_template_id | 18120952 | 100.000000 |
8 | author_flair_text | 18036426 | 99.533545 |
9 | author_flair_text_color | 14943013 | 82.462627 |
10 | author_flair_type | 3093369 | 17.070676 |
11 | author_fullname | 3093369 | 17.070676 |
12 | author_patreon_flair | 3093369 | 17.070676 |
13 | author_premium | 2988543 | 16.492196 |
14 | awarders | 638049 | 3.521057 |
15 | body | 16312 | 0.090017 |
16 | can_gild | 0 | 0.000000 |
17 | can_mod_post | 0 | 0.000000 |
18 | collapsed | 0 | 0.000000 |
19 | collapsed_because_crowd_control | 18120952 | 100.000000 |
20 | collapsed_reason | 17406962 | 96.059865 |
21 | comment_type | 18120952 | 100.000000 |
22 | controversiality | 0 | 0.000000 |
23 | created_utc | 0 | 0.000000 |
24 | distinguished | 18038164 | 99.543137 |
25 | edited | 17457460 | 96.338537 |
26 | gilded | 0 | 0.000000 |
27 | gildings | 0 | 0.000000 |
28 | id | 0 | 0.000000 |
29 | is_submitter | 0 | 0.000000 |
30 | link_id | 0 | 0.000000 |
31 | locked | 0 | 0.000000 |
32 | no_follow | 0 | 0.000000 |
33 | parent_id | 0 | 0.000000 |
34 | permalink | 0 | 0.000000 |
35 | quarantined | 0 | 0.000000 |
36 | removal_reason | 18120904 | 99.999735 |
37 | retrieved_on | 4146560 | 22.882683 |
38 | score | 0 | 0.000000 |
39 | send_replies | 0 | 0.000000 |
40 | stickied | 0 | 0.000000 |
41 | subreddit_id | 0 | 0.000000 |
42 | subreddit_name_prefixed | 4146560 | 22.882683 |
43 | subreddit_type | 0 | 0.000000 |
44 | top_awarded_type | 18120952 | 100.000000 |
45 | total_awards_received | 0 | 0.000000 |
46 | treatment_tags | 2437 | 0.013449 |
47 | author_cakeday | 18054621 | 99.633954 |
48 | editable | 11416494 | 63.001624 |
49 | media_metadata | 17925218 | 98.919847 |
50 | ym_partition | 0 | 0.000000 |
As we can see, there are some columns where the percentage of missing values is almost 100. We can directly drop such columns as it will not help us in the analysis.
threshold = missing_val[missing_val['percentage'] > 70]
threshold = threshold.reset_index()
threshold
index | variable | value | percentage | |
---|---|---|---|---|
0 | 1 | associated_award | 18120952 | 100.000000 |
1 | 4 | author_flair_background_color | 18120952 | 100.000000 |
2 | 5 | author_flair_css_class | 18119403 | 99.991452 |
3 | 7 | author_flair_template_id | 18120952 | 100.000000 |
4 | 8 | author_flair_text | 18036426 | 99.533545 |
5 | 9 | author_flair_text_color | 14943013 | 82.462627 |
6 | 19 | collapsed_because_crowd_control | 18120952 | 100.000000 |
7 | 20 | collapsed_reason | 17406962 | 96.059865 |
8 | 21 | comment_type | 18120952 | 100.000000 |
9 | 24 | distinguished | 18038164 | 99.543137 |
10 | 25 | edited | 17457460 | 96.338537 |
11 | 36 | removal_reason | 18120904 | 99.999735 |
12 | 44 | top_awarded_type | 18120952 | 100.000000 |
13 | 47 | author_cakeday | 18054621 | 99.633954 |
14 | 49 | media_metadata | 17925218 | 98.919847 |
columns_to_drop = threshold.variable.tolist()
df_full = df_full.drop(*columns_to_drop)
#New count of columns
columns = len(df_full.columns)
columns
36
df_full.printSchema()
root |-- all_awardings: string (nullable = true) |-- author: string (nullable = true) |-- author_created_utc: double (nullable = true) |-- author_flair_richtext: string (nullable = true) |-- author_flair_type: string (nullable = true) |-- author_fullname: string (nullable = true) |-- author_patreon_flair: boolean (nullable = true) |-- author_premium: boolean (nullable = true) |-- awarders: string (nullable = true) |-- body: string (nullable = true) |-- can_gild: boolean (nullable = true) |-- can_mod_post: boolean (nullable = true) |-- collapsed: boolean (nullable = true) |-- controversiality: long (nullable = true) |-- created_utc: long (nullable = true) |-- gilded: long (nullable = true) |-- gildings: string (nullable = true) |-- id: string (nullable = true) |-- is_submitter: boolean (nullable = true) |-- link_id: string (nullable = true) |-- locked: boolean (nullable = true) |-- no_follow: boolean (nullable = true) |-- parent_id: string (nullable = true) |-- permalink: string (nullable = true) |-- quarantined: boolean (nullable = true) |-- retrieved_on: long (nullable = true) |-- score: long (nullable = true) |-- send_replies: boolean (nullable = true) |-- stickied: boolean (nullable = true) |-- subreddit_id: string (nullable = true) |-- subreddit_name_prefixed: string (nullable = true) |-- subreddit_type: string (nullable = true) |-- total_awards_received: long (nullable = true) |-- treatment_tags: string (nullable = true) |-- editable: boolean (nullable = true) |-- ym_partition: integer (nullable = true)
#Checking the length of the comment
df_full = df_full.withColumn("comment_length", F.length(col('body')))
df_full.select('body','comment_length').show(10)
[Stage 8:> (0 + 1) / 1]
+--------------------+--------------+ | body|comment_length| +--------------------+--------------+ |so, theyre like a...| 33| |By that definitio...| 444| |I'm a follower of...| 66| |Nowadays MAGA is ...| 50| | MORGAN FREEMAN.| 15| | [deleted]| 9| |This is really sc...| 21| |He should be on “...| 72| |Yesss, and someti...| 129| |This is why malls...| 31| +--------------------+--------------+ only showing top 10 rows
#Maximum length
from pyspark.sql.functions import max
df_full.select([max("comment_length")]).show()
[Stage 9:=======================================================> (50 + 1) / 51]
+-------------------+ |max(comment_length)| +-------------------+ | 11252| +-------------------+
#Minimum Length
from pyspark.sql.functions import min
df_full.select([min("comment_length")]).show()
[Stage 12:======================================================> (50 + 1) / 51]
+-------------------+ |min(comment_length)| +-------------------+ | 0| +-------------------+
#Checking for blank missing values in body
blank_body = df_full.filter(df_full['body'] == '')
blank_body.select('body').show(10)
[Stage 17:=======================> (2 + 3) / 5]
+----+ |body| +----+ | | | | | | | | | | | | | | | | | | | | +----+ only showing top 10 rows
blank_body.count()
117
from pyspark.sql.functions import date_format
df_full = df_full.withColumn("comment_date",F.from_unixtime(F.col("created_utc")))
df_full.select('created_utc','comment_date').show(10)
+-----------+-------------------+ |created_utc| comment_date| +-----------+-------------------+ | 1611578421|2021-01-25 12:40:21| | 1611551895|2021-01-25 05:18:15| | 1610352926|2021-01-11 08:15:26| | 1610343356|2021-01-11 05:35:56| | 1610333628|2021-01-11 02:53:48| | 1610336206|2021-01-11 03:36:46| | 1610343951|2021-01-11 05:45:51| | 1610355160|2021-01-11 08:52:40| | 1610343826|2021-01-11 05:43:46| | 1610333461|2021-01-11 02:51:01| +-----------+-------------------+ only showing top 10 rows
from pyspark.sql.functions import regexp_extract
df_full = df_full.withColumn("year",regexp_extract(col('comment_date'),r'(.*?)-',1))
df_full = df_full.withColumn("month",regexp_extract(col('comment_date'),r'-(.*?)-',1))
df_full = df_full.withColumn("hour",regexp_extract(col('comment_date'),r'\s(.+?):',1))
df_full.select('comment_date','year','month','hour').show(10)
+-------------------+----+-----+----+ | comment_date|year|month|hour| +-------------------+----+-----+----+ |2021-01-25 12:40:21|2021| 01| 12| |2021-01-25 05:18:15|2021| 01| 05| |2021-01-11 08:15:26|2021| 01| 08| |2021-01-11 05:35:56|2021| 01| 05| |2021-01-11 02:53:48|2021| 01| 02| |2021-01-11 03:36:46|2021| 01| 03| |2021-01-11 05:45:51|2021| 01| 05| |2021-01-11 08:52:40|2021| 01| 08| |2021-01-11 05:43:46|2021| 01| 05| |2021-01-11 02:51:01|2021| 01| 02| +-------------------+----+-----+----+ only showing top 10 rows
df_full = df_full.withColumn("original_post",regexp_extract(col('permalink'),r'comments/(.*)/(.*)/(.*)/',2))
df_full.select('original_post').show(10)
[Stage 23:> (0 + 1) / 1]
+--------------------+ | original_post| +--------------------+ |some_say_hes_stil...| |babies_first_tast...| |corey_forrester_r...| |trump_terrorists_...| |watch_governor_sc...| |watch_governor_sc...| |my_mom_just_sent_...| |trump_terrorists_...| |airport_freakout_...| |please_god_omg_in...| +--------------------+ only showing top 10 rows
#Replacing underscore with spaces
from pyspark.sql.functions import regexp_replace
df_full = df_full.withColumn("original_post",regexp_replace('original_post','_',' '))
df_full.select('original_post').show(10)
+--------------------+ | original_post| +--------------------+ |some say hes stil...| |babies first tast...| |corey forrester r...| |trump terrorists ...| |watch governor sc...| |watch governor sc...| |my mom just sent ...| |trump terrorists ...| |airport freakout ...| |please god omg in...| +--------------------+ only showing top 10 rows
df_full.select('score','comment_length').show(5)
[Stage 25:> (0 + 1) / 1]
+-----+--------------+ |score|comment_length| +-----+--------------+ | 15| 33| | -5| 444| | 5| 66| | 9| 50| | 1| 15| +-----+--------------+ only showing top 5 rows
rel_score_length = df_full.groupby('comment_length').agg(F.mean('score'))
rel_score_length = rel_score_length.withColumnRenamed('avg(score)','avg_score')
rel_score_length.show(10)
[Stage 26:======================================================> (50 + 1) / 51]
+--------------+------------------+ |comment_length| avg_score| +--------------+------------------+ | 148|15.415712849777893| | 496|14.188170183327568| | 1591| 55.39772727272727| | 833| 8.625352112676056| | 1088| 13.56675749318801| | 1342| 26.13855421686747| | 243| 15.41590752507081| | 540|13.578768577494692| | 1395| 9.030864197530864| | 1303| 44.55625| +--------------+------------------+ only showing top 10 rows
rel_score_length_df = rel_score_length.toPandas()
The length (number of characters) and average score of the comments
rel_score_length_df
comment_length | avg_score | |
---|---|---|
0 | 496 | 14.188170 |
1 | 148 | 15.415713 |
2 | 833 | 8.625352 |
3 | 1088 | 13.566757 |
4 | 1342 | 26.138554 |
... | ... | ... |
6718 | 7635 | 0.000000 |
6719 | 6634 | 1.000000 |
6720 | 8882 | 1.000000 |
6721 | 5318 | 2.000000 |
6722 | 5636 | 1.000000 |
6723 rows × 2 columns
#!/mnt/miniconda/bin/pip install altair
Collecting altair Downloading altair-4.2.0-py3-none-any.whl (812 kB) |████████████████████████████████| 812 kB 33.7 MB/s eta 0:00:01 Requirement already satisfied: entrypoints in /mnt/miniconda/lib/python3.7/site-packages (from altair) (0.3) Requirement already satisfied: numpy in /mnt/miniconda/lib/python3.7/site-packages (from altair) (1.21.2) Requirement already satisfied: jsonschema>=3.0 in /mnt/miniconda/lib/python3.7/site-packages (from altair) (3.2.0) Requirement already satisfied: toolz in /mnt/miniconda/lib/python3.7/site-packages (from altair) (0.11.2) Requirement already satisfied: jinja2 in /mnt/miniconda/lib/python3.7/site-packages (from altair) (3.0.3) Requirement already satisfied: pandas>=0.18 in /mnt/miniconda/lib/python3.7/site-packages (from altair) (1.3.5) Requirement already satisfied: setuptools in /mnt/miniconda/lib/python3.7/site-packages (from jsonschema>=3.0->altair) (58.0.4) Requirement already satisfied: pyrsistent>=0.14.0 in /mnt/miniconda/lib/python3.7/site-packages (from jsonschema>=3.0->altair) (0.18.0) Requirement already satisfied: importlib-metadata in /mnt/miniconda/lib/python3.7/site-packages (from jsonschema>=3.0->altair) (4.8.2) Requirement already satisfied: attrs>=17.4.0 in /mnt/miniconda/lib/python3.7/site-packages (from jsonschema>=3.0->altair) (21.4.0) Requirement already satisfied: six>=1.11.0 in /mnt/miniconda/lib/python3.7/site-packages (from jsonschema>=3.0->altair) (1.16.0) Requirement already satisfied: python-dateutil>=2.7.3 in /mnt/miniconda/lib/python3.7/site-packages (from pandas>=0.18->altair) (2.8.2) Requirement already satisfied: pytz>=2017.3 in /mnt/miniconda/lib/python3.7/site-packages (from pandas>=0.18->altair) (2021.3) Requirement already satisfied: typing-extensions>=3.6.4 in /mnt/miniconda/lib/python3.7/site-packages (from importlib-metadata->jsonschema>=3.0->altair) (3.10.0.2) Requirement already satisfied: zipp>=0.5 in /mnt/miniconda/lib/python3.7/site-packages (from importlib-metadata->jsonschema>=3.0->altair) (3.7.0) Requirement already satisfied: MarkupSafe>=2.0 in /mnt/miniconda/lib/python3.7/site-packages (from jinja2->altair) (2.0.1) Installing collected packages: altair Successfully installed altair-4.2.0
import altair as alt
alt.data_transformers.disable_max_rows()
DataTransformerRegistry.enable('default')
fig = (alt.Chart(rel_score_length_df).mark_line().encode(
x=alt.X('comment_length', axis = alt.Axis(title = "Length of the Comment (Characters)")),
y=alt.Y('avg_score', axis = alt.Axis(title = "Average Score")),
color=alt.value('#beaed4'),
tooltip=['avg_score','comment_length']
)).resolve_scale(x='independent').properties(title={"text":'Should Authors write Long Comments to increase their Score?',"subtitle" : "Relationship between Comment Score and Length"},width = 500, height = 500).interactive()
fig
fig.save("fig1.html")
There are few length of the comments for which the average score is greater than 2000. Hence we can subset the dataset to have a better undestanding.
rel_score_length_df_score_filter = rel_score_length_df[rel_score_length_df['avg_score'] < 2000]
fig = (alt.Chart(rel_score_length_df_score_filter).mark_line().encode(
x=alt.X('comment_length', axis = alt.Axis(title = "Length of the Comment (Characters)")),
y=alt.Y('avg_score', axis = alt.Axis(title = "Average Score")),
color=alt.value('#beaed4'),
tooltip=['avg_score','comment_length']
)).resolve_scale(x='independent').properties(title={"text":'Should Authors write Long Comments to increase their Score?',"subtitle" : "Relationship between Comment Score and Length"},width = 500, height = 500).interactive()
fig
fig.save("fig1.1.html")
The relationship between the two variables is very erractic. Comparatively, the average score is low if the length of comment is small. The maximum peak of the score is reached as the length increases. If the author is aiming to increase his score, he should target to keep his character count more than 3000+.
Hottest comment time
comment_time = df_full.groupby('hour').agg(F.count('body'))
comment_time = comment_time.withColumnRenamed('count(body)','comment_count')
comment_time.show(5)
+----+-------------+ |hour|comment_count| +----+-------------+ | 01| 847773| | 18| 929931| | 06| 593932| | 15| 906672| | 23| 907571| +----+-------------+ only showing top 5 rows
comment_time_df = comment_time.toPandas()
comment_time_df
hour | comment_count | |
---|---|---|
0 | 01 | 847773 |
1 | 18 | 929931 |
2 | 06 | 593932 |
3 | 15 | 906672 |
4 | 23 | 907571 |
5 | 03 | 793605 |
6 | 13 | 724590 |
7 | 14 | 841923 |
8 | 22 | 917424 |
9 | 07 | 517727 |
10 | 16 | 927490 |
11 | 00 | 870388 |
12 | 17 | 934621 |
13 | 09 | 422781 |
14 | 20 | 913494 |
15 | 21 | 915739 |
16 | 11 | 495848 |
17 | 05 | 673001 |
18 | 19 | 916244 |
19 | 08 | 457140 |
20 | 02 | 829869 |
21 | 10 | 430960 |
22 | 12 | 609438 |
23 | 04 | 742791 |
comment_time_df['hot_start'] = '15'
comment_time_df['hot_end'] = '19'
comment_time_df['text_x'] = '17'
comment_time_df['text_y'] = 300000
fig = (alt.Chart(comment_time_df).mark_line().encode(
x=alt.X('hour', axis = alt.Axis(title = "Time of the day (Military Time)")),
y=alt.Y('comment_count', axis = alt.Axis(title = "Count of Comments")),
color=alt.value('#fdc086'),
tooltip=['hour','comment_count']
)).properties(title={"text":'Hottest Comment Time',"subtitle" : "Relationship between Time of the day and Frequency of Comments"},width = 500, height = 500).interactive()
threshold =(
alt.Chart(comment_time_df)
.mark_rule(opacity = 0.01)
.encode(
x = 'hot_start'
)
)
threshold2 =(
alt.Chart(comment_time_df)
.mark_rule(opacity = 0.01)
.encode(
x = 'hot_end'
)
)
text =(
alt.Chart(comment_time_df)
.mark_text(text = "Hottest \nComment \nTime",lineBreak = ' ',opacity = 0.1)
.encode(
x = 'text_x',
y = 'text_y'
)
)
fig + threshold + threshold2 + text
fig.save("fig2.html")
This graph depicts the number of comment during different time of the day. The timeframe with the highest activity (hottest comment time) is between 3pm and 7pm. The reddit users are very active during these hours. As the night progresses, the number of comments dip and the lowest point is around 9-10am, this is the time when people generally wake up and hence they are not active on reddit that much.
Moving further, now we looked at the time of the day, to get a bigger picture, we will look through the number of comments for each month of the timeframe of our data (2019/07 - 2021/06).
comment_time_ym = df_full.groupby('year','month').agg(F.count('body'))
comment_time_ym = comment_time_ym.withColumnRenamed('count(body)','comment_count')
comment_time_ym.show(5)
+----+-----+-------------+ |year|month|comment_count| +----+-----+-------------+ |2019| 09| 401982| |2020| 03| 472989| |2021| 04| 834271| |2019| 10| 341954| |2021| 03| 719236| +----+-----+-------------+ only showing top 5 rows
comment_time_ym_df = comment_time_ym.toPandas()
comment_time_ym_df
year | month | comment_count | |
---|---|---|---|
0 | 2019 | 09 | 401982 |
1 | 2020 | 03 | 472989 |
2 | 2021 | 04 | 834271 |
3 | 2019 | 10 | 341954 |
4 | 2021 | 03 | 719236 |
5 | 2020 | 06 | 1663388 |
6 | 2019 | 12 | 380552 |
7 | 2021 | 01 | 1081665 |
8 | 2020 | 11 | 989285 |
9 | 2020 | 10 | 850087 |
10 | 2019 | 07 | 374620 |
11 | 2019 | 08 | 417608 |
12 | 2021 | 02 | 631160 |
13 | 2020 | 05 | 1021888 |
14 | 2020 | 07 | 1343656 |
15 | 2020 | 02 | 383739 |
16 | 2021 | 06 | 979521 |
17 | 2020 | 08 | 1105279 |
18 | 2019 | 11 | 389697 |
19 | 2020 | 12 | 807404 |
20 | 2021 | 05 | 982372 |
21 | 2020 | 01 | 410716 |
22 | 2020 | 04 | 597946 |
23 | 2020 | 09 | 939937 |
comment_time_ym_df['time_concat'] = comment_time_ym_df["year"] + "_" + comment_time_ym_df["month"]
comment_time_ym_df
year | month | comment_count | time_concat | |
---|---|---|---|---|
0 | 2019 | 09 | 401982 | 2019_09 |
1 | 2020 | 03 | 472989 | 2020_03 |
2 | 2021 | 04 | 834271 | 2021_04 |
3 | 2019 | 10 | 341954 | 2019_10 |
4 | 2021 | 03 | 719236 | 2021_03 |
5 | 2020 | 06 | 1663388 | 2020_06 |
6 | 2019 | 12 | 380552 | 2019_12 |
7 | 2021 | 01 | 1081665 | 2021_01 |
8 | 2020 | 11 | 989285 | 2020_11 |
9 | 2020 | 10 | 850087 | 2020_10 |
10 | 2019 | 07 | 374620 | 2019_07 |
11 | 2019 | 08 | 417608 | 2019_08 |
12 | 2021 | 02 | 631160 | 2021_02 |
13 | 2020 | 05 | 1021888 | 2020_05 |
14 | 2020 | 07 | 1343656 | 2020_07 |
15 | 2020 | 02 | 383739 | 2020_02 |
16 | 2021 | 06 | 979521 | 2021_06 |
17 | 2020 | 08 | 1105279 | 2020_08 |
18 | 2019 | 11 | 389697 | 2019_11 |
19 | 2020 | 12 | 807404 | 2020_12 |
20 | 2021 | 05 | 982372 | 2021_05 |
21 | 2020 | 01 | 410716 | 2020_01 |
22 | 2020 | 04 | 597946 | 2020_04 |
23 | 2020 | 09 | 939937 | 2020_09 |
comment_time_ym_df['year'] = comment_time_ym_df['year'].apply(pd.to_numeric)
comment_time_ym_df = comment_time_ym_df.sort_values(["year","month"]).reset_index().drop('index',axis = 1)
The count and time of the comments
comment_time_ym_df
year | month | comment_count | time_concat | |
---|---|---|---|---|
0 | 2019 | 07 | 374620 | 2019_07 |
1 | 2019 | 08 | 417608 | 2019_08 |
2 | 2019 | 09 | 401982 | 2019_09 |
3 | 2019 | 10 | 341954 | 2019_10 |
4 | 2019 | 11 | 389697 | 2019_11 |
5 | 2019 | 12 | 380552 | 2019_12 |
6 | 2020 | 01 | 410716 | 2020_01 |
7 | 2020 | 02 | 383739 | 2020_02 |
8 | 2020 | 03 | 472989 | 2020_03 |
9 | 2020 | 04 | 597946 | 2020_04 |
10 | 2020 | 05 | 1021888 | 2020_05 |
11 | 2020 | 06 | 1663388 | 2020_06 |
12 | 2020 | 07 | 1343656 | 2020_07 |
13 | 2020 | 08 | 1105279 | 2020_08 |
14 | 2020 | 09 | 939937 | 2020_09 |
15 | 2020 | 10 | 850087 | 2020_10 |
16 | 2020 | 11 | 989285 | 2020_11 |
17 | 2020 | 12 | 807404 | 2020_12 |
18 | 2021 | 01 | 1081665 | 2021_01 |
19 | 2021 | 02 | 631160 | 2021_02 |
20 | 2021 | 03 | 719236 | 2021_03 |
21 | 2021 | 04 | 834271 | 2021_04 |
22 | 2021 | 05 | 982372 | 2021_05 |
23 | 2021 | 06 | 979521 | 2021_06 |
fig = (alt.Chart(comment_time_ym_df).mark_line().encode(
x=alt.X('time_concat', axis = alt.Axis(title = "Timeframe")),
y=alt.Y('comment_count', axis = alt.Axis(title = "Count of Comments")),
color=alt.value('#fdc086'),
tooltip=['time_concat','comment_count']
)).resolve_scale(x='independent').properties(title={"text":'Count of Comments',"subtitle" : "Relationship between Time Period and Frequency of Comments "},width = 500, height = 500).interactive()
fig
fig.save("fig3.html")
In the beginning of the time period, the number of comments is consistent. There is a spike in the frequency of comments after March 2020, this is the exact time when covid hit. This spike makes sense as people were clueless regarding the situation and freaking out and must have found Reddit the perfect platform to vent out their emotions being anonymous. This must have been the perfect platform to look past the whole situation and have a space to share comments. The comments drastically decrease,after October 2020, the comments are again following a steady path but it is still greater than before Covid. This might be the case as the Reddit users might have increased.
top_author = df_full.groupby('author').agg(F.count('body'))
top_author = top_author.withColumnRenamed('count(body)','author_freq')
top_author.show(10)
[Stage 55:=====================================================> (49 + 2) / 51]
+----------------+-----------+ | author|author_freq| +----------------+-----------+ |peasant-overlord| 62| | BBresulla| 123| | sound_defect| 145| | LoachLounge| 17| | FuckYouTikTok| 84| | FeroxNumine| 4| | Doctor_Sauce| 2| | FantaToTheKnees| 38| | Trespasserz| 14| |anonymous_potato| 342| +----------------+-----------+ only showing top 10 rows
top_author.count()
1438703
top_author_df = top_author.toPandas()
top_author_df['author_freq'] = top_author_df['author_freq'].apply(pd.to_numeric)
top_author_df = top_author_df.sort_values('author_freq',ascending=False).reset_index().drop('index',axis = 1)
top_author_df
author | author_freq | |
---|---|---|
0 | [deleted] | 3093369 |
1 | a-mirror-bot | 66720 |
2 | VredditDownloader | 20055 |
3 | SaveVideo | 13328 |
4 | SajuPacapu | 10619 |
... | ... | ... |
1438698 | Adampnad | 1 |
1438699 | Styzix1 | 1 |
1438700 | Fbeezy | 1 |
1438701 | ruwheele | 1 |
1438702 | Wretinpo | 1 |
1438703 rows × 2 columns
#Removing the first row as the author's name is not mentioned and it os deleted
top_author_df = top_author_df.iloc[1: , :].reset_index().drop('index',axis = 1)
top_author_df
author | author_freq | |
---|---|---|
0 | a-mirror-bot | 66720 |
1 | VredditDownloader | 20055 |
2 | SaveVideo | 13328 |
3 | SajuPacapu | 10619 |
4 | PF_Mirror_Bot | 8276 |
... | ... | ... |
1438697 | Adampnad | 1 |
1438698 | Styzix1 | 1 |
1438699 | Fbeezy | 1 |
1438700 | ruwheele | 1 |
1438701 | Wretinpo | 1 |
1438702 rows × 2 columns
top_author_df_top10 = top_author_df.head(10)
top_author_df_top10.head(10)
author | author_freq | |
---|---|---|
0 | a-mirror-bot | 66720 |
1 | VredditDownloader | 20055 |
2 | SaveVideo | 13328 |
3 | SajuPacapu | 10619 |
4 | PF_Mirror_Bot | 8276 |
5 | Dolt-Dragoman | 7284 |
6 | felixjawesome | 6639 |
7 | CantStopPoppin | 5853 |
8 | Pardusco | 5397 |
9 | AutoModerator | 5239 |
fig = (alt.Chart(top_author_df_top10).mark_bar().encode(
x=alt.X('author_freq', axis = alt.Axis(title = "Total comments of each author")),
y=alt.Y('author', axis = alt.Axis(title = "Author"),sort='-x'),
color=alt.value('#7fc97f'),
tooltip=['author','author_freq']
)).resolve_scale(x='independent').properties(title={"text":'Most Active User',"subtitle" : "Top 10 authors with highest frequency of the comment"},width = 500, height = 500)
fig
fig.save("fig4.html")
This graph depicts the top 10 authors with the highest frequency of comments over our timeframe. The most active user is a-mirror-bot. Through the course of 2years, he has 66720 comments in total.
authors_to_select = top_author_df_top10.author.tolist()
df_controversial = df_full[df_full['author'].isin(authors_to_select)]
df_controversial = df_controversial.toPandas()
author_contro_df = top_author_df_top10.merge(df_controversial, how='right', left_on='author', right_on='author')
author_contro_df.head()
author | author_freq | all_awardings | author_created_utc | author_flair_richtext | author_flair_type | author_fullname | author_patreon_flair | author_premium | awarders | ... | total_awards_received | treatment_tags | editable | ym_partition | comment_length | comment_date | year | month | hour | original_post | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Pardusco | 5397 | [] | 1.537556e+09 | [] | text | t2_29e64gkm | False | True | [] | ... | 0 | [] | None | 202101 | 10 | 2021-01-08 19:49:53 | 2021 | 01 | 19 | trump crowd hurls slurs at lindsey graham |
1 | SaveVideo | 13328 | [] | NaN | [] | text | t2_8gveco3a | False | True | [] | ... | 0 | [] | None | 202101 | 524 | 2021-01-07 06:40:47 | 2021 | 01 | 06 | heres the scary moment when protesters initially |
2 | a-mirror-bot | 66720 | [] | 1.531009e+09 | [] | text | t2_1q5xnz7f | False | False | [] | ... | 0 | [] | None | 202101 | 425 | 2021-01-10 22:25:02 | 2021 | 01 | 22 | another video from the turmoil in san diego today |
3 | SaveVideo | 13328 | [] | NaN | [] | text | t2_8gveco3a | False | False | [] | ... | 0 | [] | None | 202101 | 493 | 2021-01-23 22:08:56 | 2021 | 01 | 22 | 23 january russia |
4 | a-mirror-bot | 66720 | [] | 1.531009e+09 | [] | text | t2_1q5xnz7f | False | False | [] | ... | 0 | [] | None | 202101 | 425 | 2021-01-21 02:24:02 | 2021 | 01 | 02 | i cant believe this is going even more viral he |
5 rows × 43 columns
author_contro_df = author_contro_df[['author','controversiality','author_freq']]
author_contro_df.head()
author | controversiality | author_freq | |
---|---|---|---|
0 | Pardusco | 0 | 5397 |
1 | SaveVideo | 0 | 13328 |
2 | a-mirror-bot | 0 | 66720 |
3 | SaveVideo | 0 | 13328 |
4 | a-mirror-bot | 0 | 66720 |
author_contro_df_stat = author_contro_df.groupby(['author','author_freq'])['controversiality'].agg(['sum','mean']).reset_index()
author_contro_df_stat = author_contro_df_stat.rename(columns={"sum": "controversiality_sum", "mean": "controversiality_mean"})
author_contro_df_stat = author_contro_df_stat.sort_values('author_freq',ascending=False).reset_index().drop('index',axis = 1)
The controversialty mean and sum of the top 10 users
author_contro_df_stat
author | author_freq | controversiality_sum | controversiality_mean | |
---|---|---|---|---|
0 | a-mirror-bot | 66720 | 576 | 0.008633 |
1 | VredditDownloader | 20055 | 4 | 0.000199 |
2 | SaveVideo | 13328 | 10 | 0.000750 |
3 | SajuPacapu | 10619 | 846 | 0.079669 |
4 | PF_Mirror_Bot | 8276 | 87 | 0.010512 |
5 | Dolt-Dragoman | 7284 | 522 | 0.071664 |
6 | felixjawesome | 6639 | 510 | 0.076819 |
7 | CantStopPoppin | 5853 | 710 | 0.121305 |
8 | Pardusco | 5397 | 644 | 0.119326 |
9 | AutoModerator | 5239 | 0 | 0.000000 |
fig = (alt.Chart(author_contro_df_stat).mark_bar().encode(
x=alt.X('controversiality_mean', axis = alt.Axis(title = "Mean of Controversiality for a user")),
y=alt.Y('author', axis = alt.Axis(title = "Author"), sort = '-x'),
color=alt.value('#7fc97f'),
tooltip=['author','controversiality_mean']
)).resolve_scale(x='independent').properties(title={"text":'Is the top user bound to be more controversial?',"subtitle" : "Relationship between Top Users and Controversiality"},width = 500, height = 500)
fig
fig.save("fig5.html")
The user with the highest number of comments is a-mirror-bot but from this graph it can be seen that he is not the user with the highest controversiality. This implies that being active or having more comments does not make you prone to being tagged as controversial. CantStopPopping is one of the lowest active user among the top 10 but still he has the most controversiality.
mirror_st = df_full[df_full['author'].isin('a-mirror-bot')]
mirror_st_subset = mirror_st.select('score','total_awards_received','year','month')
mirror_st_subset.show(5)
+-----+---------------------+----+-----+ |score|total_awards_received|year|month| +-----+---------------------+----+-----+ | 1| 0|2021| 01| | 1| 0|2021| 01| | 1| 0|2021| 01| | 1| 0|2021| 01| | 1| 0|2021| 01| +-----+---------------------+----+-----+ only showing top 5 rows
mirror_st_subset_df = mirror_st_subset.toPandas()
mirror_st_subset_df.head(5)
score | total_awards_received | year | month | |
---|---|---|---|---|
0 | 1 | 0 | 2021 | 01 |
1 | 1 | 0 | 2021 | 01 |
2 | 1 | 0 | 2021 | 01 |
3 | 1 | 0 | 2021 | 01 |
4 | 1 | 0 | 2021 | 01 |
fig = (alt.Chart(mirror_st_subset_df).mark_point().encode(
x=alt.X('total_awards_received', axis = alt.Axis(title = "Total Awards Received for each Comment"),scale=alt.Scale(domain=[-0.2, 2.5])),
y=alt.Y('score', axis = alt.Axis(title = "Score of each Comment")),
color=alt.value('#e31a1c'),
tooltip=['score','total_awards_received']
)).resolve_scale(x='independent').properties(title={"text":'Is there a relationship between Score and Total Awards for a User ?',"subtitle" : "A-Mirror-Bot (Most active author) Statistics"},width = 500, height = 500).interactive()
fig
fig.save("fig6.html")
From the graph, it can be seen that even the most active user is not able to pull maximum awards. Even if the score of the comments increases, the number of awards does not increase and is still consistently zero.
df_full_reddit = df_full.withColumn("Pandemic_Freakout", F.regexp_extract('body', \
r'(?i)\bcovid\b|(?i)\bpandemic\b|(?i)\bcovid-19\b|(?i)\bcorona\b|(?i)\bvirus\b|(?i)\bmasks\b|(?i)\hospital\b',0))
df_full_reddit = df_full_reddit.withColumn("Happy_Freakout", F.regexp_extract('body', \
r'(?i)\bsupport\b|(?i)\bwholesome\b|(?i)\bsurprise\b|(?i)\bhappy\b|(?i)\bfun\b|(?i)\bexcited\b|(?i)\positive\b',0))
df_full_reddit = df_full_reddit.withColumn("Karen_Freakout", F.regexp_extract('body', \
r'(?i)\bkaren\b|(?i)\bmad\b|(?i)\battack\b|(?i)\bwild\b|(?i)\bmanager\b|(?i)\bargue\b|(?i)\meltdown\b',0))
df_full_reddit = df_full_reddit.withColumn("Drunk_Freakout", F.regexp_extract('body', \
r'(?i)\bdrunk\b|(?i)\bliquor\b|(?i)\bbar\b|(?i)\bspill\b|(?i)\bbouncer\b|(?i)\bbeer\b | (?i)\balcohol\b',0))
df_full_reddit = df_full_reddit.withColumn("Arrest_Freakout", F.regexp_extract('body', \
r'(?i)\barrest\b|(?i)\bofficer\b|(?i)\bpolice\b|(?i)\bcop\b|(?i)\bstab\b|(?i)\billegal\b|(?i)\brutal\b',0))
df_full_reddit = df_full_reddit.withColumn("Pandemic_Freakout",F.lower(F.col('Pandemic_Freakout')))
df_full_reddit = df_full_reddit.withColumn("Happy_Freakout",F.lower(F.col('Happy_Freakout')))
df_full_reddit = df_full_reddit.withColumn("Karen_Freakout",F.lower(F.col('Karen_Freakout')))
df_full_reddit = df_full_reddit.withColumn("Drunk_Freakout",F.lower(F.col('Drunk_Freakout')))
df_full_reddit = df_full_reddit.withColumn("Arrest_Freakout",F.lower(F.col('Arrest_Freakout')))
#Rows with blank type are dropped
df_full_reddit = df_full_reddit.filter(df_full_reddit.Pandemic_Freakout != "")
df_full_reddit = df_full_reddit.filter(df_full_reddit.Happy_Freakout != "")
df_full_reddit = df_full_reddit.filter(df_full_reddit.Karen_Freakout != "")
df_full_reddit = df_full_reddit.filter(df_full_reddit.Drunk_Freakout != "")
df_full_reddit = df_full_reddit.filter(df_full_reddit.Arrest_Freakout != "")
df_full_reddit = df_full.withColumn("Pandemic_Freakout", F.regexp_extract('body', \
r'(?i)\bcovid\b|(?i)\bpandemic\b|(?i)\bcovid-19\b|(?i)\bcorona\b|(?i)\bvirus\b|(?i)\bmasks\b|(?i)\hospital\b',0))
df_full_reddit = df_full_reddit.withColumn("Pandemic_Freakout",F.lower(F.col('Pandemic_Freakout')))
reddit_df = df_full_reddit.select('body','Pandemic_Freakout','score','controversiality','comment_date','year','month','hour')
reddit_df.show(5)
[Stage 70:> (0 + 1) / 1]
+--------------------+-----------------+-----+----------------+-------------------+----+-----+----+ | body|Pandemic_Freakout|score|controversiality| comment_date|year|month|hour| +--------------------+-----------------+-----+----------------+-------------------+----+-----+----+ |so, theyre like a...| | 15| 0|2021-01-25 12:40:21|2021| 01| 12| |By that definitio...| | -5| 0|2021-01-25 05:18:15|2021| 01| 05| |I'm a follower of...| | 5| 0|2021-01-11 08:15:26|2021| 01| 08| |Nowadays MAGA is ...| | 9| 0|2021-01-11 05:35:56|2021| 01| 05| | MORGAN FREEMAN.| | 1| 0|2021-01-11 02:53:48|2021| 01| 02| +--------------------+-----------------+-----+----------------+-------------------+----+-----+----+ only showing top 5 rows
reddit_df.groupby("Pandemic_Freakout").count().collect()
[Row(Pandemic_Freakout='masks', count=52700), Row(Pandemic_Freakout='pandemic', count=37097), Row(Pandemic_Freakout='', count=17913227), Row(Pandemic_Freakout='covid', count=76000), Row(Pandemic_Freakout='corona', count=11277), Row(Pandemic_Freakout='virus', count=30651)]
#Rows with blank type are dropped
reddit_df = reddit_df.filter(reddit_df.Pandemic_Freakout != "")
reddit_pddf = reddit_df.toPandas()
reddit_summary_pandemic = reddit_pddf.groupby(['Pandemic_Freakout','year'])['controversiality','score'].agg(['sum','mean']).reset_index()
The sum and mean of Controversiality and Score for comments with different words depicting pandemic freakout
reddit_summary_pandemic.head(15)
Pandemic_Freakout | year | controversiality | score | |||
---|---|---|---|---|---|---|
sum | mean | sum | mean | |||
0 | corona | 2019 | 2 | 0.052632 | 924 | 24.315789 |
1 | corona | 2020 | 512 | 0.051525 | 166631 | 16.768743 |
2 | corona | 2021 | 64 | 0.049155 | 27815 | 21.363287 |
3 | covid | 2020 | 2429 | 0.048284 | 934836 | 18.582992 |
4 | covid | 2021 | 1306 | 0.050829 | 431073 | 16.777185 |
5 | masks | 2019 | 88 | 0.129602 | 6381 | 9.397644 |
6 | masks | 2020 | 1624 | 0.044522 | 589466 | 16.160379 |
7 | masks | 2021 | 694 | 0.044645 | 224237 | 14.425024 |
8 | pandemic | 2019 | 0 | 0.000000 | 88 | 4.631579 |
9 | pandemic | 2020 | 1555 | 0.058593 | 488145 | 18.393496 |
10 | pandemic | 2021 | 568 | 0.053895 | 162958 | 15.462378 |
11 | virus | 2019 | 12 | 0.092308 | 1655 | 12.730769 |
12 | virus | 2020 | 1303 | 0.053253 | 285935 | 11.686080 |
13 | virus | 2021 | 388 | 0.064100 | 40380 | 6.671072 |
covid_stats = pd.read_csv("worldometer_coronavirus_daily_data.csv")
covid_stats.head()
date | country | cumulative_total_cases | daily_new_cases | active_cases | cumulative_total_deaths | daily_new_deaths | |
---|---|---|---|---|---|---|---|
0 | 2020-2-15 | Afghanistan | 0.0 | NaN | 0.0 | 0.0 | NaN |
1 | 2020-2-16 | Afghanistan | 0.0 | NaN | 0.0 | 0.0 | NaN |
2 | 2020-2-17 | Afghanistan | 0.0 | NaN | 0.0 | 0.0 | NaN |
3 | 2020-2-18 | Afghanistan | 0.0 | NaN | 0.0 | 0.0 | NaN |
4 | 2020-2-19 | Afghanistan | 0.0 | NaN | 0.0 | 0.0 | NaN |
covid_stats_case = covid_stats.groupby('date')['cumulative_total_cases'].agg(['sum']).reset_index()
covid_stats_case['year'] = pd.DatetimeIndex(covid_stats_case['date']).year
covid_stats_case['month'] = pd.DatetimeIndex(covid_stats_case['date']).month
covid_stats_case.head()
date | sum | year | month | |
---|---|---|---|---|
0 | 2020-1-22 | 571.0 | 2020 | 1 |
1 | 2020-1-23 | 830.0 | 2020 | 1 |
2 | 2020-1-24 | 1287.0 | 2020 | 1 |
3 | 2020-1-25 | 1975.0 | 2020 | 1 |
4 | 2020-1-26 | 2744.0 | 2020 | 1 |
covid_stats_case['date'] = pd.to_datetime(covid_stats_case['date'])
reddit_pddf['comment_date'] = pd.to_datetime(reddit_pddf['comment_date'])
reddit_pddf['month'] = reddit_pddf['month'].astype('int')
reddit_pddf['year'] = reddit_pddf['year'].astype('int')
covid_stats_case['year'] = covid_stats_case['year'].astype('int')
covid_stats_case['month'] = covid_stats_case['month'].astype('int')
covid_merge = pd.merge(reddit_pddf,covid_stats_case,left_on = ['year','month'],right_on = ["year","month"],how = "inner")
covid_merge.head()
body | Pandemic_Freakout | score | controversiality | comment_date | year | month | hour | date | sum | |
---|---|---|---|---|---|---|---|---|---|---|
0 | I don't believe they were anti mask, so much a... | masks | 3 | 0 | 2021-01-31 13:27:37 | 2021 | 1 | 13 | 2021-01-01 | 84893057.0 |
1 | I don't believe they were anti mask, so much a... | masks | 3 | 0 | 2021-01-31 13:27:37 | 2021 | 1 | 13 | 2021-01-02 | 85443850.0 |
2 | I don't believe they were anti mask, so much a... | masks | 3 | 0 | 2021-01-31 13:27:37 | 2021 | 1 | 13 | 2021-01-03 | 85993623.0 |
3 | I don't believe they were anti mask, so much a... | masks | 3 | 0 | 2021-01-31 13:27:37 | 2021 | 1 | 13 | 2021-01-04 | 86551281.0 |
4 | I don't believe they were anti mask, so much a... | masks | 3 | 0 | 2021-01-31 13:27:37 | 2021 | 1 | 13 | 2021-01-05 | 87292032.0 |
covid_merge = covid_merge.rename(columns = {'sum':'covid_cases'})
covid_merge_stat = covid_merge.groupby(['Pandemic_Freakout','year','month'])['controversiality','covid_cases','score'].agg(['sum']).reset_index()
This shows the how much people were talking in relation to the number of covid cases in every month
covid_merge_stat
Pandemic_Freakout | year | month | controversiality | covid_cases | score | |
---|---|---|---|---|---|---|
sum | sum | sum | ||||
0 | corona | 2020 | 1 | 60 | 4.332280e+06 | 5570 |
1 | corona | 2020 | 2 | 667 | 5.290302e+08 | 37961 |
2 | corona | 2020 | 3 | 2666 | 1.811681e+10 | 937347 |
3 | corona | 2020 | 4 | 1980 | 1.032122e+11 | 837060 |
4 | corona | 2020 | 5 | 1767 | 1.788242e+11 | 917786 |
... | ... | ... | ... | ... | ... | ... |
84 | virus | 2021 | 2 | 1288 | 2.469863e+12 | 314832 |
85 | virus | 2021 | 3 | 2046 | 4.557631e+12 | 212970 |
86 | virus | 2021 | 4 | 2070 | 3.798920e+12 | 112290 |
87 | virus | 2021 | 5 | 1736 | 3.407792e+12 | 91884 |
88 | virus | 2021 | 6 | 1860 | 4.005785e+12 | 126870 |
89 rows × 6 columns
spark.stop()