In [1]:
import findspark
findspark.init()
In [2]:
#Importing libraries
import pyspark.sql.functions as f
from pyspark.sql.functions import col, lit
from pyspark.sql import SparkSession
In [3]:
spark = SparkSession.builder.appName("machinelearning").getOrCreate()
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/04/29 20:16:22 WARN Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.
22/04/29 20:16:33 WARN YarnSchedulerBackend$YarnSchedulerEndpoint: Attempted to request executors before the AM has registered!
In [4]:
#Checking the status of spark session
spark
Out[4]:

SparkSession - in-memory

SparkContext

Spark UI

Version
v3.0.0-amzn-0
Master
yarn
AppName
machinelearning

Reading the dataset from s3¶

In [7]:
df_full = spark.read.parquet('s3://ssp88-labdata2/eda_df_full')
In [8]:
df_full.printSchema()
root
 |-- all_awardings: string (nullable = true)
 |-- author: string (nullable = true)
 |-- author_created_utc: double (nullable = true)
 |-- author_flair_richtext: string (nullable = true)
 |-- author_flair_type: string (nullable = true)
 |-- author_fullname: string (nullable = true)
 |-- author_patreon_flair: boolean (nullable = true)
 |-- author_premium: boolean (nullable = true)
 |-- awarders: string (nullable = true)
 |-- body: string (nullable = true)
 |-- can_gild: boolean (nullable = true)
 |-- can_mod_post: boolean (nullable = true)
 |-- collapsed: boolean (nullable = true)
 |-- controversiality: long (nullable = true)
 |-- created_utc: long (nullable = true)
 |-- gilded: long (nullable = true)
 |-- gildings: string (nullable = true)
 |-- id: string (nullable = true)
 |-- is_submitter: boolean (nullable = true)
 |-- link_id: string (nullable = true)
 |-- locked: boolean (nullable = true)
 |-- no_follow: boolean (nullable = true)
 |-- parent_id: string (nullable = true)
 |-- permalink: string (nullable = true)
 |-- quarantined: boolean (nullable = true)
 |-- retrieved_on: long (nullable = true)
 |-- score: long (nullable = true)
 |-- send_replies: boolean (nullable = true)
 |-- stickied: boolean (nullable = true)
 |-- subreddit_id: string (nullable = true)
 |-- subreddit_name_prefixed: string (nullable = true)
 |-- subreddit_type: string (nullable = true)
 |-- total_awards_received: long (nullable = true)
 |-- treatment_tags: string (nullable = true)
 |-- editable: boolean (nullable = true)
 |-- ym_partition: integer (nullable = true)
 |-- comment_date: string (nullable = true)
 |-- year: string (nullable = true)
 |-- month: string (nullable = true)
 |-- hour: string (nullable = true)
 |-- original_post: string (nullable = true)

In [9]:
#Checking the length of the comment 
df_full = df_full.withColumn("comment_length", f.length(col('body')))

Question 1 : Predict Score of the comment¶

In [188]:
#Selecting necessary columns
df_score = df_sentiment.select(['sentiment', 'controversiality','total_awards_received','comment_length','hour', 'score'] )
In [189]:
df_score.printSchema()
root
 |-- sentiment: string (nullable = true)
 |-- controversiality: long (nullable = true)
 |-- total_awards_received: long (nullable = true)
 |-- comment_length: integer (nullable = true)
 |-- hour: string (nullable = true)
 |-- score: long (nullable = true)

ML Tranformations¶

In [92]:
df_score.groupBy('sentiment').count().show()
                                                                                
+---------+-------+
|sentiment|  count|
+---------+-------+
| positive|9880051|
|     null| 108800|
|  neutral| 968815|
| negative|7163286|
+---------+-------+

In [93]:
#Keeping the rows with only True, False as author_premium
df_score = df_score.filter((df_score.sentiment == 'positive') | (df_score.sentiment == 'neutral') | (df_score.sentiment == 'negative'))
In [56]:
import pandas as pd
import numpy as np
import json
from sklearn.metrics import roc_curve, roc_auc_score
from pyspark.ml.feature import OneHotEncoder, StringIndexer, IndexToString, VectorAssembler
from pyspark.ml.classification import RandomForestClassifier, GBTClassifier, LogisticRegression, LinearSVC
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator, RegressionEvaluator
from pyspark.ml import Pipeline, Model
from pyspark.ml.pipeline import PipelineModel
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.ml.regression import LinearRegression
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator
In [174]:
#Splitting the data into train test and predict for our creating the model
train_data, test_data = df_score.randomSplit([0.8, 0.2], 24)
In [175]:
stringIndexer_sentiment = StringIndexer(inputCol="sentiment", outputCol="sentiment_idx")
stringIndexer_hour = StringIndexer(inputCol="hour", outputCol="hour_idx")
In [176]:
onehot_sentiment = OneHotEncoder(inputCol="sentiment_idx", 
                       outputCol="sentiment_vec")
onehot_hour = OneHotEncoder(inputCol="hour_idx", 
                       outputCol="hour_vec")
In [177]:
vectorAssembler_sentiment = VectorAssembler(
    inputCols=['sentiment_vec', 'controversiality','total_awards_received','comment_length','hour_vec'],
    outputCol= "features")

Linear Regression¶

In [179]:
# Create initial LinearRegression model
lr1 = LinearRegression(labelCol="score", featuresCol="features")
In [180]:
#Creating a pipeline
pipeline_lr1 = Pipeline(stages=[stringIndexer_sentiment, stringIndexer_hour, onehot_sentiment, onehot_hour, vectorAssembler_sentiment, lr1])
In [182]:
# Create ParamGrid for Cross Validation
lrparamGrid2 = ParamGridBuilder()\
             .addGrid(lr1.regParam, [1,2, 3])\
             .addGrid(lr1.elasticNetParam, [0.25, 0.5, 0.75])\
             .addGrid(lr1.maxIter, [5, 10, 20])\
             .build()
In [183]:
# Evaluate model
lrevaluator1 = RegressionEvaluator(predictionCol="prediction", labelCol="score", metricName="rmse").setLabelCol("score")
In [184]:
# Create 3-fold CrossValidator
lrcv1 = CrossValidator(estimator = pipeline_lr1,
                    estimatorParamMaps = lrparamGrid1,
                    evaluator = lrevaluator1,
                    numFolds = 3)
In [185]:
# Run cross validations
lrcvModel1 = lrcv1.fit(train_data)
22/04/29 23:11:47 WARN Instrumentation: [c9cb094b] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:11:50 WARN Instrumentation: [c9cb094b] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:12:03 WARN Instrumentation: [485fa21e] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:12:05 WARN Instrumentation: [485fa21e] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:12:08 WARN Instrumentation: [190b2e7b] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:12:10 WARN Instrumentation: [190b2e7b] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:12:14 WARN Instrumentation: [9dfe5eac] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:12:16 WARN Instrumentation: [9dfe5eac] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:12:19 WARN Instrumentation: [47deb984] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:12:21 WARN Instrumentation: [47deb984] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:12:24 WARN Instrumentation: [23802792] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:12:26 WARN Instrumentation: [23802792] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:12:30 WARN Instrumentation: [df64228b] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:12:32 WARN Instrumentation: [df64228b] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:12:35 WARN Instrumentation: [e0a82e5f] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:12:37 WARN Instrumentation: [e0a82e5f] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:12:40 WARN Instrumentation: [717eb28c] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:12:42 WARN Instrumentation: [717eb28c] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:12:46 WARN Instrumentation: [afe26e5a] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:12:48 WARN Instrumentation: [afe26e5a] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:12:51 WARN Instrumentation: [6b5f1c5b] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:12:53 WARN Instrumentation: [6b5f1c5b] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:12:56 WARN Instrumentation: [4bcae450] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:12:58 WARN Instrumentation: [4bcae450] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:13:02 WARN Instrumentation: [af444977] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:13:04 WARN Instrumentation: [af444977] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:13:07 WARN Instrumentation: [15e97047] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:13:09 WARN Instrumentation: [15e97047] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:13:12 WARN Instrumentation: [ab63b5a6] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:13:14 WARN Instrumentation: [ab63b5a6] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:13:18 WARN Instrumentation: [6183ab5c] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:13:20 WARN Instrumentation: [6183ab5c] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:13:23 WARN Instrumentation: [f54b3653] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:13:25 WARN Instrumentation: [f54b3653] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:13:29 WARN Instrumentation: [ff0e1202] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:13:31 WARN Instrumentation: [ff0e1202] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:13:34 WARN Instrumentation: [2e90c120] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:13:36 WARN Instrumentation: [2e90c120] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:13:40 WARN Instrumentation: [5d210179] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:13:42 WARN Instrumentation: [5d210179] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:13:45 WARN Instrumentation: [41bad21c] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:13:47 WARN Instrumentation: [41bad21c] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:13:51 WARN Instrumentation: [2dc027c5] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:13:52 WARN Instrumentation: [2dc027c5] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:13:56 WARN Instrumentation: [6f282732] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:13:58 WARN Instrumentation: [6f282732] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:14:01 WARN Instrumentation: [cd9d0021] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:14:03 WARN Instrumentation: [cd9d0021] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:14:07 WARN Instrumentation: [ca159973] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:14:09 WARN Instrumentation: [ca159973] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:14:12 WARN Instrumentation: [b3207827] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:14:14 WARN Instrumentation: [b3207827] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:14:17 WARN Instrumentation: [932f90b6] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:14:19 WARN Instrumentation: [932f90b6] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:14:23 WARN Instrumentation: [2329af98] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:14:25 WARN Instrumentation: [2329af98] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:14:28 WARN Instrumentation: [7e3d70ea] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:14:30 WARN Instrumentation: [7e3d70ea] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:14:33 WARN Instrumentation: [7806aa88] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:14:35 WARN Instrumentation: [7806aa88] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:14:39 WARN Instrumentation: [a88c6793] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:14:41 WARN Instrumentation: [a88c6793] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:14:44 WARN Instrumentation: [ab3920a2] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:14:46 WARN Instrumentation: [ab3920a2] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:14:49 WARN Instrumentation: [3cca8421] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:14:51 WARN Instrumentation: [3cca8421] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:14:55 WARN Instrumentation: [2023382c] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:14:57 WARN Instrumentation: [2023382c] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:15:00 WARN Instrumentation: [51b8e95c] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:15:02 WARN Instrumentation: [51b8e95c] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:15:05 WARN Instrumentation: [c98744f3] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:15:07 WARN Instrumentation: [c98744f3] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:15:18 WARN Instrumentation: [878bd086] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:15:20 WARN Instrumentation: [878bd086] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:15:30 WARN Instrumentation: [ebd9f5ec] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:15:31 WARN Instrumentation: [ebd9f5ec] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:15:35 WARN Instrumentation: [949f9383] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:15:37 WARN Instrumentation: [949f9383] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:15:40 WARN Instrumentation: [ef97d85c] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:15:42 WARN Instrumentation: [ef97d85c] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:15:45 WARN Instrumentation: [d7695173] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:15:47 WARN Instrumentation: [d7695173] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:15:50 WARN Instrumentation: [e2e3937d] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:15:52 WARN Instrumentation: [e2e3937d] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:15:56 WARN Instrumentation: [c3653b1b] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:15:57 WARN Instrumentation: [c3653b1b] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:16:01 WARN Instrumentation: [95051f9b] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:16:03 WARN Instrumentation: [95051f9b] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:16:06 WARN Instrumentation: [f21d1404] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:16:08 WARN Instrumentation: [f21d1404] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:16:12 WARN Instrumentation: [4754245f] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:16:13 WARN Instrumentation: [4754245f] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:16:17 WARN Instrumentation: [96660cc7] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:16:18 WARN Instrumentation: [96660cc7] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:16:22 WARN Instrumentation: [62c71923] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:16:24 WARN Instrumentation: [62c71923] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:16:27 WARN Instrumentation: [a05ecb53] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:16:29 WARN Instrumentation: [a05ecb53] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:16:32 WARN Instrumentation: [0ea16bd3] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:16:34 WARN Instrumentation: [0ea16bd3] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:16:37 WARN Instrumentation: [516b11c1] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:16:39 WARN Instrumentation: [516b11c1] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:16:43 WARN Instrumentation: [a8148271] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:16:44 WARN Instrumentation: [a8148271] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:16:48 WARN Instrumentation: [f7aa12ad] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:16:49 WARN Instrumentation: [f7aa12ad] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:16:53 WARN Instrumentation: [1a126cbe] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:16:55 WARN Instrumentation: [1a126cbe] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:16:59 WARN Instrumentation: [1da2a151] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:17:01 WARN Instrumentation: [1da2a151] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:17:04 WARN Instrumentation: [b311ce45] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:17:06 WARN Instrumentation: [b311ce45] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:17:10 WARN Instrumentation: [31854af7] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:17:11 WARN Instrumentation: [31854af7] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:17:15 WARN Instrumentation: [53b43b3e] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:17:16 WARN Instrumentation: [53b43b3e] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:17:20 WARN Instrumentation: [27b94718] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:17:21 WARN Instrumentation: [27b94718] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:17:25 WARN Instrumentation: [fb161313] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:17:27 WARN Instrumentation: [fb161313] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:17:30 WARN Instrumentation: [497f8d91] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:17:32 WARN Instrumentation: [497f8d91] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:17:35 WARN Instrumentation: [a17c5e68] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:17:37 WARN Instrumentation: [a17c5e68] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:17:40 WARN Instrumentation: [1a77fedd] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:17:43 WARN Instrumentation: [1a77fedd] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:17:46 WARN Instrumentation: [78d43ab5] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:17:48 WARN Instrumentation: [78d43ab5] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:17:51 WARN Instrumentation: [7b34c4e0] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:17:53 WARN Instrumentation: [7b34c4e0] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:17:56 WARN Instrumentation: [e32f9c75] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:17:58 WARN Instrumentation: [e32f9c75] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:18:01 WARN Instrumentation: [2c65257b] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:18:03 WARN Instrumentation: [2c65257b] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:18:07 WARN Instrumentation: [944aad32] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:18:08 WARN Instrumentation: [944aad32] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:18:12 WARN Instrumentation: [91f630e5] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:18:14 WARN Instrumentation: [91f630e5] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:18:17 WARN Instrumentation: [79639e04] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:18:19 WARN Instrumentation: [79639e04] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:18:22 WARN Instrumentation: [8736cac6] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:18:24 WARN Instrumentation: [8736cac6] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:18:28 WARN Instrumentation: [04d023e0] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:18:30 WARN Instrumentation: [04d023e0] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:18:41 WARN Instrumentation: [5019488e] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:18:43 WARN Instrumentation: [5019488e] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:18:53 WARN Instrumentation: [7edd30f8] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:18:56 WARN Instrumentation: [7edd30f8] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:19:00 WARN Instrumentation: [7b1d0745] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:19:01 WARN Instrumentation: [7b1d0745] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:19:05 WARN Instrumentation: [826e7b6e] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:19:07 WARN Instrumentation: [826e7b6e] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:19:10 WARN Instrumentation: [ddeeec1c] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:19:12 WARN Instrumentation: [ddeeec1c] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:19:15 WARN Instrumentation: [f8ad96fa] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:19:17 WARN Instrumentation: [f8ad96fa] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:19:21 WARN Instrumentation: [99e4e580] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:19:23 WARN Instrumentation: [99e4e580] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:19:26 WARN Instrumentation: [3d80c4f4] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:19:28 WARN Instrumentation: [3d80c4f4] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:19:32 WARN Instrumentation: [2d03d82e] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:19:34 WARN Instrumentation: [2d03d82e] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:19:37 WARN Instrumentation: [57ae0b76] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:19:39 WARN Instrumentation: [57ae0b76] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:19:43 WARN Instrumentation: [af08cda9] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:19:45 WARN Instrumentation: [af08cda9] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:19:48 WARN Instrumentation: [f97243e1] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:19:50 WARN Instrumentation: [f97243e1] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:19:53 WARN Instrumentation: [cbb15358] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:19:55 WARN Instrumentation: [cbb15358] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:19:59 WARN Instrumentation: [e7ce68c3] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:20:01 WARN Instrumentation: [e7ce68c3] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:20:04 WARN Instrumentation: [d8e1111a] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:20:06 WARN Instrumentation: [d8e1111a] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:20:10 WARN Instrumentation: [3fa52337] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:20:12 WARN Instrumentation: [3fa52337] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:20:15 WARN Instrumentation: [5a7f7f94] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:20:17 WARN Instrumentation: [5a7f7f94] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:20:21 WARN Instrumentation: [a78574b2] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:20:22 WARN Instrumentation: [a78574b2] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:20:26 WARN Instrumentation: [80a74b52] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:20:28 WARN Instrumentation: [80a74b52] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:20:31 WARN Instrumentation: [83c2faf2] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:20:33 WARN Instrumentation: [83c2faf2] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:20:37 WARN Instrumentation: [20f56297] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:20:39 WARN Instrumentation: [20f56297] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:20:42 WARN Instrumentation: [a209cf08] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:20:44 WARN Instrumentation: [a209cf08] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:20:47 WARN Instrumentation: [7f960cdc] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:20:49 WARN Instrumentation: [7f960cdc] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:20:53 WARN Instrumentation: [d0cb4e8b] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:20:55 WARN Instrumentation: [d0cb4e8b] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:20:58 WARN Instrumentation: [a4e9949f] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:21:00 WARN Instrumentation: [a4e9949f] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:21:04 WARN Instrumentation: [59df2c95] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:21:06 WARN Instrumentation: [59df2c95] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:21:09 WARN Instrumentation: [e74e91de] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:21:11 WARN Instrumentation: [e74e91de] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:21:14 WARN Instrumentation: [e2c5ddd8] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:21:16 WARN Instrumentation: [e2c5ddd8] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:21:20 WARN Instrumentation: [77c54391] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:21:22 WARN Instrumentation: [77c54391] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:21:25 WARN Instrumentation: [882e6834] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:21:27 WARN Instrumentation: [882e6834] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:21:31 WARN Instrumentation: [6873f457] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:21:32 WARN Instrumentation: [6873f457] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:21:36 WARN Instrumentation: [34132160] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:21:38 WARN Instrumentation: [34132160] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:21:41 WARN Instrumentation: [fbd8b341] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:21:43 WARN Instrumentation: [fbd8b341] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:21:47 WARN Instrumentation: [7d70da9b] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:21:49 WARN Instrumentation: [7d70da9b] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:21:52 WARN Instrumentation: [eaea3f7f] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:21:54 WARN Instrumentation: [eaea3f7f] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:21:57 WARN Instrumentation: [dcb2c73a] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:21:59 WARN Instrumentation: [dcb2c73a] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
22/04/29 23:22:09 WARN Instrumentation: [271f8d42] regParam is zero, which might cause numerical instability and overfitting.
22/04/29 23:22:17 WARN Instrumentation: [271f8d42] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
                                                                                
In [117]:
# Save the model pipeline into s3
lrcvModel1.write().save('s3://ssp88-labdata2/lrcvModel1.1')
In [186]:
# Use test set here so we can measure the accuracy of our model on new data
lrprediction1 = lrcvModel1.transform(test_data)
In [141]:
# cvModel uses the best model found from the Cross Validation
# Evaluate best model
print('MSE:',lrevaluator1.evaluate(lrprediction1,
{lrevaluator1.metricName: "mse"})
)
print('RMSE:', lrevaluator1.evaluate(lrprediction1,
{lrevaluator1.metricName: "rmse"})
)
print('MAE:',lrevaluator1.evaluate(lrprediction1,
{lrevaluator1.metricName: "mae"})
)
                                                                                
MSE: 32234.198698835175
                                                                                
RMSE: 179.53885011003933
[Stage 5484:==================================================>   (48 + 3) / 51]
MAE: 22.33287317693236
                                                                                
In [143]:
# Original and Predicted data
x_ax = range(0, lrprediction1.count())
y_pred = lrprediction1.select("prediction").collect()
y_orig = lrprediction1.select("score").collect()  
                                                                                
In [153]:
import matplotlib.pyplot as plt
plt.figure(figsize=(8,8))
plt.plot(x_ax, y_orig,'orange', label="original")
plt.plot(x_ax, y_pred,'cornflowerblue' ,label="predicted")
plt.title("Test and predicted data using Linear Regression")
plt.xlabel('Range of data')
plt.ylabel('Score')
plt.legend(loc='best',fancybox=True, shadow=True)
plt.ticklabel_format(style='plain',useOffset=False) 
plt.grid(True)
plt.show()  
 

Decision Tree Regression¶

In [121]:
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator
In [122]:
dt = DecisionTreeRegressor(labelCol="score", featuresCol="features")
In [124]:
#Creating a pipeline
pipeline_dt = Pipeline(stages=[stringIndexer_sentiment, stringIndexer_hour, onehot_sentiment, onehot_hour, vectorAssembler_sentiment, dt])
In [125]:
# Create ParamGrid for Cross Validation
dtparamGrid = (ParamGridBuilder()\
             .addGrid(dt.maxDepth, [2, 5, 10])\
             .addGrid(dt.maxBins, [10, 20,40])\
             .build())
In [126]:
# Evaluate model
dtevaluator = RegressionEvaluator(predictionCol="prediction", labelCol="score", metricName="rmse")
In [127]:
# Create 3-fold CrossValidator
dtcv = CrossValidator(estimator = pipeline_dt,
                      estimatorParamMaps = dtparamGrid,
                      evaluator = dtevaluator,
                      numFolds = 3)
In [128]:
# Run cross validations
dtcvModel = dtcv.fit(train_data)
                                                                                
In [ ]:
# Save the model pipeline into s3
dtcvModel.write().save('s3://ssp88-labdata2/dtcvModel')
In [129]:
# Use test set here so we can measure the accuracy of our model on new data
dtpredictions = dtcvModel.transform(test_data)
In [140]:
# cvModel uses the best model found from the Cross Validation
# Evaluate best model
print('MSE:',dtevaluator.evaluate(dtpredictions,
{dtevaluator.metricName: "mse"})
)
print('RMSE:', dtevaluator.evaluate(dtpredictions,
{dtevaluator.metricName: "rmse"})
)
print('MAE:',dtevaluator.evaluate(dtpredictions,
{dtevaluator.metricName: "mae"})
)
                                                                                
MSE: 30039.488938120427
                                                                                
RMSE: 173.3190380140636
[Stage 5478:===================================================>  (49 + 2) / 51]
MAE: 21.004838989749768
                                                                                
In [155]:
# Original and Predicted data
x_ax = range(0, dtpredictions.count())
y_pred = dtpredictions.select("prediction").collect()
y_orig = dtpredictions.select("score").collect()  
                                                                                
In [156]:
import matplotlib.pyplot as plt
plt.figure(figsize=(8,8))
plt.plot(x_ax, y_orig,'orange', label="original")
plt.plot(x_ax, y_pred,'darkorchid' ,label="predicted")
plt.title("Test and predicted data using Decision Tree")
plt.xlabel('Range of data')
plt.ylabel('Score')
plt.legend(loc='best',fancybox=True, shadow=True)
plt.ticklabel_format(style='plain',useOffset=False) 
plt.grid(True)
plt.show()  
 

Question 2 : Predict Controversiality of the comment¶

ML Transformations¶

In [101]:
import pandas as pd
import numpy as np
import json
from sklearn.metrics import roc_curve, roc_auc_score
from pyspark.ml.feature import OneHotEncoder, StringIndexer, IndexToString, VectorAssembler
from pyspark.ml.classification import RandomForestClassifier, GBTClassifier, LogisticRegression, LinearSVC
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator, RegressionEvaluator
from pyspark.ml import Pipeline, Model
from pyspark.ml.pipeline import PipelineModel
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.mllib.evaluation import BinaryClassificationMetrics
In [9]:
#Selecting necessary columns
df_con = df_full.select(['author_premium', 'controversiality', 'locked', 'no_follow', 'stickied','total_awards_received','comment_length','hour', 'score'] )
In [10]:
#Checking categorical columns
df_con.groupBy('author_premium').count().show()
22/04/29 15:35:17 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                
+--------------+--------+
|author_premium|   count|
+--------------+--------+
|          null| 2988543|
|          true|  763357|
|         false|14369052|
+--------------+--------+

In [11]:
#Keeping the rows with only True, False as author_premium
df_con = df_con.filter((df_con.author_premium == 'true') | (df_con.author_premium == 'false'))
In [12]:
#Checking the balance of target
df_con.groupBy('controversiality').count().show()
                                                                                
+----------------+--------+
|controversiality|   count|
+----------------+--------+
|               1|  779649|
|               0|14352760|
+----------------+--------+

In [13]:
#Downsamping as controversiality is imbalanced
major_df = df_con.filter(col("controversiality") == 0)
minor_df = df_con.filter(col("controversiality") == 1)
ratio = int(major_df.count()/minor_df.count())
                                                                                
In [14]:
sampled_majority_df = major_df.sample(False, 1/ratio)
downsampled_df = sampled_majority_df.unionAll(minor_df)
In [120]:
downsampled_df.groupBy('controversiality').count().show()
                                                                                
+----------------+------+
|controversiality| count|
+----------------+------+
|               1|779649|
|               0|799348|
+----------------+------+

In [61]:
#Splitting the data into train test and predict for our creating the model
train_data, test_data = downsampled_df.randomSplit([0.8, 0.2,], 24)
In [62]:
stringIndexer_hour = StringIndexer(inputCol="hour", outputCol="hour_idx")
In [63]:
onehot_hour = OneHotEncoder(inputCol="hour_idx", 
                       outputCol="hour_vec")
In [64]:
vectorAssembler_con = VectorAssembler(
    inputCols=['author_premium', 'locked', 'no_follow', 'stickied','total_awards_received','comment_length','hour_vec', 'score'],
    outputCol= "features")

Logistic Regression¶

In [83]:
# Create initial LogisticRegression model
lr2 = LogisticRegression(labelCol="controversiality", featuresCol="features")
In [84]:
#Creating a pipeline
pipeline_lr2 = Pipeline(stages=[stringIndexer_hour, onehot_hour, vectorAssembler_con, lr2])
In [93]:
# Create ParamGrid for Cross Validation
lrparamGrid2 = ParamGridBuilder()\
             .addGrid(lr2.regParam, [0.01, 0.1, 0.5])\
             .addGrid(lr2.elasticNetParam, [0.25, 0.5, 0.75])\
             .addGrid(lr2.maxIter, [5, 10, 20])\
             .build()
In [94]:
# Evaluate model
lrevaluator2 = BinaryClassificationEvaluator().setLabelCol("controversiality")
In [95]:
# Create 3-fold CrossValidator
lrcv2 = CrossValidator(estimator = pipeline_lr2,
                    estimatorParamMaps = lrparamGrid2,
                    evaluator = lrevaluator2,
                    numFolds = 3)
In [96]:
# Run cross validations
lrcvModel2 = lrcv2.fit(train_data)
                                                                                
In [183]:
# Save the model pipeline into s3
lrcvModel2.write().save('s3://ssp88-labdata2/lrcvModel2')
In [99]:
# Use test set here so we can measure the accuracy of our model on new data
lrprediction2 = lrcvModel2.transform(test_data)
In [127]:
# cvModel uses the best model found from the Cross Validation
# Evaluate best model
lr_acc2 = lrevaluator2.evaluate(lrprediction2)

#Displaying the results
print("Accuracy = %g" % lr_acc2)
print("Test Error = %g" % (1.0 - lr_acc2))
                                                                                
Accuracy = 0.626823
Test Error = 0.373177
In [130]:
#Confusion Matrix 
#y_pred has the predicted labels
y_pred_lr=lrprediction2.select("prediction").collect()
#y_orig are the actual labels 
y_orig_lr =lrprediction2.select("controversiality").collect()
                                                                                
In [131]:
#Creating the confusion matrix 
cm = confusion_matrix(y_orig_lr, y_pred_lr)
print("Confusion Matrix:")
print(cm)
Confusion Matrix:
[[ 47209 111748]
 [ 28353 127511]]

Random Forest¶

In [111]:
# Create initial RandomForest model
rf2 = RandomForestClassifier(labelCol="controversiality", featuresCol="features")
In [112]:
#Creating a pipeline
pipeline_rf2 = Pipeline(stages=[stringIndexer_hour, onehot_hour, vectorAssembler_con, rf2])
In [114]:
# Create ParamGrid for Cross Validation
rfparamGrid2 = ParamGridBuilder()\
               .addGrid(rf2.maxDepth, [2, 5, 10])\
               .addGrid(rf2.maxBins, [5, 10, 20])\
               .addGrid(rf2.numTrees, [5, 20, 50])\
               .build()
In [119]:
# Evaluate model
rfevaluator2 = RegressionEvaluator().setLabelCol("controversiality")
In [122]:
# Create 3-fold CrossValidator
rfcv2 = CrossValidator(estimator = pipeline_rf2,
                      estimatorParamMaps = rfparamGrid2,
                      evaluator = rfevaluator2,
                      numFolds = 3)
In [123]:
# Run cross validations
rfcvModel2 = rfcv2.fit(train_data)
22/04/29 17:03:05 WARN CacheManager: Asked to cache already cached data.
22/04/29 17:03:05 WARN CacheManager: Asked to cache already cached data.
22/04/29 17:05:05 WARN DAGScheduler: Broadcasting large task binary with size 1335.8 KiB
22/04/29 17:05:07 WARN DAGScheduler: Broadcasting large task binary with size 1782.4 KiB
22/04/29 17:05:35 WARN DAGScheduler: Broadcasting large task binary with size 1002.0 KiB
22/04/29 17:05:37 WARN DAGScheduler: Broadcasting large task binary with size 1402.0 KiB
22/04/29 17:05:39 WARN DAGScheduler: Broadcasting large task binary with size 1904.9 KiB
22/04/29 17:06:06 WARN DAGScheduler: Broadcasting large task binary with size 1003.3 KiB
22/04/29 17:06:08 WARN DAGScheduler: Broadcasting large task binary with size 1413.0 KiB
22/04/29 17:06:10 WARN DAGScheduler: Broadcasting large task binary with size 1935.1 KiB
22/04/29 17:07:55 WARN DAGScheduler: Broadcasting large task binary with size 1000.3 KiB
22/04/29 17:07:57 WARN DAGScheduler: Broadcasting large task binary with size 1384.7 KiB
22/04/29 17:07:59 WARN DAGScheduler: Broadcasting large task binary with size 1849.5 KiB
22/04/29 17:08:26 WARN DAGScheduler: Broadcasting large task binary with size 1020.0 KiB
22/04/29 17:08:28 WARN DAGScheduler: Broadcasting large task binary with size 1440.1 KiB
22/04/29 17:08:30 WARN DAGScheduler: Broadcasting large task binary with size 1960.8 KiB
22/04/29 17:08:55 WARN DAGScheduler: Broadcasting large task binary with size 1030.6 KiB
22/04/29 17:08:57 WARN DAGScheduler: Broadcasting large task binary with size 1469.6 KiB
22/04/29 17:08:59 WARN DAGScheduler: Broadcasting large task binary with size 2024.9 KiB
22/04/29 17:10:49 WARN DAGScheduler: Broadcasting large task binary with size 1378.1 KiB
22/04/29 17:10:51 WARN DAGScheduler: Broadcasting large task binary with size 1852.2 KiB
22/04/29 17:11:17 WARN DAGScheduler: Broadcasting large task binary with size 1019.6 KiB
22/04/29 17:11:19 WARN DAGScheduler: Broadcasting large task binary with size 1428.2 KiB
22/04/29 17:11:21 WARN DAGScheduler: Broadcasting large task binary with size 1949.9 KiB
22/04/29 17:11:46 WARN DAGScheduler: Broadcasting large task binary with size 1024.5 KiB
22/04/29 17:11:47 WARN DAGScheduler: Broadcasting large task binary with size 1455.5 KiB
22/04/29 17:11:50 WARN DAGScheduler: Broadcasting large task binary with size 1999.9 KiB
                                                                                
In [184]:
# Save the model pipeline into s3
rfcvModel2.write().save('s3://ssp88-labdata2/rfcvModel2')
In [124]:
# Use test set here so we can measure the accuracy of our model on new data
rfprediction2 = rfcvModel2.transform(test_data)
In [126]:
# cvModel uses the best model found from the Cross Validation
# Evaluate best model
rf_acc2 = rfevaluator2.evaluate(rfprediction2)

#Displaying the results
print("Accuracy = %g" % rf_acc2)
print("Test Error = %g" % (1.0 - rf_acc2))
[Stage 18586:==================================================>(100 + 2) / 102]
Accuracy = 0.550374
Test Error = 0.449626
                                                                                
In [132]:
#Confusion Matrix
#y_pred has the predicted labels
y_pred_rf=rfprediction2.select("prediction").collect()
#y_orig are the actual labels 
y_orig_rf =rfprediction2.select("controversiality").collect()
                                                                                
In [133]:
#Creating the confusion matrix 
cm = confusion_matrix(y_orig_rf, y_pred_rf)
print("Confusion Matrix:")
print(cm)
Confusion Matrix:
[[137319  21638]
 [ 73725  82139]]
In [ ]:
 

ROC Curve¶

In [161]:
from sklearn.metrics import roc_curve
from pyspark.ml.functions import vector_to_array

y_score_lr = lrprediction2.select(vector_to_array("probability")[1]).rdd.keys().collect()
y_true_lr = lrprediction2.select("controversiality").rdd.keys().collect()
fprlr, tprlr, thresholdslr = roc_curve(y_true_lr, y_score_lr)
                                                                                
In [162]:
y_score_rf = rfprediction2.select(vector_to_array("probability")[1]).rdd.keys().collect()
y_true_rf = rfprediction2.select("controversiality").rdd.keys().collect()
fprrf, tprrf, thresholdsrf = roc_curve(y_true_rf, y_score_rf)
                                                                                
In [179]:
plt.figure(figsize=(8,8))
plt.plot([0, 1], [0, 1], 'r--')
lr_plot = plt.plot(fprlr,
         tprlr,'g', label='logistic regression')
rf_plot = plt.plot(fprrf,
         tprrf,'b', label='random forest')
plt.legend(loc="upper left")
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title("ROC Curve")
plt.show()
plt.savefig('roc_lr.png')
<Figure size 432x288 with 0 Axes>
In [ ]:
spark.stop()