import findspark
findspark.init()
#Importing libraries
import pyspark.sql.functions as f
from pyspark.sql.functions import col, lit
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("machinelearning").getOrCreate()
Setting default log level to "WARN". To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel). 22/04/29 20:16:22 WARN Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME. 22/04/29 20:16:33 WARN YarnSchedulerBackend$YarnSchedulerEndpoint: Attempted to request executors before the AM has registered!
#Checking the status of spark session
spark
SparkSession - in-memory
df_full = spark.read.parquet('s3://ssp88-labdata2/eda_df_full')
df_full.printSchema()
root |-- all_awardings: string (nullable = true) |-- author: string (nullable = true) |-- author_created_utc: double (nullable = true) |-- author_flair_richtext: string (nullable = true) |-- author_flair_type: string (nullable = true) |-- author_fullname: string (nullable = true) |-- author_patreon_flair: boolean (nullable = true) |-- author_premium: boolean (nullable = true) |-- awarders: string (nullable = true) |-- body: string (nullable = true) |-- can_gild: boolean (nullable = true) |-- can_mod_post: boolean (nullable = true) |-- collapsed: boolean (nullable = true) |-- controversiality: long (nullable = true) |-- created_utc: long (nullable = true) |-- gilded: long (nullable = true) |-- gildings: string (nullable = true) |-- id: string (nullable = true) |-- is_submitter: boolean (nullable = true) |-- link_id: string (nullable = true) |-- locked: boolean (nullable = true) |-- no_follow: boolean (nullable = true) |-- parent_id: string (nullable = true) |-- permalink: string (nullable = true) |-- quarantined: boolean (nullable = true) |-- retrieved_on: long (nullable = true) |-- score: long (nullable = true) |-- send_replies: boolean (nullable = true) |-- stickied: boolean (nullable = true) |-- subreddit_id: string (nullable = true) |-- subreddit_name_prefixed: string (nullable = true) |-- subreddit_type: string (nullable = true) |-- total_awards_received: long (nullable = true) |-- treatment_tags: string (nullable = true) |-- editable: boolean (nullable = true) |-- ym_partition: integer (nullable = true) |-- comment_date: string (nullable = true) |-- year: string (nullable = true) |-- month: string (nullable = true) |-- hour: string (nullable = true) |-- original_post: string (nullable = true)
#Checking the length of the comment
df_full = df_full.withColumn("comment_length", f.length(col('body')))
#Selecting necessary columns
df_score = df_sentiment.select(['sentiment', 'controversiality','total_awards_received','comment_length','hour', 'score'] )
df_score.printSchema()
root |-- sentiment: string (nullable = true) |-- controversiality: long (nullable = true) |-- total_awards_received: long (nullable = true) |-- comment_length: integer (nullable = true) |-- hour: string (nullable = true) |-- score: long (nullable = true)
df_score.groupBy('sentiment').count().show()
+---------+-------+ |sentiment| count| +---------+-------+ | positive|9880051| | null| 108800| | neutral| 968815| | negative|7163286| +---------+-------+
#Keeping the rows with only True, False as author_premium
df_score = df_score.filter((df_score.sentiment == 'positive') | (df_score.sentiment == 'neutral') | (df_score.sentiment == 'negative'))
import pandas as pd
import numpy as np
import json
from sklearn.metrics import roc_curve, roc_auc_score
from pyspark.ml.feature import OneHotEncoder, StringIndexer, IndexToString, VectorAssembler
from pyspark.ml.classification import RandomForestClassifier, GBTClassifier, LogisticRegression, LinearSVC
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator, RegressionEvaluator
from pyspark.ml import Pipeline, Model
from pyspark.ml.pipeline import PipelineModel
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.ml.regression import LinearRegression
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator
#Splitting the data into train test and predict for our creating the model
train_data, test_data = df_score.randomSplit([0.8, 0.2], 24)
stringIndexer_sentiment = StringIndexer(inputCol="sentiment", outputCol="sentiment_idx")
stringIndexer_hour = StringIndexer(inputCol="hour", outputCol="hour_idx")
onehot_sentiment = OneHotEncoder(inputCol="sentiment_idx",
outputCol="sentiment_vec")
onehot_hour = OneHotEncoder(inputCol="hour_idx",
outputCol="hour_vec")
vectorAssembler_sentiment = VectorAssembler(
inputCols=['sentiment_vec', 'controversiality','total_awards_received','comment_length','hour_vec'],
outputCol= "features")
# Create initial LinearRegression model
lr1 = LinearRegression(labelCol="score", featuresCol="features")
#Creating a pipeline
pipeline_lr1 = Pipeline(stages=[stringIndexer_sentiment, stringIndexer_hour, onehot_sentiment, onehot_hour, vectorAssembler_sentiment, lr1])
# Create ParamGrid for Cross Validation
lrparamGrid2 = ParamGridBuilder()\
.addGrid(lr1.regParam, [1,2, 3])\
.addGrid(lr1.elasticNetParam, [0.25, 0.5, 0.75])\
.addGrid(lr1.maxIter, [5, 10, 20])\
.build()
# Evaluate model
lrevaluator1 = RegressionEvaluator(predictionCol="prediction", labelCol="score", metricName="rmse").setLabelCol("score")
# Create 3-fold CrossValidator
lrcv1 = CrossValidator(estimator = pipeline_lr1,
estimatorParamMaps = lrparamGrid1,
evaluator = lrevaluator1,
numFolds = 3)
# Run cross validations
lrcvModel1 = lrcv1.fit(train_data)
22/04/29 23:11:47 WARN Instrumentation: [c9cb094b] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:11:50 WARN Instrumentation: [c9cb094b] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:12:03 WARN Instrumentation: [485fa21e] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:12:05 WARN Instrumentation: [485fa21e] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:12:08 WARN Instrumentation: [190b2e7b] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:12:10 WARN Instrumentation: [190b2e7b] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:12:14 WARN Instrumentation: [9dfe5eac] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:12:16 WARN Instrumentation: [9dfe5eac] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:12:19 WARN Instrumentation: [47deb984] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:12:21 WARN Instrumentation: [47deb984] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:12:24 WARN Instrumentation: [23802792] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:12:26 WARN Instrumentation: [23802792] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:12:30 WARN Instrumentation: [df64228b] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:12:32 WARN Instrumentation: [df64228b] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:12:35 WARN Instrumentation: [e0a82e5f] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:12:37 WARN Instrumentation: [e0a82e5f] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:12:40 WARN Instrumentation: [717eb28c] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:12:42 WARN Instrumentation: [717eb28c] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:12:46 WARN Instrumentation: [afe26e5a] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:12:48 WARN Instrumentation: [afe26e5a] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:12:51 WARN Instrumentation: [6b5f1c5b] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:12:53 WARN Instrumentation: [6b5f1c5b] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:12:56 WARN Instrumentation: [4bcae450] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:12:58 WARN Instrumentation: [4bcae450] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:13:02 WARN Instrumentation: [af444977] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:13:04 WARN Instrumentation: [af444977] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:13:07 WARN Instrumentation: [15e97047] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:13:09 WARN Instrumentation: [15e97047] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:13:12 WARN Instrumentation: [ab63b5a6] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:13:14 WARN Instrumentation: [ab63b5a6] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:13:18 WARN Instrumentation: [6183ab5c] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:13:20 WARN Instrumentation: [6183ab5c] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:13:23 WARN Instrumentation: [f54b3653] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:13:25 WARN Instrumentation: [f54b3653] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:13:29 WARN Instrumentation: [ff0e1202] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:13:31 WARN Instrumentation: [ff0e1202] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:13:34 WARN Instrumentation: [2e90c120] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:13:36 WARN Instrumentation: [2e90c120] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:13:40 WARN Instrumentation: [5d210179] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:13:42 WARN Instrumentation: [5d210179] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:13:45 WARN Instrumentation: [41bad21c] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:13:47 WARN Instrumentation: [41bad21c] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:13:51 WARN Instrumentation: [2dc027c5] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:13:52 WARN Instrumentation: [2dc027c5] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:13:56 WARN Instrumentation: [6f282732] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:13:58 WARN Instrumentation: [6f282732] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:14:01 WARN Instrumentation: [cd9d0021] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:14:03 WARN Instrumentation: [cd9d0021] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:14:07 WARN Instrumentation: [ca159973] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:14:09 WARN Instrumentation: [ca159973] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:14:12 WARN Instrumentation: [b3207827] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:14:14 WARN Instrumentation: [b3207827] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:14:17 WARN Instrumentation: [932f90b6] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:14:19 WARN Instrumentation: [932f90b6] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:14:23 WARN Instrumentation: [2329af98] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:14:25 WARN Instrumentation: [2329af98] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:14:28 WARN Instrumentation: [7e3d70ea] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:14:30 WARN Instrumentation: [7e3d70ea] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:14:33 WARN Instrumentation: [7806aa88] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:14:35 WARN Instrumentation: [7806aa88] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:14:39 WARN Instrumentation: [a88c6793] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:14:41 WARN Instrumentation: [a88c6793] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:14:44 WARN Instrumentation: [ab3920a2] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:14:46 WARN Instrumentation: [ab3920a2] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:14:49 WARN Instrumentation: [3cca8421] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:14:51 WARN Instrumentation: [3cca8421] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:14:55 WARN Instrumentation: [2023382c] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:14:57 WARN Instrumentation: [2023382c] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:15:00 WARN Instrumentation: [51b8e95c] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:15:02 WARN Instrumentation: [51b8e95c] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:15:05 WARN Instrumentation: [c98744f3] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:15:07 WARN Instrumentation: [c98744f3] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:15:18 WARN Instrumentation: [878bd086] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:15:20 WARN Instrumentation: [878bd086] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:15:30 WARN Instrumentation: [ebd9f5ec] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:15:31 WARN Instrumentation: [ebd9f5ec] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:15:35 WARN Instrumentation: [949f9383] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:15:37 WARN Instrumentation: [949f9383] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:15:40 WARN Instrumentation: [ef97d85c] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:15:42 WARN Instrumentation: [ef97d85c] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:15:45 WARN Instrumentation: [d7695173] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:15:47 WARN Instrumentation: [d7695173] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:15:50 WARN Instrumentation: [e2e3937d] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:15:52 WARN Instrumentation: [e2e3937d] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:15:56 WARN Instrumentation: [c3653b1b] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:15:57 WARN Instrumentation: [c3653b1b] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:16:01 WARN Instrumentation: [95051f9b] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:16:03 WARN Instrumentation: [95051f9b] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:16:06 WARN Instrumentation: [f21d1404] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:16:08 WARN Instrumentation: [f21d1404] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:16:12 WARN Instrumentation: [4754245f] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:16:13 WARN Instrumentation: [4754245f] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:16:17 WARN Instrumentation: [96660cc7] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:16:18 WARN Instrumentation: [96660cc7] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:16:22 WARN Instrumentation: [62c71923] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:16:24 WARN Instrumentation: [62c71923] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:16:27 WARN Instrumentation: [a05ecb53] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:16:29 WARN Instrumentation: [a05ecb53] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:16:32 WARN Instrumentation: [0ea16bd3] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:16:34 WARN Instrumentation: [0ea16bd3] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:16:37 WARN Instrumentation: [516b11c1] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:16:39 WARN Instrumentation: [516b11c1] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:16:43 WARN Instrumentation: [a8148271] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:16:44 WARN Instrumentation: [a8148271] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:16:48 WARN Instrumentation: [f7aa12ad] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:16:49 WARN Instrumentation: [f7aa12ad] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:16:53 WARN Instrumentation: [1a126cbe] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:16:55 WARN Instrumentation: [1a126cbe] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:16:59 WARN Instrumentation: [1da2a151] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:17:01 WARN Instrumentation: [1da2a151] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:17:04 WARN Instrumentation: [b311ce45] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:17:06 WARN Instrumentation: [b311ce45] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:17:10 WARN Instrumentation: [31854af7] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:17:11 WARN Instrumentation: [31854af7] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:17:15 WARN Instrumentation: [53b43b3e] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:17:16 WARN Instrumentation: [53b43b3e] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:17:20 WARN Instrumentation: [27b94718] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:17:21 WARN Instrumentation: [27b94718] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:17:25 WARN Instrumentation: [fb161313] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:17:27 WARN Instrumentation: [fb161313] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:17:30 WARN Instrumentation: [497f8d91] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:17:32 WARN Instrumentation: [497f8d91] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:17:35 WARN Instrumentation: [a17c5e68] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:17:37 WARN Instrumentation: [a17c5e68] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:17:40 WARN Instrumentation: [1a77fedd] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:17:43 WARN Instrumentation: [1a77fedd] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:17:46 WARN Instrumentation: [78d43ab5] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:17:48 WARN Instrumentation: [78d43ab5] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:17:51 WARN Instrumentation: [7b34c4e0] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:17:53 WARN Instrumentation: [7b34c4e0] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:17:56 WARN Instrumentation: [e32f9c75] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:17:58 WARN Instrumentation: [e32f9c75] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:18:01 WARN Instrumentation: [2c65257b] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:18:03 WARN Instrumentation: [2c65257b] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:18:07 WARN Instrumentation: [944aad32] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:18:08 WARN Instrumentation: [944aad32] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:18:12 WARN Instrumentation: [91f630e5] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:18:14 WARN Instrumentation: [91f630e5] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:18:17 WARN Instrumentation: [79639e04] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:18:19 WARN Instrumentation: [79639e04] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:18:22 WARN Instrumentation: [8736cac6] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:18:24 WARN Instrumentation: [8736cac6] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:18:28 WARN Instrumentation: [04d023e0] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:18:30 WARN Instrumentation: [04d023e0] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:18:41 WARN Instrumentation: [5019488e] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:18:43 WARN Instrumentation: [5019488e] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:18:53 WARN Instrumentation: [7edd30f8] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:18:56 WARN Instrumentation: [7edd30f8] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:19:00 WARN Instrumentation: [7b1d0745] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:19:01 WARN Instrumentation: [7b1d0745] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:19:05 WARN Instrumentation: [826e7b6e] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:19:07 WARN Instrumentation: [826e7b6e] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:19:10 WARN Instrumentation: [ddeeec1c] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:19:12 WARN Instrumentation: [ddeeec1c] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:19:15 WARN Instrumentation: [f8ad96fa] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:19:17 WARN Instrumentation: [f8ad96fa] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:19:21 WARN Instrumentation: [99e4e580] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:19:23 WARN Instrumentation: [99e4e580] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:19:26 WARN Instrumentation: [3d80c4f4] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:19:28 WARN Instrumentation: [3d80c4f4] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:19:32 WARN Instrumentation: [2d03d82e] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:19:34 WARN Instrumentation: [2d03d82e] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:19:37 WARN Instrumentation: [57ae0b76] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:19:39 WARN Instrumentation: [57ae0b76] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:19:43 WARN Instrumentation: [af08cda9] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:19:45 WARN Instrumentation: [af08cda9] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:19:48 WARN Instrumentation: [f97243e1] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:19:50 WARN Instrumentation: [f97243e1] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:19:53 WARN Instrumentation: [cbb15358] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:19:55 WARN Instrumentation: [cbb15358] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:19:59 WARN Instrumentation: [e7ce68c3] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:20:01 WARN Instrumentation: [e7ce68c3] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:20:04 WARN Instrumentation: [d8e1111a] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:20:06 WARN Instrumentation: [d8e1111a] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:20:10 WARN Instrumentation: [3fa52337] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:20:12 WARN Instrumentation: [3fa52337] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:20:15 WARN Instrumentation: [5a7f7f94] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:20:17 WARN Instrumentation: [5a7f7f94] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:20:21 WARN Instrumentation: [a78574b2] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:20:22 WARN Instrumentation: [a78574b2] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:20:26 WARN Instrumentation: [80a74b52] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:20:28 WARN Instrumentation: [80a74b52] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:20:31 WARN Instrumentation: [83c2faf2] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:20:33 WARN Instrumentation: [83c2faf2] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:20:37 WARN Instrumentation: [20f56297] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:20:39 WARN Instrumentation: [20f56297] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:20:42 WARN Instrumentation: [a209cf08] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:20:44 WARN Instrumentation: [a209cf08] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:20:47 WARN Instrumentation: [7f960cdc] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:20:49 WARN Instrumentation: [7f960cdc] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:20:53 WARN Instrumentation: [d0cb4e8b] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:20:55 WARN Instrumentation: [d0cb4e8b] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:20:58 WARN Instrumentation: [a4e9949f] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:21:00 WARN Instrumentation: [a4e9949f] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:21:04 WARN Instrumentation: [59df2c95] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:21:06 WARN Instrumentation: [59df2c95] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:21:09 WARN Instrumentation: [e74e91de] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:21:11 WARN Instrumentation: [e74e91de] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:21:14 WARN Instrumentation: [e2c5ddd8] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:21:16 WARN Instrumentation: [e2c5ddd8] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:21:20 WARN Instrumentation: [77c54391] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:21:22 WARN Instrumentation: [77c54391] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:21:25 WARN Instrumentation: [882e6834] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:21:27 WARN Instrumentation: [882e6834] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:21:31 WARN Instrumentation: [6873f457] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:21:32 WARN Instrumentation: [6873f457] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:21:36 WARN Instrumentation: [34132160] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:21:38 WARN Instrumentation: [34132160] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:21:41 WARN Instrumentation: [fbd8b341] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:21:43 WARN Instrumentation: [fbd8b341] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:21:47 WARN Instrumentation: [7d70da9b] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:21:49 WARN Instrumentation: [7d70da9b] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:21:52 WARN Instrumentation: [eaea3f7f] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:21:54 WARN Instrumentation: [eaea3f7f] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:21:57 WARN Instrumentation: [dcb2c73a] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:21:59 WARN Instrumentation: [dcb2c73a] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver. 22/04/29 23:22:09 WARN Instrumentation: [271f8d42] regParam is zero, which might cause numerical instability and overfitting. 22/04/29 23:22:17 WARN Instrumentation: [271f8d42] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.
# Save the model pipeline into s3
lrcvModel1.write().save('s3://ssp88-labdata2/lrcvModel1.1')
# Use test set here so we can measure the accuracy of our model on new data
lrprediction1 = lrcvModel1.transform(test_data)
# cvModel uses the best model found from the Cross Validation
# Evaluate best model
print('MSE:',lrevaluator1.evaluate(lrprediction1,
{lrevaluator1.metricName: "mse"})
)
print('RMSE:', lrevaluator1.evaluate(lrprediction1,
{lrevaluator1.metricName: "rmse"})
)
print('MAE:',lrevaluator1.evaluate(lrprediction1,
{lrevaluator1.metricName: "mae"})
)
MSE: 32234.198698835175
RMSE: 179.53885011003933
[Stage 5484:==================================================> (48 + 3) / 51]
MAE: 22.33287317693236
# Original and Predicted data
x_ax = range(0, lrprediction1.count())
y_pred = lrprediction1.select("prediction").collect()
y_orig = lrprediction1.select("score").collect()
import matplotlib.pyplot as plt
plt.figure(figsize=(8,8))
plt.plot(x_ax, y_orig,'orange', label="original")
plt.plot(x_ax, y_pred,'cornflowerblue' ,label="predicted")
plt.title("Test and predicted data using Linear Regression")
plt.xlabel('Range of data')
plt.ylabel('Score')
plt.legend(loc='best',fancybox=True, shadow=True)
plt.ticklabel_format(style='plain',useOffset=False)
plt.grid(True)
plt.show()
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator
dt = DecisionTreeRegressor(labelCol="score", featuresCol="features")
#Creating a pipeline
pipeline_dt = Pipeline(stages=[stringIndexer_sentiment, stringIndexer_hour, onehot_sentiment, onehot_hour, vectorAssembler_sentiment, dt])
# Create ParamGrid for Cross Validation
dtparamGrid = (ParamGridBuilder()\
.addGrid(dt.maxDepth, [2, 5, 10])\
.addGrid(dt.maxBins, [10, 20,40])\
.build())
# Evaluate model
dtevaluator = RegressionEvaluator(predictionCol="prediction", labelCol="score", metricName="rmse")
# Create 3-fold CrossValidator
dtcv = CrossValidator(estimator = pipeline_dt,
estimatorParamMaps = dtparamGrid,
evaluator = dtevaluator,
numFolds = 3)
# Run cross validations
dtcvModel = dtcv.fit(train_data)
# Save the model pipeline into s3
dtcvModel.write().save('s3://ssp88-labdata2/dtcvModel')
# Use test set here so we can measure the accuracy of our model on new data
dtpredictions = dtcvModel.transform(test_data)
# cvModel uses the best model found from the Cross Validation
# Evaluate best model
print('MSE:',dtevaluator.evaluate(dtpredictions,
{dtevaluator.metricName: "mse"})
)
print('RMSE:', dtevaluator.evaluate(dtpredictions,
{dtevaluator.metricName: "rmse"})
)
print('MAE:',dtevaluator.evaluate(dtpredictions,
{dtevaluator.metricName: "mae"})
)
MSE: 30039.488938120427
RMSE: 173.3190380140636
[Stage 5478:===================================================> (49 + 2) / 51]
MAE: 21.004838989749768
# Original and Predicted data
x_ax = range(0, dtpredictions.count())
y_pred = dtpredictions.select("prediction").collect()
y_orig = dtpredictions.select("score").collect()
import matplotlib.pyplot as plt
plt.figure(figsize=(8,8))
plt.plot(x_ax, y_orig,'orange', label="original")
plt.plot(x_ax, y_pred,'darkorchid' ,label="predicted")
plt.title("Test and predicted data using Decision Tree")
plt.xlabel('Range of data')
plt.ylabel('Score')
plt.legend(loc='best',fancybox=True, shadow=True)
plt.ticklabel_format(style='plain',useOffset=False)
plt.grid(True)
plt.show()
import pandas as pd
import numpy as np
import json
from sklearn.metrics import roc_curve, roc_auc_score
from pyspark.ml.feature import OneHotEncoder, StringIndexer, IndexToString, VectorAssembler
from pyspark.ml.classification import RandomForestClassifier, GBTClassifier, LogisticRegression, LinearSVC
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator, RegressionEvaluator
from pyspark.ml import Pipeline, Model
from pyspark.ml.pipeline import PipelineModel
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.mllib.evaluation import BinaryClassificationMetrics
#Selecting necessary columns
df_con = df_full.select(['author_premium', 'controversiality', 'locked', 'no_follow', 'stickied','total_awards_received','comment_length','hour', 'score'] )
#Checking categorical columns
df_con.groupBy('author_premium').count().show()
22/04/29 15:35:17 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
+--------------+--------+ |author_premium| count| +--------------+--------+ | null| 2988543| | true| 763357| | false|14369052| +--------------+--------+
#Keeping the rows with only True, False as author_premium
df_con = df_con.filter((df_con.author_premium == 'true') | (df_con.author_premium == 'false'))
#Checking the balance of target
df_con.groupBy('controversiality').count().show()
+----------------+--------+ |controversiality| count| +----------------+--------+ | 1| 779649| | 0|14352760| +----------------+--------+
#Downsamping as controversiality is imbalanced
major_df = df_con.filter(col("controversiality") == 0)
minor_df = df_con.filter(col("controversiality") == 1)
ratio = int(major_df.count()/minor_df.count())
sampled_majority_df = major_df.sample(False, 1/ratio)
downsampled_df = sampled_majority_df.unionAll(minor_df)
downsampled_df.groupBy('controversiality').count().show()
+----------------+------+ |controversiality| count| +----------------+------+ | 1|779649| | 0|799348| +----------------+------+
#Splitting the data into train test and predict for our creating the model
train_data, test_data = downsampled_df.randomSplit([0.8, 0.2,], 24)
stringIndexer_hour = StringIndexer(inputCol="hour", outputCol="hour_idx")
onehot_hour = OneHotEncoder(inputCol="hour_idx",
outputCol="hour_vec")
vectorAssembler_con = VectorAssembler(
inputCols=['author_premium', 'locked', 'no_follow', 'stickied','total_awards_received','comment_length','hour_vec', 'score'],
outputCol= "features")
# Create initial LogisticRegression model
lr2 = LogisticRegression(labelCol="controversiality", featuresCol="features")
#Creating a pipeline
pipeline_lr2 = Pipeline(stages=[stringIndexer_hour, onehot_hour, vectorAssembler_con, lr2])
# Create ParamGrid for Cross Validation
lrparamGrid2 = ParamGridBuilder()\
.addGrid(lr2.regParam, [0.01, 0.1, 0.5])\
.addGrid(lr2.elasticNetParam, [0.25, 0.5, 0.75])\
.addGrid(lr2.maxIter, [5, 10, 20])\
.build()
# Evaluate model
lrevaluator2 = BinaryClassificationEvaluator().setLabelCol("controversiality")
# Create 3-fold CrossValidator
lrcv2 = CrossValidator(estimator = pipeline_lr2,
estimatorParamMaps = lrparamGrid2,
evaluator = lrevaluator2,
numFolds = 3)
# Run cross validations
lrcvModel2 = lrcv2.fit(train_data)
# Save the model pipeline into s3
lrcvModel2.write().save('s3://ssp88-labdata2/lrcvModel2')
# Use test set here so we can measure the accuracy of our model on new data
lrprediction2 = lrcvModel2.transform(test_data)
# cvModel uses the best model found from the Cross Validation
# Evaluate best model
lr_acc2 = lrevaluator2.evaluate(lrprediction2)
#Displaying the results
print("Accuracy = %g" % lr_acc2)
print("Test Error = %g" % (1.0 - lr_acc2))
Accuracy = 0.626823 Test Error = 0.373177
#Confusion Matrix
#y_pred has the predicted labels
y_pred_lr=lrprediction2.select("prediction").collect()
#y_orig are the actual labels
y_orig_lr =lrprediction2.select("controversiality").collect()
#Creating the confusion matrix
cm = confusion_matrix(y_orig_lr, y_pred_lr)
print("Confusion Matrix:")
print(cm)
Confusion Matrix: [[ 47209 111748] [ 28353 127511]]
# Create initial RandomForest model
rf2 = RandomForestClassifier(labelCol="controversiality", featuresCol="features")
#Creating a pipeline
pipeline_rf2 = Pipeline(stages=[stringIndexer_hour, onehot_hour, vectorAssembler_con, rf2])
# Create ParamGrid for Cross Validation
rfparamGrid2 = ParamGridBuilder()\
.addGrid(rf2.maxDepth, [2, 5, 10])\
.addGrid(rf2.maxBins, [5, 10, 20])\
.addGrid(rf2.numTrees, [5, 20, 50])\
.build()
# Evaluate model
rfevaluator2 = RegressionEvaluator().setLabelCol("controversiality")
# Create 3-fold CrossValidator
rfcv2 = CrossValidator(estimator = pipeline_rf2,
estimatorParamMaps = rfparamGrid2,
evaluator = rfevaluator2,
numFolds = 3)
# Run cross validations
rfcvModel2 = rfcv2.fit(train_data)
22/04/29 17:03:05 WARN CacheManager: Asked to cache already cached data. 22/04/29 17:03:05 WARN CacheManager: Asked to cache already cached data. 22/04/29 17:05:05 WARN DAGScheduler: Broadcasting large task binary with size 1335.8 KiB 22/04/29 17:05:07 WARN DAGScheduler: Broadcasting large task binary with size 1782.4 KiB 22/04/29 17:05:35 WARN DAGScheduler: Broadcasting large task binary with size 1002.0 KiB 22/04/29 17:05:37 WARN DAGScheduler: Broadcasting large task binary with size 1402.0 KiB 22/04/29 17:05:39 WARN DAGScheduler: Broadcasting large task binary with size 1904.9 KiB 22/04/29 17:06:06 WARN DAGScheduler: Broadcasting large task binary with size 1003.3 KiB 22/04/29 17:06:08 WARN DAGScheduler: Broadcasting large task binary with size 1413.0 KiB 22/04/29 17:06:10 WARN DAGScheduler: Broadcasting large task binary with size 1935.1 KiB 22/04/29 17:07:55 WARN DAGScheduler: Broadcasting large task binary with size 1000.3 KiB 22/04/29 17:07:57 WARN DAGScheduler: Broadcasting large task binary with size 1384.7 KiB 22/04/29 17:07:59 WARN DAGScheduler: Broadcasting large task binary with size 1849.5 KiB 22/04/29 17:08:26 WARN DAGScheduler: Broadcasting large task binary with size 1020.0 KiB 22/04/29 17:08:28 WARN DAGScheduler: Broadcasting large task binary with size 1440.1 KiB 22/04/29 17:08:30 WARN DAGScheduler: Broadcasting large task binary with size 1960.8 KiB 22/04/29 17:08:55 WARN DAGScheduler: Broadcasting large task binary with size 1030.6 KiB 22/04/29 17:08:57 WARN DAGScheduler: Broadcasting large task binary with size 1469.6 KiB 22/04/29 17:08:59 WARN DAGScheduler: Broadcasting large task binary with size 2024.9 KiB 22/04/29 17:10:49 WARN DAGScheduler: Broadcasting large task binary with size 1378.1 KiB 22/04/29 17:10:51 WARN DAGScheduler: Broadcasting large task binary with size 1852.2 KiB 22/04/29 17:11:17 WARN DAGScheduler: Broadcasting large task binary with size 1019.6 KiB 22/04/29 17:11:19 WARN DAGScheduler: Broadcasting large task binary with size 1428.2 KiB 22/04/29 17:11:21 WARN DAGScheduler: Broadcasting large task binary with size 1949.9 KiB 22/04/29 17:11:46 WARN DAGScheduler: Broadcasting large task binary with size 1024.5 KiB 22/04/29 17:11:47 WARN DAGScheduler: Broadcasting large task binary with size 1455.5 KiB 22/04/29 17:11:50 WARN DAGScheduler: Broadcasting large task binary with size 1999.9 KiB
# Save the model pipeline into s3
rfcvModel2.write().save('s3://ssp88-labdata2/rfcvModel2')
# Use test set here so we can measure the accuracy of our model on new data
rfprediction2 = rfcvModel2.transform(test_data)
# cvModel uses the best model found from the Cross Validation
# Evaluate best model
rf_acc2 = rfevaluator2.evaluate(rfprediction2)
#Displaying the results
print("Accuracy = %g" % rf_acc2)
print("Test Error = %g" % (1.0 - rf_acc2))
[Stage 18586:==================================================>(100 + 2) / 102]
Accuracy = 0.550374 Test Error = 0.449626
#Confusion Matrix
#y_pred has the predicted labels
y_pred_rf=rfprediction2.select("prediction").collect()
#y_orig are the actual labels
y_orig_rf =rfprediction2.select("controversiality").collect()
#Creating the confusion matrix
cm = confusion_matrix(y_orig_rf, y_pred_rf)
print("Confusion Matrix:")
print(cm)
Confusion Matrix: [[137319 21638] [ 73725 82139]]
from sklearn.metrics import roc_curve
from pyspark.ml.functions import vector_to_array
y_score_lr = lrprediction2.select(vector_to_array("probability")[1]).rdd.keys().collect()
y_true_lr = lrprediction2.select("controversiality").rdd.keys().collect()
fprlr, tprlr, thresholdslr = roc_curve(y_true_lr, y_score_lr)
y_score_rf = rfprediction2.select(vector_to_array("probability")[1]).rdd.keys().collect()
y_true_rf = rfprediction2.select("controversiality").rdd.keys().collect()
fprrf, tprrf, thresholdsrf = roc_curve(y_true_rf, y_score_rf)
plt.figure(figsize=(8,8))
plt.plot([0, 1], [0, 1], 'r--')
lr_plot = plt.plot(fprlr,
tprlr,'g', label='logistic regression')
rf_plot = plt.plot(fprrf,
tprrf,'b', label='random forest')
plt.legend(loc="upper left")
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title("ROC Curve")
plt.show()
plt.savefig('roc_lr.png')
<Figure size 432x288 with 0 Axes>
spark.stop()