from pyspark import SparkContext, SparkConf
from numpy import array
from math import sqrt

# https://spark.apache.org/docs/latest/mllib-clustering.html#k-means
from pyspark.mllib.clustering import KMeans, KMeansModel

sconf = SparkConf().setAppName("MovieLensApp").setMaster("local[4]")
sc = SparkContext(conf=sconf)

# Load and parse the data
data = sc.textFile("file:///usr/local/spark/data/mllib/kmeans_data.txt")
parsedData = data.map(lambda line: [float(x) for x in line.split(' ')])

# if you want to examine the contents of parsedData RDD
# you need to collect the collection  (list) from the cluster
print(parsedData.collect())

# Build the model (cluster the data) --> model will contains clusters
# train(rdd, k, maxIterations=100, initializationMode='k-means||', seed=None, initializationSteps=2, epsilon=0.0001, initialModel=None)
model = KMeans.train(parsedData, 2, maxIterations=10, initializationMode="k-means||")

# print the centers of the 2 clusters after training
print(model.centers) # OR print(model.clusterCenters)
# it will print: [array([ 0.1,  0.1,  0.1]), array([ 9.1,  9.1,  9.1])]

# Evaluate clustering by computing Within Set Sum of Squared Errors
def error(point):
    center = model.centers[model.predict(point)]
    return sqrt(sum([x**2 for x in (point - center)]))

WSSSE = parsedData.map(lambda point: error(point)).reduce(lambda x, y: x + y)
print("Within Set Sum of Squared Error = " + str(WSSSE))

# Save and load model
#clusters.save(sc, "KMeansModel")
#sameModel = KMeansModel.load(sc, "KMeansModel")