Demo entry 6671651

clustering_v3

   

Submitted by anonymous on Dec 01, 2017 at 02:02
Language: Python. Code size: 1.2 kB.

from pyspark.mllib.clustering import KMeans, KMeansModel
import sys
import time
from pyspark import SparkContext
import csv

def main():
	sc = SparkContext()
	name=sys.argv[1]
	f = open('elapsed_time_'+str(name), 'wt')
	try:
		writer = csv.writer(f)
		writer.writerow( ('Load', 'Map1', 'Map2', 'Train', 'Tot') )
		t_tot=time.time()
		t_load=time.time()
		raw = sc.textFile(u"/user/user1/data/"+str(name))
		load_time = round(((time.time() - t_load)*1000),4)
		sys.argv.append(load_time)
		t1_map=time.time()
		line = raw.map(lambda x: x.split(','))
		map1_time=round(((time.time() - t1_map)*1000),4)
		sys.argv.append(map1_time)
		t2_map=time.time()
		geo = line.map(lambda x: (float(x[1]), float(x[2])))
		map2_time=round(((time.time() - t2_map)*1000),4)
		sys.argv.append(map2_time)
		k = 8
		t_train=time.time()
		model = KMeans.train(geo, k)
		train_time=round(((time.time() - t_train)*1000),4)
		sys.argv.append(train_time)
		total_time=round(((time.time()-t_tot)*1000),4)
		sys.argv.append(total_time)
		writer.writerow((sys.argv[2], sys.argv[3], sys.argv[4], sys.argv[5], sys.argv[6]))
	finally:
		f.close()
		
if __name__ == "__main__":
    main()

This snippet took 0.01 seconds to highlight.

Back to the Entry List or Home.

Delete this entry (admin only).