# Demo entry 6750812

km

Submitted by anonymous on Jun 20, 2018 at 13:02
Language: Python. Code size: 7.4 kB.

```import random as rand
import math as math

import matplotlib.pyplot as plt
import csv
import sys

class Point:
def __init__(self, latit_, longit_):
self.latit = latit_
self.longit = longit_

class Clustering:
def __init__(self, geo_locs_, k_):
self.geo_locations = geo_locs_
self.k = k_
self.clusters = []   # 该类的点数？
self.means = []      # 该类的均值
self.debug = False   # debug flag

# 返回下一个随机点
def next_random(self, index, points, clusters):
# 选取距离其它点最远的一个点
dist = {}
for point_1 in points:
if self.debug:
print('point_1: %f %f' % (point_1.latit, point_1.longit))
# 计算该点距离其它类点的距离
for cluster in list(clusters.values()):
point_2 = cluster[0]
if self.debug:
print('point_2: %f %f' % (point_2.latit, point_2.longit))
if point_1 not in dist:
dist[point_1] = math.sqrt(math.pow(point_1.latit - point_2.latit, 2.0)
+ math.pow(point_1.longit - point_2.longit, 2.0))
else:
dist[point_1] += math.sqrt(math.pow(point_1.latit - point_2.latit, 2.0)
+ math.sqrt(math.pow(point_1.longit - point_2.longit, 2.0)))
if self.debug:
for key, value in dist.items():
print("(%f, %f ==> %f" % (key.latit, key.longit, value))

# 返回至原有点距离最远的点
count_ = 0
max_ = 0
for key, value in list(dist.items()):
if count_ == 0:
max_ = value
max_point = key
count_ += 1
else:
if value > max_:
max_ = value
max_point = key
return max_point

# 计算初始时的均值
def initial_means(self, points):
# 随机选取一个点
point_ = rand.choice(points)
if self.debug:
print('point#0: %f %f' % (point_.latit, point_.longit))
clusters = dict()
clusters.setdefault(0, []).append(point_)
points.remove(point_)

# 选取其余 k-1 个点
for i in range(1, self.k):
point_ = self.next_random(i, points, clusters)
if self.debug:
print('point#%d: %f %f' % (i, point_.latit, point_.longit))
clusters.setdefault(i, []).append(point_)
points.remove(point_)
# 计算该类的均值
self.means = self.compute_mean(clusters)
if self.debug:
print("initial means:")
self.print_means(self.means)

def compute_mean(self, clusters):
means = []
for cluster in list(clusters.values()):
mean_point = Point(0.0, 0.0)
cnt = 0.0
for point in cluster:
mean_point.latit += point.latit
mean_point.longit += point.longit
cnt += 1.0
mean_point.latit = mean_point.latit / cnt
mean_point.longit = mean_point.longit / cnt
means.append(mean_point)
return means

# 根据最小的均值 将每个点归类
def assign_points(self, points):
if self.debug:
print("assign points")
clusters = dict()
for point in points:
dist = []
if self.debug:
print("point(%f,%f)" % (point.latit, point.longit))
# 为每个点选择最合适的类簇
for mean in self.means:
dist.append(math.sqrt(math.pow(point.latit - mean.latit, 2.0)
+ math.pow(point.longit - mean.longit, 2.0)))
# 选择最小的均值
if self.debug:
print(dist)
cnt_ = 0
index = 0
min_ = dist[0]
for d in dist:
if d < min_:
min_ = d
index = cnt_
cnt_ += 1
if self.debug:
print("index: %d" % index)
clusters.setdefault(index, []).append(point)
return clusters

def update_means(self, means, threshold):
# 检查目前是否已经满足迭代停止条件
for i in range(len(self.means)):
mean_1 = self.means[i]
mean_2 = means[i]
if self.debug:
print("mean_1(%f,%f)" % (mean_1.latit, mean_1.longit))
print("mean_2(%f,%f)" % (mean_2.latit, mean_2.longit))
if math.sqrt(math.pow(mean_1.latit - mean_2.latit, 2.0)
+ math.pow(mean_1.longit - mean_2.longit, 2.0)) > threshold:
return False
return True

# 输出类簇
def print_clusters(self, clusters):
cluster_cnt = 1
for cluster in list(clusters.values()):
print("nodes in cluster #%d" % cluster_cnt)
cluster_cnt += 1
for point in cluster:
print("point(%f,%f)" % (point.latit, point.longit))

# 输出均值
def print_means(self, means):
for point in means:
print("%f %f" % (point.latit, point.longit))

# k-mean 算法
def k_means(self, plot_flag):
if len(self.geo_locations) < self.k:
return -1  # error
points_ = [point for point in self.geo_locations]
# 计算初始点
self.initial_means(points_)
stop = False
while not stop:
# 根据点与类簇的距离将每个点归类
points_ = [point for point in self.geo_locations]
clusters = self.assign_points(points_)
if self.debug:
self.print_clusters(clusters)
means = self.compute_mean(clusters)
if self.debug:
print("means:")
print(self.print_means(means))
print("update mean:")
stop = self.update_means(means, 0.01)
if not stop:
self.means = []
self.means = means
self.clusters = clusters

# 输出类簇
if plot_flag:
fig = plt.figure()
markers = ['o', 'd', 'x', 'h', 'H', 7, 4, 5, 6, '8', 'p', ',', '+', '.', 's', '*', 3, 0, 1, 2]
colors = ['r', 'k', 'b', [0, 0, 0], [0, 0, 1], [0, 1, 0], [0, 1, 1], [1, 0, 0], [1, 0, 1], [1, 1, 0],
[1, 1, 1]]
cnt = 0
for cluster in list(clusters.values()):
latits = []
longits = []
for point in cluster:
latits.append(point.latit)
longits.append(point.longit)
ax.scatter(longits, latits, s=60, c=colors[cnt], marker=markers[cnt])
cnt += 1
plt.show()
return 0

geo_locs = []
f = open('drinkingFountains.csv', 'r')
loc_ = Point(float(line[0]), float(line[1]))
geo_locs.append(loc_)

try:
cluster = Clustering(geo_locs, int(sys.argv[1]))
except:
cluster = Clustering(geo_locs, 5)
flag = cluster.k_means(True)
if flag == -1:
print("Error in arguments!")
else:
# 聚类结果为若干列数据，每一列代表一个类簇
print("聚类结果为：:")
cluster.print_clusters(cluster.clusters)
```

This snippet took 0.01 seconds to highlight.

Back to the Entry List or Home.