Demo entry 6750812

km

   

Submitted by anonymous on Jun 20, 2018 at 13:02
Language: Python. Code size: 7.4 kB.

import random as rand
import math as math

import matplotlib.pyplot as plt
import csv
import sys


class Point:
    def __init__(self, latit_, longit_):
        self.latit = latit_
        self.longit = longit_


class Clustering:
    def __init__(self, geo_locs_, k_):
        self.geo_locations = geo_locs_
        self.k = k_
        self.clusters = []   # 该类的点数?
        self.means = []      # 该类的均值
        self.debug = False   # debug flag

    # 返回下一个随机点
    def next_random(self, index, points, clusters):
        # 选取距离其它点最远的一个点
        dist = {}
        for point_1 in points:
            if self.debug:
                print('point_1: %f %f' % (point_1.latit, point_1.longit))
            # 计算该点距离其它类点的距离
            for cluster in list(clusters.values()):
                point_2 = cluster[0]
                if self.debug:
                    print('point_2: %f %f' % (point_2.latit, point_2.longit))
                if point_1 not in dist:
                    dist[point_1] = math.sqrt(math.pow(point_1.latit - point_2.latit, 2.0)
                                              + math.pow(point_1.longit - point_2.longit, 2.0))
                else:
                    dist[point_1] += math.sqrt(math.pow(point_1.latit - point_2.latit, 2.0)
                                               + math.sqrt(math.pow(point_1.longit - point_2.longit, 2.0)))
        if self.debug:
            for key, value in dist.items():
                print("(%f, %f ==> %f" % (key.latit, key.longit, value))

        # 返回至原有点距离最远的点
        count_ = 0
        max_ = 0
        for key, value in list(dist.items()):
            if count_ == 0:
                max_ = value
                max_point = key
                count_ += 1
            else:
                if value > max_:
                    max_ = value
                    max_point = key
        return max_point

    # 计算初始时的均值
    def initial_means(self, points):
        # 随机选取一个点
        point_ = rand.choice(points)
        if self.debug:
            print('point#0: %f %f' % (point_.latit, point_.longit))
        clusters = dict()
        clusters.setdefault(0, []).append(point_)
        points.remove(point_)

        # 选取其余 k-1 个点
        for i in range(1, self.k):
            point_ = self.next_random(i, points, clusters)
            if self.debug:
                print('point#%d: %f %f' % (i, point_.latit, point_.longit))
            clusters.setdefault(i, []).append(point_)
            points.remove(point_)
        # 计算该类的均值
        self.means = self.compute_mean(clusters)
        if self.debug:
            print("initial means:")
            self.print_means(self.means)

    def compute_mean(self, clusters):
        means = []
        for cluster in list(clusters.values()):
            mean_point = Point(0.0, 0.0)
            cnt = 0.0
            for point in cluster:
                mean_point.latit += point.latit
                mean_point.longit += point.longit
                cnt += 1.0
            mean_point.latit = mean_point.latit / cnt
            mean_point.longit = mean_point.longit / cnt
            means.append(mean_point)
        return means

    # 根据最小的均值 将每个点归类
    def assign_points(self, points):
        if self.debug:
            print("assign points")
        clusters = dict()
        for point in points:
            dist = []
            if self.debug:
                print("point(%f,%f)" % (point.latit, point.longit))
            # 为每个点选择最合适的类簇
            for mean in self.means:
                dist.append(math.sqrt(math.pow(point.latit - mean.latit, 2.0)
                                      + math.pow(point.longit - mean.longit, 2.0)))
            # 选择最小的均值
            if self.debug:
                print(dist)
            cnt_ = 0
            index = 0
            min_ = dist[0]
            for d in dist:
                if d < min_:
                    min_ = d
                    index = cnt_
                cnt_ += 1
            if self.debug:
                print("index: %d" % index)
            clusters.setdefault(index, []).append(point)
        return clusters

    def update_means(self, means, threshold):
        # 检查目前是否已经满足迭代停止条件
        for i in range(len(self.means)):
            mean_1 = self.means[i]
            mean_2 = means[i]
            if self.debug:
                print("mean_1(%f,%f)" % (mean_1.latit, mean_1.longit))
                print("mean_2(%f,%f)" % (mean_2.latit, mean_2.longit))
                if math.sqrt(math.pow(mean_1.latit - mean_2.latit, 2.0)
                             + math.pow(mean_1.longit - mean_2.longit, 2.0)) > threshold:
                    return False
            return True

    # 输出类簇
    def print_clusters(self, clusters):
        cluster_cnt = 1
        for cluster in list(clusters.values()):
            print("nodes in cluster #%d" % cluster_cnt)
            cluster_cnt += 1
            for point in cluster:
                print("point(%f,%f)" % (point.latit, point.longit))

    # 输出均值
    def print_means(self, means):
        for point in means:
            print("%f %f" % (point.latit, point.longit))

    # k-mean 算法
    def k_means(self, plot_flag):
        if len(self.geo_locations) < self.k:
            return -1  # error
        points_ = [point for point in self.geo_locations]
        # 计算初始点
        self.initial_means(points_)
        stop = False
        while not stop:
            # 根据点与类簇的距离将每个点归类
            points_ = [point for point in self.geo_locations]
            clusters = self.assign_points(points_)
            if self.debug:
                self.print_clusters(clusters)
            means = self.compute_mean(clusters)
            if self.debug:
                print("means:")
                print(self.print_means(means))
                print("update mean:")
            stop = self.update_means(means, 0.01)
            if not stop:
                self.means = []
                self.means = means
        self.clusters = clusters

        # 输出类簇
        if plot_flag:
            fig = plt.figure()
            ax = fig.add_subplot(111)
            markers = ['o', 'd', 'x', 'h', 'H', 7, 4, 5, 6, '8', 'p', ',', '+', '.', 's', '*', 3, 0, 1, 2]
            colors = ['r', 'k', 'b', [0, 0, 0], [0, 0, 1], [0, 1, 0], [0, 1, 1], [1, 0, 0], [1, 0, 1], [1, 1, 0],
                      [1, 1, 1]]
            cnt = 0
            for cluster in list(clusters.values()):
                latits = []
                longits = []
                for point in cluster:
                    latits.append(point.latit)
                    longits.append(point.longit)
                ax.scatter(longits, latits, s=60, c=colors[cnt], marker=markers[cnt])
                cnt += 1
            plt.show()
        return 0


geo_locs = []
f = open('drinkingFountains.csv', 'r')
reader = csv.reader(f, delimiter=",")
for line in reader:
    loc_ = Point(float(line[0]), float(line[1]))
    geo_locs.append(loc_)

try:
    cluster = Clustering(geo_locs, int(sys.argv[1]))
except:
    cluster = Clustering(geo_locs, 5)
flag = cluster.k_means(True)
if flag == -1:
    print("Error in arguments!")
else:
    # 聚类结果为若干列数据,每一列代表一个类簇
    print("聚类结果为::")
    cluster.print_clusters(cluster.clusters)

This snippet took 0.01 seconds to highlight.

Back to the Entry List or Home.

Delete this entry (admin only).