关于数据分析:聚类算法对比实现

35次阅读

共计 5552 个字符,预计需要花费 14 分钟才能阅读完成。

一、前言

是指把类似的数据划分到一起,具体划分的时候并不关怀这一类的标签,指标就是把类似的数据聚合到一起,聚类是一种无监督学习 (Unsupervised Learning) 办法。

二、聚类的个别过程

  1. 数据筹备:特色标准化和降维
  2. 特征选择:从最后的特色中抉择最无效的特色,并将其存储在向量中
  3. 特征提取:通过对抉择的特色进行转换造成新的突出特色
  4. 聚类:基于某种间隔函数进行类似度度量,获取簇
  5. 聚类后果评估:剖析聚类后果,如间隔误差和 (SSE) 等

三、数据聚类办法

数据聚类办法次要能够分为划分式聚类办法 (Partition-based Methods)、基于密度的聚类办法(Density-based methods)、层次化聚类办法(Hierarchical Methods) 等。

四、掂量聚类算法的规范

不同聚类算法有不同的优劣和不同的实用条件。大抵上从跟数据的属性(是否序列输出、维度),算法模型的预设,模型的解决能力上看。具体如下:
1、算法的解决能力:解决大的数据集的能力(即算法复杂度);解决数据噪声的能力;解决任意形态,包含有间隙的嵌套的数据的能力;
2、算法是否须要预设条件:是否须要事后晓得聚类个数,是否须要用户给出畛域常识;
3、算法的数据输出属性:算法解决的后果与数据输出的程序是否相干,也就是说算法是否独立于数据输出程序;算法解决有很多属性数据的能力,也就是对数据维数是否敏感,对数据的类型有无要求。

五、算法实现

#k-means++
class KMeansClusterAlgorithm(object):
    '''
        this class is k-means cluster algorithm 
        Author:
            xxx
        Date:
            2022-02-10
    '''def __init__(self, dataset: list, k: int) -> None:'''
            initial Args
            Args:
                dataset:list. like [[x1,y1],[x2,y2)]
                k:int. number of cluster what to get


        '''
        self.dataset = dataset
        self.k = k

    def point_avg(self, points) -> list:
        '''
            Accepts a list of points, each with the same number of dimensions.
            NB. points can have more dimensions than 2
            Returns a new points which is the center of all the points
            Args:
                points:list. a list of points, like [[x,y],[x1,y1],[x2,y2]]
            Return:
                new_center: list
        '''
        dimensions = len(points[0])
        new_center = []
        for dimension in range(dimensions):
            dim_sum = 0
            for p in points:
                dim_sum += p[dimension]

        # average of each dimension
            new_center.append(dim_sum/float(len(points)))
        return new_center

    # def update_centers(self, date_set, assignments):
    def update_centers(self, assignments) -> list:
        '''
            Accepts a dataset and a list of assignments; the indexes of both lists correspond
            to each other.
            compute the center for each of the assigned groups.
            Reture 'k' centers where  is the number of unique assignments.
            Args:
                dataset:
                assignments:
            Return:
                centers:list  ex:[[1,2]]

        '''
        new_means = defaultdict(list)
        centers = []
        for assigment, point in zip(assignments, self.dataset):
            new_means[assigment].append(point)

        for points in new_means.values():
            centers.append(self.point_avg(points))

        return centers

    def distance(self, a: list, b: list) -> int:
        '''caculate two points' distance
            Args:
                a:list. point a,ex:[1,3]
                b:list. point b,ex:[1,3]
            Return:
                :int: the distance of two point
        '''
        dimensions = len(a)
        _sum = 0
        for dimension in range(dimensions):
            difference_seq = (a[dimension] - b[dimension]) ** 2
            _sum += difference_seq

        return sqrt(_sum)

    def _assign_points(self, centers) -> list:
        '''
            assign each point to an index that corresponds to the index
            of the center point on its proximity to that point.
            Return a an array of indexes of the centers that correspond to
            an index in the data set; that is, if there are N points in data set
            the list we return will have N elements. Also If there ara Y points in centers
            there will be Y unique possible values within the returned list.
            Args:
                data_points:list  ex:[[1,2],[3,4],[5,6]]
                centers:list   ex:[[3,4]]
            Return:
                assigments:list  

        '''

        assigments = []
        for point in self.dataset:
            shortest = float('Inf')
            shortest_index = 0
            for i in range(len(centers)):
                val = self.distance(point, centers[i])
                if val < shortest:
                    shortest = val
                    shortest_index = i
            assigments.append(shortest_index)

        return assigments

    # def generate_k(self, data_set: list, k: int, centers: list = []) -> list:

    def _generate_k(self, centers: list = []) -> list:
        '''
            Given data set, which is an list of lists,
            find the minimum and maximum for each coordinate,a range.
            Generate k random points between the ranges.
            Return a list of the random points within the ranges
            use self.dataset self.k
            Args:
                data_set:list.  ex:[[1,2],[3,4]]
                k:int. the number of clusters 
            Return:
                list ex:[[1,2]]
        '''
        # centers = []
        dimensions = len(self.dataset[0])
        min_max = defaultdict(int)

        for point in self.dataset:
            for i in range(dimensions):
                val = point[i]
                min_key = f'min_{i}'
                max_key = f'max_{i}'
                if min_key not in min_max or val < min_max[min_key]:
                    min_max[min_key] = val
                if max_key not in min_max or val > min_max[max_key]:
                    min_max[max_key] = val

        for _k in range(self.k):
            rand_point = []
            for i in range(dimensions):
                min_val = min_max[f'min_{i}']
                max_val = min_max[f'max_{i}']

                rand_point.append(uniform(min_val, max_val))

            centers.append(rand_point)
        return centers

    def _euler_distance(self, point1: list, point2: list) -> float:
        '''
            Calculate euler distance between two points, support multidimensional
            Args:
                point1:list
                point2:list
            Return:
                :float
        '''
        distance = 0.0
        for a, b in zip(point1, point2):
            distance += math.pow(a - b, 2)
        return math.sqrt(distance)

    def get_closest_dist(self, point, centroids) -> float:
        '''
            get closest dist between two point
            Args:
                point1:list
                centroids:list. the center of cluster
            Return:
                min_dist:float  
        '''
        min_dist = math.inf  # 初始设为无穷大
        for i, centroid in enumerate(centroids):
            dist = self._euler_distance(centroid, point)
            if dist < min_dist:
                min_dist = dist
        return min_dist

    def _kpp_centers(self) -> list:
        '''
            calculate cluster center
            use self.dataset and self.k
            Return:
                cluster_centers:list. self.k(the number of cluster center that user defined) cluster center

        '''
        cluster_centers = []
        cluster_centers.append(random.choice(self.dataset))
        d = [0 for _ in range(len(self.dataset))]
        for _ in range(1, self.k):
            total = 0.0
            for i, point in enumerate(self.dataset):
                # The distance from the nearest cluster center
                d[i] = self.get_closest_dist(point, cluster_centers)
                total += d[i]
            total *= random.random()
            # The next clustering center is selected by wheel method.
            for i, di in enumerate(d):
                total -= di
                if total > 0:
                    continue
                cluster_centers.append(self.dataset[i])
                break
        return cluster_centers

    # def k_means(self, dataset:list, k:int):

    def k_means_plusplus(self) -> tuple:
        '''
            the enter of k-means cluster algorithm 
            Args:
                data_set:list.  ex:[[1,2],[3,4]]
                k:int. the number of clusters 
            Return:
                (assignments, self.dataset):tuple (result list,origin datalist)
        '''

        # k_points = self._generate_k() #[[1,2],[3,4]]
        k_points = self._kpp_centers()
        assignments = self._assign_points(k_points)  # [1,2,1,1,0,0,4]
        old_assignments = None
        times = 0
        while assignments != old_assignments:
            new_centers = self.update_centers(assignments)  # [[11.2],[12.2]]
            old_assignments = assignments
            assignments = self._assign_points(new_centers)

        return (assignments, self.dataset)

参考资料
https://zhuanlan.zhihu.com/p/…
https://blog.csdn.net/abc2009…
https://blog.csdn.net/weixin_…
https://www.cnblogs.com/wang2…

正文完
 0