关于数据分析:聚类算法对比实现

28次阅读

共计 5552 个字符，预计需要花费 14 分钟才能阅读完成。

是指把类似的数据划分到一起，具体划分的时候并不关怀这一类的标签，指标就是把类似的数据聚合到一起，聚类是一种无监督学习 (Unsupervised Learning) 办法。

数据筹备：特色标准化和降维
特征选择：从最后的特色中抉择最无效的特色，并将其存储在向量中
特征提取：通过对抉择的特色进行转换造成新的突出特色
聚类：基于某种间隔函数进行类似度度量，获取簇
聚类后果评估：剖析聚类后果，如间隔误差和 (SSE) 等

数据聚类办法次要能够分为划分式聚类办法 (Partition-based Methods)、基于密度的聚类办法(Density-based methods)、层次化聚类办法(Hierarchical Methods) 等。

不同聚类算法有不同的优劣和不同的实用条件。大抵上从跟数据的属性（是否序列输出、维度），算法模型的预设，模型的解决能力上看。具体如下：
1、算法的解决能力：解决大的数据集的能力（即算法复杂度）；解决数据噪声的能力；解决任意形态，包含有间隙的嵌套的数据的能力；
2、算法是否须要预设条件：是否须要事后晓得聚类个数，是否须要用户给出畛域常识；
3、算法的数据输出属性：算法解决的后果与数据输出的程序是否相干，也就是说算法是否独立于数据输出程序；算法解决有很多属性数据的能力，也就是对数据维数是否敏感，对数据的类型有无要求。

#k-means++
class KMeansClusterAlgorithm(object):
    '''
        this class is k-means cluster algorithm 
        Author:
            xxx
        Date:
            2022-02-10
    '''def __init__(self, dataset: list, k: int) -> None:'''
            initial Args
            Args:
                dataset:list. like [[x1,y1],[x2,y2)]
                k:int. number of cluster what to get


        '''
        self.dataset = dataset
        self.k = k

    def point_avg(self, points) -> list:
        '''
            Accepts a list of points, each with the same number of dimensions.
            NB. points can have more dimensions than 2
            Returns a new points which is the center of all the points
            Args:
                points:list. a list of points, like [[x,y],[x1,y1],[x2,y2]]
            Return:
                new_center: list
        '''
        dimensions = len(points[0])
        new_center = []
        for dimension in range(dimensions):
            dim_sum = 0
            for p in points:
                dim_sum += p[dimension]

        # average of each dimension
            new_center.append(dim_sum/float(len(points)))
        return new_center

    # def update_centers(self, date_set, assignments):
    def update_centers(self, assignments) -> list:
        '''
            Accepts a dataset and a list of assignments; the indexes of both lists correspond
            to each other.
            compute the center for each of the assigned groups.
            Reture 'k' centers where  is the number of unique assignments.
            Args:
                dataset:
                assignments:
            Return:
                centers:list  ex:[[1,2]]

        '''
        new_means = defaultdict(list)
        centers = []
        for assigment, point in zip(assignments, self.dataset):
            new_means[assigment].append(point)

        for points in new_means.values():
            centers.append(self.point_avg(points))

        return centers

    def distance(self, a: list, b: list) -> int:
        '''caculate two points' distance
            Args:
                a:list. point a,ex:[1,3]
                b:list. point b,ex:[1,3]
            Return:
                :int: the distance of two point
        '''
        dimensions = len(a)
        _sum = 0
        for dimension in range(dimensions):
            difference_seq = (a[dimension] - b[dimension]) ** 2
            _sum += difference_seq

        return sqrt(_sum)

    def _assign_points(self, centers) -> list:
        '''
            assign each point to an index that corresponds to the index
            of the center point on its proximity to that point.
            Return a an array of indexes of the centers that correspond to
            an index in the data set; that is, if there are N points in data set
            the list we return will have N elements. Also If there ara Y points in centers
            there will be Y unique possible values within the returned list.
            Args:
                data_points:list  ex:[[1,2],[3,4],[5,6]]
                centers:list   ex:[[3,4]]
            Return:
                assigments:list  

        '''

        assigments = []
        for point in self.dataset:
            shortest = float('Inf')
            shortest_index = 0
            for i in range(len(centers)):
                val = self.distance(point, centers[i])
                if val < shortest:
                    shortest = val
                    shortest_index = i
            assigments.append(shortest_index)

        return assigments

    # def generate_k(self, data_set: list, k: int, centers: list = []) -> list:

    def _generate_k(self, centers: list = []) -> list:
        '''
            Given data set, which is an list of lists,
            find the minimum and maximum for each coordinate,a range.
            Generate k random points between the ranges.
            Return a list of the random points within the ranges
            use self.dataset self.k
            Args:
                data_set:list.  ex:[[1,2],[3,4]]
                k:int. the number of clusters 
            Return:
                list ex:[[1,2]]
        '''
        # centers = []
        dimensions = len(self.dataset[0])
        min_max = defaultdict(int)

        for point in self.dataset:
            for i in range(dimensions):
                val = point[i]
                min_key = f'min_{i}'
                max_key = f'max_{i}'
                if min_key not in min_max or val < min_max[min_key]:
                    min_max[min_key] = val
                if max_key not in min_max or val > min_max[max_key]:
                    min_max[max_key] = val

        for _k in range(self.k):
            rand_point = []
            for i in range(dimensions):
                min_val = min_max[f'min_{i}']
                max_val = min_max[f'max_{i}']

                rand_point.append(uniform(min_val, max_val))

            centers.append(rand_point)
        return centers

    def _euler_distance(self, point1: list, point2: list) -> float:
        '''
            Calculate euler distance between two points, support multidimensional
            Args:
                point1:list
                point2:list
            Return:
                :float
        '''
        distance = 0.0
        for a, b in zip(point1, point2):
            distance += math.pow(a - b, 2)
        return math.sqrt(distance)

    def get_closest_dist(self, point, centroids) -> float:
        '''
            get closest dist between two point
            Args:
                point1:list
                centroids:list. the center of cluster
            Return:
                min_dist:float  
        '''
        min_dist = math.inf  # 初始设为无穷大
        for i, centroid in enumerate(centroids):
            dist = self._euler_distance(centroid, point)
            if dist < min_dist:
                min_dist = dist
        return min_dist

    def _kpp_centers(self) -> list:
        '''
            calculate cluster center
            use self.dataset and self.k
            Return:
                cluster_centers:list. self.k(the number of cluster center that user defined) cluster center

        '''
        cluster_centers = []
        cluster_centers.append(random.choice(self.dataset))
        d = [0 for _ in range(len(self.dataset))]
        for _ in range(1, self.k):
            total = 0.0
            for i, point in enumerate(self.dataset):
                # The distance from the nearest cluster center
                d[i] = self.get_closest_dist(point, cluster_centers)
                total += d[i]
            total *= random.random()
            # The next clustering center is selected by wheel method.
            for i, di in enumerate(d):
                total -= di
                if total > 0:
                    continue
                cluster_centers.append(self.dataset[i])
                break
        return cluster_centers

    # def k_means(self, dataset:list, k:int):

    def k_means_plusplus(self) -> tuple:
        '''
            the enter of k-means cluster algorithm 
            Args:
                data_set:list.  ex:[[1,2],[3,4]]
                k:int. the number of clusters 
            Return:
                (assignments, self.dataset):tuple (result list,origin datalist)
        '''

        # k_points = self._generate_k() #[[1,2],[3,4]]
        k_points = self._kpp_centers()
        assignments = self._assign_points(k_points)  # [1,2,1,1,0,0,4]
        old_assignments = None
        times = 0
        while assignments != old_assignments:
            new_centers = self.update_centers(assignments)  # [[11.2],[12.2]]
            old_assignments = assignments
            assignments = self._assign_points(new_centers)

        return (assignments, self.dataset)

参考资料
https://zhuanlan.zhihu.com/p/…
https://blog.csdn.net/abc2009…
https://blog.csdn.net/weixin_…
https://www.cnblogs.com/wang2…

正文完

发表至：数据分析

2022-02-14

0

关于数据分析:极客星球-阿里云原生数据湖构建与分析最佳实践

关于数据分析:房地产行业如何做投资分析这篇文章告诉你

关于数据分析:马上赴约Smartbi-V105新品系列直播来了让我们一起赢在学习的起跑线

关于数据分析:Kyligence-连续三年入选-Gartner-增强数据分析推荐厂商

关于git:Windows-上-gitea-服务的搭建

关于阿里云:SAE-最佳实践范本助力视野数科进入云原生快车道