一、前言是指把类似的数据划分到一起,具体划分的时候并不关怀这一类的标签,指标就是把类似的数据聚合到一起,聚类是一种无监督学习(Unsupervised Learning)办法。
二、聚类的个别过程数据筹备:特色标准化和降维特征选择:从最后的特色中抉择最无效的特色,并将其存储在向量中特征提取:通过对抉择的特色进行转换造成新的突出特色聚类:基于某种间隔函数进行类似度度量,获取簇聚类后果评估:剖析聚类后果,如间隔误差和(SSE)等三、数据聚类办法数据聚类办法次要能够分为划分式聚类办法(Partition-based Methods)、基于密度的聚类办法(Density-based methods)、层次化聚类办法(Hierarchical Methods)等。
四、掂量聚类算法的规范不同聚类算法有不同的优劣和不同的实用条件。大抵上从跟数据的属性(是否序列输出、维度),算法模型的预设,模型的解决能力上看。具体如下:1、算法的解决能力:解决大的数据集的能力(即算法复杂度);解决数据噪声的能力;解决任意形态,包含有间隙的嵌套的数据的能力;2、算法是否须要预设条件:是否须要事后晓得聚类个数,是否须要用户给出畛域常识;3、算法的数据输出属性:算法解决的后果与数据输出的程序是否相干,也就是说算法是否独立于数据输出程序;算法解决有很多属性数据的能力,也就是对数据维数是否敏感,对数据的类型有无要求。
五、算法实现#k-means++class KMeansClusterAlgorithm(object): ''' this class is k-means cluster algorithm Author: xxx Date: 2022-02-10 ''' def __init__(self, dataset: list, k: int) -> None: ''' initial Args Args: dataset:list. like [[x1,y1],[x2,y2)] k:int. number of cluster what to get ''' self.dataset = dataset self.k = k def point_avg(self, points) -> list: ''' Accepts a list of points, each with the same number of dimensions. NB. points can have more dimensions than 2 Returns a new points which is the center of all the points Args: points:list. a list of points, like [[x,y],[x1,y1],[x2,y2]] Return: new_center: list ''' dimensions = len(points[0]) new_center = [] for dimension in range(dimensions): dim_sum = 0 for p in points: dim_sum += p[dimension] # average of each dimension new_center.append(dim_sum/float(len(points))) return new_center # def update_centers(self, date_set, assignments): def update_centers(self, assignments) -> list: ''' Accepts a dataset and a list of assignments; the indexes of both lists correspond to each other. compute the center for each of the assigned groups. Reture 'k' centers where is the number of unique assignments. Args: dataset: assignments: Return: centers:list ex:[[1,2]] ''' new_means = defaultdict(list) centers = [] for assigment, point in zip(assignments, self.dataset): new_means[assigment].append(point) for points in new_means.values(): centers.append(self.point_avg(points)) return centers def distance(self, a: list, b: list) -> int: ''' caculate two points' distance Args: a:list. point a,ex:[1,3] b:list. point b,ex:[1,3] Return: :int: the distance of two point ''' dimensions = len(a) _sum = 0 for dimension in range(dimensions): difference_seq = (a[dimension] - b[dimension]) ** 2 _sum += difference_seq return sqrt(_sum) def _assign_points(self, centers) -> list: ''' assign each point to an index that corresponds to the index of the center point on its proximity to that point. Return a an array of indexes of the centers that correspond to an index in the data set; that is, if there are N points in data set the list we return will have N elements. Also If there ara Y points in centers there will be Y unique possible values within the returned list. Args: data_points:list ex:[[1,2],[3,4],[5,6]] centers:list ex:[[3,4]] Return: assigments:list ''' assigments = [] for point in self.dataset: shortest = float('Inf') shortest_index = 0 for i in range(len(centers)): val = self.distance(point, centers[i]) if val < shortest: shortest = val shortest_index = i assigments.append(shortest_index) return assigments # def generate_k(self, data_set: list, k: int, centers: list = []) -> list: def _generate_k(self, centers: list = []) -> list: ''' Given data set, which is an list of lists, find the minimum and maximum for each coordinate,a range. Generate k random points between the ranges. Return a list of the random points within the ranges use self.dataset self.k Args: data_set:list. ex:[[1,2],[3,4]] k:int. the number of clusters Return: list ex:[[1,2]] ''' # centers = [] dimensions = len(self.dataset[0]) min_max = defaultdict(int) for point in self.dataset: for i in range(dimensions): val = point[i] min_key = f'min_{i}' max_key = f'max_{i}' if min_key not in min_max or val < min_max[min_key]: min_max[min_key] = val if max_key not in min_max or val > min_max[max_key]: min_max[max_key] = val for _k in range(self.k): rand_point = [] for i in range(dimensions): min_val = min_max[f'min_{i}'] max_val = min_max[f'max_{i}'] rand_point.append(uniform(min_val, max_val)) centers.append(rand_point) return centers def _euler_distance(self, point1: list, point2: list) -> float: ''' Calculate euler distance between two points, support multidimensional Args: point1:list point2:list Return: :float ''' distance = 0.0 for a, b in zip(point1, point2): distance += math.pow(a - b, 2) return math.sqrt(distance) def get_closest_dist(self, point, centroids) -> float: ''' get closest dist between two point Args: point1:list centroids:list. the center of cluster Return: min_dist:float ''' min_dist = math.inf # 初始设为无穷大 for i, centroid in enumerate(centroids): dist = self._euler_distance(centroid, point) if dist < min_dist: min_dist = dist return min_dist def _kpp_centers(self) -> list: ''' calculate cluster center use self.dataset and self.k Return: cluster_centers:list. self.k(the number of cluster center that user defined) cluster center ''' cluster_centers = [] cluster_centers.append(random.choice(self.dataset)) d = [0 for _ in range(len(self.dataset))] for _ in range(1, self.k): total = 0.0 for i, point in enumerate(self.dataset): # The distance from the nearest cluster center d[i] = self.get_closest_dist(point, cluster_centers) total += d[i] total *= random.random() # The next clustering center is selected by wheel method. for i, di in enumerate(d): total -= di if total > 0: continue cluster_centers.append(self.dataset[i]) break return cluster_centers # def k_means(self, dataset:list, k:int): def k_means_plusplus(self) -> tuple: ''' the enter of k-means cluster algorithm Args: data_set:list. ex:[[1,2],[3,4]] k:int. the number of clusters Return: (assignments, self.dataset):tuple (result list,origin datalist) ''' # k_points = self._generate_k() #[[1,2],[3,4]] k_points = self._kpp_centers() assignments = self._assign_points(k_points) # [1,2,1,1,0,0,4] old_assignments = None times = 0 while assignments != old_assignments: new_centers = self.update_centers(assignments) # [[11.2],[12.2]] old_assignments = assignments assignments = self._assign_points(new_centers) return (assignments, self.dataset)参考资料https://zhuanlan.zhihu.com/p/...https://blog.csdn.net/abc2009...https://blog.csdn.net/weixin_...https://www.cnblogs.com/wang2...
...