# 观察在不同的k值下Calinski-Harabasz分数,分数值𝑠越大则聚类效果越好 from sklearn import metrics s = metrics.calinski_harabaz_score(X, y_pred)
如何在 Python 中从头开始编写 K-means?
我们的 k-means 实现将分为五个辅助方法和一个运行算法的主循环。让我们一一介绍这些功能。
成对距离
初始化中心
更新分配
更新中心
计算损失
主循环
完整的实现
计算成对距离
1 2 3 4 5 6 7 8 9 10 11 12 13 14
defpairwise_dist(self, x, y): """ Args: x: N x D numpy array y: M x D numpy array Return: dist: N x M array, where dist2[i, j] is the euclidean distance between x[i, :] and y[j, :] """ xSumSquare = np.sum(np.square(x),axis=1); ySumSquare = np.sum(np.square(y),axis=1); mul = np.dot(x, y.T); dists = np.sqrt(abs(xSumSquare[:, np.newaxis] + ySumSquare-2*mul)) return dists
def_init_centers(self, points, K, **kwargs): """ Args: points: NxD numpy array, where N is # points and D is the dimensionality K: number of clusters kwargs: any additional arguments you want Return: centers: K x D numpy array, the centers. """ row, col = points.shape retArr = np.empty([K, col]) for number inrange(K): randIndex = np.random.randint(row) retArr[number] = points[randIndex]
return retArr
该函数接收点数组并随机选择其中的 K 个作为初始质心。该函数仅返回 K 个选定点。
更新分配
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
def_update_assignment(self, centers, points): """ Args: centers: KxD numpy array, where K is the number of clusters, and D is the dimension points: NxD numpy array, the observations Return: cluster_idx: numpy array of length N, the cluster assignment for each point
Hint: You could call pairwise_dist() function. """ row, col = points.shape cluster_idx = np.empty([row]) distances = self.pairwise_dist(points, centers) cluster_idx = np.argmin(distances, axis=1)
def_update_centers(self, old_centers, cluster_idx, points): """ Args: old_centers: old centers KxD numpy array, where K is the number of clusters, and D is the dimension cluster_idx: numpy array of length N, the cluster assignment for each point points: NxD numpy array, the observations Return: centers: new centers, K x D numpy array, where K is the number of clusters, and D is the dimension. """ K, D = old_centers.shape new_centers = np.empty(old_centers.shape) for i inrange(K): new_centers[i] = np.mean(points[cluster_idx == i], axis = 0) return new_centers
def_get_loss(self, centers, cluster_idx, points): """ Args: centers: KxD numpy array, where K is the number of clusters, and D is the dimension cluster_idx: numpy array of length N, the cluster assignment for each point points: NxD numpy array, the observations Return: loss: a single float number, which is the objective function of KMeans. """ dists = self.pairwise_dist(points, centers) loss = 0.0 N, D = points.shape for i inrange(N): loss = loss + np.square(dists[i][cluster_idx[i]])
def__call__(self, points, K, max_iters=100, abs_tol=1e-16, rel_tol=1e-16, verbose=False, **kwargs): """ Args: points: NxD numpy array, where N is # points and D is the dimensionality K: number of clusters max_iters: maximum number of iterations (Hint: You could change it when debugging) abs_tol: convergence criteria w.r.t absolute change of loss rel_tol: convergence criteria w.r.t relative change of loss verbose: boolean to set whether method should print loss (Hint: helpful for debugging) kwargs: any additional arguments you want Return: cluster assignments: Nx1 int numpy array cluster centers: K x D numpy array, the centers loss: final loss value of the objective function of KMeans """ centers = self._init_centers(points, K, **kwargs) for it inrange(max_iters): cluster_idx = self._update_assignment(centers, points) centers = self._update_centers(centers, cluster_idx, points) loss = self._get_loss(centers, cluster_idx, points) K = centers.shape[0] if it: diff = np.abs(prev_loss - loss) if diff < abs_tol and diff / prev_loss < rel_tol: break prev_loss = loss if verbose: print('iter %d, loss: %.4f' % (it, loss)) return cluster_idx, centers, loss
from __future__ import absolute_import from __future__ import print_function from __future__ import division
import sys import matplotlib import numpy as np import matplotlib.pyplot as plt from mpl_toolkits.mplot3d import axes3d from tqdm import tqdm # Load image import imageio
# Set random seed so output is all same np.random.seed(1)
classKMeans(object):
def__init__(self): # No need to implement pass
defpairwise_dist(self, x, y): # [5 pts] """ Args: x: N x D numpy array y: M x D numpy array Return: dist: N x M array, where dist2[i, j] is the euclidean distance between x[i, :] and y[j, :] """ xSumSquare = np.sum(np.square(x),axis=1); ySumSquare = np.sum(np.square(y),axis=1); mul = np.dot(x, y.T); dists = np.sqrt(abs(xSumSquare[:, np.newaxis] + ySumSquare-2*mul)) return dists
def_init_centers(self, points, K, **kwargs): # [5 pts] """ Args: points: NxD numpy array, where N is # points and D is the dimensionality K: number of clusters kwargs: any additional arguments you want Return: centers: K x D numpy array, the centers. """ row, col = points.shape retArr = np.empty([K, col]) for number inrange(K): randIndex = np.random.randint(row) retArr[number] = points[randIndex]
return retArr
def_update_assignment(self, centers, points): # [10 pts] """ Args: centers: KxD numpy array, where K is the number of clusters, and D is the dimension points: NxD numpy array, the observations Return: cluster_idx: numpy array of length N, the cluster assignment for each point
Hint: You could call pairwise_dist() function. """ row, col = points.shape cluster_idx = np.empty([row]) distances = self.pairwise_dist(points, centers) cluster_idx = np.argmin(distances, axis=1)
return cluster_idx
def_update_centers(self, old_centers, cluster_idx, points): # [10 pts] """ Args: old_centers: old centers KxD numpy array, where K is the number of clusters, and D is the dimension cluster_idx: numpy array of length N, the cluster assignment for each point points: NxD numpy array, the observations Return: centers: new centers, K x D numpy array, where K is the number of clusters, and D is the dimension. """ K, D = old_centers.shape new_centers = np.empty(old_centers.shape) for i inrange(K): new_centers[i] = np.mean(points[cluster_idx == i], axis = 0) return new_centers
def_get_loss(self, centers, cluster_idx, points): # [5 pts] """ Args: centers: KxD numpy array, where K is the number of clusters, and D is the dimension cluster_idx: numpy array of length N, the cluster assignment for each point points: NxD numpy array, the observations Return: loss: a single float number, which is the objective function of KMeans. """ dists = self.pairwise_dist(points, centers) loss = 0.0 N, D = points.shape for i inrange(N): loss = loss + np.square(dists[i][cluster_idx[i]])
return loss
def__call__(self, points, K, max_iters=100, abs_tol=1e-16, rel_tol=1e-16, verbose=False, **kwargs): """ Args: points: NxD numpy array, where N is # points and D is the dimensionality K: number of clusters max_iters: maximum number of iterations (Hint: You could change it when debugging) abs_tol: convergence criteria w.r.t absolute change of loss rel_tol: convergence criteria w.r.t relative change of loss verbose: boolean to set whether method should print loss (Hint: helpful for debugging) kwargs: any additional arguments you want Return: cluster assignments: Nx1 int numpy array cluster centers: K x D numpy array, the centers loss: final loss value of the objective function of KMeans """ centers = self._init_centers(points, K, **kwargs) for it inrange(max_iters): cluster_idx = self._update_assignment(centers, points) centers = self._update_centers(centers, cluster_idx, points) loss = self._get_loss(centers, cluster_idx, points) K = centers.shape[0] if it: diff = np.abs(prev_loss - loss) if diff < abs_tol and diff / prev_loss < rel_tol: break prev_loss = loss if verbose: print('iter %d, loss: %.4f' % (it, loss)) return cluster_idx, centers, loss