源码解读----之_k-means++初始化质心的方法(被k_means调用)

时间:2021-10-19 19:50:03

 
本文是个人的理解,由于刚接触并且自身能力也有限,也许会存在误解,欢迎留言指正,本人一定虚心请教,谢谢
 
def _k_init(X, n_clusters, x_squared_norms, random_state, n_local_trials=None):
    """根据k-means++初始化质心

    @:parameter X : 输入数据,应该是双精度(dtype = np.float64)。
    @:parameter n_clusters : integer质心数
    @:parameter x_squared_norms : array, shape (n_samples,)每个数据点的欧几里得范数的平方
    @:parameter random_state : numpy.RandomState#随机数生成器,用于初始化中心
    @:parameter n_local_trials : integer, optional
        The number of seeding trials for each center (except the first),
        of which the one reducing inertia the most is greedily chosen.
        Set to None to make the number of trials depend logarithmically
        on the number of seeds (2+log(k)); this is the default.

    通过一种特别的方式对K-means聚类选择初始簇中心,从而加快收敛速度
    """
    n_samples, n_features = X.shape

    centers = np.empty((n_clusters, n_features), dtype=X.dtype)

    assert x_squared_norms is not None, 'x_squared_norms None in _k_init'

    # 如果没有设置seeding trials 则在此设置
    if n_local_trials is None:
        # This is what Arthur/Vassilvitskii tried, but did not report
        # specific results for other than mentioning in the conclusion
        # that it helped.
        n_local_trials = 2 + int(np.log(n_clusters))

    # 随机的选择第一个中心
    center_id = random_state.randint(n_samples)
    if sp.issparse(X):
        centers[0] = X[center_id].toarray()
    else:
        centers[0] = X[center_id]

    # 初始化最近距离的列表,并计算当前概率
    closest_dist_sq = euclidean_distances(
        centers[0, np.newaxis], X, Y_norm_squared=x_squared_norms,
        squared=True)#计算X与中心的距离的平方得到距离矩阵
    current_pot = closest_dist_sq.sum()#距离矩阵的和

    # 选择其余n_clusters-1点
    for c in range(1, n_clusters):
        # 通过概率的比例选择中心点候选点
        # 离已经存在的中心最近的距离的平方
        rand_vals = random_state.random_sample(n_local_trials) * current_pot
        #将rand_vals插入原有序数组距离矩阵的累积求和矩阵中,并返回插入元素的索引值
        candidate_ids = np.searchsorted(stable_cumsum(closest_dist_sq),
                                        rand_vals)

        # 计算离中心候选点的距离
        distance_to_candidates = euclidean_distances(
            X[candidate_ids], X, Y_norm_squared=x_squared_norms, squared=True)

        # 决定哪个中心候选点是最好
        best_candidate = None  best_pot = None  best_dist_sq = None  for trial in range(n_local_trials):
            # Compute potential when including center candidate
            #多个数组的对应位置上元素大小的比较:返回每个索引位置上的最小值
            new_dist_sq = np.minimum(closest_dist_sq,
                                     distance_to_candidates[trial])
            new_pot = new_dist_sq.sum()#求和

            # 如果是到目前为止最好的实验结果则存储该结果
            if (best_candidate is None) or (new_pot < best_pot):
                best_candidate = candidate_ids[trial]
                best_pot = new_pot
                best_dist_sq = new_dist_sq

        # Permanently add best center candidate found in local tries
        #把从试验中选出的最好的中心候选点添加到中心点集中
        if sp.issparse(X):
            centers[c] = X[best_candidate].toarray()
        else:
            centers[c] = X[best_candidate]
        current_pot = best_pot
        closest_dist_sq = best_dist_sq

    return centers

参考地址:https://github.com/scikit-learn/scikit-learn