Silhouettes:聚类结果衡量指标

时间:2022-12-08 18:01:37
import numpy as np
from sklearn import datasets
from sklearn.metrics import pairwise_distances
from sklearn.preprocessing import LabelEncoder


def silhouette_samples(X, labels, metric='euclidean', **kwds):
    le = LabelEncoder()
    labels = le.fit_transform(labels)
    unique_labels = le.classes_
    distances = pairwise_distances(X, metric=metric, **kwds)
    intra_clust_dists = np.ones(distances.shape[0], dtype=distances.dtype)
    inter_clust_dists = np.inf * intra_clust_dists
    for curr_label in unique_labels:
        mask = curr_label == labels
        current_distances = distances[mask]
        n_samples_curr_lab = np.sum(mask) - 1
        if n_samples_curr_lab != 0:
            intra_clust_dists[mask] = np.sum(current_distances[:, mask], axis=1) / n_samples_curr_lab
        for other_label in unique_labels:
            if other_label != curr_label:
                other_mask = other_label == labels
                other_distances = np.mean(current_distances[:, other_mask], axis=1)
                inter_clust_dists[mask] = np.minimum(inter_clust_dists[mask], other_distances)
    sil_samples = inter_clust_dists - intra_clust_dists
    sil_samples /= np.maximum(intra_clust_dists, inter_clust_dists)
    return sil_samples


dataset = datasets.load_iris()
data = dataset.data
target = dataset.target
result = silhouette_samples(data, target)
print(result)