我正在处理一个非常高维的数据集,并对其执行了 k 均值聚类。我正在尝试找到距离每个质心最近的 20 个点。数据集 (X_emb) 的尺寸为 10 x 2816。提供的是我用来查找距每个质心最近的单个点的代码。注释掉的代码是我发现的一个潜在的解决方案,但我无法使其准确工作。
import numpy as np
import pickle as pkl
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin_min
from sklearn.neighbors import NearestNeighbors
from visualization.make_video_v2 import make_video_from_numpy
from scipy.spatial import cKDTree
n_s_train = 10000
df = pkl.load(open('cluster_data/mixed_finetuning_data.pkl', 'rb'))
N = len(df)
X = []
X_emb = []
for i in range(N):
play = df.iloc[i]
if df.iloc[i].label == 1:
X_emb.append(play['embedding'])
X.append(play['input'])
X_emb = np.array(X_emb)
kmeans = KMeans(n_clusters=10)
kmeans.fit(X_emb)
results = kmeans.cluster_centers_
closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_, X)
# def find_k_closest(centroids, data, k=1, distance_norm=2):
# kdtree = cKDTree(data, leafsize=30)
# distances, indices = kdtree.query(centroids, k, p=distance_norm)
# if k > 1:
# indices = indices[:,-1]
# values = data[indices]
# return indices, values
# indices, values = find_k_closest(results, X_emb)
蝴蝶不菲
ABOUTYOU
相关分类