心有法竹
我看到的关键点是means初始化。按照sklearn Gaussian Mixture的默认实现,我切换到 KMeans,而不是随机初始化。import numpy as npimport seaborn as snsimport matplotlib.pyplot as pltplt.style.use('seaborn')eps=1e-8 def PDF(data, means, variances): return 1/(np.sqrt(2 * np.pi * variances) + eps) * np.exp(-1/2 * (np.square(data - means) / (variances + eps)))def EM_GMM(data, k=3, iterations=100, init_strategy='kmeans'): weights = np.ones((k, 1)) / k # shape=(k, 1) if init_strategy=='kmeans': from sklearn.cluster import KMeans km = KMeans(k).fit(data[:, None]) means = km.cluster_centers_ # shape=(k, 1) else: # init_strategy=='random' means = np.random.choice(data, k)[:, np.newaxis] # shape=(k, 1) variances = np.random.random_sample(size=k)[:, np.newaxis] # shape=(k, 1) data = np.repeat(data[np.newaxis, :], k, 0) # shape=(k, n) for step in range(iterations): # Expectation step likelihood = PDF(data, means, np.sqrt(variances)) # shape=(k, n) # Maximization step b = likelihood * weights # shape=(k, n) b /= np.sum(b, axis=1)[:, np.newaxis] + eps # updage means, variances, and weights means = np.sum(b * data, axis=1)[:, np.newaxis] / (np.sum(b, axis=1)[:, np.newaxis] + eps) variances = np.sum(b * np.square(data - means), axis=1)[:, np.newaxis] / (np.sum(b, axis=1)[:, np.newaxis] + eps) weights = np.mean(b, axis=1)[:, np.newaxis] return means, variances这似乎更一致地产生所需的输出:s = np.array([25.31 , 24.31 , 24.12 , 43.46 , 41.48666667, 41.48666667, 37.54 , 41.175 , 44.81 , 44.44571429, 44.44571429, 44.44571429, 44.44571429, 44.44571429, 44.44571429, 44.44571429, 44.44571429, 44.44571429, 44.44571429, 44.44571429, 44.44571429, 44.44571429, 39.71 , 26.69 , 34.15 , 24.94 , 24.75 , 24.56 , 24.38 , 35.25 , 44.62 , 44.94 , 44.815 , 44.69 , 42.31 , 40.81 , 44.38 , 44.56 , 44.44 , 44.25 , 43.66666667, 43.66666667, 43.66666667, 43.66666667, 43.66666667, 40.75 , 32.31 , 36.08 , 30.135 , 24.19 ])k=3n_iter=100means, variances = EM_GMM(s, k, n_iter)print(means,variances)[[44.42596231] [24.509301 ] [35.4137508 ]] [[0.07568723] [0.10583743] [0.52125856]]# Plotting the resultscolors = ['green', 'red', 'blue', 'yellow']bins = np.linspace(np.min(s)-2, np.max(s)+2, 100)plt.figure(figsize=(10,7))plt.xlabel('$x$')plt.ylabel('pdf')sns.scatterplot(s, [0.05] * len(s), color='navy', s=40, marker=2, label='Series data')for i, (m, v) in enumerate(zip(means, variances)): sns.lineplot(bins, PDF(bins, m, v), color=colors[i], label=f'Cluster {i+1}')plt.legend()plt.plot()最后我们可以看到纯随机初始化产生了不同的结果;让我们看看结果means:for _ in range(5): print(EM_GMM(s, k, n_iter, init_strategy='random')[0], '\n')[[44.42596231] [44.42596231] [44.42596231]][[44.42596231] [24.509301 ] [30.1349997 ]][[44.42596231] [35.4137508 ] [44.42596231]][[44.42596231] [30.1349997 ] [44.42596231]][[44.42596231] [44.42596231] [44.42596231]]可以看出这些结果有多么不同,在某些情况下,结果均值是恒定的,这意味着初始化选择了 3 个相似的值并且在迭代时没有太大变化。在 中添加一些打印语句EM_GMM将澄清这一点。
一只萌萌小番薯
# Expectation steplikelihood = PDF(data, means, np.sqrt(variances))我们为什么要sqrt过去variances?pdf 函数接受差异。所以这应该是PDF(data, means, variances)。另一个问题,# Maximization stepb = likelihood * weights # shape=(k, n)b /= np.sum(b, axis=1)[:, np.newaxis] + eps上面第二行应该是b /= np.sum(b, axis=0)[:, np.newaxis] + eps同样在 的初始化中variances,variances = np.random.random_sample(size=k)[:, np.newaxis] # shape=(k, 1)为什么我们要随机初始化方差?我们有data和means,为什么不像 中那样计算当前估计方差vars = np.expand_dims(np.mean(np.square(data - means), axis=1), -1)?通过这些更改,这是我的实现,import numpy as npimport seaborn as snsimport matplotlib.pyplot as pltplt.style.use('seaborn')eps=1e-8def pdf(data, means, vars): denom = np.sqrt(2 * np.pi * vars) + eps numer = np.exp(-0.5 * np.square(data - means) / (vars + eps)) return numer /denomdef em_gmm(data, k, n_iter, init_strategy='k_means'): weights = np.ones((k, 1), dtype=np.float32) / k if init_strategy == 'k_means': from sklearn.cluster import KMeans km = KMeans(k).fit(data[:, None]) means = km.cluster_centers_ else: means = np.random.choice(data, k)[:, np.newaxis] data = np.repeat(data[np.newaxis, :], k, 0) vars = np.expand_dims(np.mean(np.square(data - means), axis=1), -1) for step in range(n_iter): p = pdf(data, means, vars) b = p * weights denom = np.expand_dims(np.sum(b, axis=0), 0) + eps b = b / denom means_n = np.sum(b * data, axis=1) means_d = np.sum(b, axis=1) + eps means = np.expand_dims(means_n / means_d, -1) vars = np.sum(b * np.square(data - means), axis=1) / means_d vars = np.expand_dims(vars, -1) weights = np.expand_dims(np.mean(b, axis=1), -1) return means, varsdef main(): s = np.array([25.31, 24.31, 24.12, 43.46, 41.48666667, 41.48666667, 37.54, 41.175, 44.81, 44.44571429, 44.44571429, 44.44571429, 44.44571429, 44.44571429, 44.44571429, 44.44571429, 44.44571429, 44.44571429, 44.44571429, 44.44571429, 44.44571429, 44.44571429, 39.71, 26.69, 34.15, 24.94, 24.75, 24.56, 24.38, 35.25, 44.62, 44.94, 44.815, 44.69, 42.31, 40.81, 44.38, 44.56, 44.44, 44.25, 43.66666667, 43.66666667, 43.66666667, 43.66666667, 43.66666667, 40.75, 32.31, 36.08, 30.135, 24.19]) k = 3 n_iter = 100 means, vars = em_gmm(s, k, n_iter) y = 0 colors = ['green', 'red', 'blue', 'yellow'] bins = np.linspace(np.min(s) - 2, np.max(s) + 2, 100) plt.figure(figsize=(10, 7)) plt.xlabel('$x$') plt.ylabel('pdf') sns.scatterplot(s, [0.0] * len(s), color='navy', s=40, marker=2, label='Series data') for i, (m, v) in enumerate(zip(means, vars)): sns.lineplot(bins, pdf(bins, m, v), color=colors[i], label=f'Cluster {i + 1}') plt.legend() plt.plot() plt.show() pass这是我的结果。