Notebook

Gaussian Mixtures | Notebook

Dataset
For this notebook, we will use a generated dataset in the shape of elongated clusters. We can generate and visualize the dataset below.

import numpy as npimport matplotlib.pyplot as pltfrom sklearn.datasets import make_blobs
# create dataset and stretch to create non-spherical clustersX, y = make_blobs(   n_samples=300, centers=3,   cluster_std=0.60, random_state=0)rng = np.random.RandomState(13)X_stretched = np.dot(X, rng.randn(2, 2))
plt.scatter(   X_stretched[:, 0], X_stretched[:, 1],   c='white', marker='o',   edgecolor='black', s=50)plt.axis('equal')plt.show()

K-Means
As discussed in the previous notebook, K-Means works best for spherical clusters. When the dataset is stretched like it is above, K-Means does not identify the clusters. This is where other clustering methods, like Gaussian Mixture help. Below, we can see that K-Means does not properly cluster the data.

from sklearn.cluster import KMeans

clusters = 3

km = KMeans(
    n_clusters=clusters, init='random',
    n_init=10, max_iter=300, 
    tol=1e-04, random_state=0
)
y_km = km.fit_predict(X_stretched)
# plot the 3 clustersmarkers = ['s', 'H', 'v', 'p', '^', 'o', 'X', 'd', 'P' ]
for i in range(clusters):    plt.scatter(        X_stretched[y_km == i, 0], X_stretched[y_km == i, 1],        s=50,        marker=markers[i], edgecolor='white',        label='cluster ' + str(i)    )
# plot the centroidsplt.scatter(    km.cluster_centers_[:, 0], km.cluster_centers_[:, 1],    s=250, marker='*',    c='red', edgecolor='black',    label='centroids')plt.legend(scatterpoints=1)plt.grid()plt.show()

Gaussian Mixture
When we apply Gaussian Mixture with the full covariance type to the same dataset, we can see that the clusters created align with the clusters we can visually separate.

from sklearn.mixture import GaussianMixture
clusters_gm = 3gmm = GaussianMixture(n_components=clusters_gm, covariance_type='full', random_state=2, init_params='random')
import scipy.stats
y_gmm = gmm.fit(X_stretched).predict(X_stretched)# plot the clustersmarkers = ['s', 'H', 'v', 'p', '^', 'o', 'X', 'd', 'P' ]
centers = np.empty(shape=(gmm.n_components, X.shape[1]))for i in range(gmm.n_components):    density = scipy.stats.multivariate_normal(cov=gmm.covariances_[i], mean=gmm.means_[i]).logpdf(X)    centers[i, :] = X[np.argmax(density)]
for i in range(clusters_gm):    plt.scatter(        X_stretched[y_gmm == i, 0], X_stretched[y_gmm == i, 1],        s=50,        marker=markers[i], edgecolor='white',        label='cluster ' + str(i)    )
# plot the centroidsplt.scatter(    centers[:, 0], centers[:, 1],    s=250, marker='*',    c='red', edgecolor='black',    label='centroids')plt.legend(scatterpoints=1)plt.axis('equal')plt.grid()plt.show()

Covariance
As discussed earlier, the covariance adjusts the directions and lengths of the axes of the ellipsoidal density contours. SciKit Learn has four different parameter values for the covariance. They are diagonal, spherical, tied and full covariance and are visualized below.

import matplotlib as mplimport matplotlib.pyplot as pltimport numpy as npfrom sklearn.mixture import GaussianMixture
# Code adjusted from example by Andreas Mueller https://github.com/scikit-learn/scikit-learn/issues/10863
def make_ellipses(gmm, ax):    for n in range(gmm.n_components):        if gmm.covariance_type == 'full':            covariances = gmm.covariances_[n]        elif gmm.covariance_type == 'tied':            covariances = gmm.covariances_        elif gmm.covariance_type == 'diag':            covariances = np.diag(gmm.covariances_[n])        elif gmm.covariance_type == 'spherical':            covariances = np.eye(gmm.means_.shape[1]) * gmm.covariances_[n]        v, w = np.linalg.eigh(covariances)        u = w[0] / np.linalg.norm(w[0])        angle = np.arctan2(u[1], u[0])        angle = 180 * angle / np.pi  # convert to degrees        v = 2. * np.sqrt(2.) * np.sqrt(v)        ell = mpl.patches.Ellipse(gmm.means_[n], v[0], v[1],                                  180 + angle, color=plt.cm.tab10(n))        ell.set_clip_box(ax.bbox)        ell.set_alpha(0.5)        ax.add_artist(ell)
# Try GMMs using different types of covariances.estimators = [GaussianMixture(n_components=3, covariance_type=cov_type, random_state=2, init_params='random')              for cov_type in ['spherical', 'diag', 'tied', 'full']]
n_estimators = len(estimators)
fig, axes = plt.subplots(1, 4, figsize=(15, 5))titles = ("spherical\nn_components", "diag\nn_features * n_components",          "tied\n~n_features ** 2", "full\n~n_components * n_features ** 2")
for ax, title, estimator in zip(axes, titles, estimators):    estimator.fit(X_stretched)        make_ellipses(estimator, ax)
    pred = estimator.predict(X_stretched)       ax.scatter(X_stretched[:, 0], X_stretched[:, 1], c=plt.cm.tab10(pred), edgecolor='white')        ax.set_xticks(())    ax.set_yticks(())    ax.set_title(title)    ax.set_aspect("equal")


TASK: Choose two features from the wine dataset which are appropriate for clustering using gaussian mixture and apply it on the data based on those two features. 

Don't forget to normalize the input data using a pipeline like we have done in previous notebooks. Check the SVM notebook for an example.

Gaussian Mixtures | Notebook

Dataset

import numpy as np

import matplotlib.pyplot as plt

from sklearn.datasets import make_blobs

# create dataset and stretch to create non-spherical clusters

X, y = make_blobs(

n_samples=300, centers=3,

cluster_std=0.60, random_state=0

)

rng = np.random.RandomState(13)

X_stretched = np.dot(X, rng.randn(2, 2))

plt.scatter(

X_stretched[:, 0], X_stretched[:, 1],

c='white', marker='o',

edgecolor='black', s=50

)

plt.axis('equal')

plt.show()

K-Means

from sklearn.cluster import KMeans clusters = 3 km = KMeans( n_clusters=clusters, init='random', n_init=10, max_iter=300, tol=1e-04, random_state=0 ) y_km = km.fit_predict(X_stretched)

# plot the 3 clusters

markers = ['s', 'H', 'v', 'p', '^', 'o', 'X', 'd', 'P' ]

for i in range(clusters):

plt.scatter(

X_stretched[y_km == i, 0], X_stretched[y_km == i, 1],

s=50,

marker=markers[i], edgecolor='white',

label='cluster ' + str(i)

)

# plot the centroids

plt.scatter(

km.cluster_centers_[:, 0], km.cluster_centers_[:, 1],

s=250, marker='*',

c='red', edgecolor='black',

label='centroids'

)

plt.legend(scatterpoints=1)

plt.grid()

plt.show()

Gaussian Mixture

from sklearn.mixture import GaussianMixture

clusters_gm = 3

gmm = GaussianMixture(n_components=clusters_gm, covariance_type='full', random_state=2, init_params='random')

import scipy.stats

y_gmm = gmm.fit(X_stretched).predict(X_stretched)

# plot the clusters

markers = ['s', 'H', 'v', 'p', '^', 'o', 'X', 'd', 'P' ]

centers = np.empty(shape=(gmm.n_components, X.shape[1]))

for i in range(gmm.n_components):

density = scipy.stats.multivariate_normal(cov=gmm.covariances_[i], mean=gmm.means_[i]).logpdf(X)

centers[i, :] = X[np.argmax(density)]

for i in range(clusters_gm):

plt.scatter(

X_stretched[y_gmm == i, 0], X_stretched[y_gmm == i, 1],

s=50,

marker=markers[i], edgecolor='white',

label='cluster ' + str(i)

)

# plot the centroids

plt.scatter(

centers[:, 0], centers[:, 1],

s=250, marker='*',

c='red', edgecolor='black',

label='centroids'

)

plt.legend(scatterpoints=1)

plt.axis('equal')

plt.grid()

plt.show()

Covariance

import matplotlib as mpl

import matplotlib.pyplot as plt

import numpy as np

from sklearn.mixture import GaussianMixture

# Code adjusted from example by Andreas Mueller https://github.com/scikit-learn/scikit-learn/issues/10863

def make_ellipses(gmm, ax):

for n in range(gmm.n_components):

if gmm.covariance_type == 'full':

covariances = gmm.covariances_[n]