import matplotlib.pyplot as plt # To make visualisations
from sklearn.cluster import KMeans #To perform K-Means clustering
from sklearn.cluster import DBSCAN
from sklearn.datasets import make_blobs #To generate data for kmeans
from sklearn.datasets import make_moons #To generate data for DBSCAN
= make_blobs(n_samples=300, centers=4, cluster_std=0.9, random_state=42) X, y
#Collection of points in 2D space X.shape
(300, 2)
= X[:, 0] #X coordinates of the points
X_coord = X[:, 1] #Y coordinates of the points
Y_coord
print(X_coord[:5])
print(Y_coord[:5])
[ -9.25175257 -9.61269979 -1.7689072 -7.07554027 -10.67263984]
[ 6.55866298 6.97742293 7.91552684 -5.89121043 6.41624524]
=50, c='blue'); plt.scatter(X_coord, Y_coord, s
= KMeans(n_clusters=4, n_init='auto') #N_init refers to the number of times the clustering algo is ran
kmeans
kmeans.fit(X)= kmeans.predict(X)
y_pred y_pred
array([1, 1, 3, 2, 1, 2, 0, 2, 3, 0, 3, 0, 3, 3, 1, 3, 1, 0, 3, 3, 0, 3,
2, 1, 3, 1, 1, 2, 2, 0, 3, 0, 1, 0, 1, 3, 1, 2, 1, 2, 0, 3, 1, 2,
3, 3, 1, 0, 1, 0, 2, 1, 2, 3, 2, 0, 1, 0, 0, 3, 1, 0, 0, 1, 2, 2,
2, 2, 2, 3, 2, 2, 1, 0, 3, 1, 2, 2, 3, 2, 3, 3, 1, 3, 2, 1, 1, 0,
0, 0, 1, 3, 1, 3, 3, 1, 2, 3, 1, 1, 0, 0, 0, 3, 3, 3, 3, 3, 2, 1,
0, 3, 3, 3, 3, 0, 1, 2, 1, 2, 2, 2, 3, 1, 2, 1, 1, 3, 1, 2, 0, 3,
3, 3, 3, 0, 0, 1, 3, 2, 3, 0, 2, 3, 0, 0, 0, 0, 2, 3, 3, 1, 0, 2,
3, 0, 2, 1, 1, 0, 3, 1, 2, 1, 0, 1, 2, 3, 3, 3, 3, 3, 2, 0, 0, 2,
2, 0, 0, 2, 1, 3, 1, 0, 0, 1, 2, 3, 0, 0, 2, 2, 2, 1, 0, 2, 2, 0,
0, 1, 3, 3, 2, 0, 3, 2, 2, 1, 2, 3, 3, 2, 2, 0, 1, 2, 1, 1, 3, 1,
1, 2, 1, 2, 0, 0, 1, 1, 0, 0, 0, 1, 3, 2, 0, 2, 1, 0, 1, 1, 1, 2,
2, 0, 1, 2, 2, 2, 1, 2, 1, 2, 1, 0, 2, 1, 0, 3, 1, 3, 0, 3, 1, 3,
2, 0, 2, 0, 0, 3, 3, 2, 0, 0, 1, 1, 2, 3, 3, 0, 0, 0, 0, 2, 1, 0,
2, 0, 0, 2, 3, 2, 0, 3, 1, 3, 0, 3, 1, 1], dtype=int32)
=50, c=y_pred, cmap='viridis')
plt.scatter(X_coord, Y_coord, s
= kmeans.cluster_centers_ # Gets the coordinates of the cluster centers
centers 0], centers[:, 1], c='black', s=100); plt.scatter(centers[:,
Link to a visualisation site: https://www.naftaliharris.com/blog/visualizing-k-means-clustering/
Is k-means the perfect option for all cases?
= make_moons(n_samples=300, noise=0.05, random_state=42) X, y
= KMeans(n_clusters=2, n_init='auto', random_state=42)
kmeans = kmeans.fit_predict(X)
labels print(labels.shape)
(300,)
0], X[:, 1], c=labels, s=50, cmap='viridis');
plt.scatter(X[:, = kmeans.cluster_centers_
centers 0], centers[:, 1], c='black', s=100); plt.scatter(centers[:,
To overcome this issue we use other clustering algorithm such as DBSCAN
= DBSCAN(eps=0.3, min_samples=2).fit(X)
dbscan = dbscan.labels_
labels 0], X[:, 1], c=labels, s=50, cmap='viridis'); plt.scatter(X[:,
Link to a visualisation site: https://www.naftaliharris.com/blog/visualizing-dbscan-clustering/