Generating and Visualizing Multi-cluster Data with Gaussian KDE

This Python script generates synthetic data points for a specified number of clusters and visualizes them using Gaussian Kernel Density Estimation (KDE) with Matplotlib. It creates an illustrative graph showing the density of the generated data points across different clusters, helping in understanding the distribution and relationship between the data points visually.

import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
 
def create_example_data(n, clusters=2):
    coords = list()
    for cluster in range(clusters):
        m1 = np.random.normal(size=n, loc=10*cluster)
        m2 = np.random.normal(scale=0.5, size=n, loc=10*cluster)
        a = np.random.uniform(-1, 1)
        b = np.random.uniform(-1, 1)
        coords.append([[m1+(a*m2)], [m1-(b*m2)]])
    return np.vstack(coords).reshape(2, n*clusters)
 
d = create_example_data(1000)
x = d[0]
y = d[1]
 
xmin = x.min()
xmax = x.max()
 
ymin = y.min()
ymax = y.max()
 
X, Y = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]
positions = np.vstack([X.ravel(), Y.ravel()])
 
values = np.vstack([x, y])
 
kernel = stats.gaussian_kde(values)
 
Z = np.reshape(kernel(positions).T, X.shape)
 
fig, ax = plt.subplots(1, 1, figsize=(10, 7))
ax.imshow(np.rot90(Z), extent=[xmin, xmax, ymin, ymax])
ax.plot(x, y, 'o', alpha=0.1, zorder=1)
 
plt.show()