Clustering

Clustering#

Clustering seeks to group data into clusters based on their properties and then allow us to predict which cluster a new member belongs.

import numpy as np
import matplotlib.pyplot as plt

We’ll use a dataset generator that is part of scikit-learn called make_moons. This generates data that falls into 2 different sets with a shape that looks like half-moons.

from sklearn import datasets
def generate_data():
    xvec, val = datasets.make_moons(200, noise=0.2)

    # encode the output to be 2 elements
    x = []
    v = []
    for xv, vv in zip(xvec, val):
        x.append(np.array(xv))
        v.append(vv)

    return np.array(x), np.array(v)
x, v = generate_data()

Let’s look at a point and it’s value

print(f"x = {x[0]}, value = {v[0]}")
x = [ 0.18616838 -0.04957798], value = 1

Now let’s plot the data

def plot_data(x, v):
    xpt = [q[0] for q in x]
    ypt = [q[1] for q in x]

    fig, ax = plt.subplots()
    ax.scatter(xpt, ypt, s=40, c=v, cmap="viridis")
    ax.set_aspect("equal")
    return fig
fig = plot_data(x, v)
../_images/317e96ddbd391a673fc5fd5a548127b80acd52a31c2c3ce26f5250d5645d064d.png

We want to partition this domain into 2 regions, such that when we come in with a new point, we know which group it belongs to.

First we setup and train our network

from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Input
from keras.optimizers import RMSprop
2025-09-08 21:26:57.917618: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2025-09-08 21:26:57.963464: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-09-08 21:26:59.780946: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
model = Sequential()
model.add(Input(shape=(2,)))
model.add(Dense(50, activation="relu"))
model.add(Dense(20, activation="relu"))
model.add(Dense(1, activation="sigmoid"))
2025-09-08 21:27:00.096398: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)
rms = RMSprop()
model.compile(loss='binary_crossentropy',
              optimizer=rms, metrics=['accuracy'])
model.summary()
Model: "sequential"
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓
┃ Layer (type)                     Output Shape                  Param # ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩
│ dense (Dense)                   │ (None, 50)             │           150 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense_1 (Dense)                 │ (None, 20)             │         1,020 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense_2 (Dense)                 │ (None, 1)              │            21 │
└─────────────────────────────────┴────────────────────────┴───────────────┘
 Total params: 1,191 (4.65 KB)
 Trainable params: 1,191 (4.65 KB)
 Non-trainable params: 0 (0.00 B)

We seem to need a lot of epochs here to get a good result

epochs = 100
results = model.fit(x, v, batch_size=50, epochs=epochs, verbose=2)
Epoch 1/100
4/4 - 0s - 114ms/step - accuracy: 0.6950 - loss: 0.6584
Epoch 2/100
4/4 - 0s - 6ms/step - accuracy: 0.7950 - loss: 0.6159
Epoch 3/100
4/4 - 0s - 6ms/step - accuracy: 0.8300 - loss: 0.5866
Epoch 4/100
4/4 - 0s - 6ms/step - accuracy: 0.8200 - loss: 0.5624
Epoch 5/100
4/4 - 0s - 6ms/step - accuracy: 0.8300 - loss: 0.5417
Epoch 6/100
4/4 - 0s - 6ms/step - accuracy: 0.8300 - loss: 0.5227
Epoch 7/100
4/4 - 0s - 6ms/step - accuracy: 0.8300 - loss: 0.5052
Epoch 8/100
4/4 - 0s - 6ms/step - accuracy: 0.8300 - loss: 0.4886
Epoch 9/100
4/4 - 0s - 6ms/step - accuracy: 0.8350 - loss: 0.4736
Epoch 10/100
4/4 - 0s - 6ms/step - accuracy: 0.8400 - loss: 0.4595
Epoch 11/100
4/4 - 0s - 6ms/step - accuracy: 0.8400 - loss: 0.4458
Epoch 12/100
4/4 - 0s - 6ms/step - accuracy: 0.8450 - loss: 0.4331
Epoch 13/100
4/4 - 0s - 6ms/step - accuracy: 0.8500 - loss: 0.4211
Epoch 14/100
4/4 - 0s - 6ms/step - accuracy: 0.8500 - loss: 0.4101
Epoch 15/100
4/4 - 0s - 6ms/step - accuracy: 0.8550 - loss: 0.3993
Epoch 16/100
4/4 - 0s - 6ms/step - accuracy: 0.8500 - loss: 0.3898
Epoch 17/100
4/4 - 0s - 6ms/step - accuracy: 0.8500 - loss: 0.3806
Epoch 18/100
4/4 - 0s - 6ms/step - accuracy: 0.8500 - loss: 0.3720
Epoch 19/100
4/4 - 0s - 6ms/step - accuracy: 0.8500 - loss: 0.3647
Epoch 20/100
4/4 - 0s - 6ms/step - accuracy: 0.8500 - loss: 0.3573
Epoch 21/100
4/4 - 0s - 6ms/step - accuracy: 0.8650 - loss: 0.3508
Epoch 22/100
4/4 - 0s - 6ms/step - accuracy: 0.8650 - loss: 0.3453
Epoch 23/100
4/4 - 0s - 6ms/step - accuracy: 0.8650 - loss: 0.3393
Epoch 24/100
4/4 - 0s - 6ms/step - accuracy: 0.8650 - loss: 0.3352
Epoch 25/100
4/4 - 0s - 7ms/step - accuracy: 0.8750 - loss: 0.3306
Epoch 26/100
4/4 - 0s - 7ms/step - accuracy: 0.8750 - loss: 0.3266
Epoch 27/100
4/4 - 0s - 7ms/step - accuracy: 0.8750 - loss: 0.3233
Epoch 28/100
4/4 - 0s - 7ms/step - accuracy: 0.8750 - loss: 0.3203
Epoch 29/100
4/4 - 0s - 6ms/step - accuracy: 0.8750 - loss: 0.3180
Epoch 30/100
4/4 - 0s - 6ms/step - accuracy: 0.8750 - loss: 0.3150
Epoch 31/100
4/4 - 0s - 6ms/step - accuracy: 0.8750 - loss: 0.3138
Epoch 32/100
4/4 - 0s - 6ms/step - accuracy: 0.8750 - loss: 0.3110
Epoch 33/100
4/4 - 0s - 6ms/step - accuracy: 0.8750 - loss: 0.3093
Epoch 34/100
4/4 - 0s - 6ms/step - accuracy: 0.8750 - loss: 0.3075
Epoch 35/100
4/4 - 0s - 6ms/step - accuracy: 0.8750 - loss: 0.3060
Epoch 36/100
4/4 - 0s - 6ms/step - accuracy: 0.8750 - loss: 0.3040
Epoch 37/100
4/4 - 0s - 6ms/step - accuracy: 0.8750 - loss: 0.3039
Epoch 38/100
4/4 - 0s - 6ms/step - accuracy: 0.8750 - loss: 0.3011
Epoch 39/100
4/4 - 0s - 6ms/step - accuracy: 0.8750 - loss: 0.3001
Epoch 40/100
4/4 - 0s - 6ms/step - accuracy: 0.8750 - loss: 0.3003
Epoch 41/100
4/4 - 0s - 6ms/step - accuracy: 0.8750 - loss: 0.2974
Epoch 42/100
4/4 - 0s - 6ms/step - accuracy: 0.8750 - loss: 0.2961
Epoch 43/100
4/4 - 0s - 6ms/step - accuracy: 0.8750 - loss: 0.2957
Epoch 44/100
4/4 - 0s - 6ms/step - accuracy: 0.8800 - loss: 0.2939
Epoch 45/100
4/4 - 0s - 6ms/step - accuracy: 0.8700 - loss: 0.2932
Epoch 46/100
4/4 - 0s - 6ms/step - accuracy: 0.8800 - loss: 0.2921
Epoch 47/100
4/4 - 0s - 6ms/step - accuracy: 0.8750 - loss: 0.2906
Epoch 48/100
4/4 - 0s - 6ms/step - accuracy: 0.8800 - loss: 0.2896
Epoch 49/100
4/4 - 0s - 6ms/step - accuracy: 0.8900 - loss: 0.2901
Epoch 50/100
4/4 - 0s - 6ms/step - accuracy: 0.8800 - loss: 0.2885
Epoch 51/100
4/4 - 0s - 6ms/step - accuracy: 0.8800 - loss: 0.2872
Epoch 52/100
4/4 - 0s - 6ms/step - accuracy: 0.8900 - loss: 0.2866
Epoch 53/100
4/4 - 0s - 6ms/step - accuracy: 0.8900 - loss: 0.2847
Epoch 54/100
4/4 - 0s - 6ms/step - accuracy: 0.8900 - loss: 0.2839
Epoch 55/100
4/4 - 0s - 6ms/step - accuracy: 0.8900 - loss: 0.2827
Epoch 56/100
4/4 - 0s - 6ms/step - accuracy: 0.8900 - loss: 0.2826
Epoch 57/100
4/4 - 0s - 7ms/step - accuracy: 0.8900 - loss: 0.2829
Epoch 58/100
4/4 - 0s - 6ms/step - accuracy: 0.8900 - loss: 0.2810
Epoch 59/100
4/4 - 0s - 6ms/step - accuracy: 0.8900 - loss: 0.2795
Epoch 60/100
4/4 - 0s - 6ms/step - accuracy: 0.8900 - loss: 0.2789
Epoch 61/100
4/4 - 0s - 6ms/step - accuracy: 0.8900 - loss: 0.2780
Epoch 62/100
4/4 - 0s - 6ms/step - accuracy: 0.8900 - loss: 0.2779
Epoch 63/100
4/4 - 0s - 6ms/step - accuracy: 0.8900 - loss: 0.2760
Epoch 64/100
4/4 - 0s - 6ms/step - accuracy: 0.8900 - loss: 0.2771
Epoch 65/100
4/4 - 0s - 6ms/step - accuracy: 0.8900 - loss: 0.2761
Epoch 66/100
4/4 - 0s - 6ms/step - accuracy: 0.8900 - loss: 0.2743
Epoch 67/100
4/4 - 0s - 7ms/step - accuracy: 0.8900 - loss: 0.2729
Epoch 68/100
4/4 - 0s - 6ms/step - accuracy: 0.8900 - loss: 0.2719
Epoch 69/100
4/4 - 0s - 6ms/step - accuracy: 0.8900 - loss: 0.2723
Epoch 70/100
4/4 - 0s - 6ms/step - accuracy: 0.8900 - loss: 0.2708
Epoch 71/100
4/4 - 0s - 6ms/step - accuracy: 0.8900 - loss: 0.2691
Epoch 72/100
4/4 - 0s - 6ms/step - accuracy: 0.8900 - loss: 0.2687
Epoch 73/100
4/4 - 0s - 6ms/step - accuracy: 0.8900 - loss: 0.2690
Epoch 74/100
4/4 - 0s - 6ms/step - accuracy: 0.8900 - loss: 0.2663
Epoch 75/100
4/4 - 0s - 6ms/step - accuracy: 0.8900 - loss: 0.2657
Epoch 76/100
4/4 - 0s - 6ms/step - accuracy: 0.8950 - loss: 0.2653
Epoch 77/100
4/4 - 0s - 6ms/step - accuracy: 0.8950 - loss: 0.2642
Epoch 78/100
4/4 - 0s - 6ms/step - accuracy: 0.8900 - loss: 0.2621
Epoch 79/100
4/4 - 0s - 6ms/step - accuracy: 0.8900 - loss: 0.2612
Epoch 80/100
4/4 - 0s - 6ms/step - accuracy: 0.8900 - loss: 0.2598
Epoch 81/100
4/4 - 0s - 6ms/step - accuracy: 0.8900 - loss: 0.2591
Epoch 82/100
4/4 - 0s - 6ms/step - accuracy: 0.8900 - loss: 0.2586
Epoch 83/100
4/4 - 0s - 6ms/step - accuracy: 0.8900 - loss: 0.2563
Epoch 84/100
4/4 - 0s - 6ms/step - accuracy: 0.8900 - loss: 0.2549
Epoch 85/100
4/4 - 0s - 6ms/step - accuracy: 0.8950 - loss: 0.2552
Epoch 86/100
4/4 - 0s - 6ms/step - accuracy: 0.8900 - loss: 0.2530
Epoch 87/100
4/4 - 0s - 6ms/step - accuracy: 0.8900 - loss: 0.2518
Epoch 88/100
4/4 - 0s - 6ms/step - accuracy: 0.8900 - loss: 0.2529
Epoch 89/100
4/4 - 0s - 6ms/step - accuracy: 0.8900 - loss: 0.2500
Epoch 90/100
4/4 - 0s - 6ms/step - accuracy: 0.8950 - loss: 0.2485
Epoch 91/100
4/4 - 0s - 6ms/step - accuracy: 0.8900 - loss: 0.2467
Epoch 92/100
4/4 - 0s - 6ms/step - accuracy: 0.9000 - loss: 0.2466
Epoch 93/100
4/4 - 0s - 6ms/step - accuracy: 0.8950 - loss: 0.2456
Epoch 94/100
4/4 - 0s - 6ms/step - accuracy: 0.8950 - loss: 0.2429
Epoch 95/100
4/4 - 0s - 6ms/step - accuracy: 0.8950 - loss: 0.2419
Epoch 96/100
4/4 - 0s - 6ms/step - accuracy: 0.8950 - loss: 0.2416
Epoch 97/100
4/4 - 0s - 6ms/step - accuracy: 0.8950 - loss: 0.2392
Epoch 98/100
4/4 - 0s - 6ms/step - accuracy: 0.8950 - loss: 0.2385
Epoch 99/100
4/4 - 0s - 6ms/step - accuracy: 0.8950 - loss: 0.2366
Epoch 100/100
4/4 - 0s - 6ms/step - accuracy: 0.8950 - loss: 0.2349
score = model.evaluate(x, v, verbose=0)
print(f"score = {score[0]}")
print(f"accuracy = {score[1]}")
score = 0.23307277262210846
accuracy = 0.8949999809265137

Let’s look at a prediction. We need to feed in a single point as an array of shape (N, 2), where N is the number of points

res = model.predict(np.array([[-2, 2]]))
res
1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 31ms/step

1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 41ms/step
array([[1.2483164e-08]], dtype=float32)

We see that we get a floating point number. We will need to convert this to 0 or 1 by rounding.

Let’s plot the partitioning

M = 128
N = 128

xmin = -1.75
xmax = 2.5
ymin = -1.25
ymax = 1.75

xpt = np.linspace(xmin, xmax, M)
ypt = np.linspace(ymin, ymax, N)

To make the prediction go faster, we want to feed in a vector of these points, of the form:

[[xpt[0], ypt[0]],
 [xpt[1], ypt[1]],
 ...
]

We can see that this packs them into the vector

pairs = np.array(np.meshgrid(xpt, ypt)).T.reshape(-1, 2)
pairs[0]
array([-1.75, -1.25])

Now we do the prediction. We will get a vector out, which we reshape to match the original domain.

res = model.predict(pairs, verbose=0)
res.shape = (M, N)

Finally, round to 0 or 1

domain = np.where(res > 0.5, 1, 0)

and we can plot the data

fig, ax = plt.subplots()
ax.imshow(domain.T, origin="lower",
          extent=[xmin, xmax, ymin, ymax], alpha=0.25)
xpt = [q[0] for q in x]
ypt = [q[1] for q in x]

ax.scatter(xpt, ypt, s=40, c=v, cmap="viridis")
<matplotlib.collections.PathCollection at 0x7f3361cefb10>
../_images/88d2488c328594222f1262ff48cb3a2d7dd392b209dbf355754737b831141324.png