Clustering

Clustering#

Clustering seeks to group data into clusters based on their properties and then allow us to predict which cluster a new member belongs.

import numpy as np
import matplotlib.pyplot as plt

We’ll use a dataset generator that is part of scikit-learn called make_moons. This generates data that falls into 2 different sets with a shape that looks like half-moons.

from sklearn import datasets
def generate_data():
    xvec, val = datasets.make_moons(200, noise=0.2)

    # encode the output to be 2 elements
    x = []
    v = []
    for xv, vv in zip(xvec, val):
        x.append(np.array(xv))
        v.append(vv)

    return np.array(x), np.array(v)
x, v = generate_data()

Let’s look at a point and it’s value

print(f"x = {x[0]}, value = {v[0]}")
x = [-0.88808644  0.1509225 ], value = 0

Now let’s plot the data

def plot_data(x, v):
    xpt = [q[0] for q in x]
    ypt = [q[1] for q in x]

    fig, ax = plt.subplots()
    ax.scatter(xpt, ypt, s=40, c=v, cmap="viridis")
    ax.set_aspect("equal")
    return fig
fig = plot_data(x, v)
../_images/ad22806c7f5e327d08514ce0491872ad5f4e5a6c292109e93b4a50b2b8a7432e.png

We want to partition this domain into 2 regions, such that when we come in with a new point, we know which group it belongs to.

First we setup and train our network

from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Input
from keras.optimizers import RMSprop
2025-11-24 21:39:13.673383: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2025-11-24 21:39:13.719430: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-11-24 21:39:15.524678: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
model = Sequential()
model.add(Input(shape=(2,)))
model.add(Dense(50, activation="relu"))
model.add(Dense(20, activation="relu"))
model.add(Dense(1, activation="sigmoid"))
2025-11-24 21:39:15.866398: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)
rms = RMSprop()
model.compile(loss='binary_crossentropy',
              optimizer=rms, metrics=['accuracy'])
model.summary()
Model: "sequential"
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓
┃ Layer (type)                     Output Shape                  Param # ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩
│ dense (Dense)                   │ (None, 50)             │           150 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense_1 (Dense)                 │ (None, 20)             │         1,020 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense_2 (Dense)                 │ (None, 1)              │            21 │
└─────────────────────────────────┴────────────────────────┴───────────────┘
 Total params: 1,191 (4.65 KB)
 Trainable params: 1,191 (4.65 KB)
 Non-trainable params: 0 (0.00 B)

We seem to need a lot of epochs here to get a good result

epochs = 100
results = model.fit(x, v, batch_size=50, epochs=epochs, verbose=2)
Epoch 1/100
4/4 - 0s - 123ms/step - accuracy: 0.6750 - loss: 0.6199
Epoch 2/100
4/4 - 0s - 7ms/step - accuracy: 0.7450 - loss: 0.5834
Epoch 3/100
4/4 - 0s - 7ms/step - accuracy: 0.7700 - loss: 0.5597
Epoch 4/100
4/4 - 0s - 7ms/step - accuracy: 0.7800 - loss: 0.5401
Epoch 5/100
4/4 - 0s - 7ms/step - accuracy: 0.7700 - loss: 0.5213
Epoch 6/100
4/4 - 0s - 7ms/step - accuracy: 0.7750 - loss: 0.5039
Epoch 7/100
4/4 - 0s - 7ms/step - accuracy: 0.7850 - loss: 0.4875
Epoch 8/100
4/4 - 0s - 7ms/step - accuracy: 0.8050 - loss: 0.4713
Epoch 9/100
4/4 - 0s - 7ms/step - accuracy: 0.7950 - loss: 0.4568
Epoch 10/100
4/4 - 0s - 7ms/step - accuracy: 0.8100 - loss: 0.4427
Epoch 11/100
4/4 - 0s - 7ms/step - accuracy: 0.8200 - loss: 0.4290
Epoch 12/100
4/4 - 0s - 7ms/step - accuracy: 0.8250 - loss: 0.4152
Epoch 13/100
4/4 - 0s - 7ms/step - accuracy: 0.8250 - loss: 0.4022
Epoch 14/100
4/4 - 0s - 7ms/step - accuracy: 0.8300 - loss: 0.3893
Epoch 15/100
4/4 - 0s - 7ms/step - accuracy: 0.8400 - loss: 0.3769
Epoch 16/100
4/4 - 0s - 7ms/step - accuracy: 0.8400 - loss: 0.3656
Epoch 17/100
4/4 - 0s - 7ms/step - accuracy: 0.8400 - loss: 0.3546
Epoch 18/100
4/4 - 0s - 7ms/step - accuracy: 0.8450 - loss: 0.3449
Epoch 19/100
4/4 - 0s - 7ms/step - accuracy: 0.8400 - loss: 0.3350
Epoch 20/100
4/4 - 0s - 7ms/step - accuracy: 0.8550 - loss: 0.3265
Epoch 21/100
4/4 - 0s - 7ms/step - accuracy: 0.8600 - loss: 0.3173
Epoch 22/100
4/4 - 0s - 7ms/step - accuracy: 0.8600 - loss: 0.3091
Epoch 23/100
4/4 - 0s - 7ms/step - accuracy: 0.8650 - loss: 0.3015
Epoch 24/100
4/4 - 0s - 7ms/step - accuracy: 0.8650 - loss: 0.2951
Epoch 25/100
4/4 - 0s - 7ms/step - accuracy: 0.8650 - loss: 0.2873
Epoch 26/100
4/4 - 0s - 7ms/step - accuracy: 0.8650 - loss: 0.2818
Epoch 27/100
4/4 - 0s - 7ms/step - accuracy: 0.8650 - loss: 0.2773
Epoch 28/100
4/4 - 0s - 7ms/step - accuracy: 0.8800 - loss: 0.2704
Epoch 29/100
4/4 - 0s - 7ms/step - accuracy: 0.8750 - loss: 0.2665
Epoch 30/100
4/4 - 0s - 7ms/step - accuracy: 0.8750 - loss: 0.2609
Epoch 31/100
4/4 - 0s - 7ms/step - accuracy: 0.8800 - loss: 0.2563
Epoch 32/100
4/4 - 0s - 7ms/step - accuracy: 0.8750 - loss: 0.2529
Epoch 33/100
4/4 - 0s - 7ms/step - accuracy: 0.8850 - loss: 0.2503
Epoch 34/100
4/4 - 0s - 7ms/step - accuracy: 0.8850 - loss: 0.2459
Epoch 35/100
4/4 - 0s - 7ms/step - accuracy: 0.8800 - loss: 0.2428
Epoch 36/100
4/4 - 0s - 7ms/step - accuracy: 0.8900 - loss: 0.2398
Epoch 37/100
4/4 - 0s - 7ms/step - accuracy: 0.8900 - loss: 0.2386
Epoch 38/100
4/4 - 0s - 7ms/step - accuracy: 0.8900 - loss: 0.2340
Epoch 39/100
4/4 - 0s - 7ms/step - accuracy: 0.8950 - loss: 0.2310
Epoch 40/100
4/4 - 0s - 7ms/step - accuracy: 0.9050 - loss: 0.2281
Epoch 41/100
4/4 - 0s - 7ms/step - accuracy: 0.9050 - loss: 0.2260
Epoch 42/100
4/4 - 0s - 7ms/step - accuracy: 0.9000 - loss: 0.2243
Epoch 43/100
4/4 - 0s - 7ms/step - accuracy: 0.9100 - loss: 0.2205
Epoch 44/100
4/4 - 0s - 7ms/step - accuracy: 0.9100 - loss: 0.2196
Epoch 45/100
4/4 - 0s - 7ms/step - accuracy: 0.9050 - loss: 0.2164
Epoch 46/100
4/4 - 0s - 7ms/step - accuracy: 0.9100 - loss: 0.2142
Epoch 47/100
4/4 - 0s - 7ms/step - accuracy: 0.9100 - loss: 0.2115
Epoch 48/100
4/4 - 0s - 7ms/step - accuracy: 0.9150 - loss: 0.2101
Epoch 49/100
4/4 - 0s - 7ms/step - accuracy: 0.9100 - loss: 0.2070
Epoch 50/100
4/4 - 0s - 7ms/step - accuracy: 0.9150 - loss: 0.2071
Epoch 51/100
4/4 - 0s - 7ms/step - accuracy: 0.9150 - loss: 0.2032
Epoch 52/100
4/4 - 0s - 7ms/step - accuracy: 0.9150 - loss: 0.2023
Epoch 53/100
4/4 - 0s - 7ms/step - accuracy: 0.9200 - loss: 0.1992
Epoch 54/100
4/4 - 0s - 7ms/step - accuracy: 0.9200 - loss: 0.1982
Epoch 55/100
4/4 - 0s - 7ms/step - accuracy: 0.9250 - loss: 0.1963
Epoch 56/100
4/4 - 0s - 7ms/step - accuracy: 0.9200 - loss: 0.1966
Epoch 57/100
4/4 - 0s - 7ms/step - accuracy: 0.9300 - loss: 0.1920
Epoch 58/100
4/4 - 0s - 7ms/step - accuracy: 0.9250 - loss: 0.1903
Epoch 59/100
4/4 - 0s - 7ms/step - accuracy: 0.9250 - loss: 0.1893
Epoch 60/100
4/4 - 0s - 7ms/step - accuracy: 0.9300 - loss: 0.1877
Epoch 61/100
4/4 - 0s - 7ms/step - accuracy: 0.9250 - loss: 0.1850
Epoch 62/100
4/4 - 0s - 7ms/step - accuracy: 0.9300 - loss: 0.1826
Epoch 63/100
4/4 - 0s - 7ms/step - accuracy: 0.9300 - loss: 0.1817
Epoch 64/100
4/4 - 0s - 7ms/step - accuracy: 0.9400 - loss: 0.1801
Epoch 65/100
4/4 - 0s - 7ms/step - accuracy: 0.9250 - loss: 0.1816
Epoch 66/100
4/4 - 0s - 7ms/step - accuracy: 0.9300 - loss: 0.1750
Epoch 67/100
4/4 - 0s - 8ms/step - accuracy: 0.9350 - loss: 0.1753
Epoch 68/100
4/4 - 0s - 7ms/step - accuracy: 0.9350 - loss: 0.1734
Epoch 69/100
4/4 - 0s - 7ms/step - accuracy: 0.9350 - loss: 0.1712
Epoch 70/100
4/4 - 0s - 7ms/step - accuracy: 0.9400 - loss: 0.1684
Epoch 71/100
4/4 - 0s - 7ms/step - accuracy: 0.9400 - loss: 0.1691
Epoch 72/100
4/4 - 0s - 7ms/step - accuracy: 0.9400 - loss: 0.1687
Epoch 73/100
4/4 - 0s - 7ms/step - accuracy: 0.9400 - loss: 0.1630
Epoch 74/100
4/4 - 0s - 7ms/step - accuracy: 0.9400 - loss: 0.1618
Epoch 75/100
4/4 - 0s - 7ms/step - accuracy: 0.9400 - loss: 0.1599
Epoch 76/100
4/4 - 0s - 7ms/step - accuracy: 0.9350 - loss: 0.1584
Epoch 77/100
4/4 - 0s - 7ms/step - accuracy: 0.9400 - loss: 0.1572
Epoch 78/100
4/4 - 0s - 7ms/step - accuracy: 0.9450 - loss: 0.1551
Epoch 79/100
4/4 - 0s - 7ms/step - accuracy: 0.9450 - loss: 0.1526
Epoch 80/100
4/4 - 0s - 7ms/step - accuracy: 0.9450 - loss: 0.1507
Epoch 81/100
4/4 - 0s - 7ms/step - accuracy: 0.9450 - loss: 0.1498
Epoch 82/100
4/4 - 0s - 7ms/step - accuracy: 0.9400 - loss: 0.1474
Epoch 83/100
4/4 - 0s - 7ms/step - accuracy: 0.9500 - loss: 0.1457
Epoch 84/100
4/4 - 0s - 7ms/step - accuracy: 0.9450 - loss: 0.1439
Epoch 85/100
4/4 - 0s - 7ms/step - accuracy: 0.9500 - loss: 0.1428
Epoch 86/100
4/4 - 0s - 7ms/step - accuracy: 0.9450 - loss: 0.1421
Epoch 87/100
4/4 - 0s - 7ms/step - accuracy: 0.9500 - loss: 0.1389
Epoch 88/100
4/4 - 0s - 7ms/step - accuracy: 0.9500 - loss: 0.1384
Epoch 89/100
4/4 - 0s - 7ms/step - accuracy: 0.9550 - loss: 0.1374
Epoch 90/100
4/4 - 0s - 7ms/step - accuracy: 0.9550 - loss: 0.1339
Epoch 91/100
4/4 - 0s - 7ms/step - accuracy: 0.9500 - loss: 0.1322
Epoch 92/100
4/4 - 0s - 7ms/step - accuracy: 0.9550 - loss: 0.1328
Epoch 93/100
4/4 - 0s - 7ms/step - accuracy: 0.9500 - loss: 0.1295
Epoch 94/100
4/4 - 0s - 6ms/step - accuracy: 0.9550 - loss: 0.1273
Epoch 95/100
4/4 - 0s - 7ms/step - accuracy: 0.9550 - loss: 0.1263
Epoch 96/100
4/4 - 0s - 7ms/step - accuracy: 0.9550 - loss: 0.1250
Epoch 97/100
4/4 - 0s - 7ms/step - accuracy: 0.9550 - loss: 0.1241
Epoch 98/100
4/4 - 0s - 7ms/step - accuracy: 0.9600 - loss: 0.1210
Epoch 99/100
4/4 - 0s - 7ms/step - accuracy: 0.9500 - loss: 0.1201
Epoch 100/100
4/4 - 0s - 7ms/step - accuracy: 0.9600 - loss: 0.1174
score = model.evaluate(x, v, verbose=0)
print(f"score = {score[0]}")
print(f"accuracy = {score[1]}")
score = 0.11571294814348221
accuracy = 0.9649999737739563

Let’s look at a prediction. We need to feed in a single point as an array of shape (N, 2), where N is the number of points

res = model.predict(np.array([[-2, 2]]))
res
1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 35ms/step

1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 47ms/step
array([[7.820534e-08]], dtype=float32)

We see that we get a floating point number. We will need to convert this to 0 or 1 by rounding.

Let’s plot the partitioning

M = 128
N = 128

xmin = -1.75
xmax = 2.5
ymin = -1.25
ymax = 1.75

xpt = np.linspace(xmin, xmax, M)
ypt = np.linspace(ymin, ymax, N)

To make the prediction go faster, we want to feed in a vector of these points, of the form:

[[xpt[0], ypt[0]],
 [xpt[1], ypt[1]],
 ...
]

We can see that this packs them into the vector

pairs = np.array(np.meshgrid(xpt, ypt)).T.reshape(-1, 2)
pairs[0]
array([-1.75, -1.25])

Now we do the prediction. We will get a vector out, which we reshape to match the original domain.

res = model.predict(pairs, verbose=0)
res.shape = (M, N)

Finally, round to 0 or 1

domain = np.where(res > 0.5, 1, 0)

and we can plot the data

fig, ax = plt.subplots()
ax.imshow(domain.T, origin="lower",
          extent=[xmin, xmax, ymin, ymax], alpha=0.25)
xpt = [q[0] for q in x]
ypt = [q[1] for q in x]

ax.scatter(xpt, ypt, s=40, c=v, cmap="viridis")
<matplotlib.collections.PathCollection at 0x7f3dcc69f4d0>
../_images/7a4719ff7c5f20c5a16d9392057c7cc9efd7805a1e5df2a7148dae5a0103e0ac.png