Clustering

Clustering#

Clustering seeks to group data into clusters based on their properties and then allow us to predict which cluster a new member belongs.

import numpy as np
import matplotlib.pyplot as plt

We’ll use a dataset generator that is part of scikit-learn called make_moons. This generates data that falls into 2 different sets with a shape that looks like half-moons.

from sklearn import datasets
def generate_data():
    xvec, val = datasets.make_moons(200, noise=0.2)

    # encode the output to be 2 elements
    x = []
    v = []
    for xv, vv in zip(xvec, val):
        x.append(np.array(xv))
        v.append(vv)

    return np.array(x), np.array(v)
x, v = generate_data()

Let’s look at a point and it’s value

print(f"x = {x[0]}, value = {v[0]}")
x = [0.00559981 1.22674127], value = 0

Now let’s plot the data

def plot_data(x, v):
    xpt = [q[0] for q in x]
    ypt = [q[1] for q in x]

    fig, ax = plt.subplots()
    ax.scatter(xpt, ypt, s=40, c=v, cmap="viridis")
    ax.set_aspect("equal")
    return fig
fig = plot_data(x, v)
../_images/c2457ff20d60abdb48607f040bf64b17a10308dd38e15c00b9e10cd9b0945253.png

We want to partition this domain into 2 regions, such that when we come in with a new point, we know which group it belongs to.

First we setup and train our network

from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Input
from keras.optimizers import RMSprop
/opt/hostedtoolcache/Python/3.14.2/x64/lib/python3.14/site-packages/keras/src/export/tf2onnx_lib.py:8: FutureWarning: In the future `np.object` will be defined as the corresponding NumPy scalar.
  if not hasattr(np, "object"):
model = Sequential()
model.add(Input(shape=(2,)))
model.add(Dense(50, activation="relu"))
model.add(Dense(20, activation="relu"))
model.add(Dense(1, activation="sigmoid"))
rms = RMSprop()
model.compile(loss='binary_crossentropy',
              optimizer=rms, metrics=['accuracy'])
model.summary()
Model: "sequential"
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓
┃ Layer (type)                     Output Shape                  Param # ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩
│ dense (Dense)                   │ (None, 50)             │           150 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense_1 (Dense)                 │ (None, 20)             │         1,020 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense_2 (Dense)                 │ (None, 1)              │            21 │
└─────────────────────────────────┴────────────────────────┴───────────────┘
 Total params: 1,191 (4.65 KB)
 Trainable params: 1,191 (4.65 KB)
 Non-trainable params: 0 (0.00 B)

We seem to need a lot of epochs here to get a good result

epochs = 100
results = model.fit(x, v, batch_size=50, epochs=epochs, verbose=2)
Epoch 1/100
4/4 - 0s - 7ms/step - accuracy: 0.5850 - loss: 0.6613
Epoch 2/100
4/4 - 0s - 6ms/step - accuracy: 0.7200 - loss: 0.6263
Epoch 3/100
4/4 - 0s - 6ms/step - accuracy: 0.8100 - loss: 0.6028
Epoch 4/100
4/4 - 0s - 6ms/step - accuracy: 0.8200 - loss: 0.5843
Epoch 5/100
4/4 - 0s - 6ms/step - accuracy: 0.8300 - loss: 0.5661
Epoch 6/100
4/4 - 0s - 6ms/step - accuracy: 0.8350 - loss: 0.5485
Epoch 7/100
4/4 - 0s - 6ms/step - accuracy: 0.8300 - loss: 0.5314
Epoch 8/100
4/4 - 0s - 6ms/step - accuracy: 0.8400 - loss: 0.5150
Epoch 9/100
4/4 - 0s - 6ms/step - accuracy: 0.8500 - loss: 0.4991
Epoch 10/100
4/4 - 0s - 6ms/step - accuracy: 0.8550 - loss: 0.4842
Epoch 11/100
4/4 - 0s - 7ms/step - accuracy: 0.8650 - loss: 0.4699
Epoch 12/100
4/4 - 0s - 7ms/step - accuracy: 0.8650 - loss: 0.4560
Epoch 13/100
4/4 - 0s - 6ms/step - accuracy: 0.8600 - loss: 0.4430
Epoch 14/100
4/4 - 0s - 7ms/step - accuracy: 0.8650 - loss: 0.4301
Epoch 15/100
4/4 - 0s - 6ms/step - accuracy: 0.8650 - loss: 0.4173
Epoch 16/100
4/4 - 0s - 6ms/step - accuracy: 0.8550 - loss: 0.4053
Epoch 17/100
4/4 - 0s - 26ms/step - accuracy: 0.8550 - loss: 0.3934
Epoch 18/100
4/4 - 0s - 6ms/step - accuracy: 0.8550 - loss: 0.3820
Epoch 19/100
4/4 - 0s - 6ms/step - accuracy: 0.8700 - loss: 0.3722
Epoch 20/100
4/4 - 0s - 6ms/step - accuracy: 0.8650 - loss: 0.3623
Epoch 21/100
4/4 - 0s - 6ms/step - accuracy: 0.8650 - loss: 0.3530
Epoch 22/100
4/4 - 0s - 6ms/step - accuracy: 0.8700 - loss: 0.3441
Epoch 23/100
4/4 - 0s - 6ms/step - accuracy: 0.8650 - loss: 0.3380
Epoch 24/100
4/4 - 0s - 13ms/step - accuracy: 0.8750 - loss: 0.3293
Epoch 25/100
4/4 - 0s - 6ms/step - accuracy: 0.8750 - loss: 0.3228
Epoch 26/100
4/4 - 0s - 7ms/step - accuracy: 0.8800 - loss: 0.3164
Epoch 27/100
4/4 - 0s - 6ms/step - accuracy: 0.8800 - loss: 0.3113
Epoch 28/100
4/4 - 0s - 8ms/step - accuracy: 0.8850 - loss: 0.3057
Epoch 29/100
4/4 - 0s - 8ms/step - accuracy: 0.8800 - loss: 0.3006
Epoch 30/100
4/4 - 0s - 24ms/step - accuracy: 0.8850 - loss: 0.2969
Epoch 31/100
4/4 - 0s - 7ms/step - accuracy: 0.8850 - loss: 0.2921
Epoch 32/100
4/4 - 0s - 8ms/step - accuracy: 0.8850 - loss: 0.2881
Epoch 33/100
4/4 - 0s - 8ms/step - accuracy: 0.8950 - loss: 0.2846
Epoch 34/100
4/4 - 0s - 8ms/step - accuracy: 0.8900 - loss: 0.2815
Epoch 35/100
4/4 - 0s - 6ms/step - accuracy: 0.8900 - loss: 0.2784
Epoch 36/100
4/4 - 0s - 6ms/step - accuracy: 0.8900 - loss: 0.2763
Epoch 37/100
4/4 - 0s - 6ms/step - accuracy: 0.8900 - loss: 0.2734
Epoch 38/100
4/4 - 0s - 6ms/step - accuracy: 0.8850 - loss: 0.2734
Epoch 39/100
4/4 - 0s - 6ms/step - accuracy: 0.8900 - loss: 0.2702
Epoch 40/100
4/4 - 0s - 6ms/step - accuracy: 0.8900 - loss: 0.2676
Epoch 41/100
4/4 - 0s - 6ms/step - accuracy: 0.8950 - loss: 0.2660
Epoch 42/100
4/4 - 0s - 6ms/step - accuracy: 0.8850 - loss: 0.2646
Epoch 43/100
4/4 - 0s - 6ms/step - accuracy: 0.8850 - loss: 0.2636
Epoch 44/100
4/4 - 0s - 6ms/step - accuracy: 0.8900 - loss: 0.2620
Epoch 45/100
4/4 - 0s - 6ms/step - accuracy: 0.8950 - loss: 0.2601
Epoch 46/100
4/4 - 0s - 6ms/step - accuracy: 0.8900 - loss: 0.2594
Epoch 47/100
4/4 - 0s - 6ms/step - accuracy: 0.8950 - loss: 0.2579
Epoch 48/100
4/4 - 0s - 6ms/step - accuracy: 0.8900 - loss: 0.2575
Epoch 49/100
4/4 - 0s - 6ms/step - accuracy: 0.8900 - loss: 0.2557
Epoch 50/100
4/4 - 0s - 6ms/step - accuracy: 0.9000 - loss: 0.2536
Epoch 51/100
4/4 - 0s - 6ms/step - accuracy: 0.8950 - loss: 0.2523
Epoch 52/100
4/4 - 0s - 6ms/step - accuracy: 0.8950 - loss: 0.2507
Epoch 53/100
4/4 - 0s - 6ms/step - accuracy: 0.9000 - loss: 0.2506
Epoch 54/100
4/4 - 0s - 6ms/step - accuracy: 0.9000 - loss: 0.2493
Epoch 55/100
4/4 - 0s - 6ms/step - accuracy: 0.9000 - loss: 0.2501
Epoch 56/100
4/4 - 0s - 6ms/step - accuracy: 0.9000 - loss: 0.2462
Epoch 57/100
4/4 - 0s - 47ms/step - accuracy: 0.9000 - loss: 0.2449
Epoch 58/100
4/4 - 0s - 6ms/step - accuracy: 0.9000 - loss: 0.2442
Epoch 59/100
4/4 - 0s - 6ms/step - accuracy: 0.9000 - loss: 0.2423
Epoch 60/100
4/4 - 0s - 6ms/step - accuracy: 0.9000 - loss: 0.2414
Epoch 61/100
4/4 - 0s - 6ms/step - accuracy: 0.9000 - loss: 0.2391
Epoch 62/100
4/4 - 0s - 6ms/step - accuracy: 0.9100 - loss: 0.2381
Epoch 63/100
4/4 - 0s - 6ms/step - accuracy: 0.9000 - loss: 0.2369
Epoch 64/100
4/4 - 0s - 6ms/step - accuracy: 0.9050 - loss: 0.2348
Epoch 65/100
4/4 - 0s - 6ms/step - accuracy: 0.9050 - loss: 0.2333
Epoch 66/100
4/4 - 0s - 16ms/step - accuracy: 0.9050 - loss: 0.2322
Epoch 67/100
4/4 - 0s - 6ms/step - accuracy: 0.9050 - loss: 0.2315
Epoch 68/100
4/4 - 0s - 6ms/step - accuracy: 0.9050 - loss: 0.2292
Epoch 69/100
4/4 - 0s - 6ms/step - accuracy: 0.9050 - loss: 0.2281
Epoch 70/100
4/4 - 0s - 9ms/step - accuracy: 0.9050 - loss: 0.2291
Epoch 71/100
4/4 - 0s - 8ms/step - accuracy: 0.9050 - loss: 0.2266
Epoch 72/100
4/4 - 0s - 8ms/step - accuracy: 0.9050 - loss: 0.2241
Epoch 73/100
4/4 - 0s - 24ms/step - accuracy: 0.9050 - loss: 0.2240
Epoch 74/100
4/4 - 0s - 6ms/step - accuracy: 0.9050 - loss: 0.2215
Epoch 75/100
4/4 - 0s - 6ms/step - accuracy: 0.9050 - loss: 0.2204
Epoch 76/100
4/4 - 0s - 6ms/step - accuracy: 0.9050 - loss: 0.2195
Epoch 77/100
4/4 - 0s - 6ms/step - accuracy: 0.9050 - loss: 0.2191
Epoch 78/100
4/4 - 0s - 6ms/step - accuracy: 0.9050 - loss: 0.2165
Epoch 79/100
4/4 - 0s - 6ms/step - accuracy: 0.9050 - loss: 0.2148
Epoch 80/100
4/4 - 0s - 6ms/step - accuracy: 0.9050 - loss: 0.2152
Epoch 81/100
4/4 - 0s - 6ms/step - accuracy: 0.9150 - loss: 0.2130
Epoch 82/100
4/4 - 0s - 6ms/step - accuracy: 0.9050 - loss: 0.2116
Epoch 83/100
4/4 - 0s - 6ms/step - accuracy: 0.9150 - loss: 0.2099
Epoch 84/100
4/4 - 0s - 6ms/step - accuracy: 0.9200 - loss: 0.2092
Epoch 85/100
4/4 - 0s - 6ms/step - accuracy: 0.9150 - loss: 0.2071
Epoch 86/100
4/4 - 0s - 6ms/step - accuracy: 0.9200 - loss: 0.2067
Epoch 87/100
4/4 - 0s - 6ms/step - accuracy: 0.9150 - loss: 0.2043
Epoch 88/100
4/4 - 0s - 6ms/step - accuracy: 0.9150 - loss: 0.2053
Epoch 89/100
4/4 - 0s - 6ms/step - accuracy: 0.9150 - loss: 0.2015
Epoch 90/100
4/4 - 0s - 6ms/step - accuracy: 0.9150 - loss: 0.2018
Epoch 91/100
4/4 - 0s - 6ms/step - accuracy: 0.9200 - loss: 0.2002
Epoch 92/100
4/4 - 0s - 6ms/step - accuracy: 0.9250 - loss: 0.1977
Epoch 93/100
4/4 - 0s - 7ms/step - accuracy: 0.9250 - loss: 0.1961
Epoch 94/100
4/4 - 0s - 6ms/step - accuracy: 0.9150 - loss: 0.1949
Epoch 95/100
4/4 - 0s - 6ms/step - accuracy: 0.9250 - loss: 0.1947
Epoch 96/100
4/4 - 0s - 6ms/step - accuracy: 0.9250 - loss: 0.1925
Epoch 97/100
4/4 - 0s - 6ms/step - accuracy: 0.9300 - loss: 0.1905
Epoch 98/100
4/4 - 0s - 6ms/step - accuracy: 0.9250 - loss: 0.1896
Epoch 99/100
4/4 - 0s - 6ms/step - accuracy: 0.9250 - loss: 0.1883
Epoch 100/100
4/4 - 0s - 6ms/step - accuracy: 0.9250 - loss: 0.1879
score = model.evaluate(x, v, verbose=0)
print(f"score = {score[0]}")
print(f"accuracy = {score[1]}")
score = 0.18454667925834656
accuracy = 0.9300000071525574

Let’s look at a prediction. We need to feed in a single point as an array of shape (N, 2), where N is the number of points

res = model.predict(np.array([[-2, 2]]))
res
1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 4ms/step

1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step
array([[3.2676808e-06]], dtype=float32)

We see that we get a floating point number. We will need to convert this to 0 or 1 by rounding.

Let’s plot the partitioning

M = 128
N = 128

xmin = -1.75
xmax = 2.5
ymin = -1.25
ymax = 1.75

xpt = np.linspace(xmin, xmax, M)
ypt = np.linspace(ymin, ymax, N)

To make the prediction go faster, we want to feed in a vector of these points, of the form:

[[xpt[0], ypt[0]],
 [xpt[1], ypt[1]],
 ...
]

We can see that this packs them into the vector

pairs = np.array(np.meshgrid(xpt, ypt)).T.reshape(-1, 2)
pairs[0]
array([-1.75, -1.25])

Now we do the prediction. We will get a vector out, which we reshape to match the original domain.

res = model.predict(pairs, verbose=0)
res.shape = (M, N)

Finally, round to 0 or 1

domain = np.where(res > 0.5, 1, 0)

and we can plot the data

fig, ax = plt.subplots()
ax.imshow(domain.T, origin="lower",
          extent=[xmin, xmax, ymin, ymax], alpha=0.25)
xpt = [q[0] for q in x]
ypt = [q[1] for q in x]

ax.scatter(xpt, ypt, s=40, c=v, cmap="viridis")
<matplotlib.collections.PathCollection at 0x7f0589298190>
../_images/6a5561923e7fb455171a489e9ea3c8bf3adfb7dbfc41e7bbc061fb27eab340d7.png