Clustering#
Clustering seeks to group data into clusters based on their properties and then allow us to predict which cluster a new member belongs.
import numpy as np
import matplotlib.pyplot as plt
We’ll use a dataset generator that is part of scikit-learn called make_moons. This generates data that falls into 2 different sets with a shape that looks like half-moons.
from sklearn import datasets
def generate_data():
xvec, val = datasets.make_moons(200, noise=0.2)
# encode the output to be 2 elements
x = []
v = []
for xv, vv in zip(xvec, val):
x.append(np.array(xv))
v.append(vv)
return np.array(x), np.array(v)
x, v = generate_data()
Let’s look at a point and it’s value
print(f"x = {x[0]}, value = {v[0]}")
x = [ 1.11239295 -0.40600028], value = 1
Now let’s plot the data
def plot_data(x, v):
xpt = [q[0] for q in x]
ypt = [q[1] for q in x]
fig, ax = plt.subplots()
ax.scatter(xpt, ypt, s=40, c=v, cmap="viridis")
ax.set_aspect("equal")
return fig
fig = plot_data(x, v)
We want to partition this domain into 2 regions, such that when we come in with a new point, we know which group it belongs to.
First we setup and train our network
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Input
from keras.optimizers import RMSprop
/opt/hostedtoolcache/Python/3.14.2/x64/lib/python3.14/site-packages/keras/src/export/tf2onnx_lib.py:8: FutureWarning: In the future `np.object` will be defined as the corresponding NumPy scalar.
if not hasattr(np, "object"):
model = Sequential()
model.add(Input(shape=(2,)))
model.add(Dense(50, activation="relu"))
model.add(Dense(20, activation="relu"))
model.add(Dense(1, activation="sigmoid"))
rms = RMSprop()
model.compile(loss='binary_crossentropy',
optimizer=rms, metrics=['accuracy'])
model.summary()
Model: "sequential"
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓ ┃ Layer (type) ┃ Output Shape ┃ Param # ┃ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩ │ dense (Dense) │ (None, 50) │ 150 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dense_1 (Dense) │ (None, 20) │ 1,020 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dense_2 (Dense) │ (None, 1) │ 21 │ └─────────────────────────────────┴────────────────────────┴───────────────┘
Total params: 1,191 (4.65 KB)
Trainable params: 1,191 (4.65 KB)
Non-trainable params: 0 (0.00 B)
We seem to need a lot of epochs here to get a good result
epochs = 100
results = model.fit(x, v, batch_size=50, epochs=epochs, verbose=2)
Epoch 1/100
4/4 - 0s - 6ms/step - accuracy: 0.4600 - loss: 0.7073
Epoch 2/100
4/4 - 0s - 6ms/step - accuracy: 0.6850 - loss: 0.6566
Epoch 3/100
4/4 - 0s - 6ms/step - accuracy: 0.7150 - loss: 0.6216
Epoch 4/100
4/4 - 0s - 6ms/step - accuracy: 0.7500 - loss: 0.5923
Epoch 5/100
4/4 - 0s - 6ms/step - accuracy: 0.7800 - loss: 0.5668
Epoch 6/100
4/4 - 0s - 6ms/step - accuracy: 0.7900 - loss: 0.5431
Epoch 7/100
4/4 - 0s - 5ms/step - accuracy: 0.8000 - loss: 0.5217
Epoch 8/100
4/4 - 0s - 6ms/step - accuracy: 0.8100 - loss: 0.5019
Epoch 9/100
4/4 - 0s - 6ms/step - accuracy: 0.8100 - loss: 0.4838
Epoch 10/100
4/4 - 0s - 6ms/step - accuracy: 0.8100 - loss: 0.4672
Epoch 11/100
4/4 - 0s - 6ms/step - accuracy: 0.8100 - loss: 0.4518
Epoch 12/100
4/4 - 0s - 6ms/step - accuracy: 0.8150 - loss: 0.4379
Epoch 13/100
4/4 - 0s - 6ms/step - accuracy: 0.8350 - loss: 0.4241
Epoch 14/100
4/4 - 0s - 6ms/step - accuracy: 0.8500 - loss: 0.4113
Epoch 15/100
4/4 - 0s - 6ms/step - accuracy: 0.8450 - loss: 0.4002
Epoch 16/100
4/4 - 0s - 6ms/step - accuracy: 0.8500 - loss: 0.3892
Epoch 17/100
4/4 - 0s - 23ms/step - accuracy: 0.8550 - loss: 0.3801
Epoch 18/100
4/4 - 0s - 6ms/step - accuracy: 0.8500 - loss: 0.3710
Epoch 19/100
4/4 - 0s - 6ms/step - accuracy: 0.8550 - loss: 0.3627
Epoch 20/100
4/4 - 0s - 5ms/step - accuracy: 0.8550 - loss: 0.3551
Epoch 21/100
4/4 - 0s - 5ms/step - accuracy: 0.8550 - loss: 0.3488
Epoch 22/100
4/4 - 0s - 5ms/step - accuracy: 0.8550 - loss: 0.3437
Epoch 23/100
4/4 - 0s - 5ms/step - accuracy: 0.8550 - loss: 0.3373
Epoch 24/100
4/4 - 0s - 9ms/step - accuracy: 0.8550 - loss: 0.3315
Epoch 25/100
4/4 - 0s - 6ms/step - accuracy: 0.8550 - loss: 0.3267
Epoch 26/100
4/4 - 0s - 6ms/step - accuracy: 0.8550 - loss: 0.3224
Epoch 27/100
4/4 - 0s - 5ms/step - accuracy: 0.8550 - loss: 0.3174
Epoch 28/100
4/4 - 0s - 6ms/step - accuracy: 0.8550 - loss: 0.3132
Epoch 29/100
4/4 - 0s - 6ms/step - accuracy: 0.8550 - loss: 0.3106
Epoch 30/100
4/4 - 0s - 21ms/step - accuracy: 0.8550 - loss: 0.3057
Epoch 31/100
4/4 - 0s - 6ms/step - accuracy: 0.8550 - loss: 0.3030
Epoch 32/100
4/4 - 0s - 5ms/step - accuracy: 0.8550 - loss: 0.2995
Epoch 33/100
4/4 - 0s - 5ms/step - accuracy: 0.8600 - loss: 0.2969
Epoch 34/100
4/4 - 0s - 5ms/step - accuracy: 0.8600 - loss: 0.2954
Epoch 35/100
4/4 - 0s - 5ms/step - accuracy: 0.8600 - loss: 0.2912
Epoch 36/100
4/4 - 0s - 5ms/step - accuracy: 0.8650 - loss: 0.2894
Epoch 37/100
4/4 - 0s - 5ms/step - accuracy: 0.8650 - loss: 0.2873
Epoch 38/100
4/4 - 0s - 6ms/step - accuracy: 0.8650 - loss: 0.2853
Epoch 39/100
4/4 - 0s - 5ms/step - accuracy: 0.8700 - loss: 0.2833
Epoch 40/100
4/4 - 0s - 5ms/step - accuracy: 0.8700 - loss: 0.2819
Epoch 41/100
4/4 - 0s - 6ms/step - accuracy: 0.8700 - loss: 0.2776
Epoch 42/100
4/4 - 0s - 5ms/step - accuracy: 0.8700 - loss: 0.2759
Epoch 43/100
4/4 - 0s - 5ms/step - accuracy: 0.8700 - loss: 0.2736
Epoch 44/100
4/4 - 0s - 5ms/step - accuracy: 0.8850 - loss: 0.2734
Epoch 45/100
4/4 - 0s - 5ms/step - accuracy: 0.8850 - loss: 0.2709
Epoch 46/100
4/4 - 0s - 5ms/step - accuracy: 0.8900 - loss: 0.2676
Epoch 47/100
4/4 - 0s - 5ms/step - accuracy: 0.9000 - loss: 0.2658
Epoch 48/100
4/4 - 0s - 5ms/step - accuracy: 0.8950 - loss: 0.2644
Epoch 49/100
4/4 - 0s - 5ms/step - accuracy: 0.9000 - loss: 0.2621
Epoch 50/100
4/4 - 0s - 5ms/step - accuracy: 0.8950 - loss: 0.2613
Epoch 51/100
4/4 - 0s - 5ms/step - accuracy: 0.9000 - loss: 0.2607
Epoch 52/100
4/4 - 0s - 5ms/step - accuracy: 0.9000 - loss: 0.2572
Epoch 53/100
4/4 - 0s - 5ms/step - accuracy: 0.8950 - loss: 0.2560
Epoch 54/100
4/4 - 0s - 5ms/step - accuracy: 0.8950 - loss: 0.2544
Epoch 55/100
4/4 - 0s - 5ms/step - accuracy: 0.9050 - loss: 0.2527
Epoch 56/100
4/4 - 0s - 5ms/step - accuracy: 0.9050 - loss: 0.2513
Epoch 57/100
4/4 - 0s - 30ms/step - accuracy: 0.9100 - loss: 0.2504
Epoch 58/100
4/4 - 0s - 5ms/step - accuracy: 0.9050 - loss: 0.2480
Epoch 59/100
4/4 - 0s - 5ms/step - accuracy: 0.9050 - loss: 0.2497
Epoch 60/100
4/4 - 0s - 5ms/step - accuracy: 0.9050 - loss: 0.2447
Epoch 61/100
4/4 - 0s - 5ms/step - accuracy: 0.9000 - loss: 0.2438
Epoch 62/100
4/4 - 0s - 5ms/step - accuracy: 0.9000 - loss: 0.2431
Epoch 63/100
4/4 - 0s - 5ms/step - accuracy: 0.9100 - loss: 0.2412
Epoch 64/100
4/4 - 0s - 5ms/step - accuracy: 0.9150 - loss: 0.2391
Epoch 65/100
4/4 - 0s - 5ms/step - accuracy: 0.9150 - loss: 0.2403
Epoch 66/100
4/4 - 0s - 14ms/step - accuracy: 0.9150 - loss: 0.2362
Epoch 67/100
4/4 - 0s - 5ms/step - accuracy: 0.9150 - loss: 0.2354
Epoch 68/100
4/4 - 0s - 5ms/step - accuracy: 0.9200 - loss: 0.2349
Epoch 69/100
4/4 - 0s - 5ms/step - accuracy: 0.9200 - loss: 0.2321
Epoch 70/100
4/4 - 0s - 7ms/step - accuracy: 0.9150 - loss: 0.2334
Epoch 71/100
4/4 - 0s - 7ms/step - accuracy: 0.9200 - loss: 0.2294
Epoch 72/100
4/4 - 0s - 6ms/step - accuracy: 0.9100 - loss: 0.2279
Epoch 73/100
4/4 - 0s - 21ms/step - accuracy: 0.9150 - loss: 0.2272
Epoch 74/100
4/4 - 0s - 5ms/step - accuracy: 0.9150 - loss: 0.2271
Epoch 75/100
4/4 - 0s - 5ms/step - accuracy: 0.9200 - loss: 0.2246
Epoch 76/100
4/4 - 0s - 5ms/step - accuracy: 0.9100 - loss: 0.2232
Epoch 77/100
4/4 - 0s - 5ms/step - accuracy: 0.9150 - loss: 0.2220
Epoch 78/100
4/4 - 0s - 6ms/step - accuracy: 0.9150 - loss: 0.2201
Epoch 79/100
4/4 - 0s - 5ms/step - accuracy: 0.9250 - loss: 0.2225
Epoch 80/100
4/4 - 0s - 5ms/step - accuracy: 0.9200 - loss: 0.2170
Epoch 81/100
4/4 - 0s - 5ms/step - accuracy: 0.9200 - loss: 0.2163
Epoch 82/100
4/4 - 0s - 5ms/step - accuracy: 0.9200 - loss: 0.2154
Epoch 83/100
4/4 - 0s - 5ms/step - accuracy: 0.9200 - loss: 0.2130
Epoch 84/100
4/4 - 0s - 5ms/step - accuracy: 0.9200 - loss: 0.2119
Epoch 85/100
4/4 - 0s - 5ms/step - accuracy: 0.9150 - loss: 0.2104
Epoch 86/100
4/4 - 0s - 5ms/step - accuracy: 0.9200 - loss: 0.2111
Epoch 87/100
4/4 - 0s - 5ms/step - accuracy: 0.9250 - loss: 0.2070
Epoch 88/100
4/4 - 0s - 5ms/step - accuracy: 0.9200 - loss: 0.2060
Epoch 89/100
4/4 - 0s - 5ms/step - accuracy: 0.9200 - loss: 0.2050
Epoch 90/100
4/4 - 0s - 5ms/step - accuracy: 0.9150 - loss: 0.2039
Epoch 91/100
4/4 - 0s - 5ms/step - accuracy: 0.9250 - loss: 0.2012
Epoch 92/100
4/4 - 0s - 5ms/step - accuracy: 0.9250 - loss: 0.1999
Epoch 93/100
4/4 - 0s - 5ms/step - accuracy: 0.9150 - loss: 0.1984
Epoch 94/100
4/4 - 0s - 5ms/step - accuracy: 0.9300 - loss: 0.1976
Epoch 95/100
4/4 - 0s - 5ms/step - accuracy: 0.9250 - loss: 0.1955
Epoch 96/100
4/4 - 0s - 5ms/step - accuracy: 0.9250 - loss: 0.1954
Epoch 97/100
4/4 - 0s - 5ms/step - accuracy: 0.9300 - loss: 0.1923
Epoch 98/100
4/4 - 0s - 5ms/step - accuracy: 0.9300 - loss: 0.1919
Epoch 99/100
4/4 - 0s - 5ms/step - accuracy: 0.9200 - loss: 0.1917
Epoch 100/100
4/4 - 0s - 5ms/step - accuracy: 0.9350 - loss: 0.1885
score = model.evaluate(x, v, verbose=0)
print(f"score = {score[0]}")
print(f"accuracy = {score[1]}")
score = 0.18592830002307892
accuracy = 0.9300000071525574
Let’s look at a prediction. We need to feed in a single point as an array of shape (N, 2), where N is the number of points
res = model.predict(np.array([[-2, 2]]))
res
1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step
1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step
array([[5.3135196e-10]], dtype=float32)
We see that we get a floating point number. We will need to convert this to 0 or 1 by rounding.
Let’s plot the partitioning
M = 128
N = 128
xmin = -1.75
xmax = 2.5
ymin = -1.25
ymax = 1.75
xpt = np.linspace(xmin, xmax, M)
ypt = np.linspace(ymin, ymax, N)
To make the prediction go faster, we want to feed in a vector of these points, of the form:
[[xpt[0], ypt[0]],
[xpt[1], ypt[1]],
...
]
We can see that this packs them into the vector
pairs = np.array(np.meshgrid(xpt, ypt)).T.reshape(-1, 2)
pairs[0]
array([-1.75, -1.25])
Now we do the prediction. We will get a vector out, which we reshape to match the original domain.
res = model.predict(pairs, verbose=0)
res.shape = (M, N)
Finally, round to 0 or 1
domain = np.where(res > 0.5, 1, 0)
and we can plot the data
fig, ax = plt.subplots()
ax.imshow(domain.T, origin="lower",
extent=[xmin, xmax, ymin, ymax], alpha=0.25)
xpt = [q[0] for q in x]
ypt = [q[1] for q in x]
ax.scatter(xpt, ypt, s=40, c=v, cmap="viridis")
<matplotlib.collections.PathCollection at 0x7f9b914902d0>