Clustering

Clustering#

Clustering seeks to group data into clusters based on their properties and then allow us to predict which cluster a new member belongs.

import numpy as np
import matplotlib.pyplot as plt

We’ll use a dataset generator that is part of scikit-learn called make_moons. This generates data that falls into 2 different sets with a shape that looks like half-moons.

from sklearn import datasets
def generate_data():
    xvec, val = datasets.make_moons(200, noise=0.2)

    # encode the output to be 2 elements
    x = []
    v = []
    for xv, vv in zip(xvec, val):
        x.append(np.array(xv))
        v.append(vv)

    return np.array(x), np.array(v)
x, v = generate_data()

Let’s look at a point and it’s value

print(f"x = {x[0]}, value = {v[0]}")
x = [ 0.58972538 -0.4428606 ], value = 1

Now let’s plot the data

def plot_data(x, v):
    xpt = [q[0] for q in x]
    ypt = [q[1] for q in x]

    fig, ax = plt.subplots()
    ax.scatter(xpt, ypt, s=40, c=v, cmap="viridis")
    ax.set_aspect("equal")
    return fig
fig = plot_data(x, v)
../_images/8d0283f7a055de897463bf354249c2508b5920cbf2d03cc3ba8eafd53e2fa35c.png

We want to partition this domain into 2 regions, such that when we come in with a new point, we know which group it belongs to.

First we setup and train our network

from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Input
from keras.optimizers import RMSprop
model = Sequential()
model.add(Input(shape=(2,)))
model.add(Dense(50, activation="relu"))
model.add(Dense(20, activation="relu"))
model.add(Dense(1, activation="sigmoid"))
rms = RMSprop()
model.compile(loss='binary_crossentropy',
              optimizer=rms, metrics=['accuracy'])
model.summary()
Model: "sequential"
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓
┃ Layer (type)                     Output Shape                  Param # ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩
│ dense (Dense)                   │ (None, 50)             │           150 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense_1 (Dense)                 │ (None, 20)             │         1,020 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense_2 (Dense)                 │ (None, 1)              │            21 │
└─────────────────────────────────┴────────────────────────┴───────────────┘
 Total params: 1,191 (4.65 KB)
 Trainable params: 1,191 (4.65 KB)
 Non-trainable params: 0 (0.00 B)

We seem to need a lot of epochs here to get a good result

epochs = 100
results = model.fit(x, v, batch_size=50, epochs=epochs, verbose=2)
Epoch 1/100
4/4 - 0s - 6ms/step - accuracy: 0.5250 - loss: 0.6910
Epoch 2/100
4/4 - 0s - 5ms/step - accuracy: 0.6350 - loss: 0.6584
Epoch 3/100
4/4 - 0s - 5ms/step - accuracy: 0.7050 - loss: 0.6369
Epoch 4/100
4/4 - 0s - 5ms/step - accuracy: 0.7200 - loss: 0.6174
Epoch 5/100
4/4 - 0s - 5ms/step - accuracy: 0.7300 - loss: 0.5998
Epoch 6/100
4/4 - 0s - 5ms/step - accuracy: 0.7550 - loss: 0.5824
Epoch 7/100
4/4 - 0s - 5ms/step - accuracy: 0.7700 - loss: 0.5658
Epoch 8/100
4/4 - 0s - 5ms/step - accuracy: 0.7850 - loss: 0.5482
Epoch 9/100
4/4 - 0s - 5ms/step - accuracy: 0.8000 - loss: 0.5316
Epoch 10/100
4/4 - 0s - 5ms/step - accuracy: 0.8000 - loss: 0.5157
Epoch 11/100
4/4 - 0s - 5ms/step - accuracy: 0.8000 - loss: 0.4999
Epoch 12/100
4/4 - 0s - 6ms/step - accuracy: 0.8000 - loss: 0.4841
Epoch 13/100
4/4 - 0s - 6ms/step - accuracy: 0.8150 - loss: 0.4692
Epoch 14/100
4/4 - 0s - 21ms/step - accuracy: 0.8200 - loss: 0.4544
Epoch 15/100
4/4 - 0s - 5ms/step - accuracy: 0.8250 - loss: 0.4408
Epoch 16/100
4/4 - 0s - 5ms/step - accuracy: 0.8300 - loss: 0.4285
Epoch 17/100
4/4 - 0s - 5ms/step - accuracy: 0.8250 - loss: 0.4175
Epoch 18/100
4/4 - 0s - 5ms/step - accuracy: 0.8300 - loss: 0.4063
Epoch 19/100
4/4 - 0s - 5ms/step - accuracy: 0.8300 - loss: 0.3964
Epoch 20/100
4/4 - 0s - 5ms/step - accuracy: 0.8350 - loss: 0.3879
Epoch 21/100
4/4 - 0s - 9ms/step - accuracy: 0.8250 - loss: 0.3791
Epoch 22/100
4/4 - 0s - 5ms/step - accuracy: 0.8450 - loss: 0.3712
Epoch 23/100
4/4 - 0s - 6ms/step - accuracy: 0.8400 - loss: 0.3638
Epoch 24/100
4/4 - 0s - 6ms/step - accuracy: 0.8400 - loss: 0.3572
Epoch 25/100
4/4 - 0s - 7ms/step - accuracy: 0.8500 - loss: 0.3508
Epoch 26/100
4/4 - 0s - 7ms/step - accuracy: 0.8450 - loss: 0.3453
Epoch 27/100
4/4 - 0s - 21ms/step - accuracy: 0.8600 - loss: 0.3391
Epoch 28/100
4/4 - 0s - 5ms/step - accuracy: 0.8600 - loss: 0.3351
Epoch 29/100
4/4 - 0s - 5ms/step - accuracy: 0.8550 - loss: 0.3294
Epoch 30/100
4/4 - 0s - 5ms/step - accuracy: 0.8550 - loss: 0.3250
Epoch 31/100
4/4 - 0s - 5ms/step - accuracy: 0.8550 - loss: 0.3216
Epoch 32/100
4/4 - 0s - 5ms/step - accuracy: 0.8650 - loss: 0.3176
Epoch 33/100
4/4 - 0s - 5ms/step - accuracy: 0.8550 - loss: 0.3144
Epoch 34/100
4/4 - 0s - 5ms/step - accuracy: 0.8650 - loss: 0.3109
Epoch 35/100
4/4 - 0s - 5ms/step - accuracy: 0.8550 - loss: 0.3088
Epoch 36/100
4/4 - 0s - 5ms/step - accuracy: 0.8600 - loss: 0.3051
Epoch 37/100
4/4 - 0s - 5ms/step - accuracy: 0.8550 - loss: 0.3022
Epoch 38/100
4/4 - 0s - 5ms/step - accuracy: 0.8700 - loss: 0.2999
Epoch 39/100
4/4 - 0s - 5ms/step - accuracy: 0.8700 - loss: 0.2973
Epoch 40/100
4/4 - 0s - 6ms/step - accuracy: 0.8700 - loss: 0.2956
Epoch 41/100
4/4 - 0s - 6ms/step - accuracy: 0.8700 - loss: 0.2923
Epoch 42/100
4/4 - 0s - 5ms/step - accuracy: 0.8700 - loss: 0.2900
Epoch 43/100
4/4 - 0s - 5ms/step - accuracy: 0.8650 - loss: 0.2891
Epoch 44/100
4/4 - 0s - 5ms/step - accuracy: 0.8650 - loss: 0.2869
Epoch 45/100
4/4 - 0s - 6ms/step - accuracy: 0.8700 - loss: 0.2848
Epoch 46/100
4/4 - 0s - 5ms/step - accuracy: 0.8750 - loss: 0.2827
Epoch 47/100
4/4 - 0s - 6ms/step - accuracy: 0.8700 - loss: 0.2811
Epoch 48/100
4/4 - 0s - 5ms/step - accuracy: 0.8800 - loss: 0.2785
Epoch 49/100
4/4 - 0s - 5ms/step - accuracy: 0.8750 - loss: 0.2777
Epoch 50/100
4/4 - 0s - 5ms/step - accuracy: 0.8900 - loss: 0.2758
Epoch 51/100
4/4 - 0s - 5ms/step - accuracy: 0.8850 - loss: 0.2742
Epoch 52/100
4/4 - 0s - 5ms/step - accuracy: 0.8850 - loss: 0.2732
Epoch 53/100
4/4 - 0s - 5ms/step - accuracy: 0.8850 - loss: 0.2714
Epoch 54/100
4/4 - 0s - 5ms/step - accuracy: 0.8850 - loss: 0.2690
Epoch 55/100
4/4 - 0s - 28ms/step - accuracy: 0.8850 - loss: 0.2698
Epoch 56/100
4/4 - 0s - 5ms/step - accuracy: 0.8850 - loss: 0.2663
Epoch 57/100
4/4 - 0s - 5ms/step - accuracy: 0.8850 - loss: 0.2659
Epoch 58/100
4/4 - 0s - 5ms/step - accuracy: 0.8900 - loss: 0.2678
Epoch 59/100
4/4 - 0s - 5ms/step - accuracy: 0.8850 - loss: 0.2633
Epoch 60/100
4/4 - 0s - 5ms/step - accuracy: 0.8950 - loss: 0.2615
Epoch 61/100
4/4 - 0s - 5ms/step - accuracy: 0.8900 - loss: 0.2620
Epoch 62/100
4/4 - 0s - 5ms/step - accuracy: 0.8950 - loss: 0.2616
Epoch 63/100
4/4 - 0s - 5ms/step - accuracy: 0.8900 - loss: 0.2597
Epoch 64/100
4/4 - 0s - 5ms/step - accuracy: 0.8900 - loss: 0.2567
Epoch 65/100
4/4 - 0s - 10ms/step - accuracy: 0.8950 - loss: 0.2565
Epoch 66/100
4/4 - 0s - 5ms/step - accuracy: 0.8900 - loss: 0.2551
Epoch 67/100
4/4 - 0s - 5ms/step - accuracy: 0.8900 - loss: 0.2561
Epoch 68/100
4/4 - 0s - 6ms/step - accuracy: 0.8900 - loss: 0.2532
Epoch 69/100
4/4 - 0s - 6ms/step - accuracy: 0.8900 - loss: 0.2528
Epoch 70/100
4/4 - 0s - 7ms/step - accuracy: 0.8900 - loss: 0.2547
Epoch 71/100
4/4 - 0s - 6ms/step - accuracy: 0.8900 - loss: 0.2524
Epoch 72/100
4/4 - 0s - 21ms/step - accuracy: 0.8900 - loss: 0.2490
Epoch 73/100
4/4 - 0s - 5ms/step - accuracy: 0.8900 - loss: 0.2492
Epoch 74/100
4/4 - 0s - 5ms/step - accuracy: 0.8900 - loss: 0.2476
Epoch 75/100
4/4 - 0s - 6ms/step - accuracy: 0.8900 - loss: 0.2463
Epoch 76/100
4/4 - 0s - 5ms/step - accuracy: 0.8900 - loss: 0.2468
Epoch 77/100
4/4 - 0s - 5ms/step - accuracy: 0.8900 - loss: 0.2461
Epoch 78/100
4/4 - 0s - 5ms/step - accuracy: 0.8900 - loss: 0.2438
Epoch 79/100
4/4 - 0s - 6ms/step - accuracy: 0.8900 - loss: 0.2431
Epoch 80/100
4/4 - 0s - 6ms/step - accuracy: 0.8900 - loss: 0.2422
Epoch 81/100
4/4 - 0s - 5ms/step - accuracy: 0.8950 - loss: 0.2399
Epoch 82/100
4/4 - 0s - 5ms/step - accuracy: 0.8950 - loss: 0.2406
Epoch 83/100
4/4 - 0s - 5ms/step - accuracy: 0.8950 - loss: 0.2384
Epoch 84/100
4/4 - 0s - 5ms/step - accuracy: 0.9000 - loss: 0.2398
Epoch 85/100
4/4 - 0s - 5ms/step - accuracy: 0.8950 - loss: 0.2360
Epoch 86/100
4/4 - 0s - 6ms/step - accuracy: 0.8850 - loss: 0.2366
Epoch 87/100
4/4 - 0s - 5ms/step - accuracy: 0.8950 - loss: 0.2343
Epoch 88/100
4/4 - 0s - 5ms/step - accuracy: 0.9000 - loss: 0.2326
Epoch 89/100
4/4 - 0s - 5ms/step - accuracy: 0.9000 - loss: 0.2317
Epoch 90/100
4/4 - 0s - 5ms/step - accuracy: 0.9000 - loss: 0.2319
Epoch 91/100
4/4 - 0s - 5ms/step - accuracy: 0.9000 - loss: 0.2310
Epoch 92/100
4/4 - 0s - 5ms/step - accuracy: 0.9000 - loss: 0.2293
Epoch 93/100
4/4 - 0s - 6ms/step - accuracy: 0.9000 - loss: 0.2281
Epoch 94/100
4/4 - 0s - 5ms/step - accuracy: 0.8950 - loss: 0.2278
Epoch 95/100
4/4 - 0s - 5ms/step - accuracy: 0.9100 - loss: 0.2255
Epoch 96/100
4/4 - 0s - 5ms/step - accuracy: 0.9000 - loss: 0.2252
Epoch 97/100
4/4 - 0s - 5ms/step - accuracy: 0.9100 - loss: 0.2229
Epoch 98/100
4/4 - 0s - 5ms/step - accuracy: 0.9100 - loss: 0.2226
Epoch 99/100
4/4 - 0s - 5ms/step - accuracy: 0.9100 - loss: 0.2218
Epoch 100/100
4/4 - 0s - 5ms/step - accuracy: 0.9100 - loss: 0.2214
score = model.evaluate(x, v, verbose=0)
print(f"score = {score[0]}")
print(f"accuracy = {score[1]}")
score = 0.2182127833366394
accuracy = 0.9049999713897705

Let’s look at a prediction. We need to feed in a single point as an array of shape (N, 2), where N is the number of points

res = model.predict(np.array([[-2, 2]]))
res
1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step

1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 4ms/step
array([[2.0691933e-09]], dtype=float32)

We see that we get a floating point number. We will need to convert this to 0 or 1 by rounding.

Let’s plot the partitioning

M = 128
N = 128

xmin = -1.75
xmax = 2.5
ymin = -1.25
ymax = 1.75

xpt = np.linspace(xmin, xmax, M)
ypt = np.linspace(ymin, ymax, N)

To make the prediction go faster, we want to feed in a vector of these points, of the form:

[[xpt[0], ypt[0]],
 [xpt[1], ypt[1]],
 ...
]

We can see that this packs them into the vector

pairs = np.array(np.meshgrid(xpt, ypt)).T.reshape(-1, 2)
pairs[0]
array([-1.75, -1.25])

Now we do the prediction. We will get a vector out, which we reshape to match the original domain.

res = model.predict(pairs, verbose=0)
res.shape = (M, N)

Finally, round to 0 or 1

domain = np.where(res > 0.5, 1, 0)

and we can plot the data

fig, ax = plt.subplots()
ax.imshow(domain.T, origin="lower",
          extent=[xmin, xmax, ymin, ymax], alpha=0.25)
xpt = [q[0] for q in x]
ypt = [q[1] for q in x]

ax.scatter(xpt, ypt, s=40, c=v, cmap="viridis")
<matplotlib.collections.PathCollection at 0x7f5d43641310>
../_images/29c28e86563c30789c5bc1a64a364a420ea456d516d17fa1bdf6a08c6478e97d.png