Clustering#
Clustering seeks to group data into clusters based on their properties and then allow us to predict which cluster a new member belongs.
import numpy as np
import matplotlib.pyplot as plt
We’ll use a dataset generator that is part of scikit-learn called make_moons. This generates data that falls into 2 different sets with a shape that looks like half-moons.
from sklearn import datasets
def generate_data():
xvec, val = datasets.make_moons(200, noise=0.2)
# encode the output to be 2 elements
x = []
v = []
for xv, vv in zip(xvec, val):
x.append(np.array(xv))
v.append(vv)
return np.array(x), np.array(v)
x, v = generate_data()
Let’s look at a point and it’s value
print(f"x = {x[0]}, value = {v[0]}")
x = [-0.61359357 0.91506266], value = 0
Now let’s plot the data
def plot_data(x, v):
xpt = [q[0] for q in x]
ypt = [q[1] for q in x]
fig, ax = plt.subplots()
ax.scatter(xpt, ypt, s=40, c=v, cmap="viridis")
ax.set_aspect("equal")
return fig
fig = plot_data(x, v)
We want to partition this domain into 2 regions, such that when we come in with a new point, we know which group it belongs to.
First we setup and train our network
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Input
from keras.optimizers import RMSprop
model = Sequential()
model.add(Input(shape=(2,)))
model.add(Dense(50, activation="relu"))
model.add(Dense(20, activation="relu"))
model.add(Dense(1, activation="sigmoid"))
rms = RMSprop()
model.compile(loss='binary_crossentropy',
optimizer=rms, metrics=['accuracy'])
model.summary()
Model: "sequential"
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓ ┃ Layer (type) ┃ Output Shape ┃ Param # ┃ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩ │ dense (Dense) │ (None, 50) │ 150 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dense_1 (Dense) │ (None, 20) │ 1,020 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dense_2 (Dense) │ (None, 1) │ 21 │ └─────────────────────────────────┴────────────────────────┴───────────────┘
Total params: 1,191 (4.65 KB)
Trainable params: 1,191 (4.65 KB)
Non-trainable params: 0 (0.00 B)
We seem to need a lot of epochs here to get a good result
epochs = 100
results = model.fit(x, v, batch_size=50, epochs=epochs, verbose=2)
Epoch 1/100
4/4 - 0s - 5ms/step - accuracy: 0.5100 - loss: 0.6910
Epoch 2/100
4/4 - 0s - 5ms/step - accuracy: 0.5750 - loss: 0.6563
Epoch 3/100
4/4 - 0s - 5ms/step - accuracy: 0.7100 - loss: 0.6340
Epoch 4/100
4/4 - 0s - 5ms/step - accuracy: 0.7800 - loss: 0.6140
Epoch 5/100
4/4 - 0s - 5ms/step - accuracy: 0.8200 - loss: 0.5950
Epoch 6/100
4/4 - 0s - 5ms/step - accuracy: 0.8100 - loss: 0.5776
Epoch 7/100
4/4 - 0s - 5ms/step - accuracy: 0.8250 - loss: 0.5611
Epoch 8/100
4/4 - 0s - 5ms/step - accuracy: 0.8250 - loss: 0.5449
Epoch 9/100
4/4 - 0s - 5ms/step - accuracy: 0.8350 - loss: 0.5291
Epoch 10/100
4/4 - 0s - 5ms/step - accuracy: 0.8300 - loss: 0.5137
Epoch 11/100
4/4 - 0s - 5ms/step - accuracy: 0.8350 - loss: 0.4990
Epoch 12/100
4/4 - 0s - 5ms/step - accuracy: 0.8350 - loss: 0.4852
Epoch 13/100
4/4 - 0s - 5ms/step - accuracy: 0.8350 - loss: 0.4722
Epoch 14/100
4/4 - 0s - 5ms/step - accuracy: 0.8350 - loss: 0.4600
Epoch 15/100
4/4 - 0s - 19ms/step - accuracy: 0.8350 - loss: 0.4489
Epoch 16/100
4/4 - 0s - 5ms/step - accuracy: 0.8400 - loss: 0.4378
Epoch 17/100
4/4 - 0s - 5ms/step - accuracy: 0.8400 - loss: 0.4274
Epoch 18/100
4/4 - 0s - 5ms/step - accuracy: 0.8450 - loss: 0.4183
Epoch 19/100
4/4 - 0s - 5ms/step - accuracy: 0.8400 - loss: 0.4087
Epoch 20/100
4/4 - 0s - 5ms/step - accuracy: 0.8450 - loss: 0.4006
Epoch 21/100
4/4 - 0s - 10ms/step - accuracy: 0.8450 - loss: 0.3928
Epoch 22/100
4/4 - 0s - 5ms/step - accuracy: 0.8600 - loss: 0.3856
Epoch 23/100
4/4 - 0s - 5ms/step - accuracy: 0.8550 - loss: 0.3786
Epoch 24/100
4/4 - 0s - 6ms/step - accuracy: 0.8600 - loss: 0.3728
Epoch 25/100
4/4 - 0s - 6ms/step - accuracy: 0.8600 - loss: 0.3674
Epoch 26/100
4/4 - 0s - 6ms/step - accuracy: 0.8600 - loss: 0.3615
Epoch 27/100
4/4 - 0s - 19ms/step - accuracy: 0.8650 - loss: 0.3572
Epoch 28/100
4/4 - 0s - 5ms/step - accuracy: 0.8700 - loss: 0.3524
Epoch 29/100
4/4 - 0s - 5ms/step - accuracy: 0.8750 - loss: 0.3474
Epoch 30/100
4/4 - 0s - 5ms/step - accuracy: 0.8800 - loss: 0.3436
Epoch 31/100
4/4 - 0s - 5ms/step - accuracy: 0.8800 - loss: 0.3395
Epoch 32/100
4/4 - 0s - 5ms/step - accuracy: 0.8800 - loss: 0.3363
Epoch 33/100
4/4 - 0s - 5ms/step - accuracy: 0.8800 - loss: 0.3326
Epoch 34/100
4/4 - 0s - 5ms/step - accuracy: 0.8800 - loss: 0.3294
Epoch 35/100
4/4 - 0s - 5ms/step - accuracy: 0.8800 - loss: 0.3266
Epoch 36/100
4/4 - 0s - 5ms/step - accuracy: 0.8800 - loss: 0.3245
Epoch 37/100
4/4 - 0s - 5ms/step - accuracy: 0.8800 - loss: 0.3223
Epoch 38/100
4/4 - 0s - 5ms/step - accuracy: 0.8800 - loss: 0.3199
Epoch 39/100
4/4 - 0s - 5ms/step - accuracy: 0.8800 - loss: 0.3189
Epoch 40/100
4/4 - 0s - 5ms/step - accuracy: 0.8800 - loss: 0.3160
Epoch 41/100
4/4 - 0s - 5ms/step - accuracy: 0.8750 - loss: 0.3151
Epoch 42/100
4/4 - 0s - 5ms/step - accuracy: 0.8800 - loss: 0.3141
Epoch 43/100
4/4 - 0s - 5ms/step - accuracy: 0.8750 - loss: 0.3111
Epoch 44/100
4/4 - 0s - 5ms/step - accuracy: 0.8750 - loss: 0.3102
Epoch 45/100
4/4 - 0s - 5ms/step - accuracy: 0.8800 - loss: 0.3082
Epoch 46/100
4/4 - 0s - 5ms/step - accuracy: 0.8800 - loss: 0.3065
Epoch 47/100
4/4 - 0s - 5ms/step - accuracy: 0.8800 - loss: 0.3057
Epoch 48/100
4/4 - 0s - 5ms/step - accuracy: 0.8750 - loss: 0.3041
Epoch 49/100
4/4 - 0s - 5ms/step - accuracy: 0.8750 - loss: 0.3020
Epoch 50/100
4/4 - 0s - 5ms/step - accuracy: 0.8700 - loss: 0.3028
Epoch 51/100
4/4 - 0s - 5ms/step - accuracy: 0.8700 - loss: 0.3001
Epoch 52/100
4/4 - 0s - 5ms/step - accuracy: 0.8700 - loss: 0.2989
Epoch 53/100
4/4 - 0s - 5ms/step - accuracy: 0.8750 - loss: 0.2980
Epoch 54/100
4/4 - 0s - 5ms/step - accuracy: 0.8750 - loss: 0.2956
Epoch 55/100
4/4 - 0s - 5ms/step - accuracy: 0.8750 - loss: 0.2956
Epoch 56/100
4/4 - 0s - 27ms/step - accuracy: 0.8700 - loss: 0.2933
Epoch 57/100
4/4 - 0s - 5ms/step - accuracy: 0.8750 - loss: 0.2922
Epoch 58/100
4/4 - 0s - 5ms/step - accuracy: 0.8800 - loss: 0.2925
Epoch 59/100
4/4 - 0s - 5ms/step - accuracy: 0.8800 - loss: 0.2900
Epoch 60/100
4/4 - 0s - 5ms/step - accuracy: 0.8750 - loss: 0.2893
Epoch 61/100
4/4 - 0s - 5ms/step - accuracy: 0.8750 - loss: 0.2875
Epoch 62/100
4/4 - 0s - 5ms/step - accuracy: 0.8850 - loss: 0.2872
Epoch 63/100
4/4 - 0s - 5ms/step - accuracy: 0.8850 - loss: 0.2849
Epoch 64/100
4/4 - 0s - 5ms/step - accuracy: 0.8850 - loss: 0.2833
Epoch 65/100
4/4 - 0s - 12ms/step - accuracy: 0.8850 - loss: 0.2853
Epoch 66/100
4/4 - 0s - 5ms/step - accuracy: 0.8850 - loss: 0.2823
Epoch 67/100
4/4 - 0s - 5ms/step - accuracy: 0.8850 - loss: 0.2805
Epoch 68/100
4/4 - 0s - 7ms/step - accuracy: 0.8800 - loss: 0.2796
Epoch 69/100
4/4 - 0s - 5ms/step - accuracy: 0.8850 - loss: 0.2779
Epoch 70/100
4/4 - 0s - 6ms/step - accuracy: 0.8850 - loss: 0.2784
Epoch 71/100
4/4 - 0s - 6ms/step - accuracy: 0.8850 - loss: 0.2758
Epoch 72/100
4/4 - 0s - 19ms/step - accuracy: 0.8800 - loss: 0.2740
Epoch 73/100
4/4 - 0s - 5ms/step - accuracy: 0.8850 - loss: 0.2736
Epoch 74/100
4/4 - 0s - 5ms/step - accuracy: 0.8850 - loss: 0.2722
Epoch 75/100
4/4 - 0s - 5ms/step - accuracy: 0.8850 - loss: 0.2717
Epoch 76/100
4/4 - 0s - 5ms/step - accuracy: 0.8850 - loss: 0.2700
Epoch 77/100
4/4 - 0s - 5ms/step - accuracy: 0.8850 - loss: 0.2677
Epoch 78/100
4/4 - 0s - 5ms/step - accuracy: 0.8900 - loss: 0.2681
Epoch 79/100
4/4 - 0s - 5ms/step - accuracy: 0.8900 - loss: 0.2657
Epoch 80/100
4/4 - 0s - 5ms/step - accuracy: 0.8900 - loss: 0.2652
Epoch 81/100
4/4 - 0s - 5ms/step - accuracy: 0.8950 - loss: 0.2636
Epoch 82/100
4/4 - 0s - 5ms/step - accuracy: 0.8950 - loss: 0.2626
Epoch 83/100
4/4 - 0s - 5ms/step - accuracy: 0.8950 - loss: 0.2607
Epoch 84/100
4/4 - 0s - 5ms/step - accuracy: 0.9000 - loss: 0.2595
Epoch 85/100
4/4 - 0s - 5ms/step - accuracy: 0.8950 - loss: 0.2598
Epoch 86/100
4/4 - 0s - 5ms/step - accuracy: 0.8950 - loss: 0.2581
Epoch 87/100
4/4 - 0s - 5ms/step - accuracy: 0.9000 - loss: 0.2568
Epoch 88/100
4/4 - 0s - 5ms/step - accuracy: 0.8950 - loss: 0.2548
Epoch 89/100
4/4 - 0s - 5ms/step - accuracy: 0.9000 - loss: 0.2543
Epoch 90/100
4/4 - 0s - 5ms/step - accuracy: 0.9000 - loss: 0.2523
Epoch 91/100
4/4 - 0s - 5ms/step - accuracy: 0.8950 - loss: 0.2518
Epoch 92/100
4/4 - 0s - 5ms/step - accuracy: 0.8950 - loss: 0.2524
Epoch 93/100
4/4 - 0s - 5ms/step - accuracy: 0.8950 - loss: 0.2485
Epoch 94/100
4/4 - 0s - 5ms/step - accuracy: 0.9000 - loss: 0.2486
Epoch 95/100
4/4 - 0s - 5ms/step - accuracy: 0.9000 - loss: 0.2468
Epoch 96/100
4/4 - 0s - 5ms/step - accuracy: 0.8950 - loss: 0.2469
Epoch 97/100
4/4 - 0s - 5ms/step - accuracy: 0.9000 - loss: 0.2450
Epoch 98/100
4/4 - 0s - 5ms/step - accuracy: 0.9000 - loss: 0.2429
Epoch 99/100
4/4 - 0s - 5ms/step - accuracy: 0.9000 - loss: 0.2418
Epoch 100/100
4/4 - 0s - 32ms/step - accuracy: 0.9000 - loss: 0.2407
score = model.evaluate(x, v, verbose=0)
print(f"score = {score[0]}")
print(f"accuracy = {score[1]}")
score = 0.23850904405117035
accuracy = 0.8999999761581421
Let’s look at a prediction. We need to feed in a single point as an array of shape (N, 2), where N is the number of points
res = model.predict(np.array([[-2, 2]]))
res
1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step
1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 4ms/step
array([[5.108066e-08]], dtype=float32)
We see that we get a floating point number. We will need to convert this to 0 or 1 by rounding.
Let’s plot the partitioning
M = 128
N = 128
xmin = -1.75
xmax = 2.5
ymin = -1.25
ymax = 1.75
xpt = np.linspace(xmin, xmax, M)
ypt = np.linspace(ymin, ymax, N)
To make the prediction go faster, we want to feed in a vector of these points, of the form:
[[xpt[0], ypt[0]],
[xpt[1], ypt[1]],
...
]
We can see that this packs them into the vector
pairs = np.array(np.meshgrid(xpt, ypt)).T.reshape(-1, 2)
pairs[0]
array([-1.75, -1.25])
Now we do the prediction. We will get a vector out, which we reshape to match the original domain.
res = model.predict(pairs, verbose=0)
res.shape = (M, N)
Finally, round to 0 or 1
domain = np.where(res > 0.5, 1, 0)
and we can plot the data
fig, ax = plt.subplots()
ax.imshow(domain.T, origin="lower",
extent=[xmin, xmax, ymin, ymax], alpha=0.25)
xpt = [q[0] for q in x]
ypt = [q[1] for q in x]
ax.scatter(xpt, ypt, s=40, c=v, cmap="viridis")
<matplotlib.collections.PathCollection at 0x7f3916647890>