Clustering#
Clustering seeks to group data into clusters based on their properties and then allow us to predict which cluster a new member belongs.
import numpy as np
import matplotlib.pyplot as plt
We’ll use a dataset generator that is part of scikit-learn called make_moons. This generates data that falls into 2 different sets with a shape that looks like half-moons.
from sklearn import datasets
def generate_data():
xvec, val = datasets.make_moons(200, noise=0.2)
# encode the output to be 2 elements
x = []
v = []
for xv, vv in zip(xvec, val):
x.append(np.array(xv))
v.append(vv)
return np.array(x), np.array(v)
x, v = generate_data()
Let’s look at a point and it’s value
print(f"x = {x[0]}, value = {v[0]}")
x = [-0.20538457 1.11292842], value = 0
Now let’s plot the data
def plot_data(x, v):
xpt = [q[0] for q in x]
ypt = [q[1] for q in x]
fig, ax = plt.subplots()
ax.scatter(xpt, ypt, s=40, c=v, cmap="viridis")
ax.set_aspect("equal")
return fig
fig = plot_data(x, v)
We want to partition this domain into 2 regions, such that when we come in with a new point, we know which group it belongs to.
First we setup and train our network
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Input
from keras.optimizers import RMSprop
model = Sequential()
model.add(Input(shape=(2,)))
model.add(Dense(50, activation="relu"))
model.add(Dense(20, activation="relu"))
model.add(Dense(1, activation="sigmoid"))
rms = RMSprop()
model.compile(loss='binary_crossentropy',
optimizer=rms, metrics=['accuracy'])
model.summary()
Model: "sequential"
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓ ┃ Layer (type) ┃ Output Shape ┃ Param # ┃ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩ │ dense (Dense) │ (None, 50) │ 150 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dense_1 (Dense) │ (None, 20) │ 1,020 │ ├─────────────────────────────────┼────────────────────────┼───────────────┤ │ dense_2 (Dense) │ (None, 1) │ 21 │ └─────────────────────────────────┴────────────────────────┴───────────────┘
Total params: 1,191 (4.65 KB)
Trainable params: 1,191 (4.65 KB)
Non-trainable params: 0 (0.00 B)
We seem to need a lot of epochs here to get a good result
epochs = 100
results = model.fit(x, v, batch_size=50, epochs=epochs, verbose=2)
Epoch 1/100
4/4 - 0s - 6ms/step - accuracy: 0.3650 - loss: 0.7016
Epoch 2/100
4/4 - 0s - 6ms/step - accuracy: 0.7000 - loss: 0.6670
Epoch 3/100
4/4 - 0s - 6ms/step - accuracy: 0.7300 - loss: 0.6440
Epoch 4/100
4/4 - 0s - 6ms/step - accuracy: 0.7450 - loss: 0.6256
Epoch 5/100
4/4 - 0s - 6ms/step - accuracy: 0.7550 - loss: 0.6094
Epoch 6/100
4/4 - 0s - 6ms/step - accuracy: 0.7550 - loss: 0.5939
Epoch 7/100
4/4 - 0s - 6ms/step - accuracy: 0.7550 - loss: 0.5783
Epoch 8/100
4/4 - 0s - 6ms/step - accuracy: 0.7600 - loss: 0.5624
Epoch 9/100
4/4 - 0s - 6ms/step - accuracy: 0.7850 - loss: 0.5459
Epoch 10/100
4/4 - 0s - 6ms/step - accuracy: 0.7950 - loss: 0.5284
Epoch 11/100
4/4 - 0s - 6ms/step - accuracy: 0.8050 - loss: 0.5115
Epoch 12/100
4/4 - 0s - 6ms/step - accuracy: 0.8000 - loss: 0.4954
Epoch 13/100
4/4 - 0s - 6ms/step - accuracy: 0.8050 - loss: 0.4806
Epoch 14/100
4/4 - 0s - 6ms/step - accuracy: 0.8200 - loss: 0.4660
Epoch 15/100
4/4 - 0s - 31ms/step - accuracy: 0.8200 - loss: 0.4515
Epoch 16/100
4/4 - 0s - 6ms/step - accuracy: 0.8300 - loss: 0.4384
Epoch 17/100
4/4 - 0s - 5ms/step - accuracy: 0.8400 - loss: 0.4260
Epoch 18/100
4/4 - 0s - 5ms/step - accuracy: 0.8400 - loss: 0.4134
Epoch 19/100
4/4 - 0s - 5ms/step - accuracy: 0.8500 - loss: 0.4014
Epoch 20/100
4/4 - 0s - 5ms/step - accuracy: 0.8550 - loss: 0.3913
Epoch 21/100
4/4 - 0s - 11ms/step - accuracy: 0.8600 - loss: 0.3803
Epoch 22/100
4/4 - 0s - 5ms/step - accuracy: 0.8600 - loss: 0.3707
Epoch 23/100
4/4 - 0s - 5ms/step - accuracy: 0.8600 - loss: 0.3612
Epoch 24/100
4/4 - 0s - 7ms/step - accuracy: 0.8550 - loss: 0.3524
Epoch 25/100
4/4 - 0s - 7ms/step - accuracy: 0.8550 - loss: 0.3440
Epoch 26/100
4/4 - 0s - 7ms/step - accuracy: 0.8550 - loss: 0.3369
Epoch 27/100
4/4 - 0s - 23ms/step - accuracy: 0.8650 - loss: 0.3287
Epoch 28/100
4/4 - 0s - 5ms/step - accuracy: 0.8600 - loss: 0.3225
Epoch 29/100
4/4 - 0s - 5ms/step - accuracy: 0.8550 - loss: 0.3167
Epoch 30/100
4/4 - 0s - 5ms/step - accuracy: 0.8650 - loss: 0.3109
Epoch 31/100
4/4 - 0s - 6ms/step - accuracy: 0.8650 - loss: 0.3052
Epoch 32/100
4/4 - 0s - 5ms/step - accuracy: 0.8650 - loss: 0.3001
Epoch 33/100
4/4 - 0s - 5ms/step - accuracy: 0.8800 - loss: 0.2955
Epoch 34/100
4/4 - 0s - 5ms/step - accuracy: 0.8800 - loss: 0.2911
Epoch 35/100
4/4 - 0s - 5ms/step - accuracy: 0.8800 - loss: 0.2872
Epoch 36/100
4/4 - 0s - 6ms/step - accuracy: 0.8800 - loss: 0.2824
Epoch 37/100
4/4 - 0s - 6ms/step - accuracy: 0.8800 - loss: 0.2785
Epoch 38/100
4/4 - 0s - 6ms/step - accuracy: 0.8800 - loss: 0.2747
Epoch 39/100
4/4 - 0s - 6ms/step - accuracy: 0.8850 - loss: 0.2728
Epoch 40/100
4/4 - 0s - 6ms/step - accuracy: 0.8900 - loss: 0.2689
Epoch 41/100
4/4 - 0s - 6ms/step - accuracy: 0.8950 - loss: 0.2656
Epoch 42/100
4/4 - 0s - 6ms/step - accuracy: 0.8900 - loss: 0.2643
Epoch 43/100
4/4 - 0s - 6ms/step - accuracy: 0.8950 - loss: 0.2600
Epoch 44/100
4/4 - 0s - 6ms/step - accuracy: 0.8950 - loss: 0.2579
Epoch 45/100
4/4 - 0s - 6ms/step - accuracy: 0.8950 - loss: 0.2564
Epoch 46/100
4/4 - 0s - 6ms/step - accuracy: 0.8950 - loss: 0.2538
Epoch 47/100
4/4 - 0s - 6ms/step - accuracy: 0.8950 - loss: 0.2508
Epoch 48/100
4/4 - 0s - 6ms/step - accuracy: 0.8950 - loss: 0.2503
Epoch 49/100
4/4 - 0s - 6ms/step - accuracy: 0.9000 - loss: 0.2488
Epoch 50/100
4/4 - 0s - 6ms/step - accuracy: 0.9050 - loss: 0.2452
Epoch 51/100
4/4 - 0s - 6ms/step - accuracy: 0.9050 - loss: 0.2466
Epoch 52/100
4/4 - 0s - 6ms/step - accuracy: 0.9000 - loss: 0.2430
Epoch 53/100
4/4 - 0s - 6ms/step - accuracy: 0.9050 - loss: 0.2412
Epoch 54/100
4/4 - 0s - 6ms/step - accuracy: 0.9000 - loss: 0.2396
Epoch 55/100
4/4 - 0s - 48ms/step - accuracy: 0.9000 - loss: 0.2377
Epoch 56/100
4/4 - 0s - 6ms/step - accuracy: 0.9050 - loss: 0.2375
Epoch 57/100
4/4 - 0s - 5ms/step - accuracy: 0.9050 - loss: 0.2352
Epoch 58/100
4/4 - 0s - 5ms/step - accuracy: 0.9000 - loss: 0.2335
Epoch 59/100
4/4 - 0s - 5ms/step - accuracy: 0.9100 - loss: 0.2346
Epoch 60/100
4/4 - 0s - 5ms/step - accuracy: 0.9050 - loss: 0.2313
Epoch 61/100
4/4 - 0s - 5ms/step - accuracy: 0.9050 - loss: 0.2302
Epoch 62/100
4/4 - 0s - 5ms/step - accuracy: 0.9050 - loss: 0.2288
Epoch 63/100
4/4 - 0s - 5ms/step - accuracy: 0.9100 - loss: 0.2279
Epoch 64/100
4/4 - 0s - 5ms/step - accuracy: 0.9200 - loss: 0.2265
Epoch 65/100
4/4 - 0s - 19ms/step - accuracy: 0.9100 - loss: 0.2247
Epoch 66/100
4/4 - 0s - 5ms/step - accuracy: 0.9050 - loss: 0.2256
Epoch 67/100
4/4 - 0s - 5ms/step - accuracy: 0.9100 - loss: 0.2233
Epoch 68/100
4/4 - 0s - 8ms/step - accuracy: 0.9100 - loss: 0.2221
Epoch 69/100
4/4 - 0s - 7ms/step - accuracy: 0.9150 - loss: 0.2205
Epoch 70/100
4/4 - 0s - 7ms/step - accuracy: 0.9150 - loss: 0.2203
Epoch 71/100
4/4 - 0s - 6ms/step - accuracy: 0.9250 - loss: 0.2197
Epoch 72/100
4/4 - 0s - 23ms/step - accuracy: 0.9150 - loss: 0.2190
Epoch 73/100
4/4 - 0s - 5ms/step - accuracy: 0.9200 - loss: 0.2167
Epoch 74/100
4/4 - 0s - 5ms/step - accuracy: 0.9200 - loss: 0.2154
Epoch 75/100
4/4 - 0s - 5ms/step - accuracy: 0.9250 - loss: 0.2169
Epoch 76/100
4/4 - 0s - 5ms/step - accuracy: 0.9200 - loss: 0.2148
Epoch 77/100
4/4 - 0s - 5ms/step - accuracy: 0.9200 - loss: 0.2133
Epoch 78/100
4/4 - 0s - 5ms/step - accuracy: 0.9100 - loss: 0.2146
Epoch 79/100
4/4 - 0s - 5ms/step - accuracy: 0.9250 - loss: 0.2110
Epoch 80/100
4/4 - 0s - 5ms/step - accuracy: 0.9250 - loss: 0.2104
Epoch 81/100
4/4 - 0s - 6ms/step - accuracy: 0.9200 - loss: 0.2102
Epoch 82/100
4/4 - 0s - 5ms/step - accuracy: 0.9250 - loss: 0.2088
Epoch 83/100
4/4 - 0s - 6ms/step - accuracy: 0.9250 - loss: 0.2072
Epoch 84/100
4/4 - 0s - 5ms/step - accuracy: 0.9250 - loss: 0.2084
Epoch 85/100
4/4 - 0s - 6ms/step - accuracy: 0.9250 - loss: 0.2063
Epoch 86/100
4/4 - 0s - 5ms/step - accuracy: 0.9250 - loss: 0.2055
Epoch 87/100
4/4 - 0s - 5ms/step - accuracy: 0.9300 - loss: 0.2038
Epoch 88/100
4/4 - 0s - 5ms/step - accuracy: 0.9250 - loss: 0.2030
Epoch 89/100
4/4 - 0s - 5ms/step - accuracy: 0.9300 - loss: 0.2022
Epoch 90/100
4/4 - 0s - 5ms/step - accuracy: 0.9300 - loss: 0.2011
Epoch 91/100
4/4 - 0s - 5ms/step - accuracy: 0.9200 - loss: 0.2020
Epoch 92/100
4/4 - 0s - 5ms/step - accuracy: 0.9300 - loss: 0.1990
Epoch 93/100
4/4 - 0s - 5ms/step - accuracy: 0.9300 - loss: 0.1990
Epoch 94/100
4/4 - 0s - 5ms/step - accuracy: 0.9350 - loss: 0.1977
Epoch 95/100
4/4 - 0s - 5ms/step - accuracy: 0.9350 - loss: 0.1969
Epoch 96/100
4/4 - 0s - 5ms/step - accuracy: 0.9350 - loss: 0.1965
Epoch 97/100
4/4 - 0s - 6ms/step - accuracy: 0.9350 - loss: 0.1950
Epoch 98/100
4/4 - 0s - 5ms/step - accuracy: 0.9250 - loss: 0.1938
Epoch 99/100
4/4 - 0s - 5ms/step - accuracy: 0.9350 - loss: 0.1939
Epoch 100/100
4/4 - 0s - 55ms/step - accuracy: 0.9350 - loss: 0.1924
score = model.evaluate(x, v, verbose=0)
print(f"score = {score[0]}")
print(f"accuracy = {score[1]}")
score = 0.190408855676651
accuracy = 0.9399999976158142
Let’s look at a prediction. We need to feed in a single point as an array of shape (N, 2), where N is the number of points
res = model.predict(np.array([[-2, 2]]))
res
1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step
1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 5ms/step
array([[2.4969563e-07]], dtype=float32)
We see that we get a floating point number. We will need to convert this to 0 or 1 by rounding.
Let’s plot the partitioning
M = 128
N = 128
xmin = -1.75
xmax = 2.5
ymin = -1.25
ymax = 1.75
xpt = np.linspace(xmin, xmax, M)
ypt = np.linspace(ymin, ymax, N)
To make the prediction go faster, we want to feed in a vector of these points, of the form:
[[xpt[0], ypt[0]],
[xpt[1], ypt[1]],
...
]
We can see that this packs them into the vector
pairs = np.array(np.meshgrid(xpt, ypt)).T.reshape(-1, 2)
pairs[0]
array([-1.75, -1.25])
Now we do the prediction. We will get a vector out, which we reshape to match the original domain.
res = model.predict(pairs, verbose=0)
res.shape = (M, N)
Finally, round to 0 or 1
domain = np.where(res > 0.5, 1, 0)
and we can plot the data
fig, ax = plt.subplots()
ax.imshow(domain.T, origin="lower",
extent=[xmin, xmax, ymin, ymax], alpha=0.25)
xpt = [q[0] for q in x]
ypt = [q[1] for q in x]
ax.scatter(xpt, ypt, s=40, c=v, cmap="viridis")
<matplotlib.collections.PathCollection at 0x7fe649bff610>