-
Notifications
You must be signed in to change notification settings - Fork 1
/
kmeans.m
882 lines (801 loc) · 38 KB
/
kmeans.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
function [idxbest, Cbest, sumDbest, Dbest] = kmeans(X, k, varargin)
%KMEANS K-means clustering.
% IDX = KMEANS(X, K) partitions the points in the N-by-P data matrix X
% into K clusters. This partition minimizes the sum, over all clusters, of
% the within-cluster sums of point-to-cluster-centroid distances. Rows of X
% correspond to points, columns correspond to variables. Note: when X is a
% vector, KMEANS treats it as an N-by-1 data matrix, regardless of its
% orientation. KMEANS returns an N-by-1 vector IDX containing the cluster
% indices of each point. By default, KMEANS uses squared Euclidean
% distances.
%
% KMEANS treats NaNs as missing data, and ignores any rows of X that
% contain NaNs.
%
% [IDX, C] = KMEANS(X, K) returns the K cluster centroid locations in
% the K-by-P matrix C.
%
% [IDX, C, SUMD] = KMEANS(X, K) returns the within-cluster sums of
% point-to-centroid distances in the 1-by-K vector sumD.
%
% [IDX, C, SUMD, D] = KMEANS(X, K) returns distances from each point
% to every centroid in the N-by-K matrix D.
%
% [ ... ] = KMEANS(..., 'PARAM1',val1, 'PARAM2',val2, ...) specifies
% optional parameter name/value pairs to control the iterative algorithm
% used by KMEANS. Parameters are:
%
% 'Distance' - Distance measure, in P-dimensional space, that KMEANS
% should minimize with respect to. Choices are:
% 'sqeuclidean' - Squared Euclidean distance (the default)
% 'cityblock' - Sum of absolute differences, a.k.a. L1 distance
% 'cosine' - One minus the cosine of the included angle
% between points (treated as vectors)
% 'correlation' - One minus the sample correlation between points
% (treated as sequences of values)
% 'hamming' - Percentage of bits that differ (only suitable
% for binary data)
%
% 'Start' - Method used to choose initial cluster centroid positions,
% sometimes known as "seeds". Choices are:
% 'plus' - The Default. Select K observations from X according
% to the k-means++ algorithm: the first cluster center
% is chosen uniformly at random from X, after which
% each subsequent cluster center is chosen randomly
% from the remaining data points with probability
% proportional to its distance from the point's
% closest existing cluster center.
% 'sample' - Select K observations from X at random.
% 'uniform' - Select K points uniformly at random from the range
% of X. Not valid for Hamming distance.
% 'cluster' - Perform preliminary clustering phase on random 10%
% subsample of X. This preliminary phase is itself
% initialized using 'sample'.
% matrix - A K-by-P matrix of starting locations. In this case,
% you can pass in [] for K, and KMEANS infers K from
% the first dimension of the matrix. You can also
% supply a 3D array, implying a value for 'Replicates'
% from the array's third dimension.
%
% 'Replicates' - Number of times to repeat the clustering, each with a
% new set of initial centroids. A positive integer, default is 1.
%
% 'EmptyAction' - Action to take if a cluster loses all of its member
% observations. Choices are:
% 'singleton' - Create a new cluster consisting of the one
% observation furthest from its centroid (the
% default).
% 'error' - Treat an empty cluster as an error.
% 'drop' - Remove any clusters that become empty, and set
% the corresponding values in C and D to NaN.
%
%
% 'Options' - Options for the iterative algorithm used to minimize the
% fitting criterion, as created by STATSET. Choices of STATSET
% parameters are:
%
% 'Display' - Level of display output. Choices are 'off', (the
% default), 'iter', and 'final'.
% 'MaxIter' - Maximum number of iterations allowed. Default is 100.
%
% 'UseParallel' - If true and if a parpool of the Parallel Computing
% Toolbox is open, compute in parallel. If the
% Parallel Computing Toolbox is not installed, or a
% parpool is not open, computation occurs in serial
% mode. Default is 'default', meaning serial
% computation.
% 'UseSubstreams' - Set to true to compute in parallel in a reproducible
% fashion. Default is false. To compute reproducibly,
% set Streams to a type allowing substreams:
% 'mlfg6331_64' or 'mrg32k3a'.
% 'Streams' - These fields specify whether to perform clustering
% from multiple 'Start' values in parallel, and how
% to use random numbers when generating the starting
% points. For information on these fields see
% PARALLELSTATS.
% NOTE: If 'UseParallel' is TRUE and 'UseSubstreams' is FALSE,
% then the length of 'Streams' must equal the number of workers
% used by KMEANS. If a parallel pool is already open, this
% will be the size of the parallel pool. If a parallel pool
% is not already open, then MATLAB may try to open a pool for
% you (depending on your installation and preferences).
% To ensure more predictable results, it is best to use
% the PARPOOL command and explicitly create a parallel pool
% prior to invoking KMEANS with 'UseParallel' set to TRUE.
%
% 'OnlinePhase' - Flag indicating whether KMEANS should perform an "on-line
% update" phase in addition to a "batch update" phase. The on-line phase
% can be time consuming for large data sets, but guarantees a solution
% that is a local minimum of the distance criterion, i.e., a partition of
% the data where moving any single point to a different cluster increases
% the total sum of distances. 'on' (the default) or 'off'.
%
% Example:
%
% X = [randn(20,2)+ones(20,2); randn(20,2)-ones(20,2)];
% opts = statset('Display','final');
% [cidx, ctrs] = kmeans(X, 2, 'Distance','city', ...
% 'Replicates',5, 'Options',opts);
% plot(X(cidx==1,1),X(cidx==1,2),'r.', ...
% X(cidx==2,1),X(cidx==2,2),'b.', ctrs(:,1),ctrs(:,2),'kx');
%
% See also LINKAGE, CLUSTERDATA, SILHOUETTE.
% KMEANS uses a two-phase iterative algorithm to minimize the sum of
% point-to-centroid distances, summed over all K clusters. The first phase
% uses what the literature often describes as "batch" updates, where each
% iteration consists of reassigning points to their nearest cluster
% centroid, all at once, followed by recalculation of cluster centroids.
% This phase occasionally (especially for small data sets) does not converge
% to solution that is a local minimum, i.e., a partition of the data where
% moving any single point to a different cluster increases the total sum of
% distances. Thus, the batch phase be thought of as providing a fast but
% potentially only approximate solution as a starting point for the second
% phase. The second phase uses what the literature often describes as
% "on-line" updates, where points are individually reassigned if doing so
% will reduce the sum of distances, and cluster centroids are recomputed
% after each reassignment. Each iteration during this second phase consists
% of one pass though all the points. The on-line phase will converge to a
% local minimum, although there may be other local minima with lower total
% sum of distances. The problem of finding the global minimum can only be
% solved in general by an exhaustive (or clever, or lucky) choice of
% starting points, but using several replicates with random starting points
% typically results in a solution that is a global minimum.
%
% References:
%
% [1] Seber, G.A.F. (1984) Multivariate Observations, Wiley, New York.
% [2] Spath, H. (1985) Cluster Dissection and Analysis: Theory, FORTRAN
% Programs, Examples, translated by J. Goldschmidt, Halsted Press,
% New York.
% Copyright 1993-2014 The MathWorks, Inc.
if nargin < 2
error(message('stats:kmeans:TooFewInputs'));
end
wasnan = any(isnan(X),2);
hadNaNs = any(wasnan);
if hadNaNs
warning(message('stats:kmeans:MissingDataRemoved'));
X = X(~wasnan,:);
end
% n points in p dimensional space
[n, p] = size(X);
pnames = { 'distance' 'start' 'replicates' 'emptyaction' 'onlinephase' 'options' 'maxiter' 'display'};
dflts = {'sqeuclidean' 'plus' [] 'singleton' 'on' [] [] []};
[distance,start,reps,emptyact,online,options,maxit,display] ...
= internal.stats.parseArgs(pnames, dflts, varargin{:});
distNames = {'sqeuclidean','cityblock','cosine','correlation','hamming'};
distance = internal.stats.getParamVal(distance,distNames,'''Distance''');
switch distance
case 'cosine'
Xnorm = sqrt(sum(X.^2, 2));
if any(min(Xnorm) <= eps(max(Xnorm)))
error(message('stats:kmeans:ZeroDataForCos'));
end
X = X ./ Xnorm(:,ones(1,p));
case 'correlation'
X = bsxfun(@minus, X, mean(X,2));
Xnorm = sqrt(sum(X.^2, 2));
if any(min(Xnorm) <= eps(max(Xnorm)))
error(message('stats:kmeans:ConstantDataForCorr'));
end
X = X ./ Xnorm(:,ones(1,p));
case 'hamming'
if ~all(ismember(X(:),[0 1]))
error(message('stats:kmeans:NonbinaryDataForHamm'));
end
end
if ischar(start)
startNames = {'uniform','sample','cluster','plus','kmeans++'};
j = find(strncmpi(start,startNames,length(start)));
if length(j) > 1
error(message('stats:kmeans:AmbiguousStart', start));
elseif isempty(j)
error(message('stats:kmeans:UnknownStart', start));
elseif isempty(k)
error(message('stats:kmeans:MissingK'));
end
start = startNames{j};
if strcmp(start, 'uniform')
if strcmp(distance, 'hamming')
error(message('stats:kmeans:UniformStartForHamm'));
end
Xmins = min(X,[],1);
Xmaxs = max(X,[],1);
end
elseif isnumeric(start)
CC = start;
start = 'numeric';
if isempty(k)
k = size(CC,1);
elseif k ~= size(CC,1);
error(message('stats:kmeans:StartBadRowSize'));
elseif size(CC,2) ~= p
error(message('stats:kmeans:StartBadColumnSize'));
end
if isempty(reps)
reps = size(CC,3);
elseif reps ~= size(CC,3);
error(message('stats:kmeans:StartBadThirdDimSize'));
end
% Need to center explicit starting points for 'correlation'. (Re)normalization
% for 'cosine'/'correlation' is done at each iteration.
if isequal(distance, 'correlation')
CC = bsxfun(@minus, CC, mean(CC,2));
end
else
error(message('stats:kmeans:InvalidStart'));
end
emptyactNames = {'error','drop','singleton'};
emptyact = internal.stats.getParamVal(emptyact,emptyactNames,'''EmptyAction''');
[~,online] = internal.stats.getParamVal(online,{'on','off'},'''OnlinePhase''');
online = (online==1);
% 'maxiter' and 'display' are grandfathered as separate param name/value pairs
if ~isempty(display)
options = statset(options,'Display',display);
end
if ~isempty(maxit)
options = statset(options,'MaxIter',maxit);
end
options = statset(statset('kmeans'), options);
display = find(strncmpi(options.Display, {'off','notify','final','iter'},...
length(options.Display))) - 1;
maxit = options.MaxIter;
if ~(isscalar(k) && isnumeric(k) && isreal(k) && k > 0 && (round(k)==k))
error(message('stats:kmeans:InvalidK'));
% elseif k == 1
% this special case works automatically
elseif n < k
error(message('stats:kmeans:TooManyClusters'));
end
% Assume one replicate
if isempty(reps)
reps = 1;
elseif ~internal.stats.isScalarInt(reps,1)
error(message('stats:kmeans:BadReps'));
end
emptyErrCnt = 0;
[useParallel, RNGscheme, poolsz] = ...
internal.stats.parallel.processParallelAndStreamOptions(options,true);
usePool = useParallel && poolsz>0;
% Define the function that will perform one iteration of the
% loop inside smartFor
loopbody = @loopBody;
% Initialize nested variables so they will not appear to be functions here
totsumD = 0;
iter = 0;
% Prepare for in-progress
if display > 1 % 'iter' or 'final'
if usePool
% If we are running on a parallel pool, each worker will generate
% a separate periodic report. Before starting the loop, we
% seed the parallel pool so that each worker will have an
% identifying label (eg, index) for its report.
internal.stats.parallel.distributeToPool( ...
'workerID', num2cell(1:poolsz) );
% Periodic reports behave differently in parallel than they do
% in serial computation (which is the baseline).
% We advise the user of the difference.
if display == 3 % 'iter' only
warning(message('stats:kmeans:displayParallel2'));
fprintf(' worker\t iter\t phase\t num\t sum\n' );
end
else
if useParallel
warning(message('stats:kmeans:displayParallel'));
end
if display == 3 % 'iter' only
fprintf(' iter\t phase\t num\t sum\n');
end
end
end
% Perform KMEANS replicates on separate workers.
ClusterBest = internal.stats.parallel.smartForReduce(...
reps, loopbody, useParallel, RNGscheme, 'argmin');
% Extract the best solution
idxbest = ClusterBest{5};
Cbest = ClusterBest{6};
sumDbest = ClusterBest{3};
totsumDbest = ClusterBest{1};
if nargout > 3
Dbest = ClusterBest{7};
end
if display > 1 % 'final' or 'iter'
fprintf('%s\n',getString(message('stats:kmeans:FinalSumOfDistances',sprintf('%g',totsumDbest))));
end
if hadNaNs
idxbest = statinsertnan(wasnan, idxbest);
end
function cellout = loopBody(rep,S)
if isempty(S)
S = RandStream.getGlobalStream;
end
if display > 1 % 'iter'
if usePool
dispfmt = '%8d\t%6d\t%6d\t%8d\t%12g\n';
labindx = internal.stats.parallel.workerGetValue('workerID');
else
dispfmt = '%6d\t%6d\t%8d\t%12g\n';
end
end
cellout = cell(7,1); % cellout{1} = total sum of distances
% cellout{2} = replicate number
% cellout{3} = sum of distance for each cluster
% cellout{4} = iteration
% cellout{5} = idx;
% cellout{6} = Center
% cellout{7} = Distance
% Populating total sum of distances to Inf. This is used in the
% reduce operation if update fails due to empty cluster.
cellout{1} = Inf;
cellout{2} = rep;
switch start
case 'uniform'
C = Xmins(ones(k,1),:) + rand(S,[k,p]).*(Xmaxs(ones(k,1),:)-Xmins(ones(k,1),:));
% For 'cosine' and 'correlation', these are uniform inside a subset
% of the unit hypersphere. Still need to center them for
% 'correlation'. (Re)normalization for 'cosine'/'correlation' is
% done at each iteration.
if isequal(distance, 'correlation')
C = bsxfun(@minus, C, mean(C,2));
end
if isa(X,'single')
C = single(C);
end
case 'sample'
C = X(randsample(S,n,k),:);
case 'cluster'
Xsubset = X(randsample(S,n,floor(.1*n)),:);
% Turn display off for the initialization
optIndex = find(strcmpi('options',varargin));
if isempty(optIndex)
opts = statset('Display','off');
varargin = [varargin,'options',opts];
else
varargin{optIndex+1}.Display = 'off';
end
[~, C] = kmeans(Xsubset, k, varargin{:}, 'start','sample', 'replicates',1);
case 'numeric'
C = CC(:,:,rep);
case {'plus','kmeans++'}
% Select the first seed by sampling uniformly at random
index = zeros(k,1);
[C(1,:), index(1)] = datasample(S,X,1);
% Select the rest of the seeds by a probabilistic model
for ii = 2:k
sampleProbability = min(distfun(X,C(1:ii-1,:),distance),[],2);
denominator = sum(sampleProbability);
if denominator==0 || denominator==Inf
C(ii:k,:) = datasample(S,X,k-ii+1,1,'Replace',false);
break;
end
sampleProbability = sampleProbability/denominator;
[C(ii,:), index(ii)] = datasample(S,X,1,1,'Replace',false,...
'Weights',sampleProbability);
end
end
if ~isfloat(C) % X may be logical
C = double(C);
end
% Compute the distance from every point to each cluster centroid and the
% initial assignment of points to clusters
D = distfun(X, C, distance, 0, rep, reps);
[d, idx] = min(D, [], 2);
m = accumarray(idx,1,[k,1]);
try % catch empty cluster errors and move on to next rep
% Begin phase one: batch reassignments
converged = batchUpdate();
% Begin phase two: single reassignments
if online
converged = onlineUpdate();
end
if display == 2 % 'final'
fprintf('%s\n',getString(message('stats:kmeans:IterationsSumOfDistances',rep,iter,sprintf('%g',totsumD) )));
end
if ~converged
if reps==1
warning(message('stats:kmeans:FailedToConverge', maxit));
else
warning(message('stats:kmeans:FailedToConvergeRep', maxit, rep));
end
end
% Calculate cluster-wise sums of distances
nonempties = find(m>0);
D(:,nonempties) = distfun(X, C(nonempties,:), distance, iter, rep, reps);
d = D((idx-1)*n + (1:n)');
sumD = accumarray(idx,d,[k,1]);
totsumD = sum(sumD(nonempties));
% Save the best solution so far
cellout = {totsumD,rep,sumD,iter,idx,C,D}';
% If an empty cluster error occurred in one of multiple replicates, catch
% it, warn, and move on to next replicate. Error only when all replicates
% fail. Rethrow an other kind of error.
catch ME
if reps == 1 || (~isequal(ME.identifier,'stats:kmeans:EmptyCluster') && ...
~isequal(ME.identifier,'stats:kmeans:EmptyClusterRep'))
rethrow(ME);
else
emptyErrCnt = emptyErrCnt + 1;
warning(message('stats:kmeans:EmptyClusterInBatchUpdate', rep, iter));
if emptyErrCnt == reps
error(message('stats:kmeans:EmptyClusterAllReps'));
end
end
end % catch
%------------------------------------------------------------------
function converged = batchUpdate()
% Every point moved, every cluster will need an update
moved = 1:n;
changed = 1:k;
previdx = zeros(n,1);
prevtotsumD = Inf;
%
% Begin phase one: batch reassignments
%
iter = 0;
converged = false;
while true
iter = iter + 1;
% Calculate the new cluster centroids and counts, and update the
% distance from every point to those new cluster centroids
[C(changed,:), m(changed)] = gcentroids(X, idx, changed, distance);
D(:,changed) = distfun(X, C(changed,:), distance, iter, rep, reps);
% Deal with clusters that have just lost all their members
empties = changed(m(changed) == 0);
if ~isempty(empties)
if strcmp(emptyact,'error')
if reps==1
error(message('stats:kmeans:EmptyCluster', iter));
else
error(message('stats:kmeans:EmptyClusterRep', iter, rep));
end
end
switch emptyact
case 'drop'
if reps==1
warning(message('stats:kmeans:EmptyCluster', iter));
else
warning(message('stats:kmeans:EmptyClusterRep', iter, rep));
end
% Remove the empty cluster from any further processing
D(:,empties) = NaN;
changed = changed(m(changed) > 0);
case 'singleton'
for i = empties
d = D((idx-1)*n + (1:n)'); % use newly updated distances
% Find the point furthest away from its current cluster.
% Take that point out of its cluster and use it to create
% a new singleton cluster to replace the empty one.
[~, lonely] = max(d);
from = idx(lonely); % taking from this cluster
if m(from) < 2
% In the very unusual event that the cluster had only
% one member, pick any other non-singleton point.
from = find(m>1,1,'first');
lonely = find(idx==from,1,'first');
end
C(i,:) = X(lonely,:);
m(i) = 1;
idx(lonely) = i;
D(:,i) = distfun(X, C(i,:), distance, iter, rep, reps);
% Update clusters from which points are taken
[C(from,:), m(from)] = gcentroids(X, idx, from, distance);
D(:,from) = distfun(X, C(from,:), distance, iter, rep, reps);
changed = unique([changed from]);
end
end
end
% Compute the total sum of distances for the current configuration.
totsumD = sum(D((idx-1)*n + (1:n)'));
% Test for a cycle: if objective is not decreased, back out
% the last step and move on to the single update phase
if prevtotsumD <= totsumD
idx = previdx;
[C(changed,:), m(changed)] = gcentroids(X, idx, changed, distance);
iter = iter - 1;
break;
end
if display > 2 % 'iter'
if usePool
fprintf(dispfmt,labindx,iter,1,length(moved),totsumD);
else
fprintf(dispfmt,iter,1,length(moved),totsumD);
end
end
if iter >= maxit
break;
end
% Determine closest cluster for each point and reassign points to clusters
previdx = idx;
prevtotsumD = totsumD;
[d, nidx] = min(D, [], 2);
% Determine which points moved
moved = find(nidx ~= previdx);
if ~isempty(moved)
% Resolve ties in favor of not moving
moved = moved(D((previdx(moved)-1)*n + moved) > d(moved));
end
if isempty(moved)
converged = true;
break;
end
idx(moved) = nidx(moved);
% Find clusters that gained or lost members
changed = unique([idx(moved); previdx(moved)])';
end % phase one
end % nested function
%------------------------------------------------------------------
function converged = onlineUpdate()
% Initialize some cluster information prior to phase two
switch distance
case 'cityblock'
Xmid = zeros([k,p,2]);
for i = 1:k
if m(i) > 0
% Separate out sorted coords for points in i'th cluster,
% and save values above and below median, component-wise
Xsorted = sort(X(idx==i,:),1);
nn = floor(.5*m(i));
if mod(m(i),2) == 0
Xmid(i,:,1:2) = Xsorted([nn, nn+1],:)';
elseif m(i) > 1
Xmid(i,:,1:2) = Xsorted([nn, nn+2],:)';
else
Xmid(i,:,1:2) = Xsorted([1, 1],:)';
end
end
end
case 'hamming'
Xsum = zeros(k,p);
for i = 1:k
if m(i) > 0
% Sum coords for points in i'th cluster, component-wise
Xsum(i,:) = sum(X(idx==i,:), 1);
end
end
end
%
% Begin phase two: single reassignments
%
changed = find(m' > 0);
lastmoved = 0;
nummoved = 0;
iter1 = iter;
converged = false;
Del = NaN(n,k); % reassignment criterion
while iter < maxit
% Calculate distances to each cluster from each point, and the
% potential change in total sum of errors for adding or removing
% each point from each cluster. Clusters that have not changed
% membership need not be updated.
%
% Singleton clusters are a special case for the sum of dists
% calculation. Removing their only point is never best, so the
% reassignment criterion had better guarantee that a singleton
% point will stay in its own cluster. Happily, we get
% Del(i,idx(i)) == 0 automatically for them.
switch distance
case 'sqeuclidean'
for i = changed
mbrs = (idx == i);
sgn = 1 - 2*mbrs; % -1 for members, 1 for nonmembers
if m(i) == 1
sgn(mbrs) = 0; % prevent divide-by-zero for singleton mbrs
end
Del(:,i) = (m(i) ./ (m(i) + sgn)) .* sum((bsxfun(@minus, X, C(i,:))).^2, 2);
end
case 'cityblock'
for i = changed
if mod(m(i),2) == 0 % this will never catch singleton clusters
ldist = bsxfun(@minus, Xmid(i,:,1), X);
rdist = bsxfun(@minus, X, Xmid(i,:,2));
mbrs = (idx == i);
sgn = repmat(1-2*mbrs, 1, p); % -1 for members, 1 for nonmembers
Del(:,i) = sum(max(0, max(sgn.*rdist, sgn.*ldist)), 2);
else
Del(:,i) = sum(abs(bsxfun(@minus, X, C(i,:))), 2);
end
end
case {'cosine','correlation'}
% The points are normalized, centroids are not, so normalize them
normC = sqrt(sum(C.^2, 2));
if any(normC < eps(class(normC))) % small relative to unit-length data points
if reps==1
error(message('stats:kmeans:ZeroCentroid', iter));
else
error(message('stats:kmeans:ZeroCentroidRep', iter, rep));
end
end
% This can be done without a loop, but the loop saves memory allocations
for i = changed
XCi = X * C(i,:)';
mbrs = (idx == i);
sgn = 1 - 2*mbrs; % -1 for members, 1 for nonmembers
Del(:,i) = 1 + sgn .*...
(m(i).*normC(i) - sqrt((m(i).*normC(i)).^2 + 2.*sgn.*m(i).*XCi + 1));
end
case 'hamming'
for i = changed
if mod(m(i),2) == 0 % this will never catch singleton clusters
% coords with an unequal number of 0s and 1s have a
% different contribution than coords with an equal
% number
unequal01 = find(2*Xsum(i,:) ~= m(i));
numequal01 = p - length(unequal01);
mbrs = (idx == i);
Di = abs(bsxfun(@minus,X(:,unequal01), C(i,unequal01)));
Del(:,i) = (sum(Di, 2) + mbrs*numequal01) / p;
else
Del(:,i) = sum(abs(bsxfun(@minus,X,C(i,:))), 2) / p;
end
end
end
% Determine best possible move, if any, for each point. Next we
% will pick one from those that actually did move.
previdx = idx;
prevtotsumD = totsumD;
[minDel, nidx] = min(Del, [], 2);
moved = find(previdx ~= nidx);
if ~isempty(moved)
% Resolve ties in favor of not moving
moved = moved(Del((previdx(moved)-1)*n + moved) > minDel(moved));
end
if isempty(moved)
% Count an iteration if phase 2 did nothing at all, or if we're
% in the middle of a pass through all the points
if (iter == iter1) || nummoved > 0
iter = iter + 1;
if display > 2 % 'iter'
if usePool
fprintf(dispfmt,labindx,iter,2,length(moved),totsumD);
else
fprintf(dispfmt,iter,2,length(moved),totsumD);
end
end
end
converged = true;
break;
end
% Pick the next move in cyclic order
moved = mod(min(mod(moved - lastmoved - 1, n) + lastmoved), n) + 1;
% If we've gone once through all the points, that's an iteration
if moved <= lastmoved
iter = iter + 1;
if display > 2 % 'iter'
if usePool
fprintf(dispfmt,labindx,iter,2,length(moved),totsumD);
else
fprintf(dispfmt,iter,2,length(moved),totsumD);
end
end
if iter >= maxit, break; end
nummoved = 0;
end
nummoved = nummoved + 1;
lastmoved = moved;
oidx = idx(moved);
nidx = nidx(moved);
totsumD = totsumD + Del(moved,nidx) - Del(moved,oidx);
% Update the cluster index vector, and the old and new cluster
% counts and centroids
idx(moved) = nidx;
m(nidx) = m(nidx) + 1;
m(oidx) = m(oidx) - 1;
switch distance
case 'sqeuclidean'
C(nidx,:) = C(nidx,:) + (X(moved,:) - C(nidx,:)) / m(nidx);
C(oidx,:) = C(oidx,:) - (X(moved,:) - C(oidx,:)) / m(oidx);
case 'cityblock'
for i = [oidx nidx]
% Separate out sorted coords for points in each cluster.
% New centroid is the coord median, save values above and
% below median. All done component-wise.
Xsorted = sort(X(idx==i,:),1);
nn = floor(.5*m(i));
if mod(m(i),2) == 0
C(i,:) = .5 * (Xsorted(nn,:) + Xsorted(nn+1,:));
Xmid(i,:,1:2) = Xsorted([nn, nn+1],:)';
else
C(i,:) = Xsorted(nn+1,:);
if m(i) > 1
Xmid(i,:,1:2) = Xsorted([nn, nn+2],:)';
else
Xmid(i,:,1:2) = Xsorted([1, 1],:)';
end
end
end
case {'cosine','correlation'}
C(nidx,:) = C(nidx,:) + (X(moved,:) - C(nidx,:)) / m(nidx);
C(oidx,:) = C(oidx,:) - (X(moved,:) - C(oidx,:)) / m(oidx);
case 'hamming'
% Update summed coords for points in each cluster. New
% centroid is the coord median. All done component-wise.
Xsum(nidx,:) = Xsum(nidx,:) + X(moved,:);
Xsum(oidx,:) = Xsum(oidx,:) - X(moved,:);
C(nidx,:) = 2*Xsum(nidx,:) > m(nidx);
C(oidx,:) = 2*Xsum(oidx,:) > m(oidx);
end
changed = sort([oidx nidx]);
end % phase two
end % nested function
end
end % main function
%------------------------------------------------------------------
function D = distfun(X, C, dist, iter,rep, reps)
%DISTFUN Calculate point to cluster centroid distances.
[n,p] = size(X);
D = zeros(n,size(C,1));
nclusts = size(C,1);
switch dist
case 'sqeuclidean'
for i = 1:nclusts
D(:,i) = (X(:,1) - C(i,1)).^2;
for j = 2:p
D(:,i) = D(:,i) + (X(:,j) - C(i,j)).^2;
end
% D(:,i) = sum((X - C(repmat(i,n,1),:)).^2, 2);
end
case 'cityblock'
for i = 1:nclusts
D(:,i) = abs(X(:,1) - C(i,1));
for j = 2:p
D(:,i) = D(:,i) + abs(X(:,j) - C(i,j));
end
% D(:,i) = sum(abs(X - C(repmat(i,n,1),:)), 2);
end
case {'cosine','correlation'}
% The points are normalized, centroids are not, so normalize them
normC = sqrt(sum(C.^2, 2));
if any(normC < eps(class(normC))) % small relative to unit-length data points
if reps==1
error(message('stats:kmeans:ZeroCentroid', iter));
else
error(message('stats:kmeans:ZeroCentroidRep', iter, rep));
end
end
for i = 1:nclusts
D(:,i) = max(1 - X * (C(i,:)./normC(i))', 0);
end
case 'hamming'
for i = 1:nclusts
D(:,i) = abs(X(:,1) - C(i,1));
for j = 2:p
D(:,i) = D(:,i) + abs(X(:,j) - C(i,j));
end
D(:,i) = D(:,i) / p;
% D(:,i) = sum(abs(X - C(repmat(i,n,1),:)), 2) / p;
end
end
end % function
%------------------------------------------------------------------
function [centroids, counts] = gcentroids(X, index, clusts, dist)
%GCENTROIDS Centroids and counts stratified by group.
p = size(X,2);
num = length(clusts);
centroids = NaN(num,p);
counts = zeros(num,1);
for i = 1:num
members = (index == clusts(i));
if any(members)
counts(i) = sum(members);
switch dist
case 'sqeuclidean'
centroids(i,:) = sum(X(members,:),1) / counts(i);
case 'cityblock'
% Separate out sorted coords for points in i'th cluster,
% and use to compute a fast median, component-wise
Xsorted = sort(X(members,:),1);
nn = floor(.5*counts(i));
if mod(counts(i),2) == 0
centroids(i,:) = .5 * (Xsorted(nn,:) + Xsorted(nn+1,:));
else
centroids(i,:) = Xsorted(nn+1,:);
end
case {'cosine','correlation'}
centroids(i,:) = sum(X(members,:),1) / counts(i); % unnormalized
case 'hamming'
% Compute a fast median for binary data, component-wise
centroids(i,:) = 2*sum(X(members,:), 1) > counts(i);
end
end
end
end % function