forked from menwenjun/redis_source_annotation
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcluster.c
6393 lines (5931 loc) · 280 KB
/
cluster.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/* Redis Cluster implementation.
*
* Copyright (c) 2009-2012, Salvatore Sanfilippo <antirez at gmail dot com>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Redis nor the names of its contributors may be used
* to endorse or promote products derived from this software without
* specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include "server.h"
#include "cluster.h"
#include "endianconv.h"
#include <sys/types.h>
#include <sys/socket.h>
#include <arpa/inet.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/socket.h>
#include <sys/stat.h>
#include <sys/file.h>
#include <math.h>
/* A global reference to myself is handy to make code more clear.
* Myself always points to server.cluster->myself, that is, the clusterNode
* that represents this node. */
// 一个全局的引用,指向cluster->myself
clusterNode *myself = NULL;
clusterNode *createClusterNode(char *nodename, int flags);
int clusterAddNode(clusterNode *node);
void clusterAcceptHandler(aeEventLoop *el, int fd, void *privdata, int mask);
void clusterReadHandler(aeEventLoop *el, int fd, void *privdata, int mask);
void clusterSendPing(clusterLink *link, int type);
void clusterSendFail(char *nodename);
void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request);
void clusterUpdateState(void);
int clusterNodeGetSlotBit(clusterNode *n, int slot);
sds clusterGenNodesDescription(int filter);
clusterNode *clusterLookupNode(char *name);
int clusterNodeAddSlave(clusterNode *master, clusterNode *slave);
int clusterAddSlot(clusterNode *n, int slot);
int clusterDelSlot(int slot);
int clusterDelNodeSlots(clusterNode *node);
int clusterNodeSetSlotBit(clusterNode *n, int slot);
void clusterSetMaster(clusterNode *n);
void clusterHandleSlaveFailover(void);
void clusterHandleSlaveMigration(int max_slaves);
int bitmapTestBit(unsigned char *bitmap, int pos);
void clusterDoBeforeSleep(int flags);
void clusterSendUpdate(clusterLink *link, clusterNode *node);
void resetManualFailover(void);
void clusterCloseAllSlots(void);
void clusterSetNodeAsMaster(clusterNode *n);
void clusterDelNode(clusterNode *delnode);
sds representClusterNodeFlags(sds ci, uint16_t flags);
uint64_t clusterGetMaxEpoch(void);
int clusterBumpConfigEpochWithoutConsensus(void);
/* -----------------------------------------------------------------------------
* Initialization
* -------------------------------------------------------------------------- */
/* Load the cluster config from 'filename'.
*
* If the file does not exist or is zero-length (this may happen because
* when we lock the nodes.conf file, we create a zero-length one for the
* sake of locking if it does not already exist), C_ERR is returned.
* If the configuration was loaded from the file, C_OK is returned. */
// 从filename载入集群的配置
// 如果文件不存在或者文件大小为0(文件被锁住)返回C_ERR,如果成功载入配置文件,则范返回C_OK
int clusterLoadConfig(char *filename) {
FILE *fp = fopen(filename,"r");
struct stat sb;
char *line;
int maxline, j;
// 判断文件是否存在
if (fp == NULL) {
if (errno == ENOENT) {
return C_ERR;
} else {
serverLog(LL_WARNING,
"Loading the cluster node config from %s: %s",
filename, strerror(errno));
exit(1);
}
}
/* Check if the file is zero-length: if so return C_ERR to signal
* we have to write the config. */
// 判断文件是否为空
if (fstat(fileno(fp),&sb) != -1 && sb.st_size == 0) {
fclose(fp);
return C_ERR;
}
/* Parse the file. Note that single lines of the cluster config file can
* be really long as they include all the hash slots of the node.
* This means in the worst possible case, half of the Redis slots will be
* present in a single line, possibly in importing or migrating state, so
* together with the node ID of the sender/receiver.
*
* To simplify we allocate 1024+CLUSTER_SLOTS*128 bytes per line. */
// 解析文件。集群配置文件可能会非常长,因为他会在每一行记录该节点的哈希槽,在最坏情况下,一半的哈希槽将会被记录在一行中,并且附带有导入导出状态,所以为每行分配 1024+CLUSTER_SLOTS*128 字节空间
maxline = 1024+CLUSTER_SLOTS*128;
line = zmalloc(maxline);
// 每次从文件读一行
while(fgets(line,maxline,fp) != NULL) {
int argc;
sds *argv;
clusterNode *n, *master;
char *p, *s;
/* Skip blank lines, they can be created either by users manually
* editing nodes.conf or by the config writing process if stopped
* before the truncate() call. */
// 跳过空行
if (line[0] == '\n' || line[0] == '\0') continue;
/* Split the line into arguments for processing. */
// 将读入的一行,分隔开
argv = sdssplitargs(line,&argc);
if (argv == NULL) goto fmterr;
/* Handle the special "vars" line. Don't pretend it is the last
* line even if it actually is when generated by Redis. */
// 处理 vars 变量,例如:vars currentEpoch 5 lastVoteEpoch 0
if (strcasecmp(argv[0],"vars") == 0) {
for (j = 1; j < argc; j += 2) {
// currentEpoch选项
if (strcasecmp(argv[j],"currentEpoch") == 0) {
server.cluster->currentEpoch =
strtoull(argv[j+1],NULL,10);
// lastVoteEpoch选项
} else if (strcasecmp(argv[j],"lastVoteEpoch") == 0) {
server.cluster->lastVoteEpoch =
strtoull(argv[j+1],NULL,10);
} else {
serverLog(LL_WARNING,
"Skipping unknown cluster config variable '%s'",
argv[j]);
}
}
sdsfreesplitres(argv,argc);
continue;
}
/* Regular config lines have at least eight fields */
// 主节点:66478bda726ae6ba4e8fb55034d8e5e5804223ff 127.0.0.1:6381 master - 0 1496130037660 2 connected 10923-16383
// 从节点:6fb7dfdb6188a9fe53c48ea32d541724f36434e9 127.0.0.1:6383 slave 8f285670923d4f1c599ecc93367c95a30fb8bf34 0 1496130040668 4 connected
// 29978c0169ecc0a9054de7f4142155c1ab70258b 127.0.0.1:6379 myself,master - 0 0 1 connected 0-5461
// 参数最少8个
if (argc < 8) goto fmterr;
/* Create this node if it does not exist */
// 根据runid查找对应节点
n = clusterLookupNode(argv[0]);
// 如果不存在,则根据runid创建节点
if (!n) {
n = createClusterNode(argv[0],0);
// 加入到集群中
clusterAddNode(n);
}
/* Address and port */
// 解析 ip:port
if ((p = strrchr(argv[1],':')) == NULL) goto fmterr;
*p = '\0';
memcpy(n->ip,argv[1],strlen(argv[1])+1);
n->port = atoi(p+1);
/* Parse flags */
p = s = argv[2];
while(p) {
p = strchr(s,',');
if (p) *p = '\0';
// 如果是myself,则设置指向自己的指针
if (!strcasecmp(s,"myself")) {
serverAssert(server.cluster->myself == NULL);
myself = server.cluster->myself = n;
n->flags |= CLUSTER_NODE_MYSELF;
// 如果是主节点master
} else if (!strcasecmp(s,"master")) {
n->flags |= CLUSTER_NODE_MASTER;
// 如果是从节点slave
} else if (!strcasecmp(s,"slave")) {
n->flags |= CLUSTER_NODE_SLAVE;
// 可能是一个下线的节点
} else if (!strcasecmp(s,"fail?")) {
n->flags |= CLUSTER_NODE_PFAIL;
// 一个下线的节点
} else if (!strcasecmp(s,"fail")) {
n->flags |= CLUSTER_NODE_FAIL;
n->fail_time = mstime();
// 等待向节点发送PING
} else if (!strcasecmp(s,"handshake")) {
n->flags |= CLUSTER_NODE_HANDSHAKE;
// 没有获取该节点的地址
} else if (!strcasecmp(s,"noaddr")) {
n->flags |= CLUSTER_NODE_NOADDR;
// 无标识
} else if (!strcasecmp(s,"noflags")) {
/* nothing to do */
} else {
serverPanic("Unknown flag in redis cluster config file");
}
if (p) s = p+1;
}
/* Get master if any. Set the master and populate master's
* slave list. */
// 如果有主节点的话,那么设置主节点
if (argv[3][0] != '-') {
// 先查找,如果存在则直接设置该从节点从属的主节点
master = clusterLookupNode(argv[3]);
// 如果不存在则创建一个新的
if (!master) {
master = createClusterNode(argv[3],0);
clusterAddNode(master);
}
n->slaveof = master;
// 将n加入到主节点master的从节点表中
clusterNodeAddSlave(master,n);
}
/* Set ping sent / pong received timestamps */
// 设置发送PING 和 接收到PING回复的时间
if (atoi(argv[4])) n->ping_sent = mstime();
if (atoi(argv[5])) n->pong_received = mstime();
/* Set configEpoch for this node. */
// 设置配置纪元
n->configEpoch = strtoull(argv[6],NULL,10);
/* Populate hash slots served by this instance. */
// 设置从节点的槽
for (j = 8; j < argc; j++) {
int start, stop;
// 处理导出和导入 槽
if (argv[j][0] == '[') {
/* Here we handle migrating / importing slots */
int slot;
char direction;
clusterNode *cn;
p = strchr(argv[j],'-');
serverAssert(p != NULL);
*p = '\0';
// 判断是导出还是导入
direction = p[1]; /* Either '>' or '<' */
// 槽
slot = atoi(argv[j]+1);
p += 3;
// 查找目标节点
cn = clusterLookupNode(p);
// 目标节点不存在,则创建
if (!cn) {
cn = createClusterNode(p,0);
clusterAddNode(cn);
}
// 根据方向,设置要导入和导出槽的目标
if (direction == '>') {
server.cluster->migrating_slots_to[slot] = cn;
} else {
server.cluster->importing_slots_from[slot] = cn;
}
continue;
// 没有导出和导入,这是一个区间
} else if ((p = strchr(argv[j],'-')) != NULL) {
*p = '\0';
// 设置开始的槽下标和停止的槽下标
start = atoi(argv[j]);
stop = atoi(p+1);
// 没有导入或导出,这是一个单槽
} else {
start = stop = atoi(argv[j]);
}
// 将槽载入到节点中
while(start <= stop) clusterAddSlot(n, start++);
}
sdsfreesplitres(argv,argc);
}
/* Config sanity check */
if (server.cluster->myself == NULL) goto fmterr;
zfree(line);
fclose(fp);
serverLog(LL_NOTICE,"Node configuration loaded, I'm %.40s", myself->name);
/* Something that should never happen: currentEpoch smaller than
* the max epoch found in the nodes configuration. However we handle this
* as some form of protection against manual editing of critical files. */
// 一些事从不应该发生:在集群配置中currentEpoch比最大的纪元小。但是要处理这种情况
if (clusterGetMaxEpoch() > server.cluster->currentEpoch) {
server.cluster->currentEpoch = clusterGetMaxEpoch();
}
return C_OK;
fmterr:
serverLog(LL_WARNING,
"Unrecoverable error: corrupted cluster config file.");
zfree(line);
if (fp) fclose(fp);
exit(1);
}
/* Cluster node configuration is exactly the same as CLUSTER NODES output.
*
* This function writes the node config and returns 0, on error -1
* is returned.
*
* Note: we need to write the file in an atomic way from the point of view
* of the POSIX filesystem semantics, so that if the server is stopped
* or crashes during the write, we'll end with either the old file or the
* new one. Since we have the full payload to write available we can use
* a single write to write the whole file. If the pre-existing file was
* bigger we pad our payload with newlines that are anyway ignored and truncate
* the file afterward. */
// 集群节点配置和 CLUSTER NODES 命令输出是一样的
// 这个函数写集群节点的配置,成功返回0,出错返回-1
// 这个写操作必须是原子性的写入
// do_fsync 参数指定是否做同步操作
int clusterSaveConfig(int do_fsync) {
sds ci;
size_t content_size;
struct stat sb;
int fd;
server.cluster->todo_before_sleep &= ~CLUSTER_TODO_SAVE_CONFIG;
/* Get the nodes description and concatenate our "vars" directive to
* save currentEpoch and lastVoteEpoch. */
// 获取节点的字符串描述信息,
ci = clusterGenNodesDescription(CLUSTER_NODE_HANDSHAKE);
// 追加上纪元信息
ci = sdscatprintf(ci,"vars currentEpoch %llu lastVoteEpoch %llu\n",
(unsigned long long) server.cluster->currentEpoch,
(unsigned long long) server.cluster->lastVoteEpoch);
content_size = sdslen(ci);
// 读写打开配置文件
if ((fd = open(server.cluster_configfile,O_WRONLY|O_CREAT,0644))
== -1) goto err;
/* Pad the new payload if the existing file length is greater. */
// 获取配置文件的信息
if (fstat(fd,&sb) != -1) {
// 扩展字符串描述信息的大小
if (sb.st_size > (off_t)content_size) {
ci = sdsgrowzero(ci,sb.st_size);
memset(ci+content_size,'\n',sb.st_size-content_size);
}
}
// 将字符串描述信息写到配置文件中
if (write(fd,ci,sdslen(ci)) != (ssize_t)sdslen(ci)) goto err;
// 根据指定的,是否同步到磁盘中
if (do_fsync) {
server.cluster->todo_before_sleep &= ~CLUSTER_TODO_FSYNC_CONFIG;
fsync(fd);
}
/* Truncate the file if needed to remove the final \n padding that
* is just garbage. */
// 将最后一个'\n'截去
if (content_size != sdslen(ci) && ftruncate(fd,content_size) == -1) {
/* ftruncate() failing is not a critical error. */
}
close(fd);
sdsfree(ci);
return 0;
err:
if (fd != -1) close(fd);
sdsfree(ci);
return -1;
}
// 写配置文件,如果写出错直接退出程序
void clusterSaveConfigOrDie(int do_fsync) {
if (clusterSaveConfig(do_fsync) == -1) {
serverLog(LL_WARNING,"Fatal: can't update cluster config file.");
exit(1);
}
}
/* Lock the cluster config using flock(), and leaks the file descritor used to
* acquire the lock so that the file will be locked forever.
*
* This works because we always update nodes.conf with a new version
* in-place, reopening the file, and writing to it in place (later adjusting
* the length with ftruncate()).
*
* On success C_OK is returned, otherwise an error is logged and
* the function returns C_ERR to signal a lock was not acquired. */
// 使用flock()函数将集群配置文件上锁
int clusterLockConfig(char *filename) {
/* flock() does not exist on Solaris
* and a fcntl-based solution won't help, as we constantly re-open that file,
* which will release _all_ locks anyway
*/
#if !defined(__sun)
/* To lock it, we need to open the file in a way it is created if
* it does not exist, otherwise there is a race condition with other
* processes. */
// 如果文件不存在需要创建一个,因为可能会有一个竞态条件
int fd = open(filename,O_WRONLY|O_CREAT,0644);
if (fd == -1) {
serverLog(LL_WARNING,
"Can't open %s in order to acquire a lock: %s",
filename, strerror(errno));
return C_ERR;
}
// LOCK_EX:表示创建一个排他锁,在任意时间内,一个文件的排他锁只能被一个进程拥有
// 通常情况下,如果加锁请求不能被立即满足,那么系统调用 flock() 会阻塞当前进程。
// 可以指定LOCK_NB标志,那么系统就不会阻塞该进程
if (flock(fd,LOCK_EX|LOCK_NB) == -1) {
if (errno == EWOULDBLOCK) {
serverLog(LL_WARNING,
"Sorry, the cluster configuration file %s is already used "
"by a different Redis Cluster node. Please make sure that "
"different nodes use different cluster configuration "
"files.", filename);
} else {
serverLog(LL_WARNING,
"Impossible to lock %s: %s", filename, strerror(errno));
}
close(fd);
return C_ERR;
}
/* Lock acquired: leak the 'fd' by not closing it, so that we'll retain the
* lock to the file as long as the process exists. */
#endif /* __sun */
return C_OK;
}
// 初始化集群状态
void clusterInit(void) {
int saveconf = 0;
// 初始化配置
server.cluster = zmalloc(sizeof(clusterState));
server.cluster->myself = NULL;
server.cluster->currentEpoch = 0;
server.cluster->state = CLUSTER_FAIL;
server.cluster->size = 1;
server.cluster->todo_before_sleep = 0;
server.cluster->nodes = dictCreate(&clusterNodesDictType,NULL);
server.cluster->nodes_black_list =
dictCreate(&clusterNodesBlackListDictType,NULL);
server.cluster->failover_auth_time = 0;
server.cluster->failover_auth_count = 0;
server.cluster->failover_auth_rank = 0;
server.cluster->failover_auth_epoch = 0;
server.cluster->cant_failover_reason = CLUSTER_CANT_FAILOVER_NONE;
server.cluster->lastVoteEpoch = 0;
server.cluster->stats_bus_messages_sent = 0;
server.cluster->stats_bus_messages_received = 0;
memset(server.cluster->slots,0, sizeof(server.cluster->slots));
clusterCloseAllSlots();
/* Lock the cluster config file to make sure every node uses
* its own nodes.conf. */
// 配置文件上锁
if (clusterLockConfig(server.cluster_configfile) == C_ERR)
exit(1);
/* Load or create a new nodes configuration. */
// 载入或创建一个新的节点配置文件
if (clusterLoadConfig(server.cluster_configfile) == C_ERR) {
/* No configuration found. We will just use the random name provided
* by the createClusterNode() function. */
// 没找到配置文件,随机创建一个集群节点
myself = server.cluster->myself =
createClusterNode(NULL,CLUSTER_NODE_MYSELF|CLUSTER_NODE_MASTER);
serverLog(LL_NOTICE,"No cluster configuration found, I'm %.40s",
myself->name);
// 添加到当前集群节点的配置中
clusterAddNode(myself);
saveconf = 1;
}
// 写配置文件
if (saveconf) clusterSaveConfigOrDie(1);
/* We need a listening TCP port for our cluster messaging needs. */
server.cfd_count = 0;
/* Port sanity check II
* The other handshake port check is triggered too late to stop
* us from trying to use a too-high cluster port number. */
// 检查端口号是否合法
if (server.port > (65535-CLUSTER_PORT_INCR)) {
serverLog(LL_WARNING, "Redis port number too high. "
"Cluster communication port is 10,000 port "
"numbers higher than your Redis port. "
"Your Redis port number must be "
"lower than 55535.");
exit(1);
}
// 将该集群节点的端口和fd绑定
if (listenToPort(server.port+CLUSTER_PORT_INCR,
server.cfd,&server.cfd_count) == C_ERR)
{
exit(1);
} else {
int j;
// 为所有集群的fd设置可读事件的处理函数clusterAcceptHandler
for (j = 0; j < server.cfd_count; j++) {
if (aeCreateFileEvent(server.el, server.cfd[j], AE_READABLE,
clusterAcceptHandler, NULL) == AE_ERR)
serverPanic("Unrecoverable error creating Redis Cluster "
"file event.");
}
}
/* The slots -> keys map is a sorted set. Init it. */
// 创建槽映射到键的有序集合
server.cluster->slots_to_keys = zslCreate();
/* Set myself->port to my listening port, we'll just need to discover
* the IP address via MEET messages. */
// 设置集群端口
myself->port = server.port;
// 没有正在进行手动的故障转移
server.cluster->mf_end = 0;
// 重置与手动故障转移的状态
resetManualFailover();
}
/* Reset a node performing a soft or hard reset:
*
* 1) All other nodes are forget.
* 2) All the assigned / open slots are released.
* 3) If the node is a slave, it turns into a master.
* 5) Only for hard reset: a new Node ID is generated.
* 6) Only for hard reset: currentEpoch and configEpoch are set to 0.
* 7) The new configuration is saved and the cluster state updated.
* 8) If the node was a slave, the whole data set is flushed away. */
// 重置当前集群节点,hard设置软硬重置
void clusterReset(int hard) {
dictIterator *di;
dictEntry *de;
int j;
/* Turn into master. */
// 如果当前集群节点是从节点
if (nodeIsSlave(myself)) {
// 将指定的当前集群节点重新配置为主节点
clusterSetNodeAsMaster(myself);
// 取消当前的复制,并升级为主节点
replicationUnsetMaster();
// 清空所有数据库
emptyDb(NULL);
}
/* Close slots, reset manual failover state. */
// 清空所有槽的导入导出状态
clusterCloseAllSlots();
// 重置与手动故障转移的状态
resetManualFailover();
/* Unassign all the slots. */
// 解除所有指定的槽
for (j = 0; j < CLUSTER_SLOTS; j++) clusterDelSlot(j);
/* Forget all the nodes, but myself. */
// 将所有的其他集群节点忘记,除了自己
di = dictGetSafeIterator(server.cluster->nodes);
while((de = dictNext(di)) != NULL) {
clusterNode *node = dictGetVal(de);
if (node == myself) continue;
// 从当前的集群节点保存其他节点的字典中删除指定的node
clusterDelNode(node);
}
dictReleaseIterator(di);
/* Hard reset only: set epochs to 0, change node ID. */
// 如果指定硬重置:将纪元设置为0,改变当前节点的runid
if (hard) {
sds oldname;
// 重置所有的纪元信息
server.cluster->currentEpoch = 0;
server.cluster->lastVoteEpoch = 0;
myself->configEpoch = 0;
serverLog(LL_WARNING, "configEpoch set to 0 via CLUSTER RESET HARD");
/* To change the Node ID we need to remove the old name from the
* nodes table, change the ID, and re-add back with new name. */
// 改变runid
oldname = sdsnewlen(myself->name, CLUSTER_NAMELEN);
dictDelete(server.cluster->nodes,oldname);
sdsfree(oldname);
getRandomHexChars(myself->name, CLUSTER_NAMELEN);
clusterAddNode(myself);
serverLog(LL_NOTICE,"Node hard reset, now I'm %.40s", myself->name);
}
/* Make sure to persist the new config and update the state. */
// 设置进入下一个事件循环之前做的事件,用一下标识来指定
clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG|
CLUSTER_TODO_UPDATE_STATE|
CLUSTER_TODO_FSYNC_CONFIG);
}
/* -----------------------------------------------------------------------------
* CLUSTER communication link
* -------------------------------------------------------------------------- */
// 创建并返回一个连接对象,关联该node
clusterLink *createClusterLink(clusterNode *node) {
clusterLink *link = zmalloc(sizeof(*link));
link->ctime = mstime();
link->sndbuf = sdsempty();
link->rcvbuf = sdsempty();
link->node = node; //关联节点和该连接对象
link->fd = -1;
return link;
}
/* Free a cluster link, but does not free the associated node of course.
* This function will just make sure that the original node associated
* with this link will have the 'link' field set to NULL. */
// 释放连接对象
void freeClusterLink(clusterLink *link) {
// 取消监听事件
if (link->fd != -1) {
aeDeleteFileEvent(server.el, link->fd, AE_WRITABLE);
aeDeleteFileEvent(server.el, link->fd, AE_READABLE);
}
// 释放缓冲区
sdsfree(link->sndbuf);
sdsfree(link->rcvbuf);
// 如果该连接对象关联有节点,将该节点的连接关闭
if (link->node)
link->node->link = NULL;
// 释放fd
close(link->fd);
zfree(link);
}
#define MAX_CLUSTER_ACCEPTS_PER_CALL 1000
// 集群的fd所设置可读事件的处理函数
void clusterAcceptHandler(aeEventLoop *el, int fd, void *privdata, int mask) {
int cport, cfd;
int max = MAX_CLUSTER_ACCEPTS_PER_CALL;
char cip[NET_IP_STR_LEN];
clusterLink *link;
UNUSED(el);
UNUSED(mask);
UNUSED(privdata);
/* If the server is starting up, don't accept cluster connections:
* UPDATE messages may interact with the database content. */
// 如果当前节点正在载入数据,则直接返回。不接收集群的连接
if (server.masterhost == NULL && server.loading) return;
// 最大每次调用接收1000个连接
while(max--) {
// TCP连接的accept
cfd = anetTcpAccept(server.neterr, fd, cip, sizeof(cip), &cport);
if (cfd == ANET_ERR) {
if (errno != EWOULDBLOCK)
serverLog(LL_VERBOSE,
"Error accepting cluster node: %s", server.neterr);
return;
}
// 设置fd为非阻塞模式
anetNonBlock(NULL,cfd);
// 禁用 nagle 算法
anetEnableTcpNoDelay(NULL,cfd);
/* Use non-blocking I/O for cluster messages. */
serverLog(LL_VERBOSE,"Accepted cluster node %s:%d", cip, cport);
/* Create a link object we use to handle the connection.
* It gets passed to the readable handler when data is available.
* Initiallly the link->node pointer is set to NULL as we don't know
* which node is, but the right node is references once we know the
* node identity. */
// 当连接成功后,为其创建一个连接对象,但是不关联连接的节点
link = createClusterLink(NULL);
link->fd = cfd;
// 监听该连接的可读事件,并设置处理函数为clusterReadHandler
aeCreateFileEvent(server.el,cfd,AE_READABLE,clusterReadHandler,link);
}
}
/* -----------------------------------------------------------------------------
* Key space handling
* -------------------------------------------------------------------------- */
/* We have 16384 hash slots. The hash slot of a given key is obtained
* as the least significant 14 bits of the crc16 of the key.
*
* However if the key contains the {...} pattern, only the part between
* { and } is hashed. This may be useful in the future to force certain
* keys to be in the same node (assuming no resharding is in progress). */
// 我们有16384个哈希槽,获得给定key的哈希槽作为密钥的crc16的最低有效14比特
// 计算给定key应该被分配到哪个槽,如果key包含 {...} ,那么只对{}中的字符串计算哈希值
unsigned int keyHashSlot(char *key, int keylen) {
int s, e; /* start-end indexes of { and } */
// 找'{'字符
for (s = 0; s < keylen; s++)
if (key[s] == '{') break;
/* No '{' ? Hash the whole key. This is the base case. */
// 没有找到"{}",直接计算整个key的哈希值
if (s == keylen) return crc16(key,keylen) & 0x3FFF;
/* '{' found? Check if we have the corresponding '}'. */
// 找到'{',检查是否有'}'
for (e = s+1; e < keylen; e++)
if (key[e] == '}') break;
/* No '}' or nothing betweeen {} ? Hash the whole key. */
// 没有找到配对的'}',直接计算整个key的哈希值
if (e == keylen || e == s+1) return crc16(key,keylen) & 0x3FFF;
/* If we are here there is both a { and a } on its right. Hash
* what is in the middle between { and }. */
// 如果找到了"{}",计算{}中间的哈希值
return crc16(key+s+1,e-s-1) & 0x3FFF;
}
/* -----------------------------------------------------------------------------
* CLUSTER node API
* -------------------------------------------------------------------------- */
/* Create a new cluster node, with the specified flags.
* If "nodename" is NULL this is considered a first handshake and a random
* node name is assigned to this node (it will be fixed later when we'll
* receive the first pong).
*
* The node is created and returned to the user, but it is not automatically
* added to the nodes hash table. */
// 创建一个带有指定flags的集群节点
// 如果nodename为空,表示该节点还没有进行握手操作,随机指定一个节点名字
// 创建节点并返回给调用者,但不会自动被加入到节点的字典中
clusterNode *createClusterNode(char *nodename, int flags) {
clusterNode *node = zmalloc(sizeof(*node));
// 如果指定nodename,那么设置节点name
if (nodename)
memcpy(node->name, nodename, CLUSTER_NAMELEN);
else
// 随机设置节点的name
getRandomHexChars(node->name, CLUSTER_NAMELEN);
// 初始化节点的属性
node->ctime = mstime();
node->configEpoch = 0;
node->flags = flags;
memset(node->slots,0,sizeof(node->slots));
node->numslots = 0;
node->numslaves = 0;
node->slaves = NULL;
node->slaveof = NULL;
node->ping_sent = node->pong_received = 0;
node->fail_time = 0;
node->link = NULL;
memset(node->ip,0,sizeof(node->ip));
node->port = 0;
node->fail_reports = listCreate();
node->voted_time = 0;
node->orphaned_time = 0;
node->repl_offset_time = 0;
node->repl_offset = 0;
listSetFreeMethod(node->fail_reports,zfree);
return node;
}
/* This function is called every time we get a failure report from a node.
* The side effect is to populate the fail_reports list (or to update
* the timestamp of an existing report).
*
* 'failing' is the node that is in failure state according to the
* 'sender' node.
*
* The function returns 0 if it just updates a timestamp of an existing
* failure report from the same sender. 1 is returned if a new failure
* report is created. */
// 每次从节点接收到故障报告时调用该函数
// 副作用是会将故障报告添加到fail_reports list中,如果故障报告之前存在,那么会跟新时间戳
// failing是发生故障的节点,而sender是做报告的节点
// 如果只是更新了已经存在故障报告的时间戳,那么返回0,如果创建了新的故障报告,那么返回1
int clusterNodeAddFailureReport(clusterNode *failing, clusterNode *sender) {
// 获取故障报告的链表
list *l = failing->fail_reports;
listNode *ln;
listIter li;
clusterNodeFailReport *fr;
/* If a failure report from the same sender already exists, just update
* the timestamp. */
listRewind(l,&li);
// 遍历故障报告链表
while ((ln = listNext(&li)) != NULL) {
fr = ln->value;
// 如果存在sender之前发送的故障报告
if (fr->node == sender) {
// 那么只更新时间戳
fr->time = mstime();
return 0;
}
}
/* Otherwise create a new report. */
// 否则创建新的故障报告
fr = zmalloc(sizeof(*fr));
// 设置发送该报告的节点
fr->node = sender;
// 设置时间
fr->time = mstime();
// 添加到故障报告的链表中
listAddNodeTail(l,fr);
return 1;
}
/* Remove failure reports that are too old, where too old means reasonably
* older than the global node timeout. Note that anyway for a node to be
* flagged as FAIL we need to have a local PFAIL state that is at least
* older than the global node timeout, so we don't just trust the number
* of failure reports from other nodes. */
// 删除超过全局node timeout的故障报告
// 无论如何要将node设置为FAIL,我们需要至少比全局 node timeout 更早的局部 PFAIL 状态
// 因此报告node已下线的节点数量并不是当前节点被标记为PFAIL的唯一条件。
void clusterNodeCleanupFailureReports(clusterNode *node) {
list *l = node->fail_reports;
listNode *ln;
listIter li;
clusterNodeFailReport *fr;
// 计算节点报告的最大生存时间
mstime_t maxtime = server.cluster_node_timeout *
CLUSTER_FAIL_REPORT_VALIDITY_MULT;
mstime_t now = mstime();
listRewind(l,&li);
// 遍历报告当前node节点的故障报告链表
while ((ln = listNext(&li)) != NULL) {
fr = ln->value;
// 如果有过期的报告则删除
if (now - fr->time > maxtime) listDelNode(l,ln);
}
}
/* Remove the failing report for 'node' if it was previously considered
* failing by 'sender'. This function is called when a node informs us via
* gossip that a node is OK from its point of view (no FAIL or PFAIL flags).
*
* Note that this function is called relatively often as it gets called even
* when there are no nodes failing, and is O(N), however when the cluster is
* fine the failure reports list is empty so the function runs in constant
* time.
*
* The function returns 1 if the failure report was found and removed.
* Otherwise 0 is returned. */
// 从node节点中删除sender对该节点的故障报告。该函数通过gossip通知当期哨兵节点自己处于OK状态时调用
// 该函数被相对频繁的调用即使没有节点处于故障状态,时间复杂度为O(N),然而当集群处于良好状态时,故障报告链表为空,所有该函数时间复杂度为常数
// 如果故障报告被删除,返回1,否则返回0
int clusterNodeDelFailureReport(clusterNode *node, clusterNode *sender) {
list *l = node->fail_reports;
listNode *ln;
listIter li;
clusterNodeFailReport *fr;
/* Search for a failure report from this sender. */
// 从node节点的故障报告链表中寻找sender发送的报告
listRewind(l,&li);
while ((ln = listNext(&li)) != NULL) {
fr = ln->value;
if (fr->node == sender) break;
}
// 没找到sender发送的故障报告
if (!ln) return 0; /* No failure report from this sender. */
/* Remove the failure report. */
// 删除该故障报告
listDelNode(l,ln);
// 删除过期的故障报告
clusterNodeCleanupFailureReports(node);
return 1;
}
/* Return the number of external nodes that believe 'node' is failing,
* not including this node, that may have a PFAIL or FAIL state for this
* node as well. */
// 返回认为node节点下线(标记为 PFAIL or FAIL 状态)的其他节点数量,不包含当前节点
int clusterNodeFailureReportsCount(clusterNode *node) {
// 先将过期的故障报告清理
clusterNodeCleanupFailureReports(node);
// 返回报告node节点故障的节点个数
return listLength(node->fail_reports);
}
// 删除主节点master的从节点slave
int clusterNodeRemoveSlave(clusterNode *master, clusterNode *slave) {
int j;
// 遍历所有从属master的从节点
for (j = 0; j < master->numslaves; j++) {
// 如果找到指定的slave
if (master->slaves[j] == slave) {
// 通过移动覆盖的方式删除slave
if ((j+1) < master->numslaves) {
int remaining_slaves = (master->numslaves - j) - 1;
memmove(master->slaves+j,master->slaves+(j+1),
(sizeof(*master->slaves) * remaining_slaves));
}
// 计数减1
master->numslaves--;
// 没有从节点了,取消导出数据到从节点的标识
if (master->numslaves == 0)
master->flags &= ~CLUSTER_NODE_MIGRATE_TO;
return C_OK;
}
}
return C_ERR;
}
// 为主节点master添加从节点slave
int clusterNodeAddSlave(clusterNode *master, clusterNode *slave) {
int j;
/* If it's already a slave, don't add it again. */
// 遍历所有从节点,以防重复添加
for (j = 0; j < master->numslaves; j++)
if (master->slaves[j] == slave) return C_ERR;
// 分配空间,并添加到末尾
master->slaves = zrealloc(master->slaves,
sizeof(clusterNode*)*(master->numslaves+1));
master->slaves[master->numslaves] = slave;
// 更新从节点个数计数
master->numslaves++;
// 设置导出数据到从节点的标识
master->flags |= CLUSTER_NODE_MIGRATE_TO;
return C_OK;
}
// 返回从属n节点,并且处于良好状态的从节点个数
int clusterCountNonFailingSlaves(clusterNode *n) {
int j, okslaves = 0;
for (j = 0; j < n->numslaves; j++)
if (!nodeFailed(n->slaves[j])) okslaves++;
return okslaves;
}
/* Low level cleanup of the node structure. Only called by clusterDelNode(). */
// 清理释放节点结构,被 clusterDelNode() 调用
void freeClusterNode(clusterNode *n) {
sds nodename;
int j;
/* If the node has associated slaves, we have to set
* all the slaves->slaveof fields to NULL (unknown). */
// 如果该节点有从节点,那么取消主从关系,将slaves->slaveof 设置为空
for (j = 0; j < n->numslaves; j++)
n->slaves[j]->slaveof = NULL;
/* Remove this node from the list of slaves of its master. */
// 如果该节点是从节点,将它从属的主节点中将该从节点删除
if (nodeIsSlave(n) && n->slaveof) clusterNodeRemoveSlave(n->slaveof,n);
/* Unlink from the set of nodes. */
nodename = sdsnewlen(n->name, CLUSTER_NAMELEN);
// 从集群中删除名字为nodename的节点
serverAssert(dictDelete(server.cluster->nodes,nodename) == DICT_OK);
sdsfree(nodename);
/* Release link and associated data structures. */
// 释放关联的连接对象结构
if (n->link) freeClusterLink(n->link);
// 释放故障报告链表
listRelease(n->fail_reports);
// 释放从节点字典
zfree(n->slaves);
zfree(n);
}
/* Add a node to the nodes hash table */
// 添加一个node节点到集群的
int clusterAddNode(clusterNode *node) {