@@ -3,6 +3,8 @@ using Logging
3
3
4
4
export SlurmManager, PBSManager, set_worker_loggers
5
5
6
+ worker_timeout () = parse (Float64, get (ENV , " JULIA_WORKER_TIMEOUT" , " 300.0" ))
7
+
6
8
get_worker_pool () = workers () == [1 ] ? WorkerPool () : default_worker_pool ()
7
9
8
10
function run_worker_iteration (
@@ -21,7 +23,7 @@ function run_worker_iteration(
21
23
remotecall_wait (forward_model, w, iter, m)
22
24
end
23
25
end
24
-
26
+ isempty (all_known_workers . workers) && @info " No workers currently available "
25
27
@sync while ! isempty (work_to_do)
26
28
# Add new workers to worker_pool
27
29
all_workers = get_worker_pool ()
@@ -40,7 +42,7 @@ function run_worker_iteration(
40
42
push! (worker_pool, worker)
41
43
end
42
44
else
43
- println ( " no workers available" )
45
+ @debug " no workers available"
44
46
sleep (10 ) # Wait for workers to become available
45
47
end
46
48
end
@@ -100,7 +102,6 @@ function Distributed.manage(
100
102
)
101
103
if op == :register
102
104
set_worker_logger (id)
103
- evaluate_initial_expression (id, manager. expr)
104
105
end
105
106
end
106
107
@@ -478,3 +479,148 @@ function set_worker_loggers(workers = workers())
478
479
end
479
480
end
480
481
end
482
+
483
+ # Copied from Distributed.jl in order to evaluate the manager's expression on worker initialization
484
+ function Distributed. create_worker (
485
+ manager:: Union{SlurmManager, PBSManager} ,
486
+ wconfig,
487
+ )
488
+ # only node 1 can add new nodes, since nobody else has the full list of address:port
489
+ @assert Distributed. LPROC. id == 1
490
+ timeout = worker_timeout ()
491
+
492
+ # initiate a connect. Does not wait for connection completion in case of TCP.
493
+ w = Distributed. Worker ()
494
+ local r_s, w_s
495
+ try
496
+ (r_s, w_s) = Distributed. connect (manager, w. id, wconfig)
497
+ catch ex
498
+ try
499
+ Distributed. deregister_worker (w. id)
500
+ kill (manager, w. id, wconfig)
501
+ finally
502
+ rethrow (ex)
503
+ end
504
+ end
505
+
506
+ w = Distributed. Worker (w. id, r_s, w_s, manager; config = wconfig)
507
+ # install a finalizer to perform cleanup if necessary
508
+ finalizer (w) do w
509
+ if myid () == 1
510
+ Distributed. manage (w. manager, w. id, w. config, :finalize )
511
+ end
512
+ end
513
+
514
+ # set when the new worker has finished connections with all other workers
515
+ ntfy_oid = Distributed. RRID ()
516
+ rr_ntfy_join = Distributed. lookup_ref (ntfy_oid)
517
+ rr_ntfy_join. waitingfor = myid ()
518
+
519
+ # Start a new task to handle inbound messages from connected worker in master.
520
+ # Also calls `wait_connected` on TCP streams.
521
+ Distributed. process_messages (w. r_stream, w. w_stream, false )
522
+
523
+ # send address information of all workers to the new worker.
524
+ # Cluster managers set the address of each worker in `WorkerConfig.connect_at`.
525
+ # A new worker uses this to setup an all-to-all network if topology :all_to_all is specified.
526
+ # Workers with higher pids connect to workers with lower pids. Except process 1 (master) which
527
+ # initiates connections to all workers.
528
+
529
+ # Connection Setup Protocol:
530
+ # - Master sends 16-byte cookie followed by 16-byte version string and a JoinPGRP message to all workers
531
+ # - On each worker
532
+ # - Worker responds with a 16-byte version followed by a JoinCompleteMsg
533
+ # - Connects to all workers less than its pid. Sends the cookie, version and an IdentifySocket message
534
+ # - Workers with incoming connection requests write back their Version and an IdentifySocketAckMsg message
535
+ # - On master, receiving a JoinCompleteMsg triggers rr_ntfy_join (signifies that worker setup is complete)
536
+
537
+ join_list = []
538
+ if Distributed. PGRP. topology === :all_to_all
539
+ # need to wait for lower worker pids to have completed connecting, since the numerical value
540
+ # of pids is relevant to the connection process, i.e., higher pids connect to lower pids and they
541
+ # require the value of config.connect_at which is set only upon connection completion
542
+ for jw in Distributed. PGRP. workers
543
+ if (jw. id != 1 ) && (jw. id < w. id)
544
+ # wait for wl to join
545
+ # We should access this atomically using (@atomic jw.state)
546
+ # but this is only recently supported
547
+ if jw. state === Distributed. W_CREATED
548
+ lock (jw. c_state) do
549
+ wait (jw. c_state)
550
+ end
551
+ end
552
+ push! (join_list, jw)
553
+ end
554
+ end
555
+
556
+ elseif Distributed. PGRP. topology === :custom
557
+ # wait for requested workers to be up before connecting to them.
558
+ filterfunc (x) =
559
+ (x. id != 1 ) &&
560
+ isdefined (x, :config ) &&
561
+ (
562
+ notnothing (x. config. ident) in
563
+ something (wconfig. connect_idents, [])
564
+ )
565
+
566
+ wlist = filter (filterfunc, Distributed. PGRP. workers)
567
+ waittime = 0
568
+ while wconfig. connect_idents != = nothing &&
569
+ length (wlist) < length (wconfig. connect_idents)
570
+ if waittime >= timeout
571
+ error (" peer workers did not connect within $timeout seconds" )
572
+ end
573
+ sleep (1.0 )
574
+ waittime += 1
575
+ wlist = filter (filterfunc, Distributed. PGRP. workers)
576
+ end
577
+
578
+ for wl in wlist
579
+ lock (wl. c_state) do
580
+ if (@atomic wl. state) === Distributed. W_CREATED
581
+ # wait for wl to join
582
+ wait (wl. c_state)
583
+ end
584
+ end
585
+ push! (join_list, wl)
586
+ end
587
+ end
588
+
589
+ all_locs = Base. mapany (
590
+ x ->
591
+ isa (x, Distributed. Worker) ?
592
+ (something (x. config. connect_at, ()), x. id) : ((), x. id, true ),
593
+ join_list,
594
+ )
595
+ Distributed. send_connection_hdr (w, true )
596
+ enable_threaded_blas = something (wconfig. enable_threaded_blas, false )
597
+
598
+ join_message = Distributed. JoinPGRPMsg (
599
+ w. id,
600
+ all_locs,
601
+ Distributed. PGRP. topology,
602
+ enable_threaded_blas,
603
+ Distributed. isclusterlazy (),
604
+ )
605
+ Distributed. send_msg_now (
606
+ w,
607
+ Distributed. MsgHeader (Distributed. RRID (0 , 0 ), ntfy_oid),
608
+ join_message,
609
+ )
610
+
611
+ # Ensure the initial expression is evaluated before any other code
612
+ @info " Evaluating initial expression on worker $(w. id) "
613
+ evaluate_initial_expression (w. id, manager. expr)
614
+
615
+ @async Distributed. manage (w. manager, w. id, w. config, :register )
616
+
617
+ # wait for rr_ntfy_join with timeout
618
+ if timedwait (() -> isready (rr_ntfy_join), timeout) === :timed_out
619
+ error (" worker did not connect within $timeout seconds" )
620
+ end
621
+ lock (Distributed. client_refs) do
622
+ delete! (Distributed. PGRP. refs, ntfy_oid)
623
+ end
624
+
625
+ return w. id
626
+ end
0 commit comments