@@ -25,11 +25,14 @@ limitations under the License.
2525#include " absl/base/const_init.h"
2626#include " absl/base/no_destructor.h"
2727#include " absl/base/thread_annotations.h"
28+ #include " absl/log/log.h"
2829#include " absl/strings/str_split.h"
2930#include " absl/synchronization/mutex.h"
31+ #include " grpc/grpc.h"
3032#include " grpcpp/channel.h"
3133#include " grpcpp/create_channel.h"
3234#include " grpcpp/security/credentials.h"
35+ #include " grpcpp/support/channel_arguments.h"
3336#include " plugin/xprof/protobuf/worker_service.grpc.pb.h"
3437
3538namespace xprof {
@@ -56,6 +59,59 @@ static absl::NoDestructor<
5659static std::atomic<size_t > gCurrentStubIndex = 0 ;
5760static std::atomic<bool > gStubsInitialized = false ;
5861
62+ // Creates a gRPC channel for a given worker address. This channel is
63+ // configured with a service config that enables a robust retry policy for
64+ // transient errors and sets the client-side load balancing policy to
65+ // round-robin.
66+ std::shared_ptr<::grpc::Channel> CreateWorkerChannelForAddress (
67+ const std::string& address) {
68+ grpc::ChannelArguments args;
69+ // Set a service config for the channel that enables retries.
70+ // This config will be applied to all methods of the service.
71+ // Service Config: 10-minute timeout + conservative retries + LB
72+ const char * kServiceConfigJson = R"pb(
73+ {
74+ "methodConfig":
75+ [ {
76+ "name":
77+ [ {}],
78+ "timeout": "600s",
79+ "retryPolicy": {
80+ "maxAttempts": 4,
81+ "initialBackoff": "2s",
82+ "maxBackoff": "120s",
83+ "backoffMultiplier": 2.0,
84+ "retryableStatusCodes": [
85+ "UNAVAILABLE",
86+ "RESOURCE_EXHAUSTED",
87+ "INTERNAL",
88+ "ABORTED",
89+ "NOT_FOUND"
90+ ]
91+ }
92+ }],
93+ "loadBalancingConfig":
94+ [ { "round_robin": {} }]
95+ })pb" ;
96+ args.SetServiceConfigJSON (kServiceConfigJson );
97+ args.SetLoadBalancingPolicyName (" round_robin" );
98+ args.SetInt (GRPC_ARG_DNS_MIN_TIME_BETWEEN_RESOLUTIONS_MS, 5000 );
99+ args.SetInt (GRPC_ARG_KEEPALIVE_TIME_MS, 20000 );
100+ args.SetInt (GRPC_ARG_KEEPALIVE_TIMEOUT_MS, 10000 );
101+ args.SetInt (GRPC_ARG_KEEPALIVE_PERMIT_WITHOUT_CALLS, 1 );
102+ args.SetInt (GRPC_ARG_ENABLE_RETRIES, 1 );
103+ args.SetInt (GRPC_ARG_MAX_RECEIVE_MESSAGE_LENGTH, -1 );
104+ args.SetInt (GRPC_ARG_MAX_SEND_MESSAGE_LENGTH, -1 );
105+
106+ // Create the channel with insecure credentials. This is acceptable because
107+ // the communication between the aggregator and workers happens within a
108+ // trusted, internal network environment.
109+ std::shared_ptr<::grpc::Channel> channel = ::grpc::CreateCustomChannel (
110+ address, ::grpc::InsecureChannelCredentials (), args); // NOLINT
111+ LOG (INFO) << " Created gRPC channel for address: " << address;
112+ return channel;
113+ }
114+
59115void InitializeStubs (const std::string& worker_service_addresses) {
60116 absl::MutexLock lock (&gStubsMutex );
61117 if (gStubsInitialized .load (std::memory_order_acquire)) {
@@ -66,8 +122,8 @@ void InitializeStubs(const std::string& worker_service_addresses) {
66122 absl::StrSplit (worker_service_addresses, kAddressDelimiter );
67123 for (const std::string& address : addresses) {
68124 if (address.empty ()) continue ;
69- std::shared_ptr<::grpc::Channel> channel = :: grpc::CreateChannel (
70- address, :: grpc::InsecureChannelCredentials ()); // NOLINT
125+ std::shared_ptr<::grpc::Channel> channel =
126+ CreateWorkerChannelForAddress (address);
71127 gStubs ->push_back (XprofAnalysisWorkerService::NewStub (channel));
72128 }
73129 gStubsInitialized .store (true , std::memory_order_release);
0 commit comments