Skip to content

Commit

Permalink
feat: wait until connect to master or timeout
Browse files Browse the repository at this point in the history
  • Loading branch information
konnase committed Feb 18, 2025
1 parent 7e19c28 commit 337f3c5
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 7 deletions.
25 changes: 20 additions & 5 deletions gloo/rendezvous/tcp_store.cc
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#include <arpa/inet.h>
#include <iostream>
#include <stdexcept>
#include <signal.h>

#ifndef _WIN32
#include <unistd.h>
Expand Down Expand Up @@ -209,11 +210,26 @@ namespace gloo
GLOO_THROW(err);
}

// connect to server
if (connect(new_server_fd, (struct sockaddr *)&server_address, sizeof(server_address)) < 0)
const auto start = std::chrono::steady_clock::now();
auto timeout = std::chrono::seconds(timeout_);
while (true)
{
auto err = std::string("Connection to server failed: ") + strerror(errno);
GLOO_THROW(err);
// connect to server
if (connect(new_server_fd, (struct sockaddr *)&server_address, sizeof(server_address)) == 0)
{
break;
}

// check timeout
const auto elapsed = std::chrono::duration_cast<std::chrono::seconds>(
std::chrono::steady_clock::now() - start);
if (timeout != kNoTimeout && elapsed > timeout)
{
GLOO_THROW_IO_EXCEPTION(GLOO_ERROR_MSG(
"Connection to master timeout for " + std::to_string(timeout_) + " seconds"));
}
/* sleep override */
std::this_thread::sleep_for(std::chrono::milliseconds(10));
}

return new_server_fd;
Expand Down Expand Up @@ -358,7 +374,6 @@ namespace gloo
}
/* sleep override */
std::this_thread::sleep_for(std::chrono::milliseconds(10));
std::this_thread::sleep_for(std::chrono::milliseconds(1000));
}
}

Expand Down
6 changes: 4 additions & 2 deletions gloo/rendezvous/tcp_store.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
#include <netdb.h>
#include <arpa/inet.h>

#define SOCKET_INIT_TIMEOUT_SECONDS 30

namespace gloo
{
namespace rendezvous
Expand All @@ -24,7 +26,7 @@ namespace gloo
class TCPStore : public Store
{
public:
explicit TCPStore(const std::string &hostname, int port, int world_size, bool is_master, int timeout = 30);
explicit TCPStore(const std::string &hostname, int port, int world_size, bool is_master, int timeout = SOCKET_INIT_TIMEOUT_SECONDS);
virtual ~TCPStore();

virtual void set(const std::string &key, const std::vector<char> &data)
Expand Down Expand Up @@ -64,7 +66,7 @@ namespace gloo

std::mutex mtx;

int server_fd;
int server_fd = -1;
std::map<std::string, std::vector<char>> data_;
};

Expand Down

0 comments on commit 337f3c5

Please sign in to comment.