Skip to content

Commit b7b7011

Browse files
committed
try to connect again if refused error found
1 parent b6d036a commit b7b7011

File tree

1 file changed

+19
-2
lines changed

1 file changed

+19
-2
lines changed

paddle/pserver/LightNetwork.cpp

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ limitations under the License. */
1818
#include <netinet/in.h>
1919
#include <netinet/tcp.h>
2020
#include <fcntl.h>
21+
#include <chrono>
2122

2223
#include <arpa/inet.h>
2324
#include <sys/ioctl.h>
@@ -49,6 +50,10 @@ P_DEFINE_int32(sock_recv_buf_size,
4950
1024 * 1024 * 40,
5051
"restrict sock recv buff size");
5152

53+
P_DEFINE_int32(connrefused_retries_second,
54+
10,
55+
"retry connrefused_retries_second if ECONNREFUSED occurs");
56+
5257
namespace paddle {
5358

5459
/**
@@ -382,8 +387,20 @@ void SocketClient::TcpClient(const std::string &serverAddr, int serverPort) {
382387
setOption(sockfd);
383388

384389
/// Now connect to the server
385-
PCHECK(connect(sockfd, (sockaddr *)&serv_addr, sizeof(serv_addr)) >= 0)
386-
<< "ERROR connecting to " << serverAddr;
390+
int retry_second = 0;
391+
int error = 0;
392+
do {
393+
error = connect(sockfd, (sockaddr *)&serv_addr, sizeof(serv_addr));
394+
if (error == ECONNREFUSED) {
395+
LOG(WARNING) << "connection refused by pserver, try again!";
396+
if (retry_second++ >= FLAGS_connrefused_retries_second) {
397+
LOG(FATAL) << "connection refused by pserver, maybe pserver failed!";
398+
}
399+
std::this_thread::sleep_for(std::chrono::seconds(1));
400+
} else {
401+
PCHECK(error >= 0) << "ERROR connecting to " << serverAddr;
402+
}
403+
} while (error == ECONNREFUSED);
387404

388405
channel_.reset(new SocketChannel(sockfd, serverAddr));
389406
tcpRdma_ = F_TCP;

0 commit comments

Comments
 (0)