Skip to content

Commit dae8b9b

Browse files
authored
Merge pull request #747 from backyes/bugfix_connection_retry
try to connect again if refused error found
2 parents 9eae4f5 + 5ae4339 commit dae8b9b

File tree

1 file changed

+15
-2
lines changed

1 file changed

+15
-2
lines changed

paddle/pserver/LightNetwork.cpp

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ limitations under the License. */
1818
#include <netinet/tcp.h>
1919
#include <sys/socket.h>
2020
#include <sys/types.h>
21+
#include <chrono>
2122

2223
#include <arpa/inet.h>
2324
#include <net/if.h>
@@ -382,8 +383,20 @@ void SocketClient::TcpClient(const std::string &serverAddr, int serverPort) {
382383
setOption(sockfd);
383384

384385
/// Now connect to the server
385-
PCHECK(connect(sockfd, (sockaddr *)&serv_addr, sizeof(serv_addr)) >= 0)
386-
<< "ERROR connecting to " << serverAddr;
386+
int retry_second = 0;
387+
int error = 0;
388+
do {
389+
error = connect(sockfd, (sockaddr *)&serv_addr, sizeof(serv_addr));
390+
if (error == ECONNREFUSED) {
391+
LOG(WARNING) << "connection refused by pserver, try again!";
392+
if (retry_second++ >= 7) {
393+
LOG(FATAL) << "connection refused by pserver, maybe pserver failed!";
394+
}
395+
std::this_thread::sleep_for(std::chrono::seconds(1));
396+
} else {
397+
PCHECK(error >= 0) << "ERROR connecting to " << serverAddr;
398+
}
399+
} while (error == ECONNREFUSED);
387400

388401
channel_.reset(new SocketChannel(sockfd, serverAddr));
389402
tcpRdma_ = F_TCP;

0 commit comments

Comments
 (0)