diff --git a/adapter/syscall/Makefile b/adapter/syscall/Makefile index 95c03e4a6..d34814b4a 100644 --- a/adapter/syscall/Makefile +++ b/adapter/syscall/Makefile @@ -29,6 +29,12 @@ endif # like linux kernel. #FF_USE_THREAD_STRUCT_HANDLE=1 +# If FF_PRELOAD_SUPPORT_SELECT is enabled, we can use select in LD_PRELOAD mode. +# However, we need to set the value of FF_KERNEL_MAX_FD_SELECT based on our requirements to ensure file descriptors +# (FDs) for the user-space protocol stack can be covered by the select interface. +#FF_PRELOAD_SUPPORT_SELECT=1 +#FF_KERNEL_MAX_FD_SELECT=128 + PKGCONF ?= pkg-config ifndef DEBUG @@ -57,6 +63,10 @@ ifdef FF_USE_THREAD_STRUCT_HANDLE CFLAGS+= -DFF_USE_THREAD_STRUCT_HANDLE endif +ifdef FF_PRELOAD_SUPPORT_SELECT + CFLAGS+= -DFF_PRELOAD_SUPPORT_SELECT -DFF_KERNEL_MAX_FD_SELECT=$(FF_KERNEL_MAX_FD_SELECT) -DFF_USE_THREAD_STRUCT_HANDLE +endif + CFLAGS += -fPIC -Wall -Werror $(shell $(PKGCONF) --cflags libdpdk) INCLUDES= -I. -I${FF_PATH}/lib diff --git a/adapter/syscall/ff_declare_syscalls.h b/adapter/syscall/ff_declare_syscalls.h index 28d3b9808..8be24efa9 100644 --- a/adapter/syscall/ff_declare_syscalls.h +++ b/adapter/syscall/ff_declare_syscalls.h @@ -28,4 +28,5 @@ FF_SYSCALL_DECL(int, epoll_create, (int)); FF_SYSCALL_DECL(int, epoll_ctl, (int, int, int, struct epoll_event *)); FF_SYSCALL_DECL(int, epoll_wait, (int, struct epoll_event *, int, int)); FF_SYSCALL_DECL(pid_t, fork, (void)); +FF_SYSCALL_DECL(int, select, (int , fd_set *, fd_set *, fd_set *, struct timeval *)); #undef FF_SYSCALL_DECL diff --git a/adapter/syscall/ff_hook_syscall.c b/adapter/syscall/ff_hook_syscall.c index 165471a1d..6d63f806a 100644 --- a/adapter/syscall/ff_hook_syscall.c +++ b/adapter/syscall/ff_hook_syscall.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include @@ -86,6 +87,7 @@ static __thread struct ff_fcntl_args *fcntl_args = NULL; static __thread struct ff_epoll_ctl_args *epoll_ctl_args = NULL; static __thread struct ff_epoll_wait_args *epoll_wait_args = NULL; static __thread struct ff_kevent_args *kevent_args = NULL; +static __thread struct ff_select_args *select_args = NULL; #define IOV_MAX 16 #define IOV_LEN_MAX 2048 @@ -235,11 +237,17 @@ rte_spinlock_t worker_id_lock; static int nb_procs = NB_FSTACK_INSTANCE_DEFAULT; #define FF_KERNEL_MAX_FD_DEFAULT 1024 +#if defined(FF_PRELOAD_SUPPORT_SELECT) +static int ff_kernel_max_fd = FF_KERNEL_MAX_FD_SELECT; +#else static int ff_kernel_max_fd = FF_KERNEL_MAX_FD_DEFAULT; +#endif /* not support thread socket now */ static int need_alarm_sem = 0; +#define count_trailing_zeros(x) __builtin_ctzll(x) + void __chk_fail(void); static inline int convert_fstack_fd(int sockfd) { @@ -264,6 +272,47 @@ int is_fstack_fd(int sockfd) { return sockfd >= ff_kernel_max_fd; } +static inline uint64_t +ff_bitmap_first_set(fd_mask *ai, int set_words) +{ + fd_mask i = 0; + + for (; i < set_words; i++) { + fd_mask x = ai[i]; + + if (x != 0) { + return i * NFDBITS + count_trailing_zeros(x); + } + } + + return ~0; +} + +static inline fd_mask +ff_bitmap_next_set(fd_mask *ai, fd_mask i, int set_words) +{ + fd_mask i0 = i / NFDBITS; + fd_mask i1 = i % NFDBITS; + fd_mask t; + + if (i0 < set_words) { + t = (ai[i0] >> i1) << i1; + + if (t) { + return count_trailing_zeros(t) + i0 * NFDBITS; + } + + for (i0++; i0 < set_words; i0++) { + t = ai[i0]; + if (t) { + return count_trailing_zeros(t) + i0 * NFDBITS; + } + } + } + + return ~0; +} + int fstack_territory(int domain, int type, int protocol) { @@ -2558,6 +2607,315 @@ kevent(int kq, const struct kevent *changelist, int nchanges, RETURN_NOFREE(); } +int +ff_hook_select(int nfds, fd_set *restrict readfds, fd_set *restrict writefds, + fd_set *restrict exceptfds, struct timeval *restrict timeout) +{ + int set_words; + int set_bytes; + int fd; + int max_kernel_fd = 0; + int max_ff_fd = 0; + fd_mask *fds_bit; + struct timespec t_s, t_n; + time_t now_time_ms = 0; + time_t end_time_ms = 0; + int ff_set_words, ff_set_bytes; + int kernel_set_words, kernel_set_bytes; + int kernel_ret = 0; + + DEBUG_LOG("ff_hook_select nfds:%d\n", nfds); + + DEFINE_REQ_ARGS_STATIC(select); + static __thread fd_set *ff_readfds_share = NULL; + static __thread fd_set *ff_writefds_share = NULL; + static __thread fd_set *ff_exceptfds_share = NULL; + static __thread fd_set kernel_readfds; + static __thread fd_set kernel_writefds; + static __thread fd_set kernel_exceptfds; + static __thread fd_set ff_readfds; + static __thread fd_set ff_writefds; + static __thread fd_set ff_exceptfds; + + bool have_ff_readfd = false; + bool have_ff_writefd = false; + bool have_ff_exceptfd = false; + bool have_kernel_fd = false; + bool have_exec_kernel_select = false; + + if (nfds > FD_SETSIZE) { + nfds = FD_SETSIZE; + } + + if (ff_kernel_max_fd >= FD_SETSIZE) { + return ff_linux_select(nfds, readfds, writefds, exceptfds, timeout); + } + + set_words = (nfds + NFDBITS - 1) / NFDBITS; + set_bytes = set_words * sizeof(fd_mask); + + if (ff_readfds_share == NULL) { + ff_readfds_share = share_mem_alloc(sizeof(fd_set)); + if (ff_readfds_share == NULL) { + RETURN_ERROR_NOFREE(ENOMEM); + } + } + + if (ff_writefds_share == NULL) { + ff_writefds_share = share_mem_alloc(sizeof(fd_set)); + if (ff_writefds_share == NULL) { + RETURN_ERROR_NOFREE(ENOMEM); + } + } + + if (ff_exceptfds_share == NULL) { + ff_exceptfds_share = share_mem_alloc(sizeof(fd_set)); + if (ff_exceptfds_share == NULL) { + RETURN_ERROR_NOFREE(ENOMEM); + } + } + + /* We first separate kernel fd and userspace fd. */ + if (readfds != NULL) { + memset(&ff_readfds, 0, set_bytes); + memset(&kernel_readfds, 0, set_bytes); + + fds_bit = (fd_mask *)readfds; + + for (fd = ff_bitmap_first_set(fds_bit, set_words); (fd != ~0) && (fd < nfds); + fd = ff_bitmap_next_set(fds_bit, fd + 1, set_words)) { + if (is_fstack_fd(fd)) { + have_ff_readfd = true; + if (restore_fstack_fd(fd) > max_ff_fd) { + max_ff_fd = restore_fstack_fd(fd); + } + FD_SET(restore_fstack_fd(fd), &ff_readfds); + } else { + have_kernel_fd = true; + if (fd > max_kernel_fd) { + max_kernel_fd = fd; + } + FD_SET(fd, &kernel_readfds); + } + } + + memset(readfds, 0, set_bytes); + } + + if (writefds != NULL) { + memset(&ff_writefds, 0, set_bytes); + memset(&kernel_writefds, 0, set_bytes); + + fds_bit = (fd_mask *)writefds; + + for (fd = ff_bitmap_first_set(fds_bit, set_words); (fd != ~0) && (fd < nfds); + fd = ff_bitmap_next_set(fds_bit, fd + 1, set_words)) { + if (is_fstack_fd(fd)) { + have_ff_exceptfd = true; + if (restore_fstack_fd(fd) > max_ff_fd) { + max_ff_fd = restore_fstack_fd(fd); + } + FD_SET(restore_fstack_fd(fd), &ff_writefds); + } else { + have_kernel_fd = true; + if (fd > max_kernel_fd) { + max_kernel_fd = fd; + } + FD_SET(fd, &kernel_writefds); + } + } + + memset(writefds, 0, set_bytes); + } + + if (exceptfds != NULL) { + memset(&ff_exceptfds, 0, set_bytes); + memset(&kernel_exceptfds, 0, set_bytes); + + fds_bit = (fd_mask *)exceptfds; + + for (fd = ff_bitmap_first_set(fds_bit, set_words); (fd != ~0) && (fd < nfds); + fd = ff_bitmap_next_set(fds_bit, fd + 1, set_words)) { + if (is_fstack_fd(fd)) { + have_ff_writefd = true; + if (restore_fstack_fd(fd) > max_ff_fd) { + max_ff_fd = restore_fstack_fd(fd); + } + FD_SET(restore_fstack_fd(fd), &ff_exceptfds); + } else { + have_kernel_fd = true; + if (fd > max_kernel_fd) { + max_kernel_fd = fd; + } + FD_SET(fd, &kernel_exceptfds); + } + } + + memset(exceptfds, 0, set_bytes); + } + + if (timeout != NULL) { + if (clock_gettime(CLOCK_MONOTONIC_COARSE, &t_s) == -1) { + ret = -1; + goto select_exit; + } + end_time_ms = (t_s.tv_sec + timeout->tv_sec) * 1000 + t_s.tv_nsec / 1000000 + timeout->tv_usec / 1000; + } + + if (have_ff_readfd || have_ff_writefd || have_ff_exceptfd) { + args->nfds = max_ff_fd + 1; + ff_set_words = (max_ff_fd + 1 + NFDBITS - 1) / NFDBITS; + ff_set_bytes = ff_set_words * sizeof(fd_mask); + } + + if (have_kernel_fd) { + kernel_set_words = (max_kernel_fd + 1 + NFDBITS - 1) / NFDBITS; + kernel_set_bytes = kernel_set_words * sizeof(fd_mask); + } + + struct timeval no_block_time = {0, 0}; + +retry: + if (have_ff_readfd) { + memcpy(ff_readfds_share, &ff_readfds, ff_set_bytes); + args->readfds = ff_readfds_share; + } else { + args->readfds = NULL; + } + + if (have_ff_writefd) { + memcpy(ff_writefds_share, &ff_writefds, ff_set_bytes); + args->writefds = ff_writefds_share; + } else { + args->writefds = NULL; + } + + if (have_ff_exceptfd) { + memcpy(ff_exceptfds_share, &ff_exceptfds, ff_set_bytes); + args->exceptfds = ff_exceptfds_share; + } else { + args->exceptfds = NULL; + } + + ACQUIRE_ZONE_LOCK(FF_SC_IDLE); + sc->ops = FF_SO_SELECT; + sc->args = args; + + /* + * sc->result, sc->error must reset in select. + * Otherwise can access last sc call's result. + */ + sc->result = 0; + sc->error = 0; + errno = 0; + RELEASE_ZONE_LOCK(FF_SC_REQ); + + do { + /* + * we call freebsd select. + */ + ACQUIRE_ZONE_TRY_LOCK(FF_SC_REP); + ret = sc->result; + if (ret < 0) { + //occur error, return imme. + errno = sc->error; + RELEASE_ZONE_LOCK(FF_SC_IDLE); + return ret; + } + RELEASE_ZONE_LOCK(FF_SC_IDLE); + if (ret > 0) { + goto select_exit; + } + + if (have_kernel_fd) { + if (readfds) + memcpy(readfds, &kernel_readfds, kernel_set_bytes); + if (writefds) + memcpy(writefds, &kernel_writefds, kernel_set_bytes); + if (exceptfds) + memcpy(exceptfds, &kernel_exceptfds, kernel_set_bytes); + + kernel_ret = ff_linux_select(max_kernel_fd + 1, readfds, writefds, exceptfds, &no_block_time); + have_exec_kernel_select = true; + if (kernel_ret < 0) { + //occur error, return imme. + ret = kernel_ret; + goto select_exit; + } + } + + if (timeout) { + if (timeout->tv_sec == 0 && timeout->tv_usec == 0) { + goto select_exit; + } else { + clock_gettime(CLOCK_MONOTONIC_COARSE, &t_n); + now_time_ms = t_n.tv_sec * 1000 + t_n.tv_nsec / 1000000; + + if (now_time_ms >= end_time_ms) { + goto select_exit; + } + + goto retry; + } + } else { + goto retry; + } + } while(true); + +select_exit: + if (have_kernel_fd && have_exec_kernel_select == false) { + /* + * Not been called even once. + */ + if (readfds) + memcpy(readfds, &kernel_readfds, kernel_set_bytes); + if (writefds) + memcpy(writefds, &kernel_writefds, kernel_set_bytes); + if (exceptfds) + memcpy(exceptfds, &kernel_exceptfds, kernel_set_bytes); + + kernel_ret = ff_linux_select(max_kernel_fd + 1, readfds, writefds, exceptfds, &no_block_time); + if (kernel_ret < 0) { + //occur error, return imme. + return kernel_ret; + } + } + + /* if have ff_event, set to intput read/write/except fds */ + if (likely(ret > 0)) { + if (have_ff_readfd) { + fds_bit = (fd_mask *)ff_readfds_share; + + for (fd = ff_bitmap_first_set(fds_bit, set_words); fd != ~0; + fd = ff_bitmap_next_set(fds_bit, fd + 1, set_words)) { + FD_SET(convert_fstack_fd(fd), readfds); + } + } + + if (have_ff_writefd) { + fds_bit = (fd_mask *)ff_writefds_share; + + for (fd = ff_bitmap_first_set(fds_bit, set_words); fd != ~0; + fd = ff_bitmap_next_set(fds_bit, fd + 1, set_words)) { + FD_SET(convert_fstack_fd(fd), writefds); + } + } + + if (have_ff_exceptfd) { + fds_bit = (fd_mask *)ff_exceptfds_share; + + for (fd = ff_bitmap_first_set(fds_bit, set_words); fd != ~0; + fd = ff_bitmap_next_set(fds_bit, fd + 1, set_words)) { + FD_SET(convert_fstack_fd(fd), exceptfds); + } + } + } + + ret += kernel_ret; + + RETURN_NOFREE(); +} + static void thread_destructor(void *sc) { @@ -2710,7 +3068,9 @@ ff_adapter_init() ERR_LOG("getrlimit(RLIMIT_NOFILE) failed, use default ff_kernel_max_fd:%d\n", ff_kernel_max_fd); return -1; } else { +#if !defined(FF_PRELOAD_SUPPORT_SELECT) ff_kernel_max_fd = (int)rlmt.rlim_cur; +#endif } ERR_LOG("getrlimit(RLIMIT_NOFILE) successed, sed ff_kernel_max_fd:%d, and rlim_max is %ld\n", ff_kernel_max_fd, rlmt.rlim_max); diff --git a/adapter/syscall/ff_linux_syscall.c b/adapter/syscall/ff_linux_syscall.c index adfa2e60c..dbc3a78df 100644 --- a/adapter/syscall/ff_linux_syscall.c +++ b/adapter/syscall/ff_linux_syscall.c @@ -254,3 +254,10 @@ ff_linux_fork(void) SYSCALL(fork, ()); } +int +ff_linux_select(int nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds, + struct timeval *timeout) +{ + DEBUG_LOG("ff_linux_select\n"); + SYSCALL(select, (nfds, readfds, writefds, exceptfds, timeout)); +} diff --git a/adapter/syscall/ff_socket_ops.c b/adapter/syscall/ff_socket_ops.c index f73a31108..fcc7b2dcf 100644 --- a/adapter/syscall/ff_socket_ops.c +++ b/adapter/syscall/ff_socket_ops.c @@ -407,6 +407,18 @@ ff_sys_exit_thread(struct ff_exit_application_args *args) return 0; } +static int +ff_sys_select(struct ff_select_args *args) +{ + int ret; + struct timeval no_block_time = {0, 0}; + + DEBUG_LOG("to run ff_sys_select, nfds:%d\n", args->nfds); + ret = ff_select(args->nfds, args->readfds, args->writefds, args->exceptfds, &no_block_time); + + return ret; +} + static int ff_so_handler(int ops, void *args) { @@ -476,6 +488,8 @@ ff_so_handler(int ops, void *args) return ff_sys_register_thread((struct ff_register_application_args *)args); case FF_SO_EXIT_APPLICATION: return ff_sys_exit_thread((struct ff_exit_application_args *)args); + case FF_SO_SELECT: + return ff_sys_select((struct ff_select_args *)args); default: break; } diff --git a/adapter/syscall/ff_socket_ops.h b/adapter/syscall/ff_socket_ops.h index 0a3bc66bd..ff8d23a86 100644 --- a/adapter/syscall/ff_socket_ops.h +++ b/adapter/syscall/ff_socket_ops.h @@ -66,6 +66,7 @@ enum FF_SOCKET_OPS { FF_SO_FORK, // 29 FF_SO_REGISTER_APPLICATION, FF_SO_EXIT_APPLICATION, + FF_SO_SELECT, }; enum FF_SO_CONTEXT_STATUS { diff --git a/adapter/syscall/ff_sysproto.h b/adapter/syscall/ff_sysproto.h index df33639a3..9fc5d29d5 100644 --- a/adapter/syscall/ff_sysproto.h +++ b/adapter/syscall/ff_sysproto.h @@ -203,4 +203,11 @@ struct ff_exit_application_args { struct ff_so_context *sc; }; +struct ff_select_args { + int nfds; + fd_set *readfds; + fd_set *writefds; + fd_set *exceptfds; +}; + #endif