diff --git a/net/smc/Makefile b/net/smc/Makefile index 50f39ffec8c9..c0ad5883e888 100644 --- a/net/smc/Makefile +++ b/net/smc/Makefile @@ -1,2 +1,2 @@ obj-$(CONFIG_SMC) += smc.o -smc-y := af_smc.o smc_pnet.o smc_ib.o +smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 8b059b2fc34d..05c705a688e5 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -6,6 +6,13 @@ * offers an alternative communication option for TCP-protocol sockets * applicable with RoCE-cards only * + * Initial restrictions: + * - non-blocking connect postponed + * - IPv6 support postponed + * - support for alternate links postponed + * - partial support for non-blocking sockets only + * - support for urgent data postponed + * * Copyright IBM Corp. 2016 * * Author(s): Ursula Braun @@ -17,12 +24,18 @@ #include #include +#include +#include #include +#include #include "smc.h" +#include "smc_clc.h" #include "smc_ib.h" #include "smc_pnet.h" +static void smc_tcp_listen_work(struct work_struct *); + static void smc_set_keepalive(struct sock *sk, int val) { struct smc_sock *smc = smc_sk(sk); @@ -88,9 +101,11 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock) sk->sk_state = SMC_INIT; sk->sk_destruct = smc_destruct; sk->sk_protocol = SMCPROTO_SMC; - sk_refcnt_debug_inc(sk); - smc = smc_sk(sk); + INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work); + INIT_LIST_HEAD(&smc->accept_q); + spin_lock_init(&smc->accept_q_lock); + sk_refcnt_debug_inc(sk); return sk; } @@ -184,6 +199,119 @@ static void smc_copy_sock_settings_to_smc(struct smc_sock *smc) smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC); } +/* determine subnet and mask of internal TCP socket */ +int smc_netinfo_by_tcpsk(struct socket *clcsock, + __be32 *subnet, u8 *prefix_len) +{ + struct dst_entry *dst = sk_dst_get(clcsock->sk); + struct sockaddr_in addr; + int rc = -ENOENT; + int len; + + if (!dst) { + rc = -ENOTCONN; + goto out; + } + if (!dst->dev) { + rc = -ENODEV; + goto out_rel; + } + + /* get address to which the internal TCP socket is bound */ + kernel_getsockname(clcsock, (struct sockaddr *)&addr, &len); + /* analyze IPv4 specific data of net_device belonging to TCP socket */ + for_ifa(dst->dev->ip_ptr) { + if (ifa->ifa_address != addr.sin_addr.s_addr) + continue; + *prefix_len = inet_mask_len(ifa->ifa_mask); + *subnet = ifa->ifa_address & ifa->ifa_mask; + rc = 0; + break; + } endfor_ifa(dst->dev->ip_ptr); + +out_rel: + dst_release(dst); +out: + return rc; +} + +/* setup for RDMA connection of client */ +static int smc_connect_rdma(struct smc_sock *smc) +{ + struct smc_clc_msg_accept_confirm aclc; + struct smc_ib_device *smcibdev; + int reason_code = 0; + int rc = 0; + u8 ibport; + + /* IPSec connections opt out of SMC-R optimizations */ + if (using_ipsec(smc)) { + reason_code = SMC_CLC_DECL_IPSEC; + goto decline_rdma; + } + + /* PNET table look up: search active ib_device and port + * within same PNETID that also contains the ethernet device + * used for the internal TCP socket + */ + smc_pnet_find_roce_resource(smc->clcsock->sk, &smcibdev, &ibport); + if (!smcibdev) { + reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */ + goto decline_rdma; + } + + /* do inband token exchange */ + reason_code = smc_clc_send_proposal(smc, smcibdev, ibport); + if (reason_code < 0) { + rc = reason_code; + goto out_err; + } + if (reason_code > 0) /* configuration error */ + goto decline_rdma; + /* receive SMC Accept CLC message */ + reason_code = smc_clc_wait_msg(smc, &aclc, sizeof(aclc), + SMC_CLC_ACCEPT); + if (reason_code < 0) { + rc = reason_code; + goto out_err; + } + if (reason_code > 0) + goto decline_rdma; + + /* tbd in follow-on patch: more steps to setup RDMA communcication, + * create connection, link group, link + */ + + /* tbd in follow-on patch: more steps to setup RDMA communcication, + * create rmbs, map rmbs, rtoken_handling, modify_qp + */ + + rc = smc_clc_send_confirm(smc); + if (rc) + goto out_err; + + /* tbd in follow-on patch: llc_confirm */ + +out_connected: + smc_copy_sock_settings_to_clc(smc); + smc->sk.sk_state = SMC_ACTIVE; + + return rc; + +decline_rdma: + /* RDMA setup failed, switch back to TCP */ + smc->use_fallback = true; + if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) { + rc = smc_clc_send_decline(smc, reason_code, 0); + if (rc < sizeof(struct smc_clc_msg_decline)) + goto out_err; + } + goto out_connected; + +out_err: + return rc; +} + static int smc_connect(struct socket *sock, struct sockaddr *addr, int alen, int flags) { @@ -198,6 +326,7 @@ static int smc_connect(struct socket *sock, struct sockaddr *addr, goto out_err; if (addr->sa_family != AF_INET) goto out_err; + smc->addr = addr; /* needed for nonblocking connect */ lock_sock(sk); switch (sk->sk_state) { @@ -216,12 +345,12 @@ static int smc_connect(struct socket *sock, struct sockaddr *addr, if (rc) goto out; - sk->sk_state = SMC_ACTIVE; - - /* always use TCP fallback as transport mechanism for now; - * This will change once RDMA transport is implemented - */ - smc->use_fallback = true; + /* setup RDMA connection */ + rc = smc_connect_rdma(smc); + if (rc < 0) + goto out; + else + rc = 0; /* success cases including fallback */ out: release_sock(sk); @@ -236,17 +365,32 @@ static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc) struct sock *new_sk; int rc; + release_sock(&lsmc->sk); new_sk = smc_sock_alloc(sock_net(sk), NULL); if (!new_sk) { rc = -ENOMEM; lsmc->sk.sk_err = ENOMEM; *new_smc = NULL; + lock_sock(&lsmc->sk); goto out; } *new_smc = smc_sk(new_sk); rc = kernel_accept(lsmc->clcsock, &new_clcsock, 0); - if (rc) { + lock_sock(&lsmc->sk); + if (rc < 0) { + lsmc->sk.sk_err = -rc; + new_sk->sk_state = SMC_CLOSED; + sock_set_flag(new_sk, SOCK_DEAD); + sock_put(new_sk); + *new_smc = NULL; + goto out; + } + if (lsmc->sk.sk_state == SMC_CLOSED) { + if (new_clcsock) + sock_release(new_clcsock); + new_sk->sk_state = SMC_CLOSED; + sock_set_flag(new_sk, SOCK_DEAD); sock_put(new_sk); *new_smc = NULL; goto out; @@ -257,6 +401,216 @@ out: return rc; } +/* add a just created sock to the accept queue of the listen sock as + * candidate for a following socket accept call from user space + */ +static void smc_accept_enqueue(struct sock *parent, struct sock *sk) +{ + struct smc_sock *par = smc_sk(parent); + + sock_hold(sk); + spin_lock(&par->accept_q_lock); + list_add_tail(&smc_sk(sk)->accept_q, &par->accept_q); + spin_unlock(&par->accept_q_lock); + sk_acceptq_added(parent); +} + +/* remove a socket from the accept queue of its parental listening socket */ +static void smc_accept_unlink(struct sock *sk) +{ + struct smc_sock *par = smc_sk(sk)->listen_smc; + + spin_lock(&par->accept_q_lock); + list_del_init(&smc_sk(sk)->accept_q); + spin_unlock(&par->accept_q_lock); + sk_acceptq_removed(&smc_sk(sk)->listen_smc->sk); + sock_put(sk); +} + +/* remove a sock from the accept queue to bind it to a new socket created + * for a socket accept call from user space + */ +static struct sock *smc_accept_dequeue(struct sock *parent, + struct socket *new_sock) +{ + struct smc_sock *isk, *n; + struct sock *new_sk; + + list_for_each_entry_safe(isk, n, &smc_sk(parent)->accept_q, accept_q) { + new_sk = (struct sock *)isk; + + smc_accept_unlink(new_sk); + if (new_sk->sk_state == SMC_CLOSED) { + /* tbd in follow-on patch: close this sock */ + continue; + } + if (new_sock) + sock_graft(new_sk, new_sock); + return new_sk; + } + return NULL; +} + +/* clean up for a created but never accepted sock */ +static void smc_close_non_accepted(struct sock *sk) +{ + struct smc_sock *smc = smc_sk(sk); + + sock_hold(sk); + if (smc->clcsock) { + struct socket *tcp; + + tcp = smc->clcsock; + smc->clcsock = NULL; + sock_release(tcp); + } + /* more closing stuff to be added with socket closing patch */ + sock_put(sk); +} + +/* setup for RDMA connection of server */ +static void smc_listen_work(struct work_struct *work) +{ + struct smc_sock *new_smc = container_of(work, struct smc_sock, + smc_listen_work); + struct socket *newclcsock = new_smc->clcsock; + struct smc_sock *lsmc = new_smc->listen_smc; + struct smc_clc_msg_accept_confirm cclc; + struct sock *newsmcsk = &new_smc->sk; + struct smc_clc_msg_proposal pclc; + struct smc_ib_device *smcibdev; + struct sockaddr_in peeraddr; + int reason_code = 0; + int rc = 0, len; + __be32 subnet; + u8 prefix_len; + u8 ibport; + + /* do inband token exchange - + *wait for and receive SMC Proposal CLC message + */ + reason_code = smc_clc_wait_msg(new_smc, &pclc, sizeof(pclc), + SMC_CLC_PROPOSAL); + if (reason_code < 0) + goto out_err; + if (reason_code > 0) + goto decline_rdma; + + /* IPSec connections opt out of SMC-R optimizations */ + if (using_ipsec(new_smc)) { + reason_code = SMC_CLC_DECL_IPSEC; + goto decline_rdma; + } + + /* PNET table look up: search active ib_device and port + * within same PNETID that also contains the ethernet device + * used for the internal TCP socket + */ + smc_pnet_find_roce_resource(newclcsock->sk, &smcibdev, &ibport); + if (!smcibdev) { + reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */ + goto decline_rdma; + } + + /* determine subnet and mask from internal TCP socket */ + rc = smc_netinfo_by_tcpsk(newclcsock, &subnet, &prefix_len); + if (rc) { + reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */ + goto decline_rdma; + } + if ((pclc.outgoing_subnet != subnet) || + (pclc.prefix_len != prefix_len)) { + reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */ + goto decline_rdma; + } + + /* get address of the peer connected to the internal TCP socket */ + kernel_getpeername(newclcsock, (struct sockaddr *)&peeraddr, &len); + + /* tbd in follow-on patch: more steps to setup RDMA communcication, + * create connection, link_group, link + */ + + /* tbd in follow-on patch: more steps to setup RDMA communcication, + * create rmbs, map rmbs + */ + + rc = smc_clc_send_accept(new_smc); + if (rc) + goto out_err; + + /* receive SMC Confirm CLC message */ + reason_code = smc_clc_wait_msg(new_smc, &cclc, sizeof(cclc), + SMC_CLC_CONFIRM); + if (reason_code < 0) + goto out_err; + if (reason_code > 0) + goto decline_rdma; + + /* tbd in follow-on patch: more steps to setup RDMA communcication, + * rtoken_handling, modify_qp + */ + +out_connected: + sk_refcnt_debug_inc(newsmcsk); + newsmcsk->sk_state = SMC_ACTIVE; +enqueue: + lock_sock(&lsmc->sk); + if (lsmc->sk.sk_state == SMC_LISTEN) { + smc_accept_enqueue(&lsmc->sk, newsmcsk); + } else { /* no longer listening */ + smc_close_non_accepted(newsmcsk); + } + release_sock(&lsmc->sk); + + /* Wake up accept */ + lsmc->sk.sk_data_ready(&lsmc->sk); + sock_put(&lsmc->sk); /* sock_hold in smc_tcp_listen_work */ + return; + +decline_rdma: + /* RDMA setup failed, switch back to TCP */ + new_smc->use_fallback = true; + if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) { + rc = smc_clc_send_decline(new_smc, reason_code, 0); + if (rc < sizeof(struct smc_clc_msg_decline)) + goto out_err; + } + goto out_connected; + +out_err: + newsmcsk->sk_state = SMC_CLOSED; + goto enqueue; /* queue new sock with sk_err set */ +} + +static void smc_tcp_listen_work(struct work_struct *work) +{ + struct smc_sock *lsmc = container_of(work, struct smc_sock, + tcp_listen_work); + struct smc_sock *new_smc; + int rc = 0; + + lock_sock(&lsmc->sk); + while (lsmc->sk.sk_state == SMC_LISTEN) { + rc = smc_clcsock_accept(lsmc, &new_smc); + if (rc) + goto out; + if (!new_smc) + continue; + + new_smc->listen_smc = lsmc; + new_smc->use_fallback = false; /* assume rdma capability first*/ + sock_hold(&lsmc->sk); /* sock_put in smc_listen_work */ + INIT_WORK(&new_smc->smc_listen_work, smc_listen_work); + smc_copy_sock_settings_to_smc(new_smc); + schedule_work(&new_smc->smc_listen_work); + } + +out: + release_sock(&lsmc->sk); + lsmc->sk.sk_data_ready(&lsmc->sk); /* no more listening, wake accept */ +} + static int smc_listen(struct socket *sock, int backlog) { struct sock *sk = sock->sk; @@ -286,6 +640,8 @@ static int smc_listen(struct socket *sock, int backlog) sk->sk_max_ack_backlog = backlog; sk->sk_ack_backlog = 0; sk->sk_state = SMC_LISTEN; + INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work); + schedule_work(&smc->tcp_listen_work); out: release_sock(sk); @@ -295,10 +651,11 @@ out: static int smc_accept(struct socket *sock, struct socket *new_sock, int flags) { - struct smc_sock *new_smc; - struct sock *sk = sock->sk; + struct sock *sk = sock->sk, *nsk; + DECLARE_WAITQUEUE(wait, current); struct smc_sock *lsmc; - int rc; + long timeo; + int rc = 0; lsmc = smc_sk(sk); lock_sock(sk); @@ -308,18 +665,30 @@ static int smc_accept(struct socket *sock, struct socket *new_sock, goto out; } - rc = smc_clcsock_accept(lsmc, &new_smc); - if (rc) - goto out; - sock_graft(&new_smc->sk, new_sock); - new_smc->sk.sk_state = SMC_ACTIVE; + /* Wait for an incoming connection */ + timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); + add_wait_queue_exclusive(sk_sleep(sk), &wait); + while (!(nsk = smc_accept_dequeue(sk, new_sock))) { + set_current_state(TASK_INTERRUPTIBLE); + if (!timeo) { + rc = -EAGAIN; + break; + } + release_sock(sk); + timeo = schedule_timeout(timeo); + /* wakeup by sk_data_ready in smc_listen_work() */ + sched_annotate_sleep(); + lock_sock(sk); + if (signal_pending(current)) { + rc = sock_intr_errno(timeo); + break; + } + } + set_current_state(TASK_RUNNING); + remove_wait_queue(sk_sleep(sk), &wait); - smc_copy_sock_settings_to_smc(new_smc); - - /* always use TCP fallback as transport mechanism for now; - * This will change once RDMA transport is implemented - */ - new_smc->use_fallback = true; + if (!rc) + rc = sock_error(nsk); out: release_sock(sk); @@ -379,29 +748,61 @@ out: return rc; } +static unsigned int smc_accept_poll(struct sock *parent) +{ + struct smc_sock *isk; + struct sock *sk; + + lock_sock(parent); + list_for_each_entry(isk, &smc_sk(parent)->accept_q, accept_q) { + sk = (struct sock *)isk; + + if (sk->sk_state == SMC_ACTIVE) { + release_sock(parent); + return POLLIN | POLLRDNORM; + } + } + release_sock(parent); + + return 0; +} + static unsigned int smc_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; unsigned int mask = 0; struct smc_sock *smc; + int rc; smc = smc_sk(sock->sk); - if ((sk->sk_state == SMC_INIT) || (sk->sk_state == SMC_LISTEN) || - smc->use_fallback) { + if ((sk->sk_state == SMC_INIT) || smc->use_fallback) { + /* delegate to CLC child sock */ mask = smc->clcsock->ops->poll(file, smc->clcsock, wait); /* if non-blocking connect finished ... */ lock_sock(sk); if ((sk->sk_state == SMC_INIT) && (mask & POLLOUT)) { - sk->sk_state = SMC_ACTIVE; - /* always use TCP fallback as transport mechanism; - * This will change once RDMA transport is implemented - */ - smc->use_fallback = true; + sk->sk_err = smc->clcsock->sk->sk_err; + if (sk->sk_err) { + mask |= POLLERR; + } else { + rc = smc_connect_rdma(smc); + if (rc < 0) + mask |= POLLERR; + else + /* success cases including fallback */ + mask |= POLLOUT | POLLWRNORM; + } } release_sock(sk); } else { - mask = sock_no_poll(file, sock, wait); + sock_poll_wait(file, sk_sleep(sk), wait); + if (sk->sk_state == SMC_LISTEN) + /* woken up by sk_data_ready in smc_listen_work() */ + mask |= smc_accept_poll(sk); + if (sk->sk_err) + mask |= POLLERR; + /* for now - to be enhanced in follow-on patch */ } return mask; @@ -568,6 +969,7 @@ static int smc_create(struct net *net, struct socket *sock, int protocol, /* create internal TCP socket for CLC handshake and fallback */ smc = smc_sk(sk); + smc->use_fallback = false; /* assume rdma capability first */ rc = sock_create_kern(net, PF_INET, SOCK_STREAM, IPPROTO_TCP, &smc->clcsock); if (rc) diff --git a/net/smc/smc.h b/net/smc/smc.h index e87d79867099..5946aaecdebf 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -28,6 +28,12 @@ enum smc_state { /* possible states of an SMC socket */ struct smc_sock { /* smc sock container */ struct sock sk; struct socket *clcsock; /* internal tcp socket */ + struct sockaddr *addr; /* inet connect address */ + struct smc_sock *listen_smc; /* listen parent */ + struct work_struct tcp_listen_work;/* handle tcp socket accepts */ + struct work_struct smc_listen_work;/* prepare new accept socket */ + struct list_head accept_q; /* sockets to be accepted */ + spinlock_t accept_q_lock; /* protects accept_q */ bool use_fallback; /* fallback to tcp */ }; @@ -40,4 +46,20 @@ static inline struct smc_sock *smc_sk(const struct sock *sk) extern u8 local_systemid[SMC_SYSTEMID_LEN]; /* unique system identifier */ +#ifdef CONFIG_XFRM +static inline bool using_ipsec(struct smc_sock *smc) +{ + return (smc->clcsock->sk->sk_policy[0] || + smc->clcsock->sk->sk_policy[1]) ? 1 : 0; +} +#else +static inline bool using_ipsec(struct smc_sock *smc) +{ + return 0; +} +#endif + +int smc_netinfo_by_tcpsk(struct socket *clcsock, __be32 *subnet, + u8 *prefix_len); + #endif /* __SMC_H */ diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c new file mode 100644 index 000000000000..b613b866a2ef --- /dev/null +++ b/net/smc/smc_clc.c @@ -0,0 +1,252 @@ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * CLC (connection layer control) handshake over initial TCP socket to + * prepare for RDMA traffic + * + * Copyright IBM Corp. 2016 + * + * Author(s): Ursula Braun + */ + +#include +#include +#include + +#include "smc.h" +#include "smc_clc.h" +#include "smc_ib.h" + +/* Wait for data on the tcp-socket, analyze received data + * Returns: + * 0 if success and it was not a decline that we received. + * SMC_CLC_DECL_REPLY if decline received for fallback w/o another decl send. + * clcsock error, -EINTR, -ECONNRESET, -EPROTO otherwise. + */ +int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, + u8 expected_type) +{ + struct sock *clc_sk = smc->clcsock->sk; + struct smc_clc_msg_hdr *clcm = buf; + struct msghdr msg = {NULL, 0}; + int reason_code = 0; + struct kvec vec; + int len, datlen; + int krflags; + + /* peek the first few bytes to determine length of data to receive + * so we don't consume any subsequent CLC message or payload data + * in the TCP byte stream + */ + vec.iov_base = buf; + vec.iov_len = buflen; + krflags = MSG_PEEK | MSG_WAITALL; + smc->clcsock->sk->sk_rcvtimeo = CLC_WAIT_TIME; + len = kernel_recvmsg(smc->clcsock, &msg, &vec, 1, + sizeof(struct smc_clc_msg_hdr), krflags); + if (signal_pending(current)) { + reason_code = -EINTR; + clc_sk->sk_err = EINTR; + smc->sk.sk_err = EINTR; + goto out; + } + if (clc_sk->sk_err) { + reason_code = -clc_sk->sk_err; + smc->sk.sk_err = clc_sk->sk_err; + goto out; + } + if (!len) { /* peer has performed orderly shutdown */ + smc->sk.sk_err = ECONNRESET; + reason_code = -ECONNRESET; + goto out; + } + if (len < 0) { + smc->sk.sk_err = -len; + reason_code = len; + goto out; + } + datlen = ntohs(clcm->length); + if ((len < sizeof(struct smc_clc_msg_hdr)) || + (datlen < sizeof(struct smc_clc_msg_decline)) || + (datlen > sizeof(struct smc_clc_msg_accept_confirm)) || + memcmp(clcm->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)) || + ((clcm->type != SMC_CLC_DECLINE) && + (clcm->type != expected_type))) { + smc->sk.sk_err = EPROTO; + reason_code = -EPROTO; + goto out; + } + + /* receive the complete CLC message */ + vec.iov_base = buf; + vec.iov_len = buflen; + memset(&msg, 0, sizeof(struct msghdr)); + krflags = MSG_WAITALL; + smc->clcsock->sk->sk_rcvtimeo = CLC_WAIT_TIME; + len = kernel_recvmsg(smc->clcsock, &msg, &vec, 1, datlen, krflags); + if (len < datlen) { + smc->sk.sk_err = EPROTO; + reason_code = -EPROTO; + goto out; + } + if (clcm->type == SMC_CLC_DECLINE) + reason_code = SMC_CLC_DECL_REPLY; +out: + return reason_code; +} + +/* send CLC DECLINE message across internal TCP socket */ +int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info, + u8 out_of_sync) +{ + struct smc_clc_msg_decline dclc; + struct msghdr msg; + struct kvec vec; + int len; + + memset(&dclc, 0, sizeof(dclc)); + memcpy(dclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); + dclc.hdr.type = SMC_CLC_DECLINE; + dclc.hdr.length = htons(sizeof(struct smc_clc_msg_decline)); + dclc.hdr.version = SMC_CLC_V1; + dclc.hdr.flag = out_of_sync ? 1 : 0; + memcpy(dclc.id_for_peer, local_systemid, sizeof(local_systemid)); + dclc.peer_diagnosis = htonl(peer_diag_info); + memcpy(dclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); + + memset(&msg, 0, sizeof(msg)); + vec.iov_base = &dclc; + vec.iov_len = sizeof(struct smc_clc_msg_decline); + len = kernel_sendmsg(smc->clcsock, &msg, &vec, 1, + sizeof(struct smc_clc_msg_decline)); + if (len < sizeof(struct smc_clc_msg_decline)) + smc->sk.sk_err = EPROTO; + if (len < 0) + smc->sk.sk_err = -len; + return len; +} + +/* send CLC PROPOSAL message across internal TCP socket */ +int smc_clc_send_proposal(struct smc_sock *smc, + struct smc_ib_device *smcibdev, + u8 ibport) +{ + struct smc_clc_msg_proposal pclc; + int reason_code = 0; + struct msghdr msg; + struct kvec vec; + int len, rc; + + /* send SMC Proposal CLC message */ + memset(&pclc, 0, sizeof(pclc)); + memcpy(pclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); + pclc.hdr.type = SMC_CLC_PROPOSAL; + pclc.hdr.length = htons(sizeof(pclc)); + pclc.hdr.version = SMC_CLC_V1; /* SMC version */ + memcpy(pclc.lcl.id_for_peer, local_systemid, sizeof(local_systemid)); + memcpy(&pclc.lcl.gid, &smcibdev->gid[ibport - 1], SMC_GID_SIZE); + memcpy(&pclc.lcl.mac, &smcibdev->mac[ibport - 1], + sizeof(smcibdev->mac[ibport - 1])); + + /* determine subnet and mask from internal TCP socket */ + rc = smc_netinfo_by_tcpsk(smc->clcsock, &pclc.outgoing_subnet, + &pclc.prefix_len); + if (rc) + return SMC_CLC_DECL_CNFERR; /* configuration error */ + memcpy(pclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); + memset(&msg, 0, sizeof(msg)); + vec.iov_base = &pclc; + vec.iov_len = sizeof(pclc); + /* due to the few bytes needed for clc-handshake this cannot block */ + len = kernel_sendmsg(smc->clcsock, &msg, &vec, 1, sizeof(pclc)); + if (len < sizeof(pclc)) { + if (len >= 0) { + reason_code = -ENETUNREACH; + smc->sk.sk_err = -reason_code; + } else { + smc->sk.sk_err = smc->clcsock->sk->sk_err; + reason_code = -smc->sk.sk_err; + } + } + + return reason_code; +} + +/* send CLC CONFIRM message across internal TCP socket */ +int smc_clc_send_confirm(struct smc_sock *smc) +{ + struct smc_clc_msg_accept_confirm cclc; + int reason_code = 0; + struct msghdr msg; + struct kvec vec; + int len; + + /* send SMC Confirm CLC msg */ + memset(&cclc, 0, sizeof(cclc)); + memcpy(cclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); + cclc.hdr.type = SMC_CLC_CONFIRM; + cclc.hdr.length = htons(sizeof(cclc)); + cclc.hdr.version = SMC_CLC_V1; /* SMC version */ + memcpy(cclc.lcl.id_for_peer, local_systemid, sizeof(local_systemid)); + + /* tbd in follow-on patch: fill in link-related values */ + + /* tbd in follow-on patch: fill in rmb-related values */ + + cclc.conn_idx = 1; /* for now: 1 RMB = 1 RMBE */ + + memcpy(cclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); + + memset(&msg, 0, sizeof(msg)); + vec.iov_base = &cclc; + vec.iov_len = sizeof(cclc); + len = kernel_sendmsg(smc->clcsock, &msg, &vec, 1, sizeof(cclc)); + if (len < sizeof(cclc)) { + if (len >= 0) { + reason_code = -ENETUNREACH; + smc->sk.sk_err = -reason_code; + } else { + smc->sk.sk_err = smc->clcsock->sk->sk_err; + reason_code = -smc->sk.sk_err; + } + } + return reason_code; +} + +/* send CLC ACCEPT message across internal TCP socket */ +int smc_clc_send_accept(struct smc_sock *new_smc) +{ + struct smc_clc_msg_accept_confirm aclc; + struct msghdr msg; + struct kvec vec; + int rc = 0; + int len; + + memset(&aclc, 0, sizeof(aclc)); + memcpy(aclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); + aclc.hdr.type = SMC_CLC_ACCEPT; + aclc.hdr.length = htons(sizeof(aclc)); + aclc.hdr.version = SMC_CLC_V1; /* SMC version */ + memcpy(aclc.lcl.id_for_peer, local_systemid, sizeof(local_systemid)); + + /* tbd in follow-on patch: fill in link-related values */ + + /* tbd in follow-on patch: fill in rmb-related values */ + + aclc.conn_idx = 1; /* as long as 1 RMB = 1 RMBE */ + memcpy(aclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)); + + memset(&msg, 0, sizeof(msg)); + vec.iov_base = &aclc; + vec.iov_len = sizeof(aclc); + len = kernel_sendmsg(new_smc->clcsock, &msg, &vec, 1, sizeof(aclc)); + if (len < sizeof(aclc)) { + if (len >= 0) + new_smc->sk.sk_err = EPROTO; + else + new_smc->sk.sk_err = new_smc->clcsock->sk->sk_err; + rc = sock_error(&new_smc->sk); + } + + return rc; +} diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h new file mode 100644 index 000000000000..60bfc8bd8d89 --- /dev/null +++ b/net/smc/smc_clc.h @@ -0,0 +1,114 @@ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * CLC (connection layer control) handshake over initial TCP socket to + * prepare for RDMA traffic + * + * Copyright IBM Corp. 2016 + * + * Author(s): Ursula Braun + */ + +#ifndef _SMC_CLC_H +#define _SMC_CLC_H + +#include + +#include "smc.h" + +#define SMC_CLC_PROPOSAL 0x01 +#define SMC_CLC_ACCEPT 0x02 +#define SMC_CLC_CONFIRM 0x03 +#define SMC_CLC_DECLINE 0x04 + +/* eye catcher "SMCR" EBCDIC for CLC messages */ +static const char SMC_EYECATCHER[4] = {'\xe2', '\xd4', '\xc3', '\xd9'}; + +#define SMC_CLC_V1 0x1 /* SMC version */ +#define CLC_WAIT_TIME (6 * HZ) /* max. wait time on clcsock */ +#define SMC_CLC_DECL_MEM 0x01010000 /* insufficient memory resources */ +#define SMC_CLC_DECL_TIMEOUT 0x02000000 /* timeout */ +#define SMC_CLC_DECL_CNFERR 0x03000000 /* configuration error */ +#define SMC_CLC_DECL_IPSEC 0x03030000 /* IPsec usage */ +#define SMC_CLC_DECL_SYNCERR 0x04000000 /* synchronization error */ +#define SMC_CLC_DECL_REPLY 0x06000000 /* reply to a received decline */ +#define SMC_CLC_DECL_INTERR 0x99990000 /* internal error */ + +struct smc_clc_msg_hdr { /* header1 of clc messages */ + u8 eyecatcher[4]; /* eye catcher */ + u8 type; /* proposal / accept / confirm / decline */ + __be16 length; +#if defined(__BIG_ENDIAN_BITFIELD) + u8 version : 4, + flag : 1, + rsvd : 3; +#elif defined(__LITTLE_ENDIAN_BITFIELD) + u8 rsvd : 3, + flag : 1, + version : 4; +#endif +} __packed; /* format defined in RFC7609 */ + +struct smc_clc_msg_trail { /* trailer of clc messages */ + u8 eyecatcher[4]; +}; + +struct smc_clc_msg_local { /* header2 of clc messages */ + u8 id_for_peer[SMC_SYSTEMID_LEN]; /* unique system id */ + u8 gid[16]; /* gid of ib_device port */ + u8 mac[6]; /* mac of ib_device port */ +}; + +struct smc_clc_msg_proposal { /* clc proposal message */ + struct smc_clc_msg_hdr hdr; + struct smc_clc_msg_local lcl; + __be16 iparea_offset; /* offset to IP address information area */ + __be32 outgoing_subnet; /* subnet mask */ + u8 prefix_len; /* number of significant bits in mask */ + u8 reserved[2]; + u8 ipv6_prefixes_cnt; /* number of IPv6 prefixes in prefix array */ + struct smc_clc_msg_trail trl; /* eye catcher "SMCR" EBCDIC */ +} __aligned(4); + +struct smc_clc_msg_accept_confirm { /* clc accept / confirm message */ + struct smc_clc_msg_hdr hdr; + struct smc_clc_msg_local lcl; + u8 qpn[3]; /* QP number */ + __be32 rmb_rkey; /* RMB rkey */ + u8 conn_idx; /* Connection index, which RMBE in RMB */ + __be32 rmbe_alert_token;/* unique connection id */ +#if defined(__BIG_ENDIAN_BITFIELD) + u8 rmbe_size : 4, /* RMBE buf size (compressed notation) */ + qp_mtu : 4; /* QP mtu */ +#elif defined(__LITTLE_ENDIAN_BITFIELD) + u8 qp_mtu : 4, + rmbe_size : 4; +#endif + u8 reserved; + __be64 rmb_dma_addr; /* RMB virtual address */ + u8 reserved2; + u8 psn[3]; /* initial packet sequence number */ + struct smc_clc_msg_trail trl; /* eye catcher "SMCR" EBCDIC */ +} __packed; /* format defined in RFC7609 */ + +struct smc_clc_msg_decline { /* clc decline message */ + struct smc_clc_msg_hdr hdr; + u8 id_for_peer[SMC_SYSTEMID_LEN]; /* sender peer_id */ + __be32 peer_diagnosis; /* diagnosis information */ + u8 reserved2[4]; + struct smc_clc_msg_trail trl; /* eye catcher "SMCR" EBCDIC */ +} __aligned(4); + +struct smc_sock; +struct smc_ib_device; + +int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, + u8 expected_type); +int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info, + u8 out_of_sync); +int smc_clc_send_proposal(struct smc_sock *smc, struct smc_ib_device *smcibdev, + u8 ibport); +int smc_clc_send_confirm(struct smc_sock *smc); +int smc_clc_send_accept(struct smc_sock *smc); + +#endif