blob: e0d0400c3306286373c8b9479e4466ab718dbe7f [file] [log] [blame]
/*
* Copyright (c) 2012-2014 CEA
* Dominique Martinet <dominique.martinet@cea.fr>
* contributeur : William Allen Simpson <bill@cohortfs.com>
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* - Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
* - Neither the name of Sun Microsystems, Inc. nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/**
* \file rpc_rdma.h
* \brief rdma helper include file
*
* This was (very) loosely based on the Mooshika library, which in turn
* was a mix of diod, rping (librdmacm/examples), and Linux kernel's
* net/9p/trans_rdma.c (dual BSD/GPL license). No vestiges remain.
*/
#ifndef _TIRPC_RPC_RDMA_H
#define _TIRPC_RPC_RDMA_H
#include <infiniband/arch.h>
#include <rdma/rdma_cma.h>
#include <rpc/svc.h>
#include <rpc/xdr_ioq.h>
typedef union sockaddr_union {
struct sockaddr sa;
struct sockaddr_in sa_in;
struct sockaddr_in6 sa_int6;
struct sockaddr_storage sa_stor;
} sockaddr_union_t;
struct msk_stats {
uint64_t rx_bytes;
uint64_t rx_pkt;
uint64_t rx_err;
uint64_t tx_bytes;
uint64_t tx_pkt;
uint64_t tx_err;
/* times only set if debug has MSK_DEBUG_SPEED */
uint64_t nsec_callback;
uint64_t nsec_compevent;
};
typedef struct rpc_rdma_xprt RDMAXPRT;
struct rpc_rdma_cbc;
typedef void (*rpc_rdma_callback_t)(struct rpc_rdma_cbc *cbc, RDMAXPRT *xprt);
/**
* \struct rpc_rdma_cbc
* Context data we can use during recv/send callbacks
*/
struct rpc_rdma_cbc {
struct xdr_ioq workq;
struct xdr_ioq holdq;
struct xdr_ioq_uv *call_uv;
void *read_chunk; /* current in indexed list of arrays */
void *write_chunk; /* current in list of arrays */
void *reply_chunk; /* current in array */
void *call_data;
struct work_pool_entry wpe;
rpc_rdma_callback_t positive_cb;
rpc_rdma_callback_t negative_cb;
void *callback_arg;
union {
struct ibv_recv_wr rwr;
struct ibv_send_wr wwr;
} wr;
enum ibv_wc_opcode opcode;
enum ibv_wc_status status;
struct ibv_sge sg_list[0]; /**< this is actually an array.
note that when you malloc
you have to add its size */
};
struct rpc_rdma_pd {
LIST_ENTRY(rpc_rdma_pd) pdl;
struct ibv_context *context;
struct ibv_pd *pd;
struct ibv_srq *srq;
struct poolq_head srqh; /**< shared read contexts */
uint32_t pd_used;
};
#define RDMAX_CLIENT 0
#define RDMAX_SERVER_CHILD -1
/**
* \struct rpc_rdma_xprt
* RDMA transport instance
*/
struct rpc_rdma_xprt {
struct rpc_svcxprt xprt; /**< Transport Independent handle */
struct rpc_rdma_attr *xa; /**< (shared) configured attributes */
struct rdma_event_channel *event_channel;
struct rdma_cm_id *cm_id; /**< RDMA CM ID */
struct rpc_rdma_pd *pd; /**< RDMA PD entry */
struct ibv_comp_channel *comp_channel;
struct ibv_cq *cq; /**< Completion Queue pointer */
struct ibv_qp *qp; /**< Queue Pair pointer */
struct ibv_srq *srq; /**< Shared Receive Queue pointer */
struct ibv_recv_wr *bad_recv_wr;
struct ibv_send_wr *bad_send_wr;
struct ibv_mr *mr;
u_int8_t *buffer_aligned;
size_t buffer_total;
/* waiting contexts posted */
struct xdr_ioq waitq;
struct xdr_ioq_uv_head inbufs; /* recvsize */
struct xdr_ioq_uv_head outbufs; /* sendsz */
struct poolq_head cbqh; /**< combined callback contexts */
mutex_t cm_lock; /**< lock for connection events */
cond_t cm_cond; /**< cond for connection events */
struct msk_stats stats;
int stats_sock;
u_int recvsize;
u_int sendsize;
u_int conn_type; /**< RDMA Port space (RDMA_PS_TCP) */
int server; /**< connection backlog on server,
* 0 (RDMAX_CLIENT):
* client,
* -1 (RDMAX_SERVER_CHILD):
* server has accepted connection
*/
enum rdma_transport_state {
RDMAXS_INITIAL, /* assumes zero, never set */
RDMAXS_LISTENING,
RDMAXS_ADDR_RESOLVED,
RDMAXS_ROUTE_RESOLVED,
RDMAXS_CONNECT_REQUEST,
RDMAXS_CONNECTED,
RDMAXS_CLOSING,
RDMAXS_CLOSED,
RDMAXS_ERROR, /* always last */
} state; /**< transport state machine */
/* FIXME why configurable??? */
bool destroy_on_disconnect; /**< should perform cleanup */
};
typedef struct rec_rdma_strm {
RDMAXPRT *xprt;
/*
* out-going bits
*/
int (*writeit)(void *, void *, int);
TAILQ_HEAD(out_buffers_head, xdr_ioq_uv) out_buffers;
char *out_base; /* output buffer (points to frag header) */
char *out_finger; /* next output position */
char *out_boundry; /* data cannot up to this address */
u_int32_t *frag_header; /* beginning of curren fragment */
bool frag_sent; /* true if buffer sent in middle of record */
/*
* in-coming bits
*/
TAILQ_HEAD(in_buffers_head, xdr_ioq_uv) in_buffers;
u_long in_size; /* fixed size of the input buffer */
char *in_base;
char *in_finger; /* location of next byte to be had */
char *in_boundry; /* can read up to this location */
long fbtbc; /* fragment bytes to be consumed */
bool last_frag;
u_int sendsize;
u_int recvsize;
bool nonblock;
u_int32_t in_header;
char *in_hdrp;
int in_hdrlen;
int in_reclen;
int in_received;
int in_maxrec;
cond_t cond;
mutex_t lock;
uint8_t *rdmabuf;
struct ibv_mr *mr;
int credits;
} RECRDMA;
static inline void *xdr_encode_hyper(uint32_t *iptr, uint64_t val)
{
*iptr++ = htonl((uint32_t)((val >> 32) & 0xffffffff));
*iptr++ = htonl((uint32_t)(val & 0xffffffff));
return iptr;
}
static inline uint64_t xdr_decode_hyper(uint64_t *iptr)
{
return ((uint64_t) ntohl(((uint32_t*)iptr)[0]) << 32)
| (ntohl(((uint32_t*)iptr)[1]));
}
void rpc_rdma_internals_init(void);
void rpc_rdma_internals_fini(void);
/* server specific */
int rpc_rdma_accept_finalize(RDMAXPRT *xprt);
RDMAXPRT *rpc_rdma_accept_wait(RDMAXPRT *l_xprt, int msleep);
void rpc_rdma_destroy(SVCXPRT *s_xprt);
/* client */
int rpc_rdma_connect(RDMAXPRT *xprt);
int rpc_rdma_connect_finalize(RDMAXPRT *xprt);
/* XDR functions */
int xdr_rdma_create(XDR *, RDMAXPRT *, const u_int sendsize,
const u_int recvsize, const u_int flags);
void xdr_rdma_destroy(XDR *xdrs);
bool xdr_rdma_clnt_call(XDR *, u_int32_t);
bool xdr_rdma_clnt_reply(XDR *, u_int32_t);
bool xdr_rdma_clnt_flushout(XDR *);
bool xdr_rdma_svc_recv(struct rpc_rdma_cbc *, u_int32_t);
bool xdr_rdma_svc_reply(struct rpc_rdma_cbc *, u_int32_t);
bool xdr_rdma_svc_flushout(struct rpc_rdma_cbc *);
#endif /* !_TIRPC_RPC_RDMA_H */