diff options
Diffstat (limited to 'gpxe/src/net/infiniband.c')
-rw-r--r-- | gpxe/src/net/infiniband.c | 405 |
1 files changed, 378 insertions, 27 deletions
diff --git a/gpxe/src/net/infiniband.c b/gpxe/src/net/infiniband.c index d79bdc2c..d7813249 100644 --- a/gpxe/src/net/infiniband.c +++ b/gpxe/src/net/infiniband.c @@ -16,6 +16,8 @@ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ +FILE_LICENCE ( GPL2_OR_LATER ); + #include <stdint.h> #include <stdlib.h> #include <stdio.h> @@ -25,12 +27,15 @@ #include <errno.h> #include <assert.h> #include <gpxe/list.h> +#include <gpxe/errortab.h> #include <gpxe/if_arp.h> #include <gpxe/netdevice.h> #include <gpxe/iobuf.h> #include <gpxe/ipoib.h> #include <gpxe/process.h> #include <gpxe/infiniband.h> +#include <gpxe/ib_mi.h> +#include <gpxe/ib_sma.h> /** @file * @@ -41,6 +46,26 @@ /** List of Infiniband devices */ struct list_head ib_devices = LIST_HEAD_INIT ( ib_devices ); +/** List of open Infiniband devices, in reverse order of opening */ +static struct list_head open_ib_devices = LIST_HEAD_INIT ( open_ib_devices ); + +/* Disambiguate the various possible EINPROGRESSes */ +#define EINPROGRESS_INIT ( EINPROGRESS | EUNIQ_01 ) +#define EINPROGRESS_ARMED ( EINPROGRESS | EUNIQ_02 ) + +/** Human-readable message for the link statuses */ +struct errortab infiniband_errors[] __errortab = { + { EINPROGRESS_INIT, "Initialising" }, + { EINPROGRESS_ARMED, "Armed" }, +}; + +/*************************************************************************** + * + * Completion queues + * + *************************************************************************** + */ + /** * Create completion queue * @@ -61,6 +86,8 @@ ib_create_cq ( struct ib_device *ibdev, unsigned int num_cqes, cq = zalloc ( sizeof ( *cq ) ); if ( ! cq ) goto err_alloc_cq; + cq->ibdev = ibdev; + list_add ( &cq->list, &ibdev->cqs ); cq->num_cqes = num_cqes; INIT_LIST_HEAD ( &cq->work_queues ); cq->op = op; @@ -79,6 +106,7 @@ ib_create_cq ( struct ib_device *ibdev, unsigned int num_cqes, ibdev->op->destroy_cq ( ibdev, cq ); err_dev_create_cq: + list_del ( &cq->list ); free ( cq ); err_alloc_cq: return NULL; @@ -96,26 +124,57 @@ void ib_destroy_cq ( struct ib_device *ibdev, ibdev, cq->cqn ); assert ( list_empty ( &cq->work_queues ) ); ibdev->op->destroy_cq ( ibdev, cq ); + list_del ( &cq->list ); free ( cq ); } /** + * Poll completion queue + * + * @v ibdev Infiniband device + * @v cq Completion queue + */ +void ib_poll_cq ( struct ib_device *ibdev, + struct ib_completion_queue *cq ) { + struct ib_work_queue *wq; + + /* Poll completion queue */ + ibdev->op->poll_cq ( ibdev, cq ); + + /* Refill receive work queues */ + list_for_each_entry ( wq, &cq->work_queues, list ) { + if ( ! wq->is_send ) + ib_refill_recv ( ibdev, wq->qp ); + } +} + +/*************************************************************************** + * + * Work queues + * + *************************************************************************** + */ + +/** * Create queue pair * * @v ibdev Infiniband device + * @v type Queue pair type * @v num_send_wqes Number of send work queue entries * @v send_cq Send completion queue * @v num_recv_wqes Number of receive work queue entries * @v recv_cq Receive completion queue - * @v qkey Queue key * @ret qp Queue pair + * + * The queue pair will be left in the INIT state; you must call + * ib_modify_qp() before it is ready to use for sending and receiving. */ struct ib_queue_pair * ib_create_qp ( struct ib_device *ibdev, + enum ib_queue_pair_type type, unsigned int num_send_wqes, struct ib_completion_queue *send_cq, unsigned int num_recv_wqes, - struct ib_completion_queue *recv_cq, - unsigned long qkey ) { + struct ib_completion_queue *recv_cq ) { struct ib_queue_pair *qp; size_t total_size; int rc; @@ -131,16 +190,18 @@ struct ib_queue_pair * ib_create_qp ( struct ib_device *ibdev, goto err_alloc_qp; qp->ibdev = ibdev; list_add ( &qp->list, &ibdev->qps ); - qp->qkey = qkey; + qp->type = type; qp->send.qp = qp; qp->send.is_send = 1; qp->send.cq = send_cq; list_add ( &qp->send.list, &send_cq->work_queues ); + qp->send.psn = ( random() & 0xffffffUL ); qp->send.num_wqes = num_send_wqes; qp->send.iobufs = ( ( ( void * ) qp ) + sizeof ( *qp ) ); qp->recv.qp = qp; qp->recv.cq = recv_cq; list_add ( &qp->recv.list, &recv_cq->work_queues ); + qp->recv.psn = ( random() & 0xffffffUL ); qp->recv.num_wqes = num_recv_wqes; qp->recv.iobufs = ( ( ( void * ) qp ) + sizeof ( *qp ) + ( num_send_wqes * sizeof ( qp->send.iobufs[0] ) )); @@ -152,7 +213,6 @@ struct ib_queue_pair * ib_create_qp ( struct ib_device *ibdev, "%s\n", ibdev, strerror ( rc ) ); goto err_dev_create_qp; } - DBGC ( ibdev, "IBDEV %p created queue pair %p (%p) with QPN %#lx\n", ibdev, qp, ib_qp_get_drvdata ( qp ), qp->qpn ); DBGC ( ibdev, "IBDEV %p QPN %#lx has %d send entries at [%p,%p)\n", @@ -161,6 +221,24 @@ struct ib_queue_pair * ib_create_qp ( struct ib_device *ibdev, DBGC ( ibdev, "IBDEV %p QPN %#lx has %d receive entries at [%p,%p)\n", ibdev, qp->qpn, num_recv_wqes, qp->recv.iobufs, ( ( ( void * ) qp ) + total_size ) ); + + /* Calculate externally-visible QPN */ + switch ( type ) { + case IB_QPT_SMI: + qp->ext_qpn = IB_QPN_SMI; + break; + case IB_QPT_GSI: + qp->ext_qpn = IB_QPN_GSI; + break; + default: + qp->ext_qpn = qp->qpn; + break; + } + if ( qp->ext_qpn != qp->qpn ) { + DBGC ( ibdev, "IBDEV %p QPN %#lx has external QPN %#lx\n", + ibdev, qp->qpn, qp->ext_qpn ); + } + return qp; ibdev->op->destroy_qp ( ibdev, qp ); @@ -178,20 +256,15 @@ struct ib_queue_pair * ib_create_qp ( struct ib_device *ibdev, * * @v ibdev Infiniband device * @v qp Queue pair - * @v mod_list Modification list - * @v qkey New queue key, if applicable + * @v av New address vector, if applicable * @ret rc Return status code */ -int ib_modify_qp ( struct ib_device *ibdev, struct ib_queue_pair *qp, - unsigned long mod_list, unsigned long qkey ) { +int ib_modify_qp ( struct ib_device *ibdev, struct ib_queue_pair *qp ) { int rc; DBGC ( ibdev, "IBDEV %p modifying QPN %#lx\n", ibdev, qp->qpn ); - if ( mod_list & IB_MODIFY_QKEY ) - qp->qkey = qkey; - - if ( ( rc = ibdev->op->modify_qp ( ibdev, qp, mod_list ) ) != 0 ) { + if ( ( rc = ibdev->op->modify_qp ( ibdev, qp ) ) != 0 ) { DBGC ( ibdev, "IBDEV %p could not modify QPN %#lx: %s\n", ibdev, qp->qpn, strerror ( rc ) ); return rc; @@ -251,7 +324,7 @@ struct ib_queue_pair * ib_find_qp_qpn ( struct ib_device *ibdev, struct ib_queue_pair *qp; list_for_each_entry ( qp, &ibdev->qps, list ) { - if ( qp->qpn == qpn ) + if ( ( qpn == qp->qpn ) || ( qpn == qp->ext_qpn ) ) return qp; } return NULL; @@ -311,6 +384,7 @@ struct ib_work_queue * ib_find_wq ( struct ib_completion_queue *cq, int ib_post_send ( struct ib_device *ibdev, struct ib_queue_pair *qp, struct ib_address_vector *av, struct io_buffer *iobuf ) { + struct ib_address_vector av_copy; int rc; /* Check queue fill level */ @@ -320,6 +394,20 @@ int ib_post_send ( struct ib_device *ibdev, struct ib_queue_pair *qp, return -ENOBUFS; } + /* Use default address vector if none specified */ + if ( ! av ) + av = &qp->av; + + /* Make modifiable copy of address vector */ + memcpy ( &av_copy, av, sizeof ( av_copy ) ); + av = &av_copy; + + /* Fill in optional parameters in address vector */ + if ( ! av->qkey ) + av->qkey = qp->qkey; + if ( ! av->rate ) + av->rate = IB_RATE_2_5; + /* Post to hardware */ if ( ( rc = ibdev->op->post_send ( ibdev, qp, av, iobuf ) ) != 0 ) { DBGC ( ibdev, "IBDEV %p QPN %#lx could not post send WQE: " @@ -343,6 +431,13 @@ int ib_post_recv ( struct ib_device *ibdev, struct ib_queue_pair *qp, struct io_buffer *iobuf ) { int rc; + /* Check packet length */ + if ( iob_tailroom ( iobuf ) < IB_MAX_PAYLOAD_SIZE ) { + DBGC ( ibdev, "IBDEV %p QPN %#lx wrong RX buffer size (%zd)\n", + ibdev, qp->qpn, iob_tailroom ( iobuf ) ); + return -EINVAL; + } + /* Check queue fill level */ if ( qp->recv.fill >= qp->recv.num_wqes ) { DBGC ( ibdev, "IBDEV %p QPN %#lx receive queue full\n", @@ -371,7 +466,12 @@ int ib_post_recv ( struct ib_device *ibdev, struct ib_queue_pair *qp, */ void ib_complete_send ( struct ib_device *ibdev, struct ib_queue_pair *qp, struct io_buffer *iobuf, int rc ) { - qp->send.cq->op->complete_send ( ibdev, qp, iobuf, rc ); + + if ( qp->send.cq->op->complete_send ) { + qp->send.cq->op->complete_send ( ibdev, qp, iobuf, rc ); + } else { + free_iob ( iobuf ); + } qp->send.fill--; } @@ -387,11 +487,54 @@ void ib_complete_send ( struct ib_device *ibdev, struct ib_queue_pair *qp, void ib_complete_recv ( struct ib_device *ibdev, struct ib_queue_pair *qp, struct ib_address_vector *av, struct io_buffer *iobuf, int rc ) { - qp->recv.cq->op->complete_recv ( ibdev, qp, av, iobuf, rc ); + + if ( qp->recv.cq->op->complete_recv ) { + qp->recv.cq->op->complete_recv ( ibdev, qp, av, iobuf, rc ); + } else { + free_iob ( iobuf ); + } qp->recv.fill--; } /** + * Refill receive work queue + * + * @v ibdev Infiniband device + * @v qp Queue pair + */ +void ib_refill_recv ( struct ib_device *ibdev, struct ib_queue_pair *qp ) { + struct io_buffer *iobuf; + int rc; + + /* Keep filling while unfilled entries remain */ + while ( qp->recv.fill < qp->recv.num_wqes ) { + + /* Allocate I/O buffer */ + iobuf = alloc_iob ( IB_MAX_PAYLOAD_SIZE ); + if ( ! iobuf ) { + /* Non-fatal; we will refill on next attempt */ + return; + } + + /* Post I/O buffer */ + if ( ( rc = ib_post_recv ( ibdev, qp, iobuf ) ) != 0 ) { + DBGC ( ibdev, "IBDEV %p could not refill: %s\n", + ibdev, strerror ( rc ) ); + free_iob ( iobuf ); + /* Give up */ + return; + } + } +} + +/*************************************************************************** + * + * Link control + * + *************************************************************************** + */ + +/** * Open port * * @v ibdev Infiniband device @@ -400,16 +543,59 @@ void ib_complete_recv ( struct ib_device *ibdev, struct ib_queue_pair *qp, int ib_open ( struct ib_device *ibdev ) { int rc; - /* Open device if this is the first requested opening */ - if ( ibdev->open_count == 0 ) { - if ( ( rc = ibdev->op->open ( ibdev ) ) != 0 ) - return rc; + /* Increment device open request counter */ + if ( ibdev->open_count++ > 0 ) { + /* Device was already open; do nothing */ + return 0; } - /* Increment device open request counter */ - ibdev->open_count++; + /* Create subnet management interface */ + ibdev->smi = ib_create_mi ( ibdev, IB_QPT_SMI ); + if ( ! ibdev->smi ) { + DBGC ( ibdev, "IBDEV %p could not create SMI\n", ibdev ); + rc = -ENOMEM; + goto err_create_smi; + } + /* Create subnet management agent */ + if ( ( rc = ib_create_sma ( ibdev, ibdev->smi ) ) != 0 ) { + DBGC ( ibdev, "IBDEV %p could not create SMA: %s\n", + ibdev, strerror ( rc ) ); + goto err_create_sma; + } + + /* Create general services interface */ + ibdev->gsi = ib_create_mi ( ibdev, IB_QPT_GSI ); + if ( ! ibdev->gsi ) { + DBGC ( ibdev, "IBDEV %p could not create GSI\n", ibdev ); + rc = -ENOMEM; + goto err_create_gsi; + } + + /* Open device */ + if ( ( rc = ibdev->op->open ( ibdev ) ) != 0 ) { + DBGC ( ibdev, "IBDEV %p could not open: %s\n", + ibdev, strerror ( rc ) ); + goto err_open; + } + + /* Add to head of open devices list */ + list_add ( &ibdev->open_list, &open_ib_devices ); + + assert ( ibdev->open_count == 1 ); return 0; + + ibdev->op->close ( ibdev ); + err_open: + ib_destroy_mi ( ibdev, ibdev->gsi ); + err_create_gsi: + ib_destroy_sma ( ibdev, ibdev->smi ); + err_create_sma: + ib_destroy_mi ( ibdev, ibdev->smi ); + err_create_smi: + assert ( ibdev->open_count == 1 ); + ibdev->open_count = 0; + return rc; } /** @@ -423,10 +609,38 @@ void ib_close ( struct ib_device *ibdev ) { ibdev->open_count--; /* Close device if this was the last remaining requested opening */ - if ( ibdev->open_count == 0 ) + if ( ibdev->open_count == 0 ) { + list_del ( &ibdev->open_list ); + ib_destroy_mi ( ibdev, ibdev->gsi ); + ib_destroy_sma ( ibdev, ibdev->smi ); + ib_destroy_mi ( ibdev, ibdev->smi ); ibdev->op->close ( ibdev ); + } +} + +/** + * Get link state + * + * @v ibdev Infiniband device + * @ret rc Link status code + */ +int ib_link_rc ( struct ib_device *ibdev ) { + switch ( ibdev->port_state ) { + case IB_PORT_STATE_DOWN: return -ENOTCONN; + case IB_PORT_STATE_INIT: return -EINPROGRESS_INIT; + case IB_PORT_STATE_ARMED: return -EINPROGRESS_ARMED; + case IB_PORT_STATE_ACTIVE: return 0; + default: return -EINVAL; + } } +/*************************************************************************** + * + * Multicast + * + *************************************************************************** + */ + /** * Attach to multicast group * @@ -434,6 +648,10 @@ void ib_close ( struct ib_device *ibdev ) { * @v qp Queue pair * @v gid Multicast GID * @ret rc Return status code + * + * Note that this function handles only the local device's attachment + * to the multicast GID; it does not issue the relevant MADs to join + * the multicast group on the subnet. */ int ib_mcast_attach ( struct ib_device *ibdev, struct ib_queue_pair *qp, struct ib_gid *gid ) { @@ -486,6 +704,89 @@ void ib_mcast_detach ( struct ib_device *ibdev, struct ib_queue_pair *qp, } } +/*************************************************************************** + * + * Miscellaneous + * + *************************************************************************** + */ + +/** + * Get Infiniband HCA information + * + * @v ibdev Infiniband device + * @ret hca_guid HCA GUID + * @ret num_ports Number of ports + */ +int ib_get_hca_info ( struct ib_device *ibdev, + struct ib_gid_half *hca_guid ) { + struct ib_device *tmp; + int num_ports = 0; + + /* Search for IB devices with the same physical device to + * identify port count and a suitable Node GUID. + */ + for_each_ibdev ( tmp ) { + if ( tmp->dev != ibdev->dev ) + continue; + if ( num_ports == 0 ) { + memcpy ( hca_guid, &tmp->gid.u.half[1], + sizeof ( *hca_guid ) ); + } + num_ports++; + } + return num_ports; +} + +/** + * Set port information + * + * @v ibdev Infiniband device + * @v mad Set port information MAD + */ +int ib_set_port_info ( struct ib_device *ibdev, union ib_mad *mad ) { + int rc; + + /* Adapters with embedded SMAs do not need to support this method */ + if ( ! ibdev->op->set_port_info ) { + DBGC ( ibdev, "IBDEV %p does not support setting port " + "information\n", ibdev ); + return -ENOTSUP; + } + + if ( ( rc = ibdev->op->set_port_info ( ibdev, mad ) ) != 0 ) { + DBGC ( ibdev, "IBDEV %p could not set port information: %s\n", + ibdev, strerror ( rc ) ); + return rc; + } + + return 0; +}; + +/** + * Set partition key table + * + * @v ibdev Infiniband device + * @v mad Set partition key table MAD + */ +int ib_set_pkey_table ( struct ib_device *ibdev, union ib_mad *mad ) { + int rc; + + /* Adapters with embedded SMAs do not need to support this method */ + if ( ! ibdev->op->set_pkey_table ) { + DBGC ( ibdev, "IBDEV %p does not support setting partition " + "key table\n", ibdev ); + return -ENOTSUP; + } + + if ( ( rc = ibdev->op->set_pkey_table ( ibdev, mad ) ) != 0 ) { + DBGC ( ibdev, "IBDEV %p could not set partition key table: " + "%s\n", ibdev, strerror ( rc ) ); + return rc; + } + + return 0; +}; /*************************************************************************** * @@ -506,6 +807,22 @@ void ib_link_state_changed ( struct ib_device *ibdev ) { } /** + * Poll event queue + * + * @v ibdev Infiniband device + */ +void ib_poll_eq ( struct ib_device *ibdev ) { + struct ib_completion_queue *cq; + + /* Poll device's event queue */ + ibdev->op->poll_eq ( ibdev ); + + /* Poll all completion queues */ + list_for_each_entry ( cq, &ibdev->cqs, list ) + ib_poll_cq ( ibdev, cq ); +} + +/** * Single-step the Infiniband event queue * * @v process Infiniband event queue process @@ -513,13 +830,13 @@ void ib_link_state_changed ( struct ib_device *ibdev ) { static void ib_step ( struct process *process __unused ) { struct ib_device *ibdev; - list_for_each_entry ( ibdev, &ib_devices, list ) { - ibdev->op->poll_eq ( ibdev ); - } + for_each_ibdev ( ibdev ) + ib_poll_eq ( ibdev ); } /** Infiniband event queue process */ struct process ib_process __permanent_process = { + .list = LIST_HEAD_INIT ( ib_process.list ), .step = ib_step, }; @@ -546,9 +863,11 @@ struct ib_device * alloc_ibdev ( size_t priv_size ) { if ( ibdev ) { drv_priv = ( ( ( void * ) ibdev ) + sizeof ( *ibdev ) ); ib_set_drvdata ( ibdev, drv_priv ); + INIT_LIST_HEAD ( &ibdev->cqs ); INIT_LIST_HEAD ( &ibdev->qps ); + ibdev->port_state = IB_PORT_STATE_DOWN; ibdev->lid = IB_LID_NONE; - ibdev->pkey = IB_PKEY_NONE; + ibdev->pkey = IB_PKEY_DEFAULT; } return ibdev; } @@ -598,3 +917,35 @@ void unregister_ibdev ( struct ib_device *ibdev ) { ibdev_put ( ibdev ); DBGC ( ibdev, "IBDEV %p unregistered\n", ibdev ); } + +/** + * Find Infiniband device by GID + * + * @v gid GID + * @ret ibdev Infiniband device, or NULL + */ +struct ib_device * find_ibdev ( struct ib_gid *gid ) { + struct ib_device *ibdev; + + for_each_ibdev ( ibdev ) { + if ( memcmp ( gid, &ibdev->gid, sizeof ( *gid ) ) == 0 ) + return ibdev; + } + return NULL; +} + +/** + * Get most recently opened Infiniband device + * + * @ret ibdev Most recently opened Infiniband device, or NULL + */ +struct ib_device * last_opened_ibdev ( void ) { + struct ib_device *ibdev; + + list_for_each_entry ( ibdev, &open_ib_devices, open_list ) { + assert ( ibdev->open_count != 0 ); + return ibdev; + } + + return NULL; +} |