summaryrefslogtreecommitdiff
path: root/lib/dpif.c
diff options
context:
space:
mode:
Diffstat (limited to 'lib/dpif.c')
-rw-r--r--lib/dpif.c1060
1 files changed, 1060 insertions, 0 deletions
diff --git a/lib/dpif.c b/lib/dpif.c
new file mode 100644
index 000000000..73ad5df00
--- /dev/null
+++ b/lib/dpif.c
@@ -0,0 +1,1060 @@
+/*
+ * Copyright (c) 2008, 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <config.h>
+#include "dpif.h"
+
+#include <assert.h>
+#include <ctype.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <inttypes.h>
+#include <net/if.h>
+#include <linux/rtnetlink.h>
+#include <linux/ethtool.h>
+#include <linux/sockios.h>
+#include <netinet/in.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/stat.h>
+#include <sys/sysmacros.h>
+#include <unistd.h>
+
+#include "coverage.h"
+#include "dynamic-string.h"
+#include "flow.h"
+#include "netlink.h"
+#include "odp-util.h"
+#include "ofp-print.h"
+#include "ofpbuf.h"
+#include "packets.h"
+#include "poll-loop.h"
+#include "util.h"
+#include "valgrind.h"
+
+#include "vlog.h"
+#define THIS_MODULE VLM_dpif
+
+/* Rate limit for individual messages going to or from the datapath, output at
+ * DBG level. This is very high because, if these are enabled, it is because
+ * we really need to see them. */
+static struct vlog_rate_limit dpmsg_rl = VLOG_RATE_LIMIT_INIT(600, 600);
+
+/* Not really much point in logging many dpif errors. */
+static struct vlog_rate_limit error_rl = VLOG_RATE_LIMIT_INIT(9999, 5);
+
+static int get_minor_from_name(const char *name, unsigned int *minor);
+static int name_to_minor(const char *name, unsigned int *minor);
+static int lookup_minor(const char *name, unsigned int *minor);
+static int open_by_minor(unsigned int minor, struct dpif *);
+static int make_openvswitch_device(unsigned int minor, char **fnp);
+static void check_rw_odp_flow(struct odp_flow *);
+
+int
+dpif_open(const char *name, struct dpif *dpif)
+{
+ int listen_mask;
+ int error;
+
+ dpif->fd = -1;
+
+ error = name_to_minor(name, &dpif->minor);
+ if (error) {
+ return error;
+ }
+
+ error = open_by_minor(dpif->minor, dpif);
+ if (error) {
+ return error;
+ }
+
+ /* We can open the device, but that doesn't mean that it's been created.
+ * If it hasn't been, then any command other than ODP_DP_CREATE will
+ * return ENODEV. Try something innocuous. */
+ listen_mask = 0; /* Make Valgrind happy. */
+ if (ioctl(dpif->fd, ODP_GET_LISTEN_MASK, &listen_mask)) {
+ error = errno;
+ if (error != ENODEV) {
+ VLOG_WARN("dp%u: probe returned unexpected error: %s",
+ dpif->minor, strerror(error));
+ }
+ dpif_close(dpif);
+ return error;
+ }
+ return 0;
+}
+
+void
+dpif_close(struct dpif *dpif)
+{
+ if (dpif) {
+ close(dpif->fd);
+ dpif->fd = -1;
+ }
+}
+
+static int
+do_ioctl(const struct dpif *dpif, int cmd, const char *cmd_name,
+ const void *arg)
+{
+ int error = ioctl(dpif->fd, cmd, arg) ? errno : 0;
+ if (cmd_name) {
+ if (error) {
+ VLOG_WARN_RL(&error_rl, "dp%u: ioctl(%s) failed (%s)",
+ dpif->minor, cmd_name, strerror(error));
+ } else {
+ VLOG_DBG_RL(&dpmsg_rl, "dp%u: ioctl(%s): success",
+ dpif->minor, cmd_name);
+ }
+ }
+ return error;
+}
+
+int
+dpif_create(const char *name, struct dpif *dpif)
+{
+ unsigned int minor;
+ int error;
+
+ if (!get_minor_from_name(name, &minor)) {
+ /* Minor was specified in 'name', go ahead and create it. */
+ error = open_by_minor(minor, dpif);
+ if (error) {
+ return error;
+ }
+
+ if (!strncmp(name, "nl:", 3)) {
+ char devname[128];
+ sprintf(devname, "of%u", minor);
+ error = ioctl(dpif->fd, ODP_DP_CREATE, devname) < 0 ? errno : 0;
+ } else {
+ error = ioctl(dpif->fd, ODP_DP_CREATE, name) < 0 ? errno : 0;
+ }
+ if (error) {
+ dpif_close(dpif);
+ }
+ return error;
+ } else {
+ for (minor = 0; minor < ODP_MAX; minor++) {
+ error = open_by_minor(minor, dpif);
+ if (error) {
+ return error;
+ }
+
+ error = ioctl(dpif->fd, ODP_DP_CREATE, name) < 0 ? errno : 0;
+ if (!error) {
+ return 0;
+ }
+ dpif_close(dpif);
+ if (error != EBUSY) {
+ return error;
+ }
+ }
+ return ENOBUFS;
+ }
+}
+
+int
+dpif_get_name(struct dpif *dpif, char *name, size_t name_size)
+{
+ struct odp_port port;
+ int error;
+
+ assert(name_size > 0);
+ *name = '\0';
+
+ error = dpif_port_query_by_number(dpif, ODPP_LOCAL, &port);
+ if (!error) {
+ ovs_strlcpy(name, port.devname, name_size);
+ }
+ return error;
+}
+
+int
+dpif_delete(struct dpif *dpif)
+{
+ COVERAGE_INC(dpif_destroy);
+ return do_ioctl(dpif, ODP_DP_DESTROY, "ODP_DP_DESTROY", NULL);
+}
+
+int
+dpif_get_dp_stats(const struct dpif *dpif, struct odp_stats *stats)
+{
+ memset(stats, 0, sizeof *stats);
+ return do_ioctl(dpif, ODP_DP_STATS, "ODP_DP_STATS", stats);
+}
+
+int
+dpif_get_drop_frags(const struct dpif *dpif, bool *drop_frags)
+{
+ int tmp;
+ int error = do_ioctl(dpif, ODP_GET_DROP_FRAGS, "ODP_GET_DROP_FRAGS", &tmp);
+ *drop_frags = error ? tmp & 1 : false;
+ return error;
+}
+
+int
+dpif_set_drop_frags(struct dpif *dpif, bool drop_frags)
+{
+ int tmp = drop_frags;
+ return do_ioctl(dpif, ODP_SET_DROP_FRAGS, "ODP_SET_DROP_FRAGS", &tmp);
+}
+
+int
+dpif_get_listen_mask(const struct dpif *dpif, int *listen_mask)
+{
+ int error = do_ioctl(dpif, ODP_GET_LISTEN_MASK, "ODP_GET_LISTEN_MASK",
+ listen_mask);
+ if (error) {
+ *listen_mask = 0;
+ }
+ return error;
+}
+
+int
+dpif_set_listen_mask(struct dpif *dpif, int listen_mask)
+{
+ return do_ioctl(dpif, ODP_SET_LISTEN_MASK, "ODP_SET_LISTEN_MASK",
+ &listen_mask);
+}
+
+int
+dpif_purge(struct dpif *dpif)
+{
+ struct odp_stats stats;
+ unsigned int i;
+ int error;
+
+ COVERAGE_INC(dpif_purge);
+
+ error = dpif_get_dp_stats(dpif, &stats);
+ if (error) {
+ return error;
+ }
+
+ for (i = 0; i < stats.max_miss_queue + stats.max_action_queue; i++) {
+ struct ofpbuf *buf;
+ error = dpif_recv(dpif, &buf);
+ if (error) {
+ return error == EAGAIN ? 0 : error;
+ }
+ ofpbuf_delete(buf);
+ }
+ return 0;
+}
+
+int
+dpif_port_add(struct dpif *dpif, const char *devname, uint16_t port_no,
+ uint16_t flags)
+{
+ struct odp_port port;
+
+ COVERAGE_INC(dpif_port_add);
+ memset(&port, 0, sizeof port);
+ strncpy(port.devname, devname, sizeof port.devname);
+ port.port = port_no;
+ port.flags = flags;
+ if (!ioctl(dpif->fd, ODP_PORT_ADD, &port)) {
+ VLOG_DBG_RL(&dpmsg_rl, "dp%u: added %s as port %"PRIu16,
+ dpif->minor, devname, port_no);
+ return 0;
+ } else {
+ VLOG_WARN_RL(&error_rl, "dp%u: failed to add %s as port "
+ "%"PRIu16": %s", dpif->minor, devname, port_no,
+ strerror(errno));
+ return errno;
+ }
+}
+
+int
+dpif_port_del(struct dpif *dpif, uint16_t port_no)
+{
+ int tmp = port_no;
+ COVERAGE_INC(dpif_port_del);
+ return do_ioctl(dpif, ODP_PORT_DEL, "ODP_PORT_DEL", &tmp);
+}
+
+int
+dpif_port_query_by_number(const struct dpif *dpif, uint16_t port_no,
+ struct odp_port *port)
+{
+ memset(port, 0, sizeof *port);
+ port->port = port_no;
+ if (!ioctl(dpif->fd, ODP_PORT_QUERY, port)) {
+ VLOG_DBG_RL(&dpmsg_rl, "dp%u: port %"PRIu16" is device %s",
+ dpif->minor, port_no, port->devname);
+ return 0;
+ } else {
+ VLOG_WARN_RL(&error_rl, "dp%u: failed to query port %"PRIu16": %s",
+ dpif->minor, port_no, strerror(errno));
+ return errno;
+ }
+}
+
+int
+dpif_port_query_by_name(const struct dpif *dpif, const char *devname,
+ struct odp_port *port)
+{
+ memset(port, 0, sizeof *port);
+ strncpy(port->devname, devname, sizeof port->devname);
+ if (!ioctl(dpif->fd, ODP_PORT_QUERY, port)) {
+ VLOG_DBG_RL(&dpmsg_rl, "dp%u: device %s is on port %"PRIu16,
+ dpif->minor, devname, port->port);
+ return 0;
+ } else {
+ VLOG_WARN_RL(&error_rl, "dp%u: failed to query port %s: %s",
+ dpif->minor, devname, strerror(errno));
+ return errno;
+ }
+}
+
+int
+dpif_port_list(const struct dpif *dpif,
+ struct odp_port **ports, size_t *n_ports)
+{
+ struct odp_portvec pv;
+ struct odp_stats stats;
+ int error;
+
+ do {
+ error = dpif_get_dp_stats(dpif, &stats);
+ if (error) {
+ goto error;
+ }
+
+ *ports = xcalloc(1, stats.n_ports * sizeof **ports);
+ pv.ports = *ports;
+ pv.n_ports = stats.n_ports;
+ error = do_ioctl(dpif, ODP_PORT_LIST, "ODP_PORT_LIST", &pv);
+ if (error) {
+ free(*ports);
+ goto error;
+ }
+ } while (pv.n_ports != stats.n_ports);
+ *n_ports = pv.n_ports;
+ return 0;
+
+error:
+ *ports = NULL;
+ *n_ports = 0;
+ return error;
+}
+
+int
+dpif_port_group_set(struct dpif *dpif, uint16_t group,
+ const uint16_t ports[], size_t n_ports)
+{
+ struct odp_port_group pg;
+
+ COVERAGE_INC(dpif_port_group_set);
+ assert(n_ports <= UINT16_MAX);
+ pg.group = group;
+ pg.ports = (uint16_t *) ports;
+ pg.n_ports = n_ports;
+ return do_ioctl(dpif, ODP_PORT_GROUP_SET, "ODP_PORT_GROUP_SET", &pg);
+}
+
+/* Careful: '*n_out' can be greater than 'n_ports' on return, if 'n_ports' is
+ * less than the number of ports in 'group'. */
+int
+dpif_port_group_get(const struct dpif *dpif, uint16_t group,
+ uint16_t ports[], size_t n_ports, size_t *n_out)
+{
+ struct odp_port_group pg;
+ int error;
+
+ assert(n_ports <= UINT16_MAX);
+ pg.group = group;
+ pg.ports = ports;
+ pg.n_ports = n_ports;
+ error = do_ioctl(dpif, ODP_PORT_GROUP_GET, "ODP_PORT_GROUP_GET", &pg);
+ *n_out = error ? 0 : pg.n_ports;
+ return error;
+}
+
+int
+dpif_flow_flush(struct dpif *dpif)
+{
+ COVERAGE_INC(dpif_flow_flush);
+ return do_ioctl(dpif, ODP_FLOW_FLUSH, "ODP_FLOW_FLUSH", NULL);
+}
+
+static enum vlog_level
+flow_message_log_level(int error)
+{
+ return error ? VLL_WARN : VLL_DBG;
+}
+
+static bool
+should_log_flow_message(int error)
+{
+ return !vlog_should_drop(THIS_MODULE, flow_message_log_level(error),
+ error ? &error_rl : &dpmsg_rl);
+}
+
+static void
+log_flow_message(const struct dpif *dpif, int error,
+ const char *operation,
+ const flow_t *flow, const struct odp_flow_stats *stats,
+ const union odp_action *actions, size_t n_actions)
+{
+ struct ds ds = DS_EMPTY_INITIALIZER;
+ ds_put_format(&ds, "dp%u: ", dpif->minor);
+ if (error) {
+ ds_put_cstr(&ds, "failed to ");
+ }
+ ds_put_format(&ds, "%s ", operation);
+ if (error) {
+ ds_put_format(&ds, "(%s) ", strerror(error));
+ }
+ flow_format(&ds, flow);
+ if (stats) {
+ ds_put_cstr(&ds, ", ");
+ format_odp_flow_stats(&ds, stats);
+ }
+ if (actions || n_actions) {
+ ds_put_cstr(&ds, ", actions:");
+ format_odp_actions(&ds, actions, n_actions);
+ }
+ vlog(THIS_MODULE, flow_message_log_level(error), "%s", ds_cstr(&ds));
+ ds_destroy(&ds);
+}
+
+static int
+do_flow_ioctl(const struct dpif *dpif, int cmd, struct odp_flow *flow,
+ const char *operation, bool show_stats)
+{
+ int error = do_ioctl(dpif, cmd, NULL, flow);
+ if (error && show_stats) {
+ flow->n_actions = 0;
+ }
+ if (should_log_flow_message(error)) {
+ log_flow_message(dpif, error, operation, &flow->key,
+ show_stats && !error ? &flow->stats : NULL,
+ flow->actions, flow->n_actions);
+ }
+ return error;
+}
+
+int
+dpif_flow_put(struct dpif *dpif, struct odp_flow_put *put)
+{
+ int error = do_ioctl(dpif, ODP_FLOW_PUT, NULL, put);
+ COVERAGE_INC(dpif_flow_put);
+ if (should_log_flow_message(error)) {
+ struct ds operation = DS_EMPTY_INITIALIZER;
+ ds_put_cstr(&operation, "put");
+ if (put->flags & ODPPF_CREATE) {
+ ds_put_cstr(&operation, "[create]");
+ }
+ if (put->flags & ODPPF_MODIFY) {
+ ds_put_cstr(&operation, "[modify]");
+ }
+ if (put->flags & ODPPF_ZERO_STATS) {
+ ds_put_cstr(&operation, "[zero]");
+ }
+#define ODPPF_ALL (ODPPF_CREATE | ODPPF_MODIFY | ODPPF_ZERO_STATS)
+ if (put->flags & ~ODPPF_ALL) {
+ ds_put_format(&operation, "[%x]", put->flags & ~ODPPF_ALL);
+ }
+ log_flow_message(dpif, error, ds_cstr(&operation), &put->flow.key,
+ !error ? &put->flow.stats : NULL,
+ put->flow.actions, put->flow.n_actions);
+ ds_destroy(&operation);
+ }
+ return error;
+}
+
+int
+dpif_flow_del(struct dpif *dpif, struct odp_flow *flow)
+{
+ COVERAGE_INC(dpif_flow_del);
+ check_rw_odp_flow(flow);
+ memset(&flow->stats, 0, sizeof flow->stats);
+ return do_flow_ioctl(dpif, ODP_FLOW_DEL, flow, "delete flow", true);
+}
+
+int
+dpif_flow_get(const struct dpif *dpif, struct odp_flow *flow)
+{
+ COVERAGE_INC(dpif_flow_query);
+ check_rw_odp_flow(flow);
+ memset(&flow->stats, 0, sizeof flow->stats);
+ return do_flow_ioctl(dpif, ODP_FLOW_GET, flow, "get flow", true);
+}
+
+int
+dpif_flow_get_multiple(const struct dpif *dpif,
+ struct odp_flow flows[], size_t n)
+{
+ struct odp_flowvec fv;
+ size_t i;
+
+ COVERAGE_ADD(dpif_flow_query_multiple, n);
+ fv.flows = flows;
+ fv.n_flows = n;
+ for (i = 0; i < n; i++) {
+ check_rw_odp_flow(&flows[i]);
+ }
+ return do_ioctl(dpif, ODP_FLOW_GET_MULTIPLE, "ODP_FLOW_GET_MULTIPLE",
+ &fv);
+}
+
+int
+dpif_flow_list(const struct dpif *dpif, struct odp_flow flows[], size_t n,
+ size_t *n_out)
+{
+ struct odp_flowvec fv;
+ uint32_t i;
+ int error;
+
+ COVERAGE_INC(dpif_flow_query_list);
+ fv.flows = flows;
+ fv.n_flows = n;
+ if (RUNNING_ON_VALGRIND) {
+ memset(flows, 0, n * sizeof *flows);
+ } else {
+ for (i = 0; i < n; i++) {
+ flows[i].actions = NULL;
+ flows[i].n_actions = 0;
+ }
+ }
+ error = do_ioctl(dpif, ODP_FLOW_LIST, NULL, &fv);
+ if (error) {
+ *n_out = 0;
+ VLOG_WARN_RL(&error_rl, "dp%u: flow list failed (%s)",
+ dpif->minor, strerror(error));
+ } else {
+ COVERAGE_ADD(dpif_flow_query_list_n, fv.n_flows);
+ *n_out = fv.n_flows;
+ VLOG_DBG_RL(&dpmsg_rl, "dp%u: listed %zu flows", dpif->minor, *n_out);
+ }
+ return error;
+}
+
+int
+dpif_flow_list_all(const struct dpif *dpif,
+ struct odp_flow **flowsp, size_t *np)
+{
+ struct odp_stats stats;
+ struct odp_flow *flows;
+ size_t n_flows;
+ int error;
+
+ *flowsp = NULL;
+ *np = 0;
+
+ error = dpif_get_dp_stats(dpif, &stats);
+ if (error) {
+ return error;
+ }
+
+ flows = xmalloc(sizeof *flows * stats.n_flows);
+ error = dpif_flow_list(dpif, flows, stats.n_flows, &n_flows);
+ if (error) {
+ free(flows);
+ return error;
+ }
+
+ if (stats.n_flows != n_flows) {
+ VLOG_WARN_RL(&error_rl, "dp%u: datapath stats reported %"PRIu32" "
+ "flows but flow listing reported %zu",
+ dpif->minor, stats.n_flows, n_flows);
+ }
+ *flowsp = flows;
+ *np = n_flows;
+ return 0;
+}
+
+int
+dpif_execute(struct dpif *dpif, uint16_t in_port,
+ const union odp_action actions[], size_t n_actions,
+ const struct ofpbuf *buf)
+{
+ int error;
+
+ COVERAGE_INC(dpif_execute);
+ if (n_actions > 0) {
+ struct odp_execute execute;
+ memset(&execute, 0, sizeof execute);
+ execute.in_port = in_port;
+ execute.actions = (union odp_action *) actions;
+ execute.n_actions = n_actions;
+ execute.data = buf->data;
+ execute.length = buf->size;
+ error = do_ioctl(dpif, ODP_EXECUTE, NULL, &execute);
+ } else {
+ error = 0;
+ }
+
+ if (!(error ? VLOG_DROP_WARN(&error_rl) : VLOG_DROP_DBG(&dpmsg_rl))) {
+ struct ds ds = DS_EMPTY_INITIALIZER;
+ char *packet = ofp_packet_to_string(buf->data, buf->size, buf->size);
+ ds_put_format(&ds, "dp%u: execute ", dpif->minor);
+ format_odp_actions(&ds, actions, n_actions);
+ if (error) {
+ ds_put_format(&ds, " failed (%s)", strerror(error));
+ }
+ ds_put_format(&ds, " on packet %s", packet);
+ vlog(THIS_MODULE, error ? VLL_WARN : VLL_DBG, "%s", ds_cstr(&ds));
+ ds_destroy(&ds);
+ free(packet);
+ }
+ return error;
+}
+
+int
+dpif_recv(struct dpif *dpif, struct ofpbuf **bufp)
+{
+ struct ofpbuf *buf;
+ int retval;
+ int error;
+
+ buf = ofpbuf_new(65536);
+ retval = read(dpif->fd, ofpbuf_tail(buf), ofpbuf_tailroom(buf));
+ if (retval < 0) {
+ error = errno;
+ if (error != EAGAIN) {
+ VLOG_WARN_RL(&error_rl, "dp%u: read failed: %s",
+ dpif->minor, strerror(error));
+ }
+ } else if (retval >= sizeof(struct odp_msg)) {
+ struct odp_msg *msg = buf->data;
+ if (msg->length <= retval) {
+ buf->size += retval;
+ if (VLOG_IS_DBG_ENABLED()) {
+ void *payload = msg + 1;
+ size_t length = buf->size - sizeof *msg;
+ char *s = ofp_packet_to_string(payload, length, length);
+ VLOG_DBG_RL(&dpmsg_rl, "dp%u: received %s message of length "
+ "%zu on port %"PRIu16": %s", dpif->minor,
+ (msg->type == _ODPL_MISS_NR ? "miss"
+ : msg->type == _ODPL_ACTION_NR ? "action"
+ : "<unknown>"),
+ msg->length - sizeof(struct odp_msg),
+ msg->port, s);
+ free(s);
+ }
+ *bufp = buf;
+ COVERAGE_INC(dpif_recv);
+ return 0;
+ } else {
+ VLOG_WARN_RL(&error_rl, "dp%u: discarding message truncated "
+ "from %zu bytes to %d",
+ dpif->minor, msg->length, retval);
+ error = ERANGE;
+ }
+ } else if (!retval) {
+ VLOG_WARN_RL(&error_rl, "dp%u: unexpected end of file", dpif->minor);
+ error = EPROTO;
+ } else {
+ VLOG_WARN_RL(&error_rl,
+ "dp%u: discarding too-short message (%d bytes)",
+ dpif->minor, retval);
+ error = ERANGE;
+ }
+
+ *bufp = NULL;
+ ofpbuf_delete(buf);
+ return error;
+}
+
+void
+dpif_recv_wait(struct dpif *dpif)
+{
+ poll_fd_wait(dpif->fd, POLLIN);
+}
+
+struct dpifmon {
+ struct dpif dpif;
+ struct nl_sock *sock;
+ int local_ifindex;
+};
+
+int
+dpifmon_create(const char *datapath_name, struct dpifmon **monp)
+{
+ struct dpifmon *mon;
+ char local_name[IFNAMSIZ];
+ int error;
+
+ mon = *monp = xmalloc(sizeof *mon);
+
+ error = dpif_open(datapath_name, &mon->dpif);
+ if (error) {
+ goto error;
+ }
+ error = dpif_get_name(&mon->dpif, local_name, sizeof local_name);
+ if (error) {
+ goto error_close_dpif;
+ }
+
+ mon->local_ifindex = if_nametoindex(local_name);
+ if (!mon->local_ifindex) {
+ error = errno;
+ VLOG_WARN("could not get ifindex of %s device: %s",
+ local_name, strerror(errno));
+ goto error_close_dpif;
+ }
+
+ error = nl_sock_create(NETLINK_ROUTE, RTNLGRP_LINK, 0, 0, &mon->sock);
+ if (error) {
+ VLOG_WARN("could not create rtnetlink socket: %s", strerror(error));
+ goto error_close_dpif;
+ }
+
+ return 0;
+
+error_close_dpif:
+ dpif_close(&mon->dpif);
+error:
+ free(mon);
+ *monp = NULL;
+ return error;
+}
+
+void
+dpifmon_destroy(struct dpifmon *mon)
+{
+ if (mon) {
+ dpif_close(&mon->dpif);
+ nl_sock_destroy(mon->sock);
+ }
+}
+
+int
+dpifmon_poll(struct dpifmon *mon, char **devnamep)
+{
+ static struct vlog_rate_limit slow_rl = VLOG_RATE_LIMIT_INIT(1, 5);
+ static const struct nl_policy rtnlgrp_link_policy[] = {
+ [IFLA_IFNAME] = { .type = NL_A_STRING },
+ [IFLA_MASTER] = { .type = NL_A_U32, .optional = true },
+ };
+ struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
+ struct ofpbuf *buf;
+ int error;
+
+ *devnamep = NULL;
+again:
+ error = nl_sock_recv(mon->sock, &buf, false);
+ switch (error) {
+ case 0:
+ if (!nl_policy_parse(buf, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
+ rtnlgrp_link_policy,
+ attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
+ VLOG_WARN_RL(&slow_rl, "received bad rtnl message");
+ error = ENOBUFS;
+ } else {
+ const char *devname = nl_attr_get_string(attrs[IFLA_IFNAME]);
+ bool for_us;
+
+ if (attrs[IFLA_MASTER]) {
+ uint32_t master_ifindex = nl_attr_get_u32(attrs[IFLA_MASTER]);
+ for_us = master_ifindex == mon->local_ifindex;
+ } else {
+ /* It's for us if that device is one of our ports. This is
+ * open-coded instead of using dpif_port_query_by_name() to
+ * avoid logging a warning on failure. */
+ struct odp_port port;
+ memset(&port, 0, sizeof port);
+ strncpy(port.devname, devname, sizeof port.devname);
+ for_us = !ioctl(mon->dpif.fd, ODP_PORT_QUERY, &port);
+ }
+
+ if (!for_us) {
+ /* Not for us, try again. */
+ ofpbuf_delete(buf);
+ COVERAGE_INC(dpifmon_poll_false_wakeup);
+ goto again;
+ }
+ COVERAGE_INC(dpifmon_poll_changed);
+ *devnamep = xstrdup(devname);
+ }
+ ofpbuf_delete(buf);
+ break;
+
+ case EAGAIN:
+ /* Nothing to do. */
+ break;
+
+ case ENOBUFS:
+ VLOG_WARN_RL(&slow_rl, "dpifmon socket overflowed");
+ break;
+
+ default:
+ VLOG_WARN_RL(&slow_rl, "error on dpifmon socket: %s", strerror(error));
+ break;
+ }
+ return error;
+}
+
+void
+dpifmon_run(struct dpifmon *mon UNUSED)
+{
+ /* Nothing to do in this implementation. */
+}
+
+void
+dpifmon_wait(struct dpifmon *mon)
+{
+ nl_sock_wait(mon->sock, POLLIN);
+}
+
+static int get_openvswitch_major(void);
+static int get_major(const char *target, int default_major);
+
+static int
+lookup_minor(const char *name, unsigned int *minor)
+{
+ struct ethtool_drvinfo drvinfo;
+ struct ifreq ifr;
+ int error;
+ int sock;
+
+ *minor = -1;
+ sock = socket(AF_INET, SOCK_DGRAM, 0);
+ if (sock < 0) {
+ VLOG_WARN("socket(AF_INET) failed: %s", strerror(errno));
+ error = errno;
+ goto error;
+ }
+
+ memset(&ifr, 0, sizeof ifr);
+ strncpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
+ ifr.ifr_data = (caddr_t) &drvinfo;
+
+ memset(&drvinfo, 0, sizeof drvinfo);
+ drvinfo.cmd = ETHTOOL_GDRVINFO;
+ if (ioctl(sock, SIOCETHTOOL, &ifr)) {
+ VLOG_WARN("ioctl(SIOCETHTOOL) failed: %s", strerror(errno));
+ error = errno;
+ goto error_close_sock;
+ }
+
+ if (strcmp(drvinfo.driver, "openvswitch")) {
+ VLOG_WARN("%s is not an openvswitch device", name);
+ error = EOPNOTSUPP;
+ goto error_close_sock;
+ }
+
+ if (!isdigit(drvinfo.bus_info[0])) {
+ VLOG_WARN("%s ethtool info does not contain an openvswitch minor",
+ name);
+ error = EPROTOTYPE;
+ goto error_close_sock;
+ }
+
+ *minor = atoi(drvinfo.bus_info);
+ close(sock);
+ return 0;
+
+error_close_sock:
+ close(sock);
+error:
+ return error;
+}
+
+static int
+make_openvswitch_device(unsigned int minor, char **fnp)
+{
+ dev_t dev = makedev(get_openvswitch_major(), minor);
+ const char dirname[] = "/dev/net";
+ struct stat s;
+ char fn[128];
+
+ *fnp = NULL;
+ sprintf(fn, "%s/dp%d", dirname, minor);
+ if (!stat(fn, &s)) {
+ if (!S_ISCHR(s.st_mode)) {
+ VLOG_WARN_RL(&error_rl, "%s is not a character device, fixing",
+ fn);
+ } else if (s.st_rdev != dev) {
+ VLOG_WARN_RL(&error_rl,
+ "%s is device %u:%u instead of %u:%u, fixing",
+ fn, major(s.st_rdev), minor(s.st_rdev),
+ major(dev), minor(dev));
+ } else {
+ goto success;
+ }
+ if (unlink(fn)) {
+ VLOG_WARN_RL(&error_rl, "%s: unlink failed (%s)",
+ fn, strerror(errno));
+ return errno;
+ }
+ } else if (errno == ENOENT) {
+ if (stat(dirname, &s)) {
+ if (errno == ENOENT) {
+ if (mkdir(dirname, 0755)) {
+ VLOG_WARN_RL(&error_rl, "%s: mkdir failed (%s)",
+ dirname, strerror(errno));
+ return errno;
+ }
+ } else {
+ VLOG_WARN_RL(&error_rl, "%s: stat failed (%s)",
+ dirname, strerror(errno));
+ return errno;
+ }
+ }
+ } else {
+ VLOG_WARN_RL(&error_rl, "%s: stat failed (%s)", fn, strerror(errno));
+ return errno;
+ }
+
+ /* The device needs to be created. */
+ if (mknod(fn, S_IFCHR | 0700, dev)) {
+ VLOG_WARN_RL(&error_rl,
+ "%s: creating character device %u:%u failed (%s)",
+ fn, major(dev), minor(dev), strerror(errno));
+ return errno;
+ }
+
+success:
+ *fnp = xstrdup(fn);
+ return 0;
+}
+
+
+static int
+get_openvswitch_major(void)
+{
+ static unsigned int openvswitch_major;
+ if (!openvswitch_major) {
+ enum { DEFAULT_MAJOR = 248 };
+ openvswitch_major = get_major("openvswitch", DEFAULT_MAJOR);
+ }
+ return openvswitch_major;
+}
+
+static int
+get_major(const char *target, int default_major)
+{
+ const char fn[] = "/proc/devices";
+ char line[128];
+ FILE *file;
+ int ln;
+
+ file = fopen(fn, "r");
+ if (!file) {
+ VLOG_ERR("opening %s failed (%s)", fn, strerror(errno));
+ goto error;
+ }
+
+ for (ln = 1; fgets(line, sizeof line, file); ln++) {
+ char name[64];
+ int major;
+
+ if (!strncmp(line, "Character", 9) || line[0] == '\0') {
+ /* Nothing to do. */
+ } else if (!strncmp(line, "Block", 5)) {
+ /* We only want character devices, so skip the rest of the file. */
+ break;
+ } else if (sscanf(line, "%d %63s", &major, name)) {
+ if (!strcmp(name, target)) {
+ fclose(file);
+ return major;
+ }
+ } else {
+ static bool warned;
+ if (!warned) {
+ VLOG_WARN("%s:%d: syntax error", fn, ln);
+ }
+ warned = true;
+ }
+ }
+
+ VLOG_ERR("%s: %s major not found (is the module loaded?), using "
+ "default major %d", fn, target, default_major);
+error:
+ VLOG_INFO("using default major %d for %s", default_major, target);
+ return default_major;
+}
+
+static int
+name_to_minor(const char *name, unsigned int *minor)
+{
+ if (!get_minor_from_name(name, minor)) {
+ return 0;
+ }
+ return lookup_minor(name, minor);
+}
+
+static int
+get_minor_from_name(const char *name, unsigned int *minor)
+{
+ if (!strncmp(name, "dp", 2) && isdigit(name[2])) {
+ *minor = atoi(name + 2);
+ return 0;
+ } else if (!strncmp(name, "nl:", 3) && isdigit(name[3])) {
+ /* This is for compatibility only and will be dropped. */
+ *minor = atoi(name + 3);
+ return 0;
+ } else {
+ return EINVAL;
+ }
+}
+
+static int
+open_by_minor(unsigned int minor, struct dpif *dpif)
+{
+ int error;
+ char *fn;
+ int fd;
+
+ dpif->minor = -1;
+ dpif->fd = -1;
+ error = make_openvswitch_device(minor, &fn);
+ if (error) {
+ return error;
+ }
+
+ fd = open(fn, O_RDONLY | O_NONBLOCK);
+ if (fd < 0) {
+ error = errno;
+ VLOG_WARN("%s: open failed (%s)", fn, strerror(error));
+ free(fn);
+ return error;
+ }
+
+ free(fn);
+ dpif->minor = minor;
+ dpif->fd = fd;
+ return 0;
+}
+
+/* There is a tendency to construct odp_flow objects on the stack and to
+ * forget to properly initialize their "actions" and "n_actions" members.
+ * When this happens, we get memory corruption because the kernel
+ * writes through the random pointer that is in the "actions" member.
+ *
+ * This function attempts to combat the problem by:
+ *
+ * - Forcing a segfault if "actions" points to an invalid region (instead
+ * of just getting back EFAULT, which can be easily missed in the log).
+ *
+ * - Storing a distinctive value that is likely to cause an
+ * easy-to-identify error later if it is dereferenced, etc.
+ *
+ * - Triggering a warning on uninitialized memory from Valgrind if
+ * "actions" or "n_actions" was not initialized.
+ */
+static void
+check_rw_odp_flow(struct odp_flow *flow)
+{
+ if (flow->n_actions) {
+ memset(&flow->actions[0], 0xcc, sizeof flow->actions[0]);
+ }
+}