summaryrefslogtreecommitdiff
path: root/lib/netlink-socket.h
blob: 7852ad052a6cdbf8547eed87b753d1cb54a5375d (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
/*
 * Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013, 2014 Nicira, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at:
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef NETLINK_SOCKET_H
#define NETLINK_SOCKET_H 1

/* Netlink socket definitions.
 *
 * This header file defines functions for working with Netlink sockets.  Only
 * Linux natively supports Netlink sockets, but Netlink is well suited as a
 * basis for extensible low-level protocols, so it can make sense to implement
 * a Netlink layer on other systems.  This doesn't have to be done in exactly
 * the same way as on Linux, as long as the implementation can support the
 * semantics that are important to Open vSwitch.  See "Usage concepts" below
 * for more information.
 *
 * For Netlink protocol definitions, see netlink-protocol.h.  For helper
 * functions for working with Netlink messages, see netlink.h.
 *
 *
 * Usage concepts
 * ==============
 *
 * Netlink is a datagram-based network protocol primarily for communication
 * between user processes and the kernel.  Netlink is specified in RFC 3549,
 * "Linux Netlink as an IP Services Protocol".
 *
 * Netlink is not suitable for use in physical networks of heterogeneous
 * machines because host byte order is used throughout.
 *
 * The AF_NETLINK socket namespace is subdivided into statically numbered
 * protocols, e.g. NETLINK_ROUTE, NETLINK_NETFILTER, provided as the third
 * argument to the socket() function.  Maintaining the assigned numbers became
 * a bit of a problem, so the "Generic Netlink" NETLINK_GENERIC protocol was
 * introduced to map between human-readable names and dynamically assigned
 * numbers.  All recently introduced Netlink protocol messages in Linux
 * (including all of the Open vSwitch specific messages) fall under
 * NETLINK_GENERIC.  The Netlink library provides the nl_lookup_genl_family()
 * function for translating a Generic Netlink name to a number.  On Linux, this
 * queries the kernel Generic Netlink implementation, but on other systems it
 * might be easier to statically assign each of the names used by Open vSwitch
 * and then implement this function entirely in userspace.
 *
 * Each Netlink socket is distinguished by its Netlink PID, a 32-bit integer
 * that is analogous to a TCP or UDP port number.  The kernel has PID 0.
 *
 * Most Netlink messages manage a kernel table of some kind, e.g. the kernel
 * routing table, ARP table, etc.  Open vSwitch specific messages manage tables
 * of datapaths, ports within datapaths ("vports"), and flows within
 * datapaths.  Open vSwitch also has messages related to network packets
 * received on vports, which aren't really a table.
 *
 * Datagram protocols over a physical network are typically unreliable: in UDP,
 * for example, messages can be dropped, delivered more than once, or delivered
 * out of order.  In Linux, Netlink does not deliver messages out of order or
 * multiple times.  In some cases it can drop messages, but the kernel
 * indicates when a message has been dropped.  The description below of each
 * way Open vSwitch uses Netlink also explains how to work around dropped
 * messages.
 *
 * Open vSwitch uses Netlink in four characteristic ways:
 *
 *    1. Transactions.  A transaction is analogous to a system call, an ioctl,
 *       or an RPC: userspace sends a request to the kernel, which processes
 *       the request synchronously and returns a reply to userspace.
 *       (Sometimes there is no explicit reply, but even in that case userspace
 *       will receive an immediate reply if there is an error.)
 *
 *       nl_transact() is the primary interface for transactions over Netlink.
 *       This function doesn't take a socket as a parameter because sockets do
 *       not have any state related to transactions.
 *
 *       Netlink uses 16-bit "length" fields extensively, which effectively
 *       limits requests and replies to 64 kB.  "Dumps" (see below) are one way
 *       to work around this limit for replies.
 *
 *       In the Linux implementation of Netlink transactions, replies can
 *       sometimes be lost.  When this happens, nl_transact() automatically
 *       executes the transaction again.  This means that it is important that
 *       transactions be idempotent, or that the client be prepared to tolerate
 *       that a transaction might actually execute more than once.
 *
 *       The Linux implementation can execute several transactions at the same
 *       time more efficiently than individually.  nl_transact_multiple()
 *       allows for this.  The semantics are no different from executing each
 *       of the transactions individually with nl_transact().
 *
 *    2. Dumps.  A dump asks the kernel to provide all of the information in a
 *       table.  It consists of a request and a reply, where the reply consists
 *       of an arbitrary number of messages.  Each message in the reply is
 *       limited to 64 kB, as is the request, but the total size of the reply
 *       can be many times larger.
 *
 *       The reply to a dump is usually generated piece by piece, not
 *       atomically.  The reply can represent an inconsistent snapshot of the
 *       table.  This is especially likely if entries in the table were being
 *       added or deleted or changing during the dump.
 *
 *       nl_dump_start() begins a dump based on the caller-provided request and
 *       initializes a "struct nl_dump" to identify the dump.  Subsequent calls
 *       to nl_dump_next() then obtain the reply, one message at a time.
 *       Usually, each message gives information about some entry in a table,
 *       e.g. one flow in the Open vSwitch flow table, or one route in a
 *       routing table.  nl_dump_done() ends the dump.
 *
 *       Linux implements dumps so that messages in a reply do not get lost.
 *
 *    3. Multicast subscriptions.  Most kernel Netlink implementations allow a
 *       process to monitor changes to its table, by subscribing to a Netlink
 *       multicast group dedicated to that table.  Whenever the table's content
 *       changes (e.g. an entry is added or deleted or modified), the Netlink
 *       implementation sends a message to all sockets that subscribe to its
 *       multicast group notifying it of details of the change.  (This doesn't
 *       require much extra work by the Netlink implementer because the message
 *       is generally identical to the one sent as a reply to the request that
 *       changed the table.)
 *
 *       nl_sock_join_mcgroup() subscribes a socket to a multicast group, and
 *       nl_sock_recv() reads notifications.
 *
 *       If userspace doesn't read messages from a socket subscribed to a
 *       multicast group quickly enough, then notification messages can pile up
 *       in the socket's receive buffer.  If this continues long enough, the
 *       receive buffer will fill up and notifications will be lost.  In that
 *       case, nl_sock_recv() will return ENOBUFS.  The client can then use a
 *       dump to resynchronize with the table state.  (A simple implementation
 *       of multicast groups might take advantage of this by simply returning
 *       ENOBUFS whenever a table changes, without implementing actual
 *       notifications.  This would cause lots of extra dumps, so it may not be
 *       suitable as a production implementation.)
 *
 *    4. Unicast subscriptions (Open vSwitch specific).  Userspace can assign
 *       one or more Netlink PIDs to a vport as "upcall PIDs".  When a packet
 *       received on the vport does not match any flow in its datapath's flow
 *       table, the kernel hashes some of the packet's headers, uses the hash
 *       to select one of the PIDs, and sends the packet (encapsulated in an
 *       Open vSwitch Netlink message) to the socket with the selected PID.
 *
 *       nl_sock_recv() reads notifications sent this way.
 *
 *       Specifically on Windows platform, the datapath needs to allocate a
 *       queue for packets, and it does so only when userspace "subscribe"'s to
 *       packets on that netlink socket.  Before closing the netlink socket,
 *       userspace needs to "unsubscribe" packets on that netlink socket.
 *
 *       nl_sock_subscribe_packets() and nl_sock_unsubscribe_packets() are
 *       Windows specific.
 *
 *       Messages received this way can overflow, just like multicast
 *       subscription messages, and they are reported the same way.  Because
 *       packet notification messages do not report the state of a table, there
 *       is no way to recover the dropped packets; they are simply lost.
 *
 *       The main reason to support multiple PIDs per vport is to increase
 *       fairness, that is, to make it harder for a single high-flow-rate
 *       sender to drown out lower rate sources.  Multiple PIDs per vport might
 *       also improve packet handling latency or flow setup rate, but that is
 *       not the main goal.
 *
 *       Old versions of the Linux kernel module supported only one PID per
 *       vport, and userspace still copes with this, so a simple or early
 *       implementation might only support one PID per vport too.
 *
 *
 * Thread-safety
 * =============
 *
 * Most of the netlink functions are not fully thread-safe: Only a single
 * thread may use a given nl_sock or nl_dump at one time. The exceptions are:
 *
 *    - nl_sock_recv() is conditionally thread-safe: it may be called from
 *      different threads with the same nl_sock, but each caller must provide
 *      an independent receive buffer.
 *
 *    - nl_dump_next() is conditionally thread-safe: it may be called from
 *      different threads with the same nl_dump, but each caller must provide
 *      independent buffers.
 */

#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#include "openvswitch/ofpbuf.h"
#include "ovs-atomic.h"
#include "ovs-thread.h"

struct nl_sock;

#ifndef HAVE_NETLINK
#ifndef _WIN32
#error "netlink-socket.h is only for hosts that support Netlink sockets"
#endif
#endif

/* Netlink sockets. */
int nl_sock_create(int protocol, struct nl_sock **);
int nl_sock_clone(const struct nl_sock *, struct nl_sock **);
void nl_sock_destroy(struct nl_sock *);

int nl_sock_join_mcgroup(struct nl_sock *, unsigned int multicast_group);
int nl_sock_leave_mcgroup(struct nl_sock *, unsigned int multicast_group);

int nl_sock_listen_all_nsid(struct nl_sock *, bool enable);

#ifdef _WIN32
int nl_sock_subscribe_packets(struct nl_sock *sock);
int nl_sock_unsubscribe_packets(struct nl_sock *sock);
#endif

int nl_sock_send(struct nl_sock *, const struct ofpbuf *, bool wait);
int nl_sock_send_seq(struct nl_sock *, const struct ofpbuf *,
                     uint32_t nlmsg_seq, bool wait);
int nl_sock_recv(struct nl_sock *, struct ofpbuf *, int *nsid, bool wait);

int nl_sock_drain(struct nl_sock *);

void nl_sock_wait(const struct nl_sock *, short int events);
#ifndef _WIN32
int nl_sock_fd(const struct nl_sock *);
#endif

uint32_t nl_sock_pid(const struct nl_sock *);

/* Batching transactions. */
struct nl_transaction {
    /* Filled in by client. */
    struct ofpbuf *request;     /* Request to send. */

    /* The client must initialize 'reply' to one of:
     *
     *   - NULL, if it does not care to examine the reply.
     *
     *   - Otherwise, to an ofpbuf with a memory allocation of at least
     *     NLMSG_HDRLEN bytes.
     */
    struct ofpbuf *reply;       /* Reply (empty if reply was an error code). */
    int error;                  /* Positive errno value, 0 if no error. */
};

/* Transactions without an allocated socket. */
int nl_transact(int protocol, const struct ofpbuf *request,
                struct ofpbuf **replyp);
void nl_transact_multiple(int protocol, struct nl_transaction **, size_t n);

/* Table dumping. */
#define NL_DUMP_BUFSIZE         4096

struct nl_dump {
    /* These members are immutable during the lifetime of the nl_dump. */
    struct nl_sock *sock;       /* Socket being dumped. */
    uint32_t nl_seq;            /* Expected nlmsg_seq for replies. */
    int status OVS_GUARDED;     /* 0: dump in progress,
                                 * positive errno: dump completed with error,
                                 * EOF: dump completed successfully. */

    /* 'mutex' protects 'status' and serializes access to 'sock'. */
    struct ovs_mutex mutex;     /* Protects 'status', synchronizes recv(). */
};

void nl_dump_start(struct nl_dump *, int protocol,
                   const struct ofpbuf *request);
bool nl_dump_next(struct nl_dump *, struct ofpbuf *reply, struct ofpbuf *buf);
int nl_dump_done(struct nl_dump *);

/* Miscellaneous */
int nl_lookup_genl_family(const char *name, int *number);
int nl_lookup_genl_mcgroup(const char *family_name, const char *group_name,
                           unsigned int *multicast_group);

#endif /* netlink-socket.h */