summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBen Pfaff <blp@nicira.com>2009-07-08 13:19:16 -0700
committerBen Pfaff <blp@nicira.com>2009-07-08 13:19:16 -0700
commit064af42167bf4fc9aaea2702d80ce08074b889c0 (patch)
treeefd15a6dc2402eeec273bb34db3b2445687589e5
downloadopenvswitch-0.90.0.tar.gz
Import from old repository commit 61ef2b42a9c4ba8e1600f15bb0236765edc2ad45.v0.90.0
-rw-r--r--.gitignore43
-rw-r--r--COPYING29
-rw-r--r--CodingStyle504
-rw-r--r--INSTALL514
-rw-r--r--Makefile.am74
-rw-r--r--README74
-rw-r--r--acinclude.m4195
-rwxr-xr-xboot.sh2
-rw-r--r--configure.ac89
-rw-r--r--datapath/.gitignore7
-rw-r--r--datapath/Makefile.am12
-rw-r--r--datapath/Modules.mk32
-rw-r--r--datapath/actions.c421
-rw-r--r--datapath/actions.h18
-rw-r--r--datapath/brc_procfs.c185
-rw-r--r--datapath/brc_procfs.h11
-rw-r--r--datapath/brc_sysfs.h25
-rw-r--r--datapath/brc_sysfs_dp.c532
-rw-r--r--datapath/brc_sysfs_if.c334
-rw-r--r--datapath/brcompat.c519
-rw-r--r--datapath/compat.h17
-rw-r--r--datapath/datapath.c1611
-rw-r--r--datapath/datapath.h139
-rw-r--r--datapath/dp_dev.c210
-rw-r--r--datapath/dp_dev.h27
-rw-r--r--datapath/dp_notify.c29
-rw-r--r--datapath/flow.c301
-rw-r--r--datapath/flow.h49
-rw-r--r--datapath/linux-2.6/.gitignore20
-rw-r--r--datapath/linux-2.6/Kbuild.in34
-rw-r--r--datapath/linux-2.6/Makefile.in9
-rw-r--r--datapath/linux-2.6/Makefile.main.in82
-rw-r--r--datapath/linux-2.6/Modules.mk50
-rw-r--r--datapath/linux-2.6/compat-2.6/compat26.h37
-rw-r--r--datapath/linux-2.6/compat-2.6/genetlink-brcompat.c20
-rw-r--r--datapath/linux-2.6/compat-2.6/genetlink-openvswitch.c22
-rw-r--r--datapath/linux-2.6/compat-2.6/include/asm-generic/bug.h19
-rw-r--r--datapath/linux-2.6/compat-2.6/include/linux/cpumask.h11
-rw-r--r--datapath/linux-2.6/compat-2.6/include/linux/dmi.h114
-rw-r--r--datapath/linux-2.6/compat-2.6/include/linux/err.h21
-rw-r--r--datapath/linux-2.6/compat-2.6/include/linux/icmp.h13
-rw-r--r--datapath/linux-2.6/compat-2.6/include/linux/if_arp.h15
-rw-r--r--datapath/linux-2.6/compat-2.6/include/linux/ip.h18
-rw-r--r--datapath/linux-2.6/compat-2.6/include/linux/ipv6.h13
-rw-r--r--datapath/linux-2.6/compat-2.6/include/linux/jiffies.h26
-rw-r--r--datapath/linux-2.6/compat-2.6/include/linux/kernel.h9
-rw-r--r--datapath/linux-2.6/compat-2.6/include/linux/lockdep.h450
-rw-r--r--datapath/linux-2.6/compat-2.6/include/linux/log2.h17
-rw-r--r--datapath/linux-2.6/compat-2.6/include/linux/mutex.h59
-rw-r--r--datapath/linux-2.6/compat-2.6/include/linux/netdevice.h35
-rw-r--r--datapath/linux-2.6/compat-2.6/include/linux/netfilter_bridge.h24
-rw-r--r--datapath/linux-2.6/compat-2.6/include/linux/netfilter_ipv4.h19
-rw-r--r--datapath/linux-2.6/compat-2.6/include/linux/netlink.h24
-rw-r--r--datapath/linux-2.6/compat-2.6/include/linux/percpu.h10
-rw-r--r--datapath/linux-2.6/compat-2.6/include/linux/random.h17
-rw-r--r--datapath/linux-2.6/compat-2.6/include/linux/rculist.h12
-rw-r--r--datapath/linux-2.6/compat-2.6/include/linux/rtnetlink.h29
-rw-r--r--datapath/linux-2.6/compat-2.6/include/linux/skbuff.h170
-rw-r--r--datapath/linux-2.6/compat-2.6/include/linux/tcp.h18
-rw-r--r--datapath/linux-2.6/compat-2.6/include/linux/timer.h96
-rw-r--r--datapath/linux-2.6/compat-2.6/include/linux/types.h14
-rw-r--r--datapath/linux-2.6/compat-2.6/include/linux/udp.h13
-rw-r--r--datapath/linux-2.6/compat-2.6/include/linux/workqueue.h42
-rw-r--r--datapath/linux-2.6/compat-2.6/include/net/checksum.h16
-rw-r--r--datapath/linux-2.6/compat-2.6/include/net/genetlink.h123
-rw-r--r--datapath/linux-2.6/compat-2.6/include/net/netlink.h22
-rw-r--r--datapath/linux-2.6/compat-2.6/random32.c144
-rw-r--r--datapath/linux-2.6/compat-2.6/veth.c537
-rw-r--r--datapath/linux-2.6/config/config-linux-2.6.23-rc9-kvm1408
-rw-r--r--datapath/table.c240
-rw-r--r--debian/.gitignore19
-rw-r--r--debian/automake.mk50
-rw-r--r--debian/changelog5
-rwxr-xr-xdebian/commands/reconfigure128
-rwxr-xr-xdebian/commands/update4
-rw-r--r--debian/compat1
-rw-r--r--debian/control143
-rw-r--r--debian/control.modules.in20
-rw-r--r--debian/copyright21
-rwxr-xr-xdebian/corekeeper.cron.daily5
-rwxr-xr-xdebian/corekeeper.init63
-rw-r--r--debian/dirs2
-rw-r--r--debian/openvswitch-common.dirs1
-rw-r--r--debian/openvswitch-common.install3
-rw-r--r--debian/openvswitch-common.manpages2
-rw-r--r--debian/openvswitch-controller.README.Debian12
-rw-r--r--debian/openvswitch-controller.default29
-rw-r--r--debian/openvswitch-controller.dirs1
-rwxr-xr-xdebian/openvswitch-controller.init269
-rw-r--r--debian/openvswitch-controller.install1
-rw-r--r--debian/openvswitch-controller.manpages1
-rwxr-xr-xdebian/openvswitch-controller.postinst52
-rwxr-xr-xdebian/openvswitch-datapath-module-_KVERS_.postinst.modules.in25
-rw-r--r--debian/openvswitch-datapath-source.README.Debian31
-rw-r--r--debian/openvswitch-datapath-source.copyright15
-rw-r--r--debian/openvswitch-datapath-source.dirs1
-rw-r--r--debian/openvswitch-datapath-source.install6
-rw-r--r--debian/openvswitch-monitor.default27
-rw-r--r--debian/openvswitch-monitor.dirs1
-rwxr-xr-xdebian/openvswitch-monitor.init174
-rw-r--r--debian/openvswitch-monitor.install1
-rw-r--r--debian/openvswitch-pki-server.apache21
-rw-r--r--debian/openvswitch-pki-server.dirs1
-rw-r--r--debian/openvswitch-pki-server.install1
-rwxr-xr-xdebian/openvswitch-pki-server.postinst44
-rwxr-xr-xdebian/openvswitch-pki.postinst41
-rw-r--r--debian/openvswitch-switch-config.dirs1
-rw-r--r--debian/openvswitch-switch-config.install1
-rw-r--r--debian/openvswitch-switch-config.manpages1
-rw-r--r--debian/openvswitch-switch-config.overrides1
-rw-r--r--debian/openvswitch-switch-config.templates228
-rw-r--r--debian/openvswitch-switch.README.Debian18
-rw-r--r--debian/openvswitch-switch.dirs2
-rwxr-xr-xdebian/openvswitch-switch.init428
-rw-r--r--debian/openvswitch-switch.install7
-rw-r--r--debian/openvswitch-switch.logrotate11
-rw-r--r--debian/openvswitch-switch.manpages5
-rwxr-xr-xdebian/openvswitch-switch.postinst51
-rwxr-xr-xdebian/openvswitch-switch.postrm43
-rw-r--r--debian/openvswitch-switch.template165
-rw-r--r--debian/openvswitch-switchui.copyright33
-rw-r--r--debian/openvswitch-switchui.default35
-rw-r--r--debian/openvswitch-switchui.dirs3
-rwxr-xr-xdebian/openvswitch-switchui.init210
-rw-r--r--debian/openvswitch-switchui.install2
-rw-r--r--debian/openvswitch-wdt.default24
-rw-r--r--debian/openvswitch-wdt.dirs2
-rwxr-xr-xdebian/openvswitch-wdt.init176
-rw-r--r--debian/openvswitch-wdt.install1
-rwxr-xr-xdebian/ovs-switch-setup615
-rw-r--r--debian/ovs-switch-setup.841
-rw-r--r--debian/po/POTFILES.in1
-rw-r--r--debian/po/templates.pot522
-rwxr-xr-xdebian/rules145
-rw-r--r--extras/ezio/automake.mk49
-rw-r--r--extras/ezio/byteq.c216
-rw-r--r--extras/ezio/byteq.h57
-rw-r--r--extras/ezio/ezio-term.c1060
-rw-r--r--extras/ezio/ezio.c243
-rw-r--r--extras/ezio/ezio.h96
-rw-r--r--extras/ezio/ezio3.ti21
-rw-r--r--extras/ezio/ovs-switchui.c3026
-rw-r--r--extras/ezio/terminal.c833
-rw-r--r--extras/ezio/terminal.h41
-rw-r--r--extras/ezio/tty.c404
-rw-r--r--extras/ezio/tty.h39
-rw-r--r--extras/ezio/vt-dummy.c40
-rw-r--r--extras/ezio/vt-linux.c139
-rw-r--r--extras/ezio/vt.h33
-rw-r--r--include/.gitignore2
-rw-r--r--include/automake.mk2
-rw-r--r--include/openflow/automake.mk4
-rw-r--r--include/openflow/nicira-ext.h109
-rw-r--r--include/openflow/openflow-mgmt.h194
-rw-r--r--include/openflow/openflow.h796
-rw-r--r--include/openvswitch/automake.mk4
-rw-r--r--include/openvswitch/brcompat-netlink.h56
-rw-r--r--include/openvswitch/datapath-protocol.h287
-rw-r--r--lib/.gitignore4
-rw-r--r--lib/automake.mk184
-rw-r--r--lib/backtrace.c106
-rw-r--r--lib/backtrace.h31
-rw-r--r--lib/bitmap.c55
-rw-r--r--lib/bitmap.h82
-rw-r--r--lib/cfg.c1182
-rw-r--r--lib/cfg.h98
-rw-r--r--lib/classifier.c832
-rw-r--r--lib/classifier.h149
-rw-r--r--lib/command-line.c49
-rw-r--r--lib/command-line.h25
-rw-r--r--lib/common.man7
-rw-r--r--lib/compiler.h30
-rw-r--r--lib/coverage-counters.h25
-rwxr-xr-xlib/coverage-scan.pl47
-rw-r--r--lib/coverage.c163
-rw-r--r--lib/coverage.h58
-rw-r--r--lib/csum.c98
-rw-r--r--lib/csum.h31
-rw-r--r--lib/daemon.c294
-rw-r--r--lib/daemon.h51
-rw-r--r--lib/daemon.man21
-rw-r--r--lib/dh1024.pem10
-rw-r--r--lib/dh2048.pem12
-rw-r--r--lib/dh4096.pem18
-rw-r--r--lib/dhcp-client.c1073
-rw-r--r--lib/dhcp-client.h56
-rw-r--r--lib/dhcp.c825
-rw-r--r--lib/dhcp.h262
-rw-r--r--lib/dhparams.h10
-rw-r--r--lib/dirs.h25
-rw-r--r--lib/dpif.c1060
-rw-r--r--lib/dpif.h102
-rw-r--r--lib/dpif.man16
-rw-r--r--lib/dynamic-string.c261
-rw-r--r--lib/dynamic-string.h62
-rw-r--r--lib/fatal-signal.c253
-rw-r--r--lib/fatal-signal.h39
-rw-r--r--lib/fault.c73
-rw-r--r--lib/fault.h23
-rw-r--r--lib/flow.c280
-rw-r--r--lib/flow.h101
-rw-r--r--lib/hash.c85
-rw-r--r--lib/hash.h71
-rw-r--r--lib/hmap.c145
-rw-r--r--lib/hmap.h254
-rw-r--r--lib/leak-checker.c244
-rw-r--r--lib/leak-checker.h41
-rw-r--r--lib/leak-checker.man7
-rw-r--r--lib/learning-switch.c644
-rw-r--r--lib/learning-switch.h33
-rw-r--r--lib/list.c158
-rw-r--r--lib/list.h71
-rw-r--r--lib/mac-learning.c285
-rw-r--r--lib/mac-learning.h37
-rw-r--r--lib/netdev.c1556
-rw-r--r--lib/netdev.h116
-rw-r--r--lib/netlink-protocol.h141
-rw-r--r--lib/netlink.c1077
-rw-r--r--lib/netlink.h127
-rw-r--r--lib/odp-util.c133
-rw-r--r--lib/odp-util.h82
-rw-r--r--lib/ofp-print.c1473
-rw-r--r--lib/ofp-print.h44
-rw-r--r--lib/ofpbuf.c288
-rw-r--r--lib/ofpbuf.h73
-rw-r--r--lib/packets.h274
-rw-r--r--lib/pcap.c163
-rw-r--r--lib/pcap.h30
-rw-r--r--lib/poll-loop.c267
-rw-r--r--lib/poll-loop.h55
-rw-r--r--lib/port-array.c183
-rw-r--r--lib/port-array.h95
-rw-r--r--lib/process.c417
-rw-r--r--lib/process.h48
-rw-r--r--lib/queue.c120
-rw-r--r--lib/queue.h34
-rw-r--r--lib/random.c90
-rw-r--r--lib/random.h30
-rw-r--r--lib/rconn.c959
-rw-r--r--lib/rconn.h104
-rw-r--r--lib/sat-math.h46
-rw-r--r--lib/sha1.c394
-rw-r--r--lib/sha1.h74
-rw-r--r--lib/shash.c93
-rw-r--r--lib/shash.h42
-rw-r--r--lib/signals.c128
-rw-r--r--lib/signals.h27
-rw-r--r--lib/socket-util.c343
-rw-r--r--lib/socket-util.h39
-rw-r--r--lib/stp.c1226
-rw-r--r--lib/stp.h103
-rw-r--r--lib/svec.c381
-rw-r--r--lib/svec.h60
-rw-r--r--lib/tag.c82
-rw-r--r--lib/tag.h134
-rw-r--r--lib/timeval.c305
-rw-r--r--lib/timeval.h51
-rw-r--r--lib/type-props.h32
-rw-r--r--lib/unixctl.c592
-rw-r--r--lib/unixctl.h44
-rw-r--r--lib/util.c296
-rw-r--r--lib/util.h117
-rw-r--r--lib/valgrind.h26
-rw-r--r--lib/vconn-provider.h170
-rw-r--r--lib/vconn-ssl.c1197
-rw-r--r--lib/vconn-ssl.h54
-rw-r--r--lib/vconn-stream.c346
-rw-r--r--lib/vconn-stream.h35
-rw-r--r--lib/vconn-tcp.c186
-rw-r--r--lib/vconn-unix.c118
-rw-r--r--lib/vconn.c1405
-rw-r--r--lib/vconn.h128
-rw-r--r--lib/vlog-modules.def65
-rw-r--r--lib/vlog.c711
-rw-r--r--lib/vlog.h192
-rw-r--r--lib/vlog.man44
-rw-r--r--lib/xtoxll.h34
-rw-r--r--m4/nx-build.m453
-rw-r--r--m4/openvswitch.m4210
-rw-r--r--secchan/.gitignore4
-rw-r--r--secchan/automake.mk42
-rw-r--r--secchan/commands/automake.mk3
-rwxr-xr-xsecchan/commands/reboot3
-rw-r--r--secchan/discovery.c270
-rw-r--r--secchan/discovery.h38
-rw-r--r--secchan/executer.c519
-rw-r--r--secchan/executer.h33
-rw-r--r--secchan/fail-open.c140
-rw-r--r--secchan/fail-open.h38
-rw-r--r--secchan/in-band.c358
-rw-r--r--secchan/in-band.h37
-rw-r--r--secchan/main.c565
-rw-r--r--secchan/netflow.c328
-rw-r--r--secchan/netflow.h33
-rw-r--r--secchan/ofproto.c3305
-rw-r--r--secchan/ofproto.h109
-rw-r--r--secchan/pinsched.c284
-rw-r--r--secchan/pinsched.h35
-rw-r--r--secchan/pktbuf.c150
-rw-r--r--secchan/pktbuf.h34
-rw-r--r--secchan/secchan.8.in463
-rw-r--r--secchan/status.c229
-rw-r--r--secchan/status.h45
-rwxr-xr-xsoexpand.pl26
-rw-r--r--tests/.gitignore10
-rw-r--r--tests/automake.mk56
-rwxr-xr-xtests/flowgen.pl224
-rw-r--r--tests/test-classifier.c977
-rw-r--r--tests/test-dhcp-client.c189
-rw-r--r--tests/test-flows.c76
-rwxr-xr-xtests/test-flows.sh9
-rw-r--r--tests/test-hash.c155
-rw-r--r--tests/test-hmap.c281
-rw-r--r--tests/test-list.c159
-rw-r--r--tests/test-stp-ieee802.1d-199812
-rw-r--r--tests/test-stp-ieee802.1d-2004-fig17.431
-rw-r--r--tests/test-stp-ieee802.1d-2004-fig17.614
-rw-r--r--tests/test-stp-ieee802.1d-2004-fig17.717
-rw-r--r--tests/test-stp-iol-io-1.125
-rw-r--r--tests/test-stp-iol-io-1.214
-rw-r--r--tests/test-stp-iol-io-1.413
-rw-r--r--tests/test-stp-iol-io-1.540
-rw-r--r--tests/test-stp-iol-op-1.17
-rw-r--r--tests/test-stp-iol-op-1.48
-rw-r--r--tests/test-stp-iol-op-3.111
-rw-r--r--tests/test-stp-iol-op-3.311
-rw-r--r--tests/test-stp-iol-op-3.411
-rw-r--r--tests/test-stp.c648
-rwxr-xr-xtests/test-stp.sh7
-rw-r--r--tests/test-type-props.c41
-rw-r--r--third-party/.gitignore2
-rw-r--r--third-party/README35
-rw-r--r--third-party/automake.mk3
-rw-r--r--third-party/ofp-tcpdump.patch109
-rw-r--r--utilities/.gitignore22
-rw-r--r--utilities/automake.mk74
-rw-r--r--utilities/nlmon.c90
-rw-r--r--utilities/ovs-appctl.8.in166
-rw-r--r--utilities/ovs-appctl.c221
-rw-r--r--utilities/ovs-cfg-mod.8.in101
-rw-r--r--utilities/ovs-cfg-mod.c239
-rw-r--r--utilities/ovs-controller.8.in132
-rw-r--r--utilities/ovs-controller.c323
-rw-r--r--utilities/ovs-discover.8.in118
-rw-r--r--utilities/ovs-discover.c405
-rw-r--r--utilities/ovs-dpctl.8.in166
-rw-r--r--utilities/ovs-dpctl.c552
-rw-r--r--utilities/ovs-kill.8.in60
-rw-r--r--utilities/ovs-kill.c210
-rwxr-xr-xutilities/ovs-monitor128
-rw-r--r--utilities/ovs-ofctl.8.in489
-rw-r--r--utilities/ovs-ofctl.c1278
-rwxr-xr-xutilities/ovs-parse-leaks.in285
-rwxr-xr-xutilities/ovs-pki-cgi.in41
-rw-r--r--utilities/ovs-pki.8.in323
-rwxr-xr-xutilities/ovs-pki.in582
-rw-r--r--utilities/ovs-wdt.c263
-rw-r--r--vswitchd/.gitignore7
-rw-r--r--vswitchd/automake.mk40
-rw-r--r--vswitchd/bridge.c3058
-rw-r--r--vswitchd/bridge.h43
-rw-r--r--vswitchd/mgmt.c679
-rw-r--r--vswitchd/mgmt.h36
-rw-r--r--vswitchd/ovs-brcompatd.8.in49
-rw-r--r--vswitchd/ovs-brcompatd.c766
-rw-r--r--vswitchd/ovs-vswitchd.8.in87
-rw-r--r--vswitchd/ovs-vswitchd.c255
-rw-r--r--vswitchd/ovs-vswitchd.conf.5.in642
-rw-r--r--vswitchd/ovs-vswitchd.h32
-rw-r--r--vswitchd/port.c68
-rw-r--r--vswitchd/port.h33
-rw-r--r--vswitchd/proc-net-compat.c344
-rw-r--r--vswitchd/proc-net-compat.h51
-rw-r--r--vswitchd/xenserver.c90
-rw-r--r--vswitchd/xenserver.h32
-rw-r--r--xenserver/README78
-rw-r--r--xenserver/automake.mk19
-rwxr-xr-xxenserver/etc_init.d_vswitch302
-rwxr-xr-xxenserver/etc_init.d_vswitch-xapi-update71
-rw-r--r--xenserver/etc_logrotate.d_vswitch14
-rw-r--r--xenserver/etc_profile.d_vswitch.sh56
-rw-r--r--xenserver/etc_sysconfig_vswitch.example79
-rwxr-xr-xxenserver/etc_xapi.d_plugins_vswitch-cfg-update123
-rwxr-xr-xxenserver/etc_xensource_scripts_vif130
-rwxr-xr-xxenserver/opt_xensource_libexec_interface-reconfigure1572
-rw-r--r--xenserver/usr_lib_xsconsole_plugins-base_XSFeatureVSwitch.py296
-rw-r--r--xenserver/vswitch-xen.spec310
387 files changed, 75535 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 000000000..480615ac9
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,43 @@
+#*#
+*.a
+*.d
+*.ko
+*.la
+*.lo
+*.loT
+*.mod.c
+*.o
+*.o
+*.pyc
+*.so
+*~
+.#*
+.*.cmd
+.*.swp
+.deps
+.libs
+.tmp_versions
+/Makefile
+/Makefile.in
+/aclocal.m4
+/autom4te.cache
+/build-arch-stamp
+/build-aux
+/build-indep-stamp
+/compile
+/config.guess
+/config.h
+/config.h.in
+/config.log
+/config.status
+/config.sub
+/configure
+/configure-stamp
+/depcomp
+/install-sh
+/missing
+/stamp-h1
+Module.symvers
+TAGS
+cscope.*
+tags
diff --git a/COPYING b/COPYING
new file mode 100644
index 000000000..c4a9fb2f2
--- /dev/null
+++ b/COPYING
@@ -0,0 +1,29 @@
+This file is a summary of the licensing of files in this distribution.
+Some files may be marked specifically with a different license, in
+which case that license applies to the file in question.
+
+Files under the debian, doc, include, lib, m4, secchan, tests,
+third-party, and utilities directories are licensed under the ISC
+license:
+
+ Permission to use, copy, modify, and/or distribute this software for any
+ purpose with or without fee is hereby granted, provided that the above
+ copyright notice and this permission notice appear in all copies.
+
+ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+Files under the datapath directory are licensed under the GNU General
+Public License, version 2.
+
+Files under the extras and vswitchd directories are licensed under the
+GNU General Public License, version 3 or later.
+
+Files under the xenserver directory are licensed on a file-by-file
+basis. Some files are under an uncertain license that may not be
+DFSG-compliant or GPL-compatible. Refer to each file for details.
diff --git a/CodingStyle b/CodingStyle
new file mode 100644
index 000000000..126b45a89
--- /dev/null
+++ b/CodingStyle
@@ -0,0 +1,504 @@
+ Open vSwitch Coding Style
+ =========================
+
+This file describes the coding style used in most C files in the Open
+vSwitch distribution. However, Linux kernel code datapath directory
+follows the Linux kernel's established coding conventions.
+
+BASICS
+
+ Limit lines to 79 characters.
+
+ Use form feeds (control+L) to divide long source files into logical
+pieces. A form feed should appear as the only character on a line.
+
+ Do not use tabs for indentation.
+
+ Avoid trailing spaces on lines.
+
+
+NAMING
+
+ Use names that explain the purpose of a function or object.
+
+ Use underscores to separate words in an identifier: multi_word_name.
+
+ Use lowercase for most names. Use uppercase for macros, macro
+parameters, and members of enumerations.
+
+ Give arrays names that are plural.
+
+ Pick a unique name prefix (ending with an underscore) for each
+module, and apply that prefix to all of that module's externally
+visible names. Names of macro parameters, struct and union members,
+and parameters in function prototypes are not considered externally
+visible for this purpose.
+
+ Do not use names that begin with _. If you need a name for
+"internal use only", use __ as a suffix instead of a prefix.
+
+ Avoid negative names: "found" is a better name than "not_found".
+
+ In names, a "size" is a count of bytes, a "length" is a count of
+characters. A buffer has size, but a string has length. The length
+of a string does not include the null terminator, but the size of the
+buffer that contains the string does.
+
+
+COMMENTS
+
+ Comments should be written as full sentences that start with a
+capital letter and end with a period. Put two spaces between
+sentences.
+
+ Write block comments as shown below. You may put the /* and */ on
+the same line as comment text if you prefer.
+
+ /*
+ * We redirect stderr to /dev/null because we often want to remove all
+ * traffic control configuration on a port so its in a known state. If
+ * this done when there is no such configuration, tc complains, so we just
+ * always ignore it.
+ */
+
+ Each function and each variable declared outside a function, and
+each struct, union, and typedef declaration should be preceded by a
+comment. See FUNCTION DEFINITIONS below for function comment
+guidelines.
+
+ Each struct and union member should each have an inline comment that
+explains its meaning. structs and unions with many members should be
+additionally divided into logical groups of members by block comments,
+e.g.:
+
+ /* An event that will wake the following call to poll_block(). */
+ struct poll_waiter {
+ /* Set when the waiter is created. */
+ struct list node; /* Element in global waiters list. */
+ int fd; /* File descriptor. */
+ short int events; /* Events to wait for (POLLIN, POLLOUT). */
+ poll_fd_func *function; /* Callback function, if any, or null. */
+ void *aux; /* Argument to callback function. */
+ struct backtrace *backtrace; /* Event that created waiter, or null. */
+
+ /* Set only when poll_block() is called. */
+ struct pollfd *pollfd; /* Pointer to element of the pollfds array
+ (null if added from a callback). */
+ };
+
+ Use XXX or FIXME comments to mark code that needs work.
+
+ Don't use // comments.
+
+ Don't comment out or #if 0 out code. Just remove it. The code that
+was there will still be in version control history.
+
+
+FUNCTIONS
+
+ Put the return type, function name, and the braces that surround the
+function's code on separate lines, all starting in column 0.
+
+ Before each function definition, write a comment that describes the
+function's purpose, including each parameter, the return value, and
+side effects. References to argument names should be given in
+single-quotes, e.g. 'arg'. The comment should not include the
+function name, nor need it follow any formal structure. The comment
+does not need to describe how a function does its work, unless this
+information is needed to use the function correctly (this is often
+better done with comments *inside* the function).
+
+ Simple static functions do not need a comment.
+
+ Within a file, non-static functions should come first, in the order
+that they are declared in the header file, followed by static
+functions. Static functions should be in one or more separate pages
+(separated by form feed characters) in logical groups. A commonly
+useful way to divide groups is by "level", with high-level functions
+first, followed by groups of progressively lower-level functions.
+This makes it easy for the program's reader to see the top-down
+structure by reading from top to bottom.
+
+ All function declarations and definitions should include a
+prototype. Empty parentheses, e.g. "int foo();", do not include a
+prototype (they state that the function's parameters are unknown);
+write "void" in parentheses instead, e.g. "int foo(void);".
+
+ Prototypes for static functions should either all go at the top of
+the file, separated into groups by blank lines, or they should appear
+at the top of each page of functions. Don't comment individual
+prototypes, but a comment on each group of prototypes is often
+appropriate.
+
+ In the absence of good reasons for another order, the following
+parameter order is preferred. One notable exception is that data
+parameters and their corresponding size parameters should be paired.
+
+ 1. The primary object being manipulated, if any (equivalent to the
+ "this" pointer in C++).
+ 2. Input-only parameters.
+ 3. Input/output parameters.
+ 4. Output-only parameters.
+ 5. Status parameter.
+
+ Example:
+
+ /* Stores the features supported by 'netdev' into each of '*current',
+ * '*advertised', '*supported', and '*peer' that are non-null. Each value
+ * is a bitmap of "enum ofp_port_features" bits, in host byte order.
+ * Returns 0 if successful, otherwise a positive errno value. On failure,
+ * all of the passed-in values are set to 0. */
+ int
+ netdev_get_features(struct netdev *netdev,
+ uint32_t *current, uint32_t *advertised,
+ uint32_t *supported, uint32_t *peer)
+ {
+ ...
+ }
+
+
+FUNCTION PROTOTYPES
+
+ Put the return type and function name on the same line in a function
+prototype:
+
+ static const struct option_class *get_option_class(int code);
+
+
+ Omit parameter names from function prototypes when the names do not
+give useful information, e.g.:
+
+ int netdev_get_mtu(const struct netdev *);
+
+
+STATEMENTS
+
+ Indent each level of code with 4 spaces. Use BSD-style brace
+placement:
+
+ if (a()) {
+ b();
+ d();
+ }
+
+ Put a space between "if", "while", "for", etc. and the expressions
+that follow them.
+
+ Enclose single statements in braces:
+
+ if (a > b) {
+ return a;
+ } else {
+ return b;
+ }
+
+ Use comments and blank lines to divide long functions into logical
+groups of statements.
+
+ Avoid assignments inside "if" and "while" conditions.
+
+ Do not put gratuitous parentheses around the expression in a return
+statement, that is, write "return 0;" and not "return(0);"
+
+ Write only one statement per line.
+
+ Indent "switch" statements like this:
+
+ switch (conn->state) {
+ case S_RECV:
+ error = run_connection_input(conn);
+ break;
+
+ case S_PROCESS:
+ error = 0;
+ break;
+
+ case S_SEND:
+ error = run_connection_output(conn);
+ break;
+
+ default:
+ NOT_REACHED();
+ }
+
+ "switch" statements with very short, uniform cases may use an
+abbreviated style:
+
+ switch (code) {
+ case 200: return "OK";
+ case 201: return "Created";
+ case 202: return "Accepted";
+ case 204: return "No Content";
+ default: return "Unknown";
+ }
+
+ Use "for (;;)" to write an infinite loop.
+
+ In an if/else construct where one branch is the "normal" or "common"
+case and the other branch is the "uncommon" or "error" case, put the
+common case after the "if", not the "else". This is a form of
+documentation. It also places the most important code in sequential
+order without forcing the reader to visually skip past less important
+details. (Some compilers also assume that the "if" branch is the more
+common case, so this can be a real form of optimization as well.)
+
+
+MACROS
+
+ Don't define an object-like macro if an enum can be used instead.
+
+ Don't define a function-like macro if a "static inline" function
+can be used instead.
+
+ If a macro's definition contains multiple statements, enclose them
+with "do { ... } while (0)" to allow them to work properly in all
+syntactic circumstances.
+
+ Do use macros to eliminate the need to update different parts of a
+single file in parallel, e.g. a list of enums and an array that gives
+the name of each enum. For example:
+
+ /* Logging importance levels. */
+ #define VLOG_LEVELS \
+ VLOG_LEVEL(EMER, LOG_ALERT) \
+ VLOG_LEVEL(ERR, LOG_ERR) \
+ VLOG_LEVEL(WARN, LOG_WARNING) \
+ VLOG_LEVEL(INFO, LOG_NOTICE) \
+ VLOG_LEVEL(DBG, LOG_DEBUG)
+ enum vlog_level {
+ #define VLOG_LEVEL(NAME, SYSLOG_LEVEL) VLL_##NAME,
+ VLOG_LEVELS
+ #undef VLOG_LEVEL
+ VLL_N_LEVELS
+ };
+
+ /* Name for each logging level. */
+ static const char *level_names[VLL_N_LEVELS] = {
+ #define VLOG_LEVEL(NAME, SYSLOG_LEVEL) #NAME,
+ VLOG_LEVELS
+ #undef VLOG_LEVEL
+ };
+
+
+SOURCE FILES
+
+ Each source file should state its license in a comment at the very
+top, followed by a comment explaining the purpose of the code that is
+in that file. The comment should explain how the code in the file
+relates to code in other files. The goal is to allow a programmer to
+quickly figure out where a given module fits into the larger system.
+
+ The first non-comment line in a .c source file should be:
+
+ #include <config.h>
+
+#include directives should appear in the following order:
+
+ 1. #include <config.h>
+
+ 2. The module's own headers, if any. Including this before any
+ other header (besides <config.h>) ensures that the module's
+ header file is self-contained (see HEADER FILES) below.
+
+ 3. Standard C library headers and other system headers, preferably
+ in alphabetical order. (Occasionally one encounters a set of
+ system headers that must be included in a particular order, in
+ which case that order must take precedence.)
+
+ 4. Open vSwitch headers, in alphabetical order. Use "", not <>,
+ to specify Open vSwitch header names.
+
+
+HEADER FILES
+
+ Each header file should start with its license, as described under
+SOURCE FILES above, followed by a "header guard" to make the header
+file idempotent, like so:
+
+ #ifndef NETDEV_H
+ #define NETDEV_H 1
+
+ ...
+
+ #endif /* netdev.h */
+
+ Header files should be self-contained; that is, they should #include
+whatever additional headers are required, without requiring the client
+to #include them for it.
+
+ Don't define the members of a struct or union in a header file,
+unless client code is actually intended to access them directly or if
+the definition is otherwise actually needed (e.g. inline functions
+defined in the header need them).
+
+ Similarly, don't #include a header file just for the declaration of
+a struct or union tag (e.g. just for "struct <name>;"). Just declare
+the tag yourself. This reduces the number of header file
+dependencies.
+
+
+TYPES
+
+ Use typedefs sparingly. Code is clearer if the actual type is
+visible at the point of declaration. Do not, in general, declare a
+typedef for a struct, union, or enum. Do not declare a typedef for a
+pointer type, because this can be very confusing to the reader.
+
+ A function type is a good use for a typedef because it can clarify
+code. The type should be a function type, not a pointer-to-function
+type. That way, the typedef name can be used to declare function
+prototypes. (It cannot be used for function definitions, because that
+is explicitly prohibited by C89 and C99.)
+
+ You may assume that "char" is exactly 8 bits and that "int" and
+"long" are at least 32 bits.
+
+ Don't assume that "long" is big enough to hold a pointer. If you
+need to cast a pointer to an integer, use "intptr_t" or "uintptr_t"
+from <stdint.h>.
+
+ Use the int<N>_t and uint<N>_t types from <stdint.h> for exact-width
+integer types. Use the PRId<N>, PRIu<N>, and PRIx<N> macros from
+<inttypes.h> for formatting them with printf() and related functions.
+
+ Use %zu to format size_t with printf().
+
+ Use bit-fields sparingly. Do not use bit-fields for layout of
+network protocol fields or in other circumstances where the exact
+format is important.
+
+ Declare bit-fields to be type "unsigned int" or "signed int". Do
+*not* declare bit-fields of type "int": C89 allows these to be either
+signed or unsigned according to the compiler's whim. (A 1-bit
+bit-field of type "int" may have a range of -1...0!) Do not declare
+bit-fields of type _Bool or enum or any other type, because these are
+not portable.
+
+ Try to order structure members such that they pack well on a system
+with 2-byte "short", 4-byte "int", and 4- or 8-byte "long" and pointer
+types. Prefer clear organization over size optimization unless you
+are convinced there is a size or speed benefit.
+
+ Pointer declarators bind to the variable name, not the type name.
+Write "int *x", not "int* x" and definitely not "int * x".
+
+
+EXPRESSIONS
+
+ Put one space on each side of infix binary and ternary operators:
+
+ * / %
+ + -
+ << >>
+ < <= > >=
+ == !=
+ &
+ ^
+ |
+ &&
+ ||
+ ?:
+ = += -= *= /= %= &= ^= |= <<= >>=
+
+ Avoid comma operators.
+
+ Do not put any white space around postfix, prefix, or grouping
+operators:
+
+ () [] -> .
+ ! ~ ++ -- + - * &
+
+Exception 1: Put a space after (but not before) the "sizeof" keyword.
+Exception 2: Put a space between the () used in a cast and the
+expression whose type is cast: (void *) 0.
+
+ Break long lines before binary operators and the ternary operators ?
+and :, rather than after them, e.g.
+
+ if (first_long_condition() || second_long_condition()
+ || third_long_condition())
+
+and
+
+ return (out_port != VIGP_CONTROL_PATH
+ ? alpheus_output_port(dp, skb, out_port)
+ : alpheus_output_control(dp, skb, fwd_save_skb(skb),
+ VIGR_ACTION));
+
+
+ Do not parenthesize the operands of && and || unless operator
+precedence makes it necessary, or unless the operands are themselves
+expressions that use && and ||. Thus:
+
+ if (!isdigit(s[0]) || !isdigit(s[1]) || !isdigit(s[2])) {
+ printf("string %s does not start with 3-digit code\n", s);
+ }
+
+but
+
+ if (rule && (!best || rule->priority > best->priority)) {
+ best = rule;
+ }
+
+ Do parenthesize a subexpression that must be split across more than
+one line, e.g.:
+
+ *idxp = ((l1_idx << PORT_ARRAY_L1_SHIFT)
+ | (l2_idx << PORT_ARRAY_L2_SHIFT)
+ | (l3_idx << PORT_ARRAY_L3_SHIFT));
+
+ Try to avoid casts. Don't cast the return value of malloc().
+
+ The "sizeof" operator is unique among C operators in that it accepts
+two very different kinds of operands: an expression or a type. In
+general, prefer to specify an expression, e.g. "int *x =
+xmalloc(sizeof *x);". When the operand of sizeof is an expression,
+there is no need to parenthesize that operand, and please don't.
+
+ Use the ARRAY_SIZE macro from lib/util.h to calculate the number of
+elements in an array.
+
+ When using a relational operator like "<" or "==", put an expression
+or variable argument on the left and a constant argument on the
+right, e.g. "x == 0", *not* "0 == x".
+
+
+BLANK LINES
+
+ Put one blank line between top-level definitions of functions and
+global variables.
+
+
+C DIALECT
+
+ Try to avoid using GCC extensions where possible.
+
+ Some C99 extensions are OK:
+
+ * Flexible array members (e.g. struct { int foo[]; }).
+
+ * "static inline" functions (but no other forms of "inline", for
+ which GCC and C99 have differing interpretations).
+
+ * "long long"
+
+ * <stdint.h> and <inttypes.h>.
+
+ * bool and <stdbool.h>, but don't assume that bool or _Bool can
+ only take on the values 0 or 1, because this behavior can't be
+ simulated on C89 compilers.
+
+ Don't use other C99 extensions, and especially:
+
+ * Don't use // comments.
+
+ * Don't use designated initializers (e.g. don't write "struct foo
+ foo = {.a = 1};" or "int a[] = {[2] = 5};").
+
+ * Don't mix declarations and code within a block.
+
+ * Don't use declarations in iteration statements (e.g. don't write
+ "for (int i = 0; i < 10; i++)").
+
+ * Don't put a trailing comma in an enum declaration (e.g. don't
+ write "enum { x = 1, };").
diff --git a/INSTALL b/INSTALL
new file mode 100644
index 000000000..994e8d320
--- /dev/null
+++ b/INSTALL
@@ -0,0 +1,514 @@
+ Open vSwitch Installation Instructions
+
+This document describes how to build, install, and execute
+Open vSwitch.
+
+Open vSwitch implements an Ethernet switch with MAC learning that may
+be configured with any of the following features:
+
+ * NIC bonding with automatic fail-over and source MAC-based TX
+ load balancing ("SLB").
+
+ * 802.1Q VLAN support.
+
+ * Port mirroring, with optional VLAN tagging.
+
+ * NetFlow v5 flow logging.
+
+ * Connectivity to an external OpenFlow controller, such as
+ NOX.
+
+The current version of this distribution requires a kernel module to
+be built and loaded. An (optional) entirely userspace switch is on
+the roadmap for future versions.
+
+The distribution also contains a number of related utilities.
+
+Build Methods
+=============
+
+There are two principal ways to build and install this distribution:
+
+ - Using "configure" and "make" in the ordinary way. See
+ Building Conventionally below for detailed instructions.
+
+ - As a set of Debian packages. Refer to Building Debian
+ Packages, below, for instructions.
+
+Base Prerequisites
+------------------
+
+Regardless of how it is built, Open vSwitch has a common set of
+prerequisites. To compile the userspace programs in the OpenFlow
+reference distribution, you will need the following software:
+
+ - A make program, e.g. GNU make
+ (http://www.gnu.org/software/make/). BSD make should also work.
+
+ - The GNU C compiler (http://gcc.gnu.org/). We generally test
+ with version 4.2 or 4.3.
+
+ - libssl, from OpenSSL (http://www.openssl.org/), is optional but
+ recommended if you plan to connect the Open vSwitch to an
+ OpenFlow controller. libssl is required to establish
+ confidentiality and authenticity in the connections from an
+ Open vSwitch to an OpenFlow controller. To enable, configure
+ with --enable-ssl=yes.
+
+To compile the kernel module, you must also install the following:
+
+ - A supported Linux kernel version. Please refer to README for a
+ list of supported versions.
+
+ The OpenFlow datapath requires bridging support (CONFIG_BRIDGE)
+ to be built as a kernel module. (This is common in kernels
+ provided by Linux distributions.) The bridge module must not be
+ loaded or in use. If the bridge module is running (check with
+ "lsmod | grep bridge"), you must remove it ("rmmod bridge")
+ before starting the datapath.
+
+ - To build a kernel module, you need the same version of GCC that
+ was used to build that kernel (usually version 4.0 or later).
+
+ - A kernel build directory corresponding to the Linux kernel image
+ the module is to run on. Under Debian and Ubuntu, for example,
+ each linux-image package containing a kernel binary has a
+ corresponding linux-headers package with the required build
+ infrastructure.
+
+If you are working from a Git tree or snapshot (instead of from a
+distribution tarball), or if you modify the Open vSwitch build system,
+you will also need the following software:
+
+ - Autoconf version 2.60 or later (http://www.gnu.org/software/autoconf).
+
+ - Automake version 1.10 or later (http://www.gnu.org/software/automake).
+
+ - pkg-config (http://pkg-config.freedesktop.org/wiki/). We test
+ with version 0.22.
+
+Debian Prerequisites
+--------------------
+
+To build Debian packages from the Open vSwitch distribution, you will
+need to install a number of Debian packages in addition to the base
+prerequisites listed above. These additional prerequisites may be
+found listed as "Build-Depends" in debian/control in the source tree.
+To check that they are installed, first install the dpkg-dev package,
+then run dpkg-checkbuilddeps from the top level of the OpenFlow source
+tree.
+
+To build Debian packages without being root, also install the
+"fakeroot" package.
+
+Building Conventionally
+=======================
+
+This section explains how to build and install the Open vSwitch
+distribution in the ordinary way using "configure" and "make".
+
+0. Check that you have installed all the prerequisites listed above in
+ the Base Prerequisites section.
+
+1. In the top source directory, configure the package by running the
+ configure script. You can usually invoke configure without any
+ arguments:
+
+ % ./configure
+
+ To use a specific C compiler for compiling OpenFlow user programs,
+ also specify it on the configure command line, like so:
+
+ % ./configure CC=gcc-4.2
+
+ To build the Linux kernel module, so that you can run the
+ kernel-based switch, pass the location of the kernel build
+ directory on --with-l26. For example, to build for a running
+ instance of Linux 2.6:
+
+ % ./configure --with-l26=/lib/modules/`uname -r`/build
+
+ If you wish to build the kernel module for an architecture other
+ than the architecture of the machine used for the build, you may
+ specify the kernel architecture string using the KARCH variable
+ when invoking the configure script. For example, to build for MIPS
+ with Linux 2.6:
+
+ % ./configure --with-l26=/path/to/linux-2.6 KARCH=mips
+
+ The configure script accepts a number of other options and honors
+ additional environment variables. For a full list, invoke
+ configure with the --help option.
+
+2. Run make in the top source directory:
+
+ % make
+
+ The following main binaries will be built:
+
+ - Virtual switch daemon: vswitchd/ovs-vswitchd
+
+ - Bridge compatibility daemon: vswitchd/ovs-brcompatd
+
+ - Datapath administration utility: utilities/ovs-dpctl.
+
+ Some less important binaries will be built also:
+
+ - Runtime configuration utility: utilities/ovs-appctl.
+
+ - Simple OpenFlow controller: utilities/ovs-controller.
+
+ - Secure channel executable: secchan/secchan.
+
+ - Miscellaneous utilities: utilities/ovs-discover,
+ utilities/ovs-kill.
+
+ - ANSI terminal support for EZIO 16x2 LCD panel:
+ extras/ezio/ezio-term (only if the proper libraries are
+ installed).
+
+ - Switch monitoring UI for small text displays:
+ extras/ezio/ovs-switchui (only if the proper libraries are
+ installed).
+
+ - Tests: various binaries in tests/.
+
+ If you passed --with-l26 to configure, "make" will also build the
+ following kernel modules:
+
+ - datapath/linux-2.6/brcompat_mod.ko
+
+ - datapath/linux-2.6/openflow_mod.ko
+
+3. Run "make install" to install the executables and manpages into the
+ running system, by default under /usr/local.
+
+4. If you built kernel modules, you may load them with "insmod", e.g.:
+
+ % insmod datapath/linux-2.6/openflow_mod.ko
+
+ The insmod program must be run as root. You may need to specify a
+ full path to insmod, e.g. /sbin/insmod. To verify that the modules
+ have been loaded, run "/sbin/lsmod" and check that openflow_mod is
+ listed.
+
+5. Test the virtuaal switch, as described under Testing the Virtual
+Switch below.
+
+Building Debian Packages
+========================
+
+Follow these instructions to build Debian packages for OpenFlow.
+
+0. Check that you have installed all the prerequisites listed above in
+ the Base Prerequisites and Debian Prerequisites sections above.
+
+1. In the top source directory, run the following command, as root:
+
+ % dpkg-buildpackage
+
+ Alternatively, if you installed the "fakeroot" package, you may run
+ dpkg-buildpackage as an ordinary user with the following syntax:
+
+ % dpkg-buildpackage -rfakeroot
+
+ The following packages will be built in the directory above the
+ source tree:
+
+ - openflow-controller: The OpenFlow controller. Depends on
+ openflow-pki (see below).
+
+ - openflow-switch: Install this package on a machine that acts
+ as an OpenFlow kernel switch.
+
+ - openflow-datapath-source: Source code for OpenFlow's Linux
+ kernel module.
+
+ - openflow-pki: Public-key infrastructure for OpenFlow. Install
+ this package on a machine that acts as an OpenFlow PKI server
+ (see "Establishing a Public Key Infrastructure" below).
+
+ - openflow-common: Files and utilities required by more than one
+ of the above packages.
+
+2. To set up an OpenFlow controller, install the openflow-controller
+ package and its dependencies. You may configure it by editing
+ /etc/default/openflow-controller, e.g. to enable non-SSL
+ connections, which are disabled by default. If you change the
+ default settings, you will need to restart the controller by
+ running:
+
+ % /etc/init.d/openflow-controller restart
+
+3. To set up an OpenFlow switch, install the openflow-switch package
+ and its dependencies. If it is to be a kernel-based switch, also
+ install openflow-datapath-source, then follow the instructions in
+ /usr/share/doc/openflow-datapath-source/README.Debian to build and
+ install the kernel module.
+
+ You may configure the switch one of the following ways:
+
+ - Completely by hand, as described under the Testing section
+ below.
+
+ For the userspace datapath-based switch, this is the only
+ supported form of configuration.
+
+ - By editing /etc/default/openflow-switch. You must at least
+ configure some network devices, by uncommenting NETDEVS and
+ adding the appropriate devices to the list, e.g. NETDEVS="eth0
+ eth1".
+
+ After you edit this file, you will need to start the switch by
+ running:
+
+ % /etc/init.d/openflow-switch restart
+
+ This form of configuration is not supported for the userspace
+ datapath-based switch.
+
+ - By running the ovs-switch-setup program. This interactive
+ program will walk you through all the steps of configuring an
+ OpenFlow switch, including configuration of SSL certificates.
+ Run it without arguments, as root:
+
+ % ovs-switch-setup
+
+ This form of configuration is not supported for the userspace
+ datapath-based switch.
+
+Installation
+============
+
+This section explains how to install Open vSwitch in a network with one
+controller and one or more switches, each of which runs on a separate
+machine. Before you begin, you must decide on one of two ways for
+each switch to reach the controller over the network:
+
+ - Use a "control network" that is completely separate from the
+ "data network" to be controlled ("out-of-band control"). The
+ location of the controller must be configured manually in this
+ case.
+
+ - Use the same network for control and for data ("in-band
+ control"). When in-band control is used, the location of the
+ controller may be configured manually or discovered
+ automatically. We will assume manual configuration here;
+ please refer to secchan(8) for instructions on setting up
+ controller discovery.
+
+Controller Setup
+----------------
+
+On the machine that is to be the OpenFlow controller, start the
+"ovs-controller" program listening for connections from switches on
+TCP port 6633 (the default), as shown below.
+
+ # ovs-controller -v ptcp:
+
+(See ovs-controller(8) for more details)
+
+Make sure the machine hosting the controller is reachable by the
+switch.
+
+Testing the Virtual Switch
+--------------------------
+
+The Open vSwitch kernel module must be loaded, as described under
+"Building Conventionally", before it may be used.
+
+0. The commands below must run as root, so log in as root, or use a
+ program such as "su" to become root temporarily.
+
+1. Create a datapath instance. The command below creates a datapath
+ identified as dp0 (see ovs-dpctl(8) for more detailed usage
+ information).
+
+ # ovs-dpctl add-dp dp0
+
+ (dp0 is the first datapath within a host. openvswitch_mod supports
+ multiple datapaths within the same host, which would be identified
+ as dp1, dp2, etc.)
+
+ Creating datapath dp0 creates a new network device, also named dp0.
+ This network device, called the datapath's "local port", will be
+ bridged to the physical switch ports by the secchan, for use in
+ in-band control.
+
+2. Use ovs-dpctl to attach the datapath to physical interfaces on the
+ machine. Say, for example, you want to create a trivial 2-port
+ switch using interfaces eth1 and eth2, you would issue the
+ following commands:
+
+ # ovs-dpctl add-if dp0 eth1
+ # ovs-dpctl add-if dp0 eth2
+
+ You can verify that the interfaces were successfully added by asking
+ ovs-dpctl to print the current status of datapath dp0:
+
+ # ovs-dpctl show dp0
+
+3. Arrange so that the switch can reach the controller over the
+ network.
+
+ - If you are using out-of-band control, at this point make sure
+ that the switch machine can reach the controller over the
+ network.
+
+ - If you are using in-band control, then at this point you must
+ configure the dp0 network device created in step 1. This
+ device is not yet bridged to any physical network (because
+ secchan does that, and it is not yet running), so the next
+ step depends on whether connectivity is required to configure
+ the device's IP address:
+
+ * If the switch has a static IP address, you may configure
+ its IP address now, e.g.:
+
+ # ifconfig dp0 192.168.1.1
+
+ * If the switch does not have a static IP address, e.g. its
+ IP address is obtained dynamically via DHCP, then proceed
+ to step 4. The DHCP client will not be able to contact
+ the DHCP server until the secure channel has started up.
+
+ - If you are using in-band control with controller discovery, no
+ configuration is required at this point. You may proceed to
+ step 4.
+
+4. Run secchan to start the secure channel connecting the datapath to
+ a remote controller. If the controller is running on host
+ 192.168.1.2 port 6633 (the default port), the secchan invocation
+ would look like this:
+
+ # secchan dp0 tcp:192.168.1.2
+
+ - If you are using in-band control with controller discovery, omit
+ the second argument to the secchan command.
+
+ - If you are using out-of-band control, add --out-of-band to the
+ command line.
+
+5. If you are using in-band control with manual configuration, and the
+ switch obtains its IP address dynamically, then you may now obtain
+ the switch's IP address, e.g. by invoking a DHCP client. The
+ secure channel will only be able to connect to the controller after
+ an IP address has been obtained.
+
+6. The secure channel should connect to the controller within a few
+ seconds. It may take a little longer if controller discovery is in
+ use, because the switch must then also obtain its own IP address
+ and the controller's location via DHCP.
+
+Configuration
+=============
+
+Secure operation over SSL
+-------------------------
+
+The instructions above set up Open vSwitch for operation over a
+plaintext TCP connection. Production use of Open vSwitch should use
+SSL[*] to ensure confidentiality and authenticity of traffic among
+switches and controllers. The source must be configured with
+--enable-ssl=yes to build with SSL support.
+
+To use SSL with Open vSwitch, you must set up a public-key infrastructure
+(PKI) including a pair of certificate authorities (CAs), one for
+controllers and one for switches. If you have an established PKI,
+Open vSwitch can use it directly. Otherwise, refer to "Establishing a
+Public Key Infrastructure" below.
+
+To configure the controller to listen for SSL connections on port 6633
+(the default), invoke it as follows:
+
+ # ovs-controller -v pssl: --private-key=PRIVKEY --certificate=CERT \
+ --ca-cert=CACERT
+
+where PRIVKEY is a file containing the controller's private key, CERT
+is a file containing the controller CA's certificate for the
+controller's public key, and CACERT is a file containing the root
+certificate for the switch CA. If, for example, your PKI was created
+with the instructions below, then the invocation would look like:
+
+ # ovs-controller -v pssl: --private-key=ctl-privkey.pem \
+ --certificate=ctl-cert.pem --ca-cert=pki/switchca/cacert.pem
+
+To configure a switch to connect to a controller running on port 6633
+(the default) on host 192.168.1.2 over SSL, invoke secchan as follows:
+
+ # secchan -v DATAPATH ssl:192.168.1.2 --private-key=PRIVKEY \
+ --certificate=CERT --ca-cert=CACERT
+
+where DATAPATH is the datapath to connect to (e.g. dp0 or
+unix:/var/run/dp0.sock), PRIVKEY is a file containing the switch's
+private key, CERT is a file containing the switch CA's certificate for
+the switch's public key, and CACERT is a file containing the root
+certificate for the controller CA. If, for example, your PKI was
+created with the instructions below, then the invocation would look
+like:
+
+ # secchan -v DATAPATH ssl:192.168.1.2 --private-key=sc-privkey.pem \
+ --certificate=sc-cert.pem --ca-cert=pki/controllerca/cacert.pem
+
+[*] To be specific, Open vSwitch uses TLS version 1.0 or later (TLSv1), as
+ specified by RFC 2246, which is very similar to SSL version 3.0.
+ TLSv1 was released in January 1999, so all current software and
+ hardware should implement it.
+
+Establishing a Public Key Infrastructure
+----------------------------------------
+
+If you do not have a PKI, the ovs-pki script included with Open vSwitch
+can help. To create an initial PKI structure, invoke it as:
+ % ovs-pki init
+which will create and populate a new PKI directory. The default
+location for the PKI directory depends on how the Open vSwitch tree was
+configured (to see the configured default, look for the --dir option
+description in the output of "ovs-pki --help").
+
+The pki directory contains two important subdirectories. The
+controllerca subdirectory contains controller certificate authority
+related files, including the following:
+
+ - cacert.pem: Root certificate for the controller certificate
+ authority. This file must be provided to secchan with the
+ --ca-cert option to enable it to authenticate valid controllers.
+
+ - private/cakey.pem: Private signing key for the controller
+ certificate authority. This file must be kept secret. There is
+ no need for switches or controllers to have a copy of it.
+
+The switchca subdirectory contains switch certificate authority
+related files, analogous to those in the controllerca subdirectory:
+
+ - cacert.pem: Root certificate for the switch certificate
+ authority. This file must be provided to the controller program
+ with the --ca-cert option to enable it to authenticate valid
+ switches.
+
+ - private/cakey.pem: Private signing key for the switch
+ certificate authority. This file must be kept secret. There is
+ no need for switches or controllers to have a copy of it.
+
+After you create the initial structure, you can create keys and
+certificates for switches and controllers with ovs-pki. To create a
+controller private key and certificate in files named ctl-privkey.pem
+and ctl-cert.pem, for example, you could run:
+ % ovs-pki req+sign ctl controller
+ctl-privkey.pem and ctl-cert.pem would need to be copied to the
+controller for its use at runtime (they could then be deleted from
+their original locations). The --private-key and --certificate
+options of ovs-controller, respectively, would point to these files.
+
+Analogously, to create a switch private key and certificate in files
+named sc-privkey.pem and sc-cert.pem, for example, you could run:
+ % ovs-pki req+sign sc switch
+sc-privkey.pem and sc-cert.pem would need to be copied to the switch
+for its use at runtime (they could then be deleted from their original
+locations). The --private-key and --certificate options,
+respectively, of secchan would point to these files.
+
+Bug Reporting
+-------------
+
+Please report problems to ovs-bugs@openvswitch.org.
diff --git a/Makefile.am b/Makefile.am
new file mode 100644
index 000000000..18108cf02
--- /dev/null
+++ b/Makefile.am
@@ -0,0 +1,74 @@
+AUTOMAKE_OPTIONS = foreign subdir-objects
+ACLOCAL_AMFLAGS = -I m4
+SUBDIRS = datapath
+
+if ENABLE_USERSPACE
+if HAVE_DPKG_BUILDPACKAGE
+distcheck-hook:
+ cd $(srcdir) && dpkg-buildpackage -rfakeroot -us -uc
+ cd $(srcdir) && fakeroot ./debian/rules clean
+else
+distcheck-hook:
+endif
+
+AM_CPPFLAGS = $(SSL_CFLAGS)
+AM_CPPFLAGS += $(NCURSES_CFLAGS)
+AM_CPPFLAGS += $(PCRE_CFLAGS)
+AM_CPPFLAGS += -I $(top_srcdir)/include
+AM_CPPFLAGS += -I $(top_srcdir)/lib
+
+AM_CFLAGS = -Wstrict-prototypes
+
+if NDEBUG
+AM_CPPFLAGS += -DNDEBUG
+AM_CFLAGS += -fomit-frame-pointer
+else
+AM_LDFLAGS = -export-dynamic
+endif
+
+CLEANFILES =
+DISTCLEANFILES =
+EXTRA_DIST =
+TESTS =
+TESTS_ENVIRONMENT =
+bin_PROGRAMS =
+sbin_PROGRAMS =
+bin_SCRIPTS =
+dist_commands_DATA =
+dist_man_MANS =
+dist_pkgdata_SCRIPTS =
+dist_sbin_SCRIPTS =
+man_MANS =
+noinst_HEADERS =
+noinst_LIBRARIES =
+noinst_PROGRAMS =
+noinst_SCRIPTS =
+
+EXTRA_DIST += soexpand.pl
+
+ro_c = echo '/* -*- mode: c; buffer-read-only: t -*- */'
+
+SUFFIXES = .in
+.in:
+ $(PERL) $(srcdir)/soexpand.pl -I$(srcdir) < $< | \
+ sed -e 's,[@]LOGDIR[@],$(LOGDIR),g' \
+ -e 's,[@]PKIDIR[@],$(PKIDIR),g' \
+ -e 's,[@]RUNDIR[@],$(RUNDIR),g' \
+ -e 's,[@]pkgdatadir[@],$(pkgdatadir),g' \
+ -e 's,[@]PERL[@],$(PERL),g' > $@
+
+include lib/automake.mk
+include secchan/automake.mk
+include utilities/automake.mk
+include tests/automake.mk
+include include/automake.mk
+include third-party/automake.mk
+include debian/automake.mk
+include vswitchd/automake.mk
+include xenserver/automake.mk
+if HAVE_CURSES
+if HAVE_PCRE
+include extras/ezio/automake.mk
+endif
+endif
+endif # ENABLE_USERSPACE
diff --git a/README b/README
new file mode 100644
index 000000000..8991e4c81
--- /dev/null
+++ b/README
@@ -0,0 +1,74 @@
+ Open vSwitch <http://openvswitch.org>
+
+What is Open vSwitch?
+---------------------
+
+Open vSwitch is an Ethernet switch for virtual servers with the
+following features:
+
+ * NIC bonding with automatic fail-over and source MAC-based TX
+ load balancing ("SLB").
+
+ * 802.1Q VLAN support.
+
+ * Port mirroring, with optional VLAN tagging.
+
+ * NetFlow v5 flow logging.
+
+ * Connectivity to an external OpenFlow controller, such as
+ NOX.
+
+What's here?
+------------
+
+The most important components of this distribution are:
+
+ - A Linux kernel module for flow-based switching, in the
+ datapath directory.
+
+ - ovs-vswitchd, a daemon that implements the virtual switch.
+
+ - ovs-dpctl, a tool for configuring the kernel module and
+ controlling OpenFlow switches.
+
+This distribution includes some additional software as well:
+
+ - secchan, a program that implements a simple OpenFlow switch
+ (without the special features provided by ovs-vswitchd) using
+ the same kernel module as ovs-vswitchd.
+
+ - ovs-controller, a simple OpenFlow switch
+
+ - ovs-ofctl, a utility for querying and controlling OpenFlow
+ switches and controllers.
+
+ - vlog-appctl, a utility that can control Open vSwitch daemons,
+ adjusting their logging levels among other uses.
+
+ - ovs-pki, a utility for creating and managing the public-key
+ infrastructure for OpenFlow switches.
+
+ - A patch to tcpdump that enables it to parse OpenFlow
+ messages.
+
+For installation instructions, read INSTALL. Each userspace program
+is also accompanied by a manpage.
+
+Platform support
+----------------
+
+Our primary test environment is Debian GNU/Linux. Ports to other
+platforms are welcome. Please contact us with portability-related bug
+reports or patches.
+
+The testing of the kernel module has focused on version 2.6.18 from
+Xen and version 2.6.26 from kernel.org. Linux 2.6 releases from
+2.6.15 onward should also work.
+
+GCC is the expected compiler.
+
+Contact
+-------
+
+ovs-bugs@openvswitch.org
+http://openvswitch.org/
diff --git a/acinclude.m4 b/acinclude.m4
new file mode 100644
index 000000000..498196b30
--- /dev/null
+++ b/acinclude.m4
@@ -0,0 +1,195 @@
+# -*- autoconf -*-
+
+# Copyright (c) 2008, 2009 Nicira Networks.
+#
+# Permission to use, copy, modify, and/or distribute this software for any
+# purpose with or without fee is hereby granted, provided that the above
+# copyright notice and this permission notice appear in all copies.
+#
+# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+dnl Checks for --disable-userspace.
+AC_DEFUN([OVS_CHECK_USERSPACE],
+ [AC_ARG_ENABLE(
+ [userspace],
+ [AC_HELP_STRING([--disable-userspace],
+ [Disable building userspace components.])],
+ [case "${enableval}" in
+ (yes) build_userspace=true ;;
+ (no) build_userspace=false ;;
+ (*) AC_MSG_ERROR([bad value ${enableval} for --enable-userspace]) ;;
+ esac],
+ [build_userspace=true])
+ AM_CONDITIONAL([ENABLE_USERSPACE], [$build_userspace])])
+
+dnl OVS_CHECK_LINUX(OPTION, VERSION, VARIABLE, CONDITIONAL)
+dnl
+dnl Configure linux kernel source tree
+AC_DEFUN([OVS_CHECK_LINUX], [
+ AC_ARG_WITH([$1],
+ [AC_HELP_STRING([--with-$1=/path/to/linux-$2],
+ [Specify the linux $2 kernel sources])],
+ [path="$withval"], [path=])dnl
+ if test -n "$path"; then
+ path=`eval echo "$path"`
+
+ AC_MSG_CHECKING([for $path directory])
+ if test -d "$path"; then
+ AC_MSG_RESULT([yes])
+ $3=$path
+ AC_SUBST($3)
+ else
+ AC_MSG_RESULT([no])
+ AC_ERROR([source dir $path doesn't exist])
+ fi
+
+ AC_MSG_CHECKING([for $path kernel version])
+ patchlevel=`sed -n 's/^PATCHLEVEL = //p' "$path/Makefile"`
+ sublevel=`sed -n 's/^SUBLEVEL = //p' "$path/Makefile"`
+ AC_MSG_RESULT([2.$patchlevel.$sublevel])
+ if test "2.$patchlevel" != '$2'; then
+ AC_ERROR([Linux kernel source in $path is not version $2])
+ fi
+ if ! test -e "$path"/include/linux/version.h || \
+ ! test -e "$path"/include/linux/autoconf.h; then
+ AC_MSG_ERROR([Linux kernel source in $path is not configured])
+ fi
+ m4_if($2, [2.6], [OVS_CHECK_LINUX26_COMPAT])
+ fi
+ AM_CONDITIONAL($4, test -n "$path")
+])
+
+dnl OVS_GREP_IFELSE(FILE, REGEX, IF-MATCH, IF-NO-MATCH)
+dnl
+dnl Greps FILE for REGEX. If it matches, runs IF-MATCH, otherwise IF-NO-MATCH.
+AC_DEFUN([OVS_GREP_IFELSE], [
+ AC_MSG_CHECKING([whether $2 matches in $1])
+ grep '$2' $1 >/dev/null 2>&1
+ status=$?
+ case $status in
+ 0)
+ AC_MSG_RESULT([yes])
+ $3
+ ;;
+ 1)
+ AC_MSG_RESULT([no])
+ $4
+ ;;
+ *)
+ AC_MSG_ERROR([grep exited with status $status])
+ ;;
+ esac
+])
+
+dnl OVS_DEFINE(NAME)
+dnl
+dnl Defines NAME to 1 in kcompat.h.
+AC_DEFUN([OVS_DEFINE], [
+ echo '#define $1 1' >> datapath/linux-2.6/kcompat.h.new
+])
+
+AC_DEFUN([OVS_CHECK_VETH], [
+ AC_MSG_CHECKING([whether to build veth module])
+ if test "$sublevel" = 18; then
+ AC_MSG_RESULT([yes])
+ AC_SUBST([BUILD_VETH], 1)
+ else
+ AC_MSG_RESULT([no])
+ fi
+])
+
+AC_DEFUN([OVS_CHECK_LOG2_H], [
+ AC_MSG_CHECKING([for $KSRC26/include/linux/log2.h])
+ if test -e $KSRC26/include/linux/log2.h; then
+ AC_MSG_RESULT([yes])
+ OVS_DEFINE([HAVE_LOG2_H])
+ else
+ AC_MSG_RESULT([no])
+ fi
+])
+
+dnl OVS_CHECK_LINUX26_COMPAT
+dnl
+dnl Runs various Autoconf checks on the Linux 2.6 kernel source in
+dnl the directory in $KSRC26.
+AC_DEFUN([OVS_CHECK_LINUX26_COMPAT], [
+ rm -f datapath/linux-2.6/kcompat.h.new
+ mkdir -p datapath/linux-2.6
+ : > datapath/linux-2.6/kcompat.h.new
+ OVS_GREP_IFELSE([$KSRC26/include/linux/skbuff.h], [skb_transport_header],
+ [OVS_DEFINE([HAVE_SKBUFF_HEADER_HELPERS])])
+ OVS_GREP_IFELSE([$KSRC26/include/linux/skbuff.h], [raw],
+ [OVS_DEFINE([HAVE_MAC_RAW])])
+ OVS_GREP_IFELSE([$KSRC26/include/linux/skbuff.h],
+ [skb_copy_from_linear_data_offset],
+ [OVS_DEFINE([HAVE_SKB_COPY_FROM_LINEAR_DATA_OFFSET])])
+ OVS_GREP_IFELSE([$KSRC26/include/net/netlink.h], [NLA_NUL_STRING],
+ [OVS_DEFINE([HAVE_NLA_NUL_STRING])])
+ OVS_GREP_IFELSE([$KSRC26/include/linux/err.h], [ERR_CAST],
+ [OVS_DEFINE([HAVE_ERR_CAST])])
+ OVS_CHECK_LOG2_H
+ OVS_CHECK_VETH
+ if cmp -s datapath/linux-2.6/kcompat.h.new \
+ datapath/linux-2.6/kcompat.h >/dev/null 2>&1; then
+ rm datapath/linux-2.6/kcompat.h.new
+ else
+ mv datapath/linux-2.6/kcompat.h.new datapath/linux-2.6/kcompat.h
+ fi
+])
+
+dnl Checks for net/if_packet.h.
+AC_DEFUN([OVS_CHECK_IF_PACKET],
+ [AC_CHECK_HEADER([net/if_packet.h],
+ [HAVE_IF_PACKET=yes],
+ [HAVE_IF_PACKET=no])
+ AM_CONDITIONAL([HAVE_IF_PACKET], [test "$HAVE_IF_PACKET" = yes])
+ if test "$HAVE_IF_PACKET" = yes; then
+ AC_DEFINE([HAVE_IF_PACKET], [1],
+ [Define to 1 if net/if_packet.h is available.])
+ fi])
+
+dnl Checks for dpkg-buildpackage. If this is available then we check
+dnl that the Debian packaging is functional at "make distcheck" time.
+AC_DEFUN([OVS_CHECK_DPKG_BUILDPACKAGE],
+ [AC_CHECK_PROG([HAVE_DPKG_BUILDPACKAGE], [dpkg-buildpackage], [yes], [no])
+ AM_CONDITIONAL([HAVE_DPKG_BUILDPACKAGE],
+ [test $HAVE_DPKG_BUILDPACKAGE = yes])])
+
+dnl ----------------------------------------------------------------------
+dnl These macros are from GNU PSPP, with the following original license:
+dnl Copyright (C) 2005, 2006, 2007 Free Software Foundation, Inc.
+dnl This file is free software; the Free Software Foundation
+dnl gives unlimited permission to copy and/or distribute it,
+dnl with or without modifications, as long as this notice is preserved.
+
+dnl OVS_CHECK_CC_OPTION([OPTION], [ACTION-IF-ACCEPTED], [ACTION-IF-REJECTED])
+dnl Check whether the given C compiler OPTION is accepted.
+dnl If so, execute ACTION-IF-ACCEPTED, otherwise ACTION-IF-REJECTED.
+AC_DEFUN([OVS_CHECK_CC_OPTION],
+[
+ m4_define([ovs_cv_name], [ovs_cv_[]m4_translit([$1], [-], [_])])dnl
+ AC_CACHE_CHECK([whether $CC accepts $1], [ovs_cv_name],
+ [ovs_save_CFLAGS="$CFLAGS"
+ CFLAGS="$CFLAGS $1"
+ AC_COMPILE_IFELSE([AC_LANG_PROGRAM(,)], [ovs_cv_name[]=yes], [ovs_cv_name[]=no])
+ CFLAGS="$ovs_save_CFLAGS"])
+ if test $ovs_cv_name = yes; then
+ m4_if([$2], [], [;], [$2])
+ else
+ m4_if([$3], [], [:], [$3])
+ fi
+])
+
+dnl OVS_ENABLE_OPTION([OPTION])
+dnl Check whether the given C compiler OPTION is accepted.
+dnl If so, add it to CFLAGS.
+dnl Example: OVS_ENABLE_OPTION([-Wdeclaration-after-statement])
+AC_DEFUN([OVS_ENABLE_OPTION],
+ [OVS_CHECK_CC_OPTION([$1], [CFLAGS="$CFLAGS $1"])])
+dnl ----------------------------------------------------------------------
diff --git a/boot.sh b/boot.sh
new file mode 100755
index 000000000..05dd35996
--- /dev/null
+++ b/boot.sh
@@ -0,0 +1,2 @@
+#! /bin/sh
+autoreconf --install --force
diff --git a/configure.ac b/configure.ac
new file mode 100644
index 000000000..a557f0f4f
--- /dev/null
+++ b/configure.ac
@@ -0,0 +1,89 @@
+# Copyright (c) 2008, 2009 Nicira Networks
+#
+# Permission to use, copy, modify, and/or distribute this software for any
+# purpose with or without fee is hereby granted, provided that the above
+# copyright notice and this permission notice appear in all copies.
+#
+# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+AC_PREREQ(2.60)
+AC_INIT(openvswitch, 0.90.0, ovs-bugs@openvswitch.org)
+NX_BUILDNR
+AC_CONFIG_SRCDIR([datapath/datapath.c])
+AC_CONFIG_MACRO_DIR([m4])
+AC_CONFIG_AUX_DIR([build-aux])
+AC_CONFIG_HEADERS([config.h])
+AM_INIT_AUTOMAKE
+
+AC_PROG_CC
+AM_PROG_CC_C_O
+AC_PROG_CPP
+AC_PROG_RANLIB
+AC_PROG_MKDIR_P
+
+AC_ARG_VAR([PERL], [path to Perl interpreter])
+AC_PATH_PROG([PERL], perl, no)
+if test "$PERL" = no; then
+ AC_MSG_ERROR([Perl interpreter not found in $PATH or $PERL.])
+fi
+
+AC_USE_SYSTEM_EXTENSIONS
+AC_C_BIGENDIAN
+AC_SYS_LARGEFILE
+
+OVS_CHECK_USERSPACE
+OVS_CHECK_NDEBUG
+OVS_CHECK_NETLINK
+OVS_CHECK_OPENSSL
+OVS_CHECK_LOGDIR
+OVS_CHECK_CURSES
+OVS_CHECK_LINUX_VT_H
+OVS_CHECK_PCRE
+OVS_CHECK_IF_PACKET
+OVS_CHECK_DPKG_BUILDPACKAGE
+
+if $build_userspace; then
+ OVS_CHECK_PKIDIR
+ OVS_CHECK_RUNDIR
+ OVS_CHECK_MALLOC_HOOKS
+ OVS_CHECK_VALGRIND
+ OVS_CHECK_TTY_LOCK_DIR
+ OVS_CHECK_SOCKET_LIBS
+ OVS_CHECK_FAULT_LIBS
+
+ AC_CHECK_FUNCS([strsignal])
+
+ OVS_ENABLE_OPTION([-Wall])
+ OVS_ENABLE_OPTION([-Wno-sign-compare])
+ OVS_ENABLE_OPTION([-Wpointer-arith])
+ OVS_ENABLE_OPTION([-Wdeclaration-after-statement])
+ OVS_ENABLE_OPTION([-Wformat-security])
+ OVS_ENABLE_OPTION([-Wswitch-enum])
+ OVS_ENABLE_OPTION([-Wunused-parameter])
+ OVS_ENABLE_OPTION([-Wstrict-aliasing])
+ OVS_ENABLE_OPTION([-Wbad-function-cast])
+ OVS_ENABLE_OPTION([-Wcast-align])
+ OVS_ENABLE_OPTION([-Wstrict-prototypes])
+ OVS_ENABLE_OPTION([-Wold-style-definition])
+ OVS_ENABLE_OPTION([-Wmissing-prototypes])
+ OVS_ENABLE_OPTION([-Wmissing-field-initializers])
+ OVS_ENABLE_OPTION([-Wno-override-init])
+fi
+
+AC_ARG_VAR(KARCH, [Kernel Architecture String])
+AC_SUBST(KARCH)
+OVS_CHECK_LINUX(l26, 2.6, KSRC26, L26_ENABLED)
+
+AC_CONFIG_FILES([Makefile
+datapath/Makefile
+datapath/linux-2.6/Kbuild
+datapath/linux-2.6/Makefile
+datapath/linux-2.6/Makefile.main])
+
+AC_OUTPUT
diff --git a/datapath/.gitignore b/datapath/.gitignore
new file mode 100644
index 000000000..5a59a0d39
--- /dev/null
+++ b/datapath/.gitignore
@@ -0,0 +1,7 @@
+/Makefile
+/Makefile.in
+*.cmd
+*.ko
+*.mod.c
+Module.symvers
+
diff --git a/datapath/Makefile.am b/datapath/Makefile.am
new file mode 100644
index 000000000..71e2dc485
--- /dev/null
+++ b/datapath/Makefile.am
@@ -0,0 +1,12 @@
+SUBDIRS =
+if L26_ENABLED
+SUBDIRS += linux-2.6
+endif
+
+EXTRA_DIST = $(dist_headers) $(dist_sources)
+
+# Suppress warnings about GNU extensions in Modules.mk files.
+AUTOMAKE_OPTIONS = -Wno-portability
+
+include Modules.mk
+include linux-2.6/Modules.mk
diff --git a/datapath/Modules.mk b/datapath/Modules.mk
new file mode 100644
index 000000000..1b5de4aba
--- /dev/null
+++ b/datapath/Modules.mk
@@ -0,0 +1,32 @@
+# Some modules should be built and distributed, e.g. openvswitch.
+#
+# Some modules should be distributed but not built, e.g. we do not build
+# veth if the kernel in question already has it.
+#
+# Some modules should be built but not distributed, e.g. third-party
+# hwtable modules.
+both_modules = openvswitch
+build_modules = $(both_modules) # Modules to build
+dist_modules = $(both_modules) # Modules to distribute
+
+openvswitch_sources = \
+ actions.c \
+ datapath.c \
+ dp_dev.c \
+ dp_notify.c \
+ flow.c \
+ table.c
+
+openvswitch_headers = \
+ actions.h \
+ compat.h \
+ datapath.h \
+ dp_dev.h \
+ flow.h
+
+dist_sources = $(foreach module,$(dist_modules),$($(module)_sources))
+dist_headers = $(foreach module,$(dist_modules),$($(module)_headers))
+build_sources = $(foreach module,$(build_modules),$($(module)_sources))
+build_headers = $(foreach module,$(build_modules),$($(module)_headers))
+build_links = $(notdir $(build_sources))
+build_objects = $(notdir $(patsubst %.c,%.o,$(build_sources)))
diff --git a/datapath/actions.c b/datapath/actions.c
new file mode 100644
index 000000000..30b840cbb
--- /dev/null
+++ b/datapath/actions.c
@@ -0,0 +1,421 @@
+/*
+ * Distributed under the terms of the GNU GPL version 2.
+ * Copyright (c) 2007, 2008, 2009 Nicira Networks.
+ */
+
+/* Functions for executing flow actions. */
+
+#include <linux/skbuff.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/in6.h>
+#include <linux/if_vlan.h>
+#include <net/ip.h>
+#include <net/checksum.h>
+#include "datapath.h"
+#include "dp_dev.h"
+#include "actions.h"
+#include "openvswitch/datapath-protocol.h"
+
+struct sk_buff *
+make_writable(struct sk_buff *skb, gfp_t gfp)
+{
+ if (skb_shared(skb) || skb_cloned(skb)) {
+ struct sk_buff *nskb = skb_copy(skb, gfp);
+ if (nskb) {
+ kfree_skb(skb);
+ return nskb;
+ }
+ } else {
+ unsigned int hdr_len = (skb_transport_offset(skb)
+ + sizeof(struct tcphdr));
+ if (pskb_may_pull(skb, min(hdr_len, skb->len)))
+ return skb;
+ }
+ kfree_skb(skb);
+ return NULL;
+}
+
+
+static struct sk_buff *
+vlan_pull_tag(struct sk_buff *skb)
+{
+ struct vlan_ethhdr *vh = vlan_eth_hdr(skb);
+ struct ethhdr *eh;
+
+
+ /* Verify we were given a vlan packet */
+ if (vh->h_vlan_proto != htons(ETH_P_8021Q))
+ return skb;
+
+ memmove(skb->data + VLAN_HLEN, skb->data, 2 * VLAN_ETH_ALEN);
+
+ eh = (struct ethhdr *)skb_pull(skb, VLAN_HLEN);
+
+ skb->protocol = eh->h_proto;
+ skb->mac_header += VLAN_HLEN;
+
+ return skb;
+}
+
+
+static struct sk_buff *
+modify_vlan_tci(struct datapath *dp, struct sk_buff *skb,
+ struct odp_flow_key *key, const union odp_action *a,
+ int n_actions, gfp_t gfp)
+{
+ u16 tci, mask;
+
+ if (a->type == ODPAT_SET_VLAN_VID) {
+ tci = ntohs(a->vlan_vid.vlan_vid);
+ mask = VLAN_VID_MASK;
+ key->dl_vlan = htons(tci & mask);
+ } else {
+ tci = a->vlan_pcp.vlan_pcp << 13;
+ mask = VLAN_PCP_MASK;
+ }
+
+ skb = make_writable(skb, gfp);
+ if (!skb)
+ return ERR_PTR(-ENOMEM);
+
+ if (skb->protocol == htons(ETH_P_8021Q)) {
+ /* Modify vlan id, but maintain other TCI values */
+ struct vlan_ethhdr *vh = vlan_eth_hdr(skb);
+ vh->h_vlan_TCI = htons((ntohs(vh->h_vlan_TCI) & ~mask) | tci);
+ } else {
+ /* Add vlan header */
+
+ /* Set up checksumming pointers for checksum-deferred packets
+ * on Xen. Otherwise, dev_queue_xmit() will try to do this
+ * when we send the packet out on the wire, and it will fail at
+ * that point because skb_checksum_setup() will not look inside
+ * an 802.1Q header. */
+ skb_checksum_setup(skb);
+
+ /* GSO is not implemented for packets with an 802.1Q header, so
+ * we have to do segmentation before we add that header.
+ *
+ * GSO does work with hardware-accelerated VLAN tagging, but we
+ * can't use hardware-accelerated VLAN tagging since it
+ * requires the device to have a VLAN group configured (with
+ * e.g. vconfig(8)) and we don't do that.
+ *
+ * Having to do this here may be a performance loss, since we
+ * can't take advantage of TSO hardware support, although it
+ * does not make a measurable network performance difference
+ * for 1G Ethernet. Fixing that would require patching the
+ * kernel (either to add GSO support to the VLAN protocol or to
+ * support hardware-accelerated VLAN tagging without VLAN
+ * groups configured). */
+ if (skb_is_gso(skb)) {
+ struct sk_buff *segs;
+
+ segs = skb_gso_segment(skb, 0);
+ kfree_skb(skb);
+ if (unlikely(IS_ERR(segs)))
+ return ERR_CAST(segs);
+
+ do {
+ struct sk_buff *nskb = segs->next;
+ int err;
+
+ segs->next = NULL;
+
+ segs = __vlan_put_tag(segs, tci);
+ err = -ENOMEM;
+ if (segs) {
+ struct odp_flow_key segkey = *key;
+ err = execute_actions(dp, segs,
+ &segkey, a + 1,
+ n_actions - 1,
+ gfp);
+ }
+
+ if (unlikely(err)) {
+ while ((segs = nskb)) {
+ nskb = segs->next;
+ segs->next = NULL;
+ kfree_skb(segs);
+ }
+ return ERR_PTR(err);
+ }
+
+ segs = nskb;
+ } while (segs->next);
+
+ skb = segs;
+ }
+
+ /* The hardware-accelerated version of vlan_put_tag() works
+ * only for a device that has a VLAN group configured (with
+ * e.g. vconfig(8)), so call the software-only version
+ * __vlan_put_tag() directly instead.
+ */
+ skb = __vlan_put_tag(skb, tci);
+ if (!skb)
+ return ERR_PTR(-ENOMEM);
+ }
+
+ return skb;
+}
+
+static struct sk_buff *strip_vlan(struct sk_buff *skb,
+ struct odp_flow_key *key, gfp_t gfp)
+{
+ skb = make_writable(skb, gfp);
+ if (skb) {
+ vlan_pull_tag(skb);
+ key->dl_vlan = htons(ODP_VLAN_NONE);
+ }
+ return skb;
+}
+
+static struct sk_buff *set_dl_addr(struct sk_buff *skb,
+ const struct odp_action_dl_addr *a,
+ gfp_t gfp)
+{
+ skb = make_writable(skb, gfp);
+ if (skb) {
+ struct ethhdr *eh = eth_hdr(skb);
+ memcpy(a->type == ODPAT_SET_DL_SRC ? eh->h_source : eh->h_dest,
+ a->dl_addr, ETH_ALEN);
+ }
+ return skb;
+}
+
+/* Updates 'sum', which is a field in 'skb''s data, given that a 4-byte field
+ * covered by the sum has been changed from 'from' to 'to'. If set,
+ * 'pseudohdr' indicates that the field is in the TCP or UDP pseudo-header.
+ * Based on nf_proto_csum_replace4. */
+static void update_csum(__sum16 *sum, struct sk_buff *skb,
+ __be32 from, __be32 to, int pseudohdr)
+{
+ __be32 diff[] = { ~from, to };
+ if (skb->ip_summed != CHECKSUM_PARTIAL) {
+ *sum = csum_fold(csum_partial((char *)diff, sizeof(diff),
+ ~csum_unfold(*sum)));
+ if (skb->ip_summed == CHECKSUM_COMPLETE && pseudohdr)
+ skb->csum = ~csum_partial((char *)diff, sizeof(diff),
+ ~skb->csum);
+ } else if (pseudohdr)
+ *sum = ~csum_fold(csum_partial((char *)diff, sizeof(diff),
+ csum_unfold(*sum)));
+}
+
+static struct sk_buff *set_nw_addr(struct sk_buff *skb,
+ struct odp_flow_key *key,
+ const struct odp_action_nw_addr *a,
+ gfp_t gfp)
+{
+ if (key->dl_type != htons(ETH_P_IP))
+ return skb;
+
+ skb = make_writable(skb, gfp);
+ if (skb) {
+ struct iphdr *nh = ip_hdr(skb);
+ u32 *f = a->type == ODPAT_SET_NW_SRC ? &nh->saddr : &nh->daddr;
+ u32 old = *f;
+ u32 new = a->nw_addr;
+
+ if (key->nw_proto == IPPROTO_TCP) {
+ struct tcphdr *th = tcp_hdr(skb);
+ update_csum(&th->check, skb, old, new, 1);
+ } else if (key->nw_proto == IPPROTO_UDP) {
+ struct udphdr *th = udp_hdr(skb);
+ update_csum(&th->check, skb, old, new, 1);
+ }
+ update_csum(&nh->check, skb, old, new, 0);
+ *f = new;
+ }
+ return skb;
+}
+
+static struct sk_buff *
+set_tp_port(struct sk_buff *skb, struct odp_flow_key *key,
+ const struct odp_action_tp_port *a,
+ gfp_t gfp)
+{
+ int check_ofs;
+
+ if (key->dl_type != htons(ETH_P_IP))
+ return skb;
+
+ if (key->nw_proto == IPPROTO_TCP)
+ check_ofs = offsetof(struct tcphdr, check);
+ else if (key->nw_proto == IPPROTO_UDP)
+ check_ofs = offsetof(struct udphdr, check);
+ else
+ return skb;
+
+ skb = make_writable(skb, gfp);
+ if (skb) {
+ struct udphdr *th = udp_hdr(skb);
+ u16 *f = a->type == ODPAT_SET_TP_SRC ? &th->source : &th->dest;
+ u16 old = *f;
+ u16 new = a->tp_port;
+ update_csum((u16*)((u8*)skb->data + check_ofs),
+ skb, old, new, 1);
+ *f = new;
+ }
+ return skb;
+}
+
+static inline unsigned packet_length(const struct sk_buff *skb)
+{
+ unsigned length = skb->len - ETH_HLEN;
+ if (skb->protocol == htons(ETH_P_8021Q))
+ length -= VLAN_HLEN;
+ return length;
+}
+
+int dp_xmit_skb(struct sk_buff *skb)
+{
+ struct datapath *dp = skb->dev->br_port->dp;
+ int len = skb->len;
+
+ if (packet_length(skb) > skb->dev->mtu && !skb_is_gso(skb)) {
+ printk(KERN_WARNING "%s: dropped over-mtu packet: %d > %d\n",
+ dp_name(dp), packet_length(skb), skb->dev->mtu);
+ kfree_skb(skb);
+ return -E2BIG;
+ }
+
+ dev_queue_xmit(skb);
+
+ return len;
+}
+
+static void
+do_output(struct datapath *dp, struct sk_buff *skb, int out_port)
+{
+ struct net_bridge_port *p;
+ struct net_device *dev;
+
+ if (!skb)
+ goto error;
+
+ p = dp->ports[out_port];
+ if (!p)
+ goto error;
+
+ dev = skb->dev = p->dev;
+ if (is_dp_dev(dev))
+ dp_dev_recv(dev, skb);
+ else
+ dp_xmit_skb(skb);
+ return;
+
+error:
+ kfree_skb(skb);
+}
+
+/* Never consumes 'skb'. Returns a port that 'skb' should be sent to, -1 if
+ * none. */
+static int output_group(struct datapath *dp, __u16 group,
+ struct sk_buff *skb, gfp_t gfp)
+{
+ struct dp_port_group *g = rcu_dereference(dp->groups[group]);
+ int prev_port = -1;
+ int i;
+
+ if (!g)
+ return -1;
+ for (i = 0; i < g->n_ports; i++) {
+ struct net_bridge_port *p = dp->ports[g->ports[i]];
+ if (!p || skb->dev == p->dev)
+ continue;
+ if (prev_port != -1) {
+ struct sk_buff *clone = skb_clone(skb, gfp);
+ if (!clone)
+ return -1;
+ do_output(dp, clone, prev_port);
+ }
+ prev_port = p->port_no;
+ }
+ return prev_port;
+}
+
+static int
+output_control(struct datapath *dp, struct sk_buff *skb, u32 arg, gfp_t gfp)
+{
+ skb = skb_clone(skb, gfp);
+ if (!skb)
+ return -ENOMEM;
+ return dp_output_control(dp, skb, _ODPL_ACTION_NR, arg);
+}
+
+/* Execute a list of actions against 'skb'. */
+int execute_actions(struct datapath *dp, struct sk_buff *skb,
+ struct odp_flow_key *key,
+ const union odp_action *a, int n_actions,
+ gfp_t gfp)
+{
+ /* Every output action needs a separate clone of 'skb', but the common
+ * case is just a single output action, so that doing a clone and
+ * then freeing the original skbuff is wasteful. So the following code
+ * is slightly obscure just to avoid that. */
+ int prev_port = -1;
+ int err = 0;
+ for (; n_actions > 0; a++, n_actions--) {
+ WARN_ON_ONCE(skb_shared(skb));
+ if (prev_port != -1) {
+ do_output(dp, skb_clone(skb, gfp), prev_port);
+ prev_port = -1;
+ }
+
+ switch (a->type) {
+ case ODPAT_OUTPUT:
+ prev_port = a->output.port;
+ break;
+
+ case ODPAT_OUTPUT_GROUP:
+ prev_port = output_group(dp, a->output_group.group,
+ skb, gfp);
+ break;
+
+ case ODPAT_CONTROLLER:
+ err = output_control(dp, skb, a->controller.arg, gfp);
+ if (err) {
+ kfree_skb(skb);
+ return err;
+ }
+ break;
+
+ case ODPAT_SET_VLAN_VID:
+ case ODPAT_SET_VLAN_PCP:
+ skb = modify_vlan_tci(dp, skb, key, a, n_actions, gfp);
+ if (IS_ERR(skb))
+ return PTR_ERR(skb);
+ break;
+
+ case ODPAT_STRIP_VLAN:
+ skb = strip_vlan(skb, key, gfp);
+ break;
+
+ case ODPAT_SET_DL_SRC:
+ case ODPAT_SET_DL_DST:
+ skb = set_dl_addr(skb, &a->dl_addr, gfp);
+ break;
+
+ case ODPAT_SET_NW_SRC:
+ case ODPAT_SET_NW_DST:
+ skb = set_nw_addr(skb, key, &a->nw_addr, gfp);
+ break;
+
+ case ODPAT_SET_TP_SRC:
+ case ODPAT_SET_TP_DST:
+ skb = set_tp_port(skb, key, &a->tp_port, gfp);
+ break;
+ }
+ if (!skb)
+ return -ENOMEM;
+ }
+ if (prev_port != -1)
+ do_output(dp, skb, prev_port);
+ else
+ kfree_skb(skb);
+ return err;
+}
diff --git a/datapath/actions.h b/datapath/actions.h
new file mode 100644
index 000000000..410e3ba79
--- /dev/null
+++ b/datapath/actions.h
@@ -0,0 +1,18 @@
+#ifndef ACTIONS_H
+#define ACTIONS_H 1
+
+#include <linux/gfp.h>
+
+struct datapath;
+struct sk_buff;
+struct odp_flow_key;
+union odp_action;
+
+struct sk_buff *make_writable(struct sk_buff *, gfp_t gfp);
+int dp_xmit_skb(struct sk_buff *);
+int execute_actions(struct datapath *dp, struct sk_buff *skb,
+ struct odp_flow_key *key,
+ const union odp_action *, int n_actions,
+ gfp_t gfp);
+
+#endif /* actions.h */
diff --git a/datapath/brc_procfs.c b/datapath/brc_procfs.c
new file mode 100644
index 000000000..733e9a94d
--- /dev/null
+++ b/datapath/brc_procfs.c
@@ -0,0 +1,185 @@
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <net/genetlink.h>
+#include "openvswitch/brcompat-netlink.h"
+
+/* This code implements a Generic Netlink command BRC_GENL_C_SET_PROC that can
+ * be used to add, modify, and delete arbitrary files in selected
+ * subdirectories of /proc. It's a horrible kluge prompted by the need to
+ * simulate certain /proc/net/vlan and /proc/net/bonding files for software
+ * that wants to read them, and with any luck it will go away eventually.
+ *
+ * The implementation is a kluge too. In particular, we want to release the
+ * strings copied into the 'data' members of proc_dir_entry when the
+ * proc_dir_entry structures are freed, but there doesn't appear to be a way to
+ * hook that, so instead we have to rely on being the only entity modifying the
+ * directories in question.
+ */
+
+static int brc_seq_show(struct seq_file *seq, void *unused)
+{
+ seq_puts(seq, seq->private);
+ return 0;
+}
+
+static int brc_seq_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, brc_seq_show, PDE(inode)->data);
+}
+
+static struct file_operations brc_fops = {
+ .owner = THIS_MODULE,
+ .open = brc_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static struct proc_dir_entry *proc_vlan_dir;
+static struct proc_dir_entry *proc_bonding_dir;
+
+struct proc_dir_entry *brc_lookup_entry(struct proc_dir_entry *de, const char *name)
+{
+ int namelen = strlen(name);
+ for (de = de->subdir; de; de = de->next) {
+ if (de->namelen != namelen)
+ continue;
+ if (!memcmp(name, de->name, de->namelen))
+ return de;
+ }
+ return NULL;
+}
+
+static struct proc_dir_entry *brc_open_dir(const char *dir_name,
+ struct proc_dir_entry *parent,
+ struct proc_dir_entry **dirp)
+{
+ if (!*dirp) {
+ struct proc_dir_entry *dir;
+ if (brc_lookup_entry(parent, dir_name)) {
+ printk(KERN_WARNING "%s proc directory exists, can't "
+ "simulate--probably its real module is "
+ "loaded\n", dir_name);
+ return NULL;
+ }
+ dir = *dirp = proc_mkdir(dir_name, parent);
+ }
+ return *dirp;
+}
+
+/* Maximum length of the BRC_GENL_A_PROC_DIR and BRC_GENL_A_PROC_NAME strings.
+ * If we could depend on supporting NLA_NUL_STRING and the .len member in
+ * Generic Netlink policy, then we could just put this in brc_genl_policy (and
+ * simplify brc_genl_set_proc() below too), but upstream 2.6.18 does not have
+ * either. */
+#define BRC_NAME_LEN_MAX 32
+
+int brc_genl_set_proc(struct sk_buff *skb, struct genl_info *info)
+{
+ struct proc_dir_entry *dir, *entry;
+ const char *dir_name, *name;
+ char *data;
+
+ if (!info->attrs[BRC_GENL_A_PROC_DIR] ||
+ VERIFY_NUL_STRING(info->attrs[BRC_GENL_A_PROC_DIR]) ||
+ !info->attrs[BRC_GENL_A_PROC_NAME] ||
+ VERIFY_NUL_STRING(info->attrs[BRC_GENL_A_PROC_NAME]) ||
+ (info->attrs[BRC_GENL_A_PROC_DATA] &&
+ VERIFY_NUL_STRING(info->attrs[BRC_GENL_A_PROC_DATA])))
+ return -EINVAL;
+
+ dir_name = nla_data(info->attrs[BRC_GENL_A_PROC_DIR]);
+ name = nla_data(info->attrs[BRC_GENL_A_PROC_NAME]);
+ if (strlen(dir_name) > BRC_NAME_LEN_MAX ||
+ strlen(name) > BRC_NAME_LEN_MAX)
+ return -EINVAL;
+
+ if (!strcmp(dir_name, "net/vlan"))
+ dir = brc_open_dir("vlan", proc_net, &proc_vlan_dir);
+ else if (!strcmp(dir_name, "net/bonding"))
+ dir = brc_open_dir("bonding", proc_net, &proc_bonding_dir);
+ else
+ return -EINVAL;
+ if (!dir) {
+ /* Probably failed because the module that really implements
+ * the function in question is loaded and already owns the
+ * directory in question.*/
+ return -EBUSY;
+ }
+
+ entry = brc_lookup_entry(dir, name);
+ if (!info->attrs[BRC_GENL_A_PROC_DATA]) {
+ if (!entry)
+ return -ENOENT;
+
+ data = entry->data;
+ remove_proc_entry(name, dir);
+ if (brc_lookup_entry(dir, name))
+ return -EBUSY; /* Shouldn't happen */
+
+ kfree(data);
+ } else {
+ data = kstrdup(nla_data(info->attrs[BRC_GENL_A_PROC_DATA]),
+ GFP_KERNEL);
+ if (!data)
+ return -ENOMEM;
+
+ if (entry) {
+ char *old_data = entry->data;
+ entry->data = data;
+ kfree(old_data);
+ return 0;
+ }
+
+ entry = create_proc_entry(name, S_IFREG|S_IRUSR|S_IWUSR, dir);
+ if (!entry) {
+ kfree(data);
+ return -ENOBUFS;
+ }
+ entry->proc_fops = &brc_fops;
+ entry->data = data;
+ }
+ return 0;
+}
+
+static void kill_proc_dir(const char *dir_name,
+ struct proc_dir_entry *parent,
+ struct proc_dir_entry *dir)
+{
+ if (!dir)
+ return;
+ for (;;) {
+ struct proc_dir_entry *e;
+ char *data;
+ char name[BRC_NAME_LEN_MAX + 1];
+
+ e = dir->subdir;
+ if (!e)
+ break;
+
+ if (e->namelen >= sizeof name) {
+ /* Can't happen: we prevent adding names this long by
+ * limiting the BRC_GENL_A_PROC_NAME string to
+ * BRC_NAME_LEN_MAX bytes. */
+ WARN_ON(1);
+ break;
+ }
+ strcpy(name, e->name);
+
+ data = e->data;
+ e->data = NULL;
+ kfree(data);
+
+ remove_proc_entry(name, dir);
+ }
+ remove_proc_entry(dir_name, parent);
+}
+
+void brc_procfs_exit(void)
+{
+ kill_proc_dir("vlan", proc_net, proc_vlan_dir);
+ kill_proc_dir("bonding", proc_net, proc_bonding_dir);
+}
diff --git a/datapath/brc_procfs.h b/datapath/brc_procfs.h
new file mode 100644
index 000000000..93e21cfb0
--- /dev/null
+++ b/datapath/brc_procfs.h
@@ -0,0 +1,11 @@
+#ifndef BRC_PROCFS_H
+#define BRC_PROCFS_H 1
+
+struct sk_buff;
+struct genl_info;
+
+void brc_procfs_exit(void);
+int brc_genl_set_proc(struct sk_buff *skb, struct genl_info *info);
+
+#endif /* brc_procfs.h */
+
diff --git a/datapath/brc_sysfs.h b/datapath/brc_sysfs.h
new file mode 100644
index 000000000..0c72fb227
--- /dev/null
+++ b/datapath/brc_sysfs.h
@@ -0,0 +1,25 @@
+#ifndef BRC_SYSFS_H
+#define BRC_SYSFS_H 1
+
+struct datapath;
+struct net_bridge_port;
+
+/* brc_sysfs_dp.c */
+int brc_sysfs_add_dp(struct datapath *dp);
+int brc_sysfs_del_dp(struct datapath *dp);
+
+/* brc_sysfs_if.c */
+int brc_sysfs_add_if(struct net_bridge_port *p);
+int brc_sysfs_del_if(struct net_bridge_port *p);
+
+#include <linux/version.h>
+#if LINUX_VERSION_CODE == KERNEL_VERSION(2,6,18)
+#define SUPPORT_SYSFS 1
+#else
+/* We only support sysfs on Linux 2.6.18 because that's the only place we
+ * really need it (on Xen, for brcompat) and it's a big pain to try to support
+ * multiple versions. */
+#endif
+
+#endif /* brc_sysfs.h */
+
diff --git a/datapath/brc_sysfs_dp.c b/datapath/brc_sysfs_dp.c
new file mode 100644
index 000000000..fc02f2794
--- /dev/null
+++ b/datapath/brc_sysfs_dp.c
@@ -0,0 +1,532 @@
+#include <linux/version.h>
+
+/*
+ * Sysfs attributes of bridge for Open vSwitch
+ *
+ * This has been shamelessly copied from the kernel sources.
+ */
+
+#include <linux/capability.h>
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/if_bridge.h>
+#include <linux/rtnetlink.h>
+#include <linux/spinlock.h>
+#include <linux/times.h>
+#include <linux/version.h>
+
+#include "brc_sysfs.h"
+#include "datapath.h"
+#include "dp_dev.h"
+
+#ifdef SUPPORT_SYSFS
+#define to_dev(obj) container_of(obj, struct device, kobj)
+
+/* Hack to attempt to build on more platforms. */
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,21)
+#define to_kobj(d) &(d)->class_dev.kobj
+#define BRC_DEVICE_ATTR CLASS_DEVICE_ATTR
+#else
+#define to_kobj(d) &(d)->dev.kobj
+#define BRC_DEVICE_ATTR DEVICE_ATTR
+#endif
+
+/*
+ * Common code for storing bridge parameters.
+ */
+static ssize_t store_bridge_parm(struct class_device *d,
+ const char *buf, size_t len,
+ void (*set)(struct datapath *, unsigned long))
+{
+ struct datapath *dp = dp_dev_get_dp(to_net_dev(d));
+ char *endp;
+ unsigned long val;
+
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ val = simple_strtoul(buf, &endp, 0);
+ if (endp == buf)
+ return -EINVAL;
+
+#if 0
+ spin_lock_bh(&br->lock);
+ (*set)(br, val);
+ spin_unlock_bh(&br->lock);
+#else
+ /* xxx We use a default value of 0 for all fields. If the caller is
+ * xxx attempting to set the value to our default, just silently
+ * xxx ignore the request.
+ */
+ if (val != 0) {
+ printk("%s: xxx writing dp parms not supported yet!\n",
+ dp_name(dp));
+ }
+#endif
+ return len;
+}
+
+
+static ssize_t show_forward_delay(struct class_device *d,
+ char *buf)
+{
+#if 0
+ struct datapath *dp = dp_dev_get_dp(to_net_dev(d));
+ return sprintf(buf, "%lu\n", jiffies_to_clock_t(br->forward_delay));
+#else
+ return sprintf(buf, "%d\n", 0);
+#endif
+}
+
+static void set_forward_delay(struct datapath *dp, unsigned long val)
+{
+#if 0
+ unsigned long delay = clock_t_to_jiffies(val);
+ br->forward_delay = delay;
+ if (br_is_root_bridge(br))
+ br->bridge_forward_delay = delay;
+#else
+ printk("%s: xxx attempt to set_forward_delay()\n", dp_name(dp));
+#endif
+}
+
+static ssize_t store_forward_delay(struct class_device *d,
+ const char *buf, size_t len)
+{
+ return store_bridge_parm(d, buf, len, set_forward_delay);
+}
+static BRC_DEVICE_ATTR(forward_delay, S_IRUGO | S_IWUSR,
+ show_forward_delay, store_forward_delay);
+
+static ssize_t show_hello_time(struct class_device *d, char *buf)
+{
+#if 0
+ return sprintf(buf, "%lu\n",
+ jiffies_to_clock_t(to_bridge(d)->hello_time));
+#else
+ return sprintf(buf, "%d\n", 0);
+#endif
+}
+
+static void set_hello_time(struct datapath *dp, unsigned long val)
+{
+#if 0
+ unsigned long t = clock_t_to_jiffies(val);
+ br->hello_time = t;
+ if (br_is_root_bridge(br))
+ br->bridge_hello_time = t;
+#else
+ printk("%s: xxx attempt to set_hello_time()\n", dp_name(dp));
+#endif
+}
+
+static ssize_t store_hello_time(struct class_device *d,
+ const char *buf,
+ size_t len)
+{
+ return store_bridge_parm(d, buf, len, set_hello_time);
+}
+static BRC_DEVICE_ATTR(hello_time, S_IRUGO | S_IWUSR, show_hello_time,
+ store_hello_time);
+
+static ssize_t show_max_age(struct class_device *d,
+ char *buf)
+{
+#if 0
+ return sprintf(buf, "%lu\n",
+ jiffies_to_clock_t(to_bridge(d)->max_age));
+#else
+ return sprintf(buf, "%d\n", 0);
+#endif
+}
+
+static void set_max_age(struct datapath *dp, unsigned long val)
+{
+#if 0
+ unsigned long t = clock_t_to_jiffies(val);
+ br->max_age = t;
+ if (br_is_root_bridge(br))
+ br->bridge_max_age = t;
+#else
+ printk("%s: xxx attempt to set_max_age()\n", dp_name(dp));
+#endif
+}
+
+static ssize_t store_max_age(struct class_device *d,
+ const char *buf, size_t len)
+{
+ return store_bridge_parm(d, buf, len, set_max_age);
+}
+static BRC_DEVICE_ATTR(max_age, S_IRUGO | S_IWUSR, show_max_age, store_max_age);
+
+static ssize_t show_ageing_time(struct class_device *d,
+ char *buf)
+{
+#if 0
+ struct datapath *dp = dp_dev_get_dp(to_net_dev(d));
+ return sprintf(buf, "%lu\n", jiffies_to_clock_t(br->ageing_time));
+#else
+ return sprintf(buf, "%d\n", 0);
+#endif
+}
+
+static void set_ageing_time(struct datapath *dp, unsigned long val)
+{
+#if 0
+ br->ageing_time = clock_t_to_jiffies(val);
+#else
+ printk("%s: xxx attempt to set_ageing_time()\n", dp_name(dp));
+#endif
+}
+
+static ssize_t store_ageing_time(struct class_device *d,
+ const char *buf, size_t len)
+{
+ return store_bridge_parm(d, buf, len, set_ageing_time);
+}
+static BRC_DEVICE_ATTR(ageing_time, S_IRUGO | S_IWUSR, show_ageing_time,
+ store_ageing_time);
+
+static ssize_t show_stp_state(struct class_device *d,
+ char *buf)
+{
+#if 0
+ struct datapath *dp = dp_dev_get_dp(to_net_dev(d));
+ return sprintf(buf, "%d\n", br->stp_enabled);
+#else
+ return sprintf(buf, "%d\n", 0);
+#endif
+}
+
+
+static ssize_t store_stp_state(struct class_device *d,
+ const char *buf,
+ size_t len)
+{
+ struct datapath *dp = dp_dev_get_dp(to_net_dev(d));
+#if 0
+ char *endp;
+ unsigned long val;
+
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ val = simple_strtoul(buf, &endp, 0);
+ if (endp == buf)
+ return -EINVAL;
+
+ rtnl_lock();
+ br_stp_set_enabled(br, val);
+ rtnl_unlock();
+#else
+ printk("%s: xxx attempt to set_stp_state()\n", dp_name(dp));
+#endif
+
+ return len;
+}
+static BRC_DEVICE_ATTR(stp_state, S_IRUGO | S_IWUSR, show_stp_state,
+ store_stp_state);
+
+static ssize_t show_priority(struct class_device *d,
+ char *buf)
+{
+#if 0
+ struct datapath *dp = dp_dev_get_dp(to_net_dev(d));
+ return sprintf(buf, "%d\n",
+ (br->bridge_id.prio[0] << 8) | br->bridge_id.prio[1]);
+#else
+ return sprintf(buf, "%d\n", 0);
+#endif
+}
+
+static void set_priority(struct datapath *dp, unsigned long val)
+{
+#if 0
+ br_stp_set_bridge_priority(br, (u16) val);
+#else
+ printk("%s: xxx attempt to set_priority()\n", dp_name(dp));
+#endif
+}
+
+static ssize_t store_priority(struct class_device *d,
+ const char *buf, size_t len)
+{
+ return store_bridge_parm(d, buf, len, set_priority);
+}
+static BRC_DEVICE_ATTR(priority, S_IRUGO | S_IWUSR, show_priority, store_priority);
+
+static ssize_t show_root_id(struct class_device *d,
+ char *buf)
+{
+#if 0
+ return br_show_bridge_id(buf, &to_bridge(d)->designated_root);
+#else
+ return sprintf(buf, "0000.010203040506\n");
+#endif
+}
+static BRC_DEVICE_ATTR(root_id, S_IRUGO, show_root_id, NULL);
+
+static ssize_t show_bridge_id(struct class_device *d,
+ char *buf)
+{
+ struct datapath *dp = dp_dev_get_dp(to_net_dev(d));
+ const unsigned char *addr = dp->ports[ODPP_LOCAL]->dev->dev_addr;
+
+ /* xxx Do we need a lock of some sort? */
+ return sprintf(buf, "%.2x%.2x.%.2x%.2x%.2x%.2x%.2x%.2x\n",
+ 0, 0, addr[0], addr[1], addr[2], addr[3], addr[4], addr[5]);
+}
+static BRC_DEVICE_ATTR(bridge_id, S_IRUGO, show_bridge_id, NULL);
+
+static ssize_t show_root_port(struct class_device *d,
+ char *buf)
+{
+#if 0
+ return sprintf(buf, "%d\n", to_bridge(d)->root_port);
+#else
+ return sprintf(buf, "%d\n", 0);
+#endif
+}
+static BRC_DEVICE_ATTR(root_port, S_IRUGO, show_root_port, NULL);
+
+static ssize_t show_root_path_cost(struct class_device *d,
+ char *buf)
+{
+#if 0
+ return sprintf(buf, "%d\n", to_bridge(d)->root_path_cost);
+#else
+ return sprintf(buf, "%d\n", 0);
+#endif
+}
+static BRC_DEVICE_ATTR(root_path_cost, S_IRUGO, show_root_path_cost, NULL);
+
+static ssize_t show_topology_change(struct class_device *d,
+ char *buf)
+{
+#if 0
+ return sprintf(buf, "%d\n", to_bridge(d)->topology_change);
+#else
+ return sprintf(buf, "%d\n", 0);
+#endif
+}
+static BRC_DEVICE_ATTR(topology_change, S_IRUGO, show_topology_change, NULL);
+
+static ssize_t show_topology_change_detected(struct class_device *d,
+ char *buf)
+{
+#if 0
+ struct datapath *dp = dp_dev_get_dp(to_net_dev(d));
+ return sprintf(buf, "%d\n", br->topology_change_detected);
+#else
+ return sprintf(buf, "%d\n", 0);
+#endif
+}
+static BRC_DEVICE_ATTR(topology_change_detected, S_IRUGO,
+ show_topology_change_detected, NULL);
+
+static ssize_t show_hello_timer(struct class_device *d,
+ char *buf)
+{
+#if 0
+ struct datapath *dp = dp_dev_get_dp(to_net_dev(d));
+ return sprintf(buf, "%ld\n", br_timer_value(&br->hello_timer));
+#else
+ return sprintf(buf, "%d\n", 0);
+#endif
+}
+static BRC_DEVICE_ATTR(hello_timer, S_IRUGO, show_hello_timer, NULL);
+
+static ssize_t show_tcn_timer(struct class_device *d,
+ char *buf)
+{
+#if 0
+ struct datapath *dp = dp_dev_get_dp(to_net_dev(d));
+ return sprintf(buf, "%ld\n", br_timer_value(&br->tcn_timer));
+#else
+ return sprintf(buf, "%d\n", 0);
+#endif
+}
+static BRC_DEVICE_ATTR(tcn_timer, S_IRUGO, show_tcn_timer, NULL);
+
+static ssize_t show_topology_change_timer(struct class_device *d,
+ char *buf)
+{
+#if 0
+ struct datapath *dp = dp_dev_get_dp(to_net_dev(d));
+ return sprintf(buf, "%ld\n", br_timer_value(&br->topology_change_timer));
+#else
+ return sprintf(buf, "%d\n", 0);
+#endif
+}
+static BRC_DEVICE_ATTR(topology_change_timer, S_IRUGO, show_topology_change_timer,
+ NULL);
+
+static ssize_t show_gc_timer(struct class_device *d,
+ char *buf)
+{
+#if 0
+ struct datapath *dp = dp_dev_get_dp(to_net_dev(d));
+ return sprintf(buf, "%ld\n", br_timer_value(&br->gc_timer));
+#else
+ return sprintf(buf, "%d\n", 0);
+#endif
+}
+static BRC_DEVICE_ATTR(gc_timer, S_IRUGO, show_gc_timer, NULL);
+
+static ssize_t show_group_addr(struct class_device *d,
+ char *buf)
+{
+#if 0
+ struct datapath *dp = dp_dev_get_dp(to_net_dev(d));
+ return sprintf(buf, "%x:%x:%x:%x:%x:%x\n",
+ br->group_addr[0], br->group_addr[1],
+ br->group_addr[2], br->group_addr[3],
+ br->group_addr[4], br->group_addr[5]);
+#else
+ return sprintf(buf, "00:01:02:03:04:05\n");
+#endif
+}
+
+static ssize_t store_group_addr(struct class_device *d,
+ const char *buf, size_t len)
+{
+ struct datapath *dp = dp_dev_get_dp(to_net_dev(d));
+#if 0
+ unsigned new_addr[6];
+ int i;
+
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ if (sscanf(buf, "%x:%x:%x:%x:%x:%x",
+ &new_addr[0], &new_addr[1], &new_addr[2],
+ &new_addr[3], &new_addr[4], &new_addr[5]) != 6)
+ return -EINVAL;
+
+ /* Must be 01:80:c2:00:00:0X */
+ for (i = 0; i < 5; i++)
+ if (new_addr[i] != br_group_address[i])
+ return -EINVAL;
+
+ if (new_addr[5] & ~0xf)
+ return -EINVAL;
+
+ if (new_addr[5] == 1 /* 802.3x Pause address */
+ || new_addr[5] == 2 /* 802.3ad Slow protocols */
+ || new_addr[5] == 3) /* 802.1X PAE address */
+ return -EINVAL;
+
+ spin_lock_bh(&br->lock);
+ for (i = 0; i < 6; i++)
+ br->group_addr[i] = new_addr[i];
+ spin_unlock_bh(&br->lock);
+#else
+ printk("%s: xxx attempt to store_group_addr()\n", dp_name(dp));
+#endif
+ return len;
+}
+
+static BRC_DEVICE_ATTR(group_addr, S_IRUGO | S_IWUSR,
+ show_group_addr, store_group_addr);
+
+static struct attribute *bridge_attrs[] = {
+ &class_device_attr_forward_delay.attr,
+ &class_device_attr_hello_time.attr,
+ &class_device_attr_max_age.attr,
+ &class_device_attr_ageing_time.attr,
+ &class_device_attr_stp_state.attr,
+ &class_device_attr_priority.attr,
+ &class_device_attr_bridge_id.attr,
+ &class_device_attr_root_id.attr,
+ &class_device_attr_root_path_cost.attr,
+ &class_device_attr_root_port.attr,
+ &class_device_attr_topology_change.attr,
+ &class_device_attr_topology_change_detected.attr,
+ &class_device_attr_hello_timer.attr,
+ &class_device_attr_tcn_timer.attr,
+ &class_device_attr_topology_change_timer.attr,
+ &class_device_attr_gc_timer.attr,
+ &class_device_attr_group_addr.attr,
+ NULL
+};
+
+static struct attribute_group bridge_group = {
+ .name = SYSFS_BRIDGE_ATTR,
+ .attrs = bridge_attrs,
+};
+
+/*
+ * Add entries in sysfs onto the existing network class device
+ * for the bridge.
+ * Adds a attribute group "bridge" containing tuning parameters.
+ * Sub directory to hold links to interfaces.
+ *
+ * Note: the ifobj exists only to be a subdirectory
+ * to hold links. The ifobj exists in the same data structure
+ * as its parent the bridge so reference counting works.
+ */
+int brc_sysfs_add_dp(struct datapath *dp)
+{
+ struct kobject *kobj = to_kobj(dp->ports[ODPP_LOCAL]->dev);
+ int err;
+
+ err = sysfs_create_group(kobj, &bridge_group);
+ if (err) {
+ pr_info("%s: can't create group %s/%s\n",
+ __func__, dp_name(dp), bridge_group.name);
+ goto out1;
+ }
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,25)
+ kobject_set_name(&dp->ifobj, SYSFS_BRIDGE_PORT_SUBDIR);
+ dp->ifobj.ktype = NULL;
+ dp->ifobj.kset = NULL;
+ dp->ifobj.parent = kobj;
+
+ err = kobject_register(&dp->ifobj);
+ if (err) {
+ pr_info("%s: can't add kobject (directory) %s/%s\n",
+ __FUNCTION__, dp_name(dp), dp->ifobj.name);
+ goto out2;
+ }
+#else
+ br->ifobj = kobject_create_and_add(SYSFS_BRIDGE_PORT_SUBDIR, kobj);
+ if (!br->ifobj) {
+ pr_info("%s: can't add kobject (directory) %s/%s\n",
+ __func__, dp_name(dp), SYSFS_BRIDGE_PORT_SUBDIR);
+ goto out2;
+ }
+#endif
+ return 0;
+
+ out2:
+ sysfs_remove_group(kobj, &bridge_group);
+ out1:
+ return err;
+}
+
+int brc_sysfs_del_dp(struct datapath *dp)
+{
+ struct kobject *kobj = to_kobj(dp->ports[ODPP_LOCAL]->dev);
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,25)
+ kobject_unregister(&dp->ifobj);
+#else
+ kobject_put(dp->ifobj);
+#endif
+ sysfs_remove_group(kobj, &bridge_group);
+
+ return 0;
+}
+#else /* !SUPPORT_SYSFS */
+int brc_sysfs_add_dp(struct datapath *dp) { return 0; }
+int brc_sysfs_del_dp(struct datapath *dp) { return 0; }
+int brc_sysfs_add_if(struct net_bridge_port *p) { return 0; }
+int brc_sysfs_del_if(struct net_bridge_port *p)
+{
+ dev_put(p->dev);
+ kfree(p);
+ return 0;
+}
+#endif /* !SUPPORT_SYSFS */
diff --git a/datapath/brc_sysfs_if.c b/datapath/brc_sysfs_if.c
new file mode 100644
index 000000000..20bb109b5
--- /dev/null
+++ b/datapath/brc_sysfs_if.c
@@ -0,0 +1,334 @@
+/*
+ * Sysfs attributes of bridge ports for Open vSwitch
+ *
+ * This has been shamelessly copied from the kernel sources.
+ */
+
+#include <linux/capability.h>
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/if_bridge.h>
+#include <linux/rtnetlink.h>
+#include <linux/spinlock.h>
+#include "brc_sysfs.h"
+#include "datapath.h"
+
+#ifdef SUPPORT_SYSFS
+
+struct brport_attribute {
+ struct attribute attr;
+ ssize_t (*show)(struct net_bridge_port *, char *);
+ ssize_t (*store)(struct net_bridge_port *, unsigned long);
+};
+
+#define BRPORT_ATTR(_name,_mode,_show,_store) \
+struct brport_attribute brport_attr_##_name = { \
+ .attr = {.name = __stringify(_name), \
+ .mode = _mode, \
+ .owner = THIS_MODULE, }, \
+ .show = _show, \
+ .store = _store, \
+};
+
+static ssize_t show_path_cost(struct net_bridge_port *p, char *buf)
+{
+#if 0
+ return sprintf(buf, "%d\n", p->path_cost);
+#else
+ return sprintf(buf, "%d\n", 0);
+#endif
+}
+static ssize_t store_path_cost(struct net_bridge_port *p, unsigned long v)
+{
+#if 0
+ br_stp_set_path_cost(p, v);
+#endif
+ return 0;
+}
+static BRPORT_ATTR(path_cost, S_IRUGO | S_IWUSR,
+ show_path_cost, store_path_cost);
+
+static ssize_t show_priority(struct net_bridge_port *p, char *buf)
+{
+#if 0
+ return sprintf(buf, "%d\n", p->priority);
+#else
+ return sprintf(buf, "%d\n", 0);
+#endif
+}
+static ssize_t store_priority(struct net_bridge_port *p, unsigned long v)
+{
+#if 0
+ if (v >= (1<<(16-BR_PORT_BITS)))
+ return -ERANGE;
+ br_stp_set_port_priority(p, v);
+#endif
+ return 0;
+}
+static BRPORT_ATTR(priority, S_IRUGO | S_IWUSR,
+ show_priority, store_priority);
+
+static ssize_t show_designated_root(struct net_bridge_port *p, char *buf)
+{
+#if 0
+ return br_show_bridge_id(buf, &p->designated_root);
+#else
+ return sprintf(buf, "0000.010203040506\n");
+#endif
+}
+static BRPORT_ATTR(designated_root, S_IRUGO, show_designated_root, NULL);
+
+static ssize_t show_designated_bridge(struct net_bridge_port *p, char *buf)
+{
+#if 0
+ return br_show_bridge_id(buf, &p->designated_bridge);
+#else
+ return sprintf(buf, "0000.060504030201\n");
+#endif
+}
+static BRPORT_ATTR(designated_bridge, S_IRUGO, show_designated_bridge, NULL);
+
+static ssize_t show_designated_port(struct net_bridge_port *p, char *buf)
+{
+#if 0
+ return sprintf(buf, "%d\n", p->designated_port);
+#else
+ return sprintf(buf, "%d\n", 0);
+#endif
+}
+static BRPORT_ATTR(designated_port, S_IRUGO, show_designated_port, NULL);
+
+static ssize_t show_designated_cost(struct net_bridge_port *p, char *buf)
+{
+#if 0
+ return sprintf(buf, "%d\n", p->designated_cost);
+#else
+ return sprintf(buf, "%d\n", 0);
+#endif
+}
+static BRPORT_ATTR(designated_cost, S_IRUGO, show_designated_cost, NULL);
+
+static ssize_t show_port_id(struct net_bridge_port *p, char *buf)
+{
+#if 0
+ return sprintf(buf, "0x%x\n", p->port_id);
+#else
+ return sprintf(buf, "0x%x\n", 0);
+#endif
+}
+static BRPORT_ATTR(port_id, S_IRUGO, show_port_id, NULL);
+
+static ssize_t show_port_no(struct net_bridge_port *p, char *buf)
+{
+ return sprintf(buf, "0x%x\n", p->port_no);
+}
+
+static BRPORT_ATTR(port_no, S_IRUGO, show_port_no, NULL);
+
+static ssize_t show_change_ack(struct net_bridge_port *p, char *buf)
+{
+#if 0
+ return sprintf(buf, "%d\n", p->topology_change_ack);
+#else
+ return sprintf(buf, "%d\n", 0);
+#endif
+}
+static BRPORT_ATTR(change_ack, S_IRUGO, show_change_ack, NULL);
+
+static ssize_t show_config_pending(struct net_bridge_port *p, char *buf)
+{
+#if 0
+ return sprintf(buf, "%d\n", p->config_pending);
+#else
+ return sprintf(buf, "%d\n", 0);
+#endif
+}
+static BRPORT_ATTR(config_pending, S_IRUGO, show_config_pending, NULL);
+
+static ssize_t show_port_state(struct net_bridge_port *p, char *buf)
+{
+#if 0
+ return sprintf(buf, "%d\n", p->state);
+#else
+ return sprintf(buf, "%d\n", 0);
+#endif
+}
+static BRPORT_ATTR(state, S_IRUGO, show_port_state, NULL);
+
+static ssize_t show_message_age_timer(struct net_bridge_port *p,
+ char *buf)
+{
+#if 0
+ return sprintf(buf, "%ld\n", br_timer_value(&p->message_age_timer));
+#else
+ return sprintf(buf, "%d\n", 0);
+#endif
+}
+static BRPORT_ATTR(message_age_timer, S_IRUGO, show_message_age_timer, NULL);
+
+static ssize_t show_forward_delay_timer(struct net_bridge_port *p,
+ char *buf)
+{
+#if 0
+ return sprintf(buf, "%ld\n", br_timer_value(&p->forward_delay_timer));
+#else
+ return sprintf(buf, "%d\n", 0);
+#endif
+}
+static BRPORT_ATTR(forward_delay_timer, S_IRUGO, show_forward_delay_timer, NULL);
+
+static ssize_t show_hold_timer(struct net_bridge_port *p,
+ char *buf)
+{
+#if 0
+ return sprintf(buf, "%ld\n", br_timer_value(&p->hold_timer));
+#else
+ return sprintf(buf, "%d\n", 0);
+#endif
+}
+static BRPORT_ATTR(hold_timer, S_IRUGO, show_hold_timer, NULL);
+
+static struct brport_attribute *brport_attrs[] = {
+ &brport_attr_path_cost,
+ &brport_attr_priority,
+ &brport_attr_port_id,
+ &brport_attr_port_no,
+ &brport_attr_designated_root,
+ &brport_attr_designated_bridge,
+ &brport_attr_designated_port,
+ &brport_attr_designated_cost,
+ &brport_attr_state,
+ &brport_attr_change_ack,
+ &brport_attr_config_pending,
+ &brport_attr_message_age_timer,
+ &brport_attr_forward_delay_timer,
+ &brport_attr_hold_timer,
+ NULL
+};
+
+#define to_brport_attr(_at) container_of(_at, struct brport_attribute, attr)
+#define to_brport(obj) container_of(obj, struct net_bridge_port, kobj)
+
+static ssize_t brport_show(struct kobject * kobj,
+ struct attribute * attr, char * buf)
+{
+ struct brport_attribute * brport_attr = to_brport_attr(attr);
+ struct net_bridge_port * p = to_brport(kobj);
+
+ return brport_attr->show(p, buf);
+}
+
+static ssize_t brport_store(struct kobject * kobj,
+ struct attribute * attr,
+ const char * buf, size_t count)
+{
+ struct net_bridge_port * p = to_brport(kobj);
+#if 0
+ struct brport_attribute * brport_attr = to_brport_attr(attr);
+ char *endp;
+ unsigned long val;
+#endif
+ ssize_t ret = -EINVAL;
+
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+#if 0
+ val = simple_strtoul(buf, &endp, 0);
+ if (endp != buf) {
+ rtnl_lock();
+ if (p->dev && p->br && brport_attr->store) {
+ spin_lock_bh(&p->br->lock);
+ ret = brport_attr->store(p, val);
+ spin_unlock_bh(&p->br->lock);
+ if (ret == 0)
+ ret = count;
+ }
+ rtnl_unlock();
+ }
+#else
+ printk("%s: xxx writing port parms not supported yet!\n",
+ dp_name(p->dp));
+#endif
+ return ret;
+}
+
+struct sysfs_ops brport_sysfs_ops = {
+ .show = brport_show,
+ .store = brport_store,
+};
+
+static void release_nbp(struct kobject *kobj)
+{
+ struct net_bridge_port *p
+ = container_of(kobj, struct net_bridge_port, kobj);
+ kfree(p);
+}
+
+struct kobj_type brport_ktype = {
+ .sysfs_ops = &brport_sysfs_ops,
+ .release = release_nbp
+};
+
+/*
+ * Add sysfs entries to ethernet device added to a bridge.
+ * Creates a brport subdirectory with bridge attributes.
+ * Puts symlink in bridge's brport subdirectory
+ */
+int brc_sysfs_add_if(struct net_bridge_port *p)
+{
+ struct datapath *dp = p->dp;
+ struct brport_attribute **a;
+ int err;
+
+ kobject_init(&p->kobj);
+ kobject_set_name(&p->kobj, SYSFS_BRIDGE_PORT_ATTR);
+ p->kobj.ktype = &brport_ktype;
+ p->kobj.kset = NULL;
+ p->kobj.parent = &(p->dev->class_dev.kobj);
+
+ err = kobject_add(&p->kobj);
+ if (err)
+ goto err_put;
+
+ err = sysfs_create_link(&p->kobj,
+ &dp->ports[ODPP_LOCAL]->dev->class_dev.kobj,
+ SYSFS_BRIDGE_PORT_LINK);
+ if (err)
+ goto err_del;
+
+ for (a = brport_attrs; *a; ++a) {
+ err = sysfs_create_file(&p->kobj, &((*a)->attr));
+ if (err)
+ goto err_del;
+ }
+
+ err = sysfs_create_link(&dp->ifobj, &p->kobj, p->dev->name);
+ if (err)
+ goto err_del;
+
+ kobject_uevent(&p->kobj, KOBJ_ADD);
+
+ return err;
+
+err_del:
+ kobject_del(&p->kobj);
+err_put:
+ kobject_put(&p->kobj);
+ return err;
+}
+
+int brc_sysfs_del_if(struct net_bridge_port *p)
+{
+ struct net_device *dev = p->dev;
+
+ kobject_uevent(&p->kobj, KOBJ_REMOVE);
+ kobject_del(&p->kobj);
+
+ dev_put(dev);
+
+ kobject_put(&p->kobj);
+
+ return 0;
+}
+#endif /* SUPPORT_SYSFS */
diff --git a/datapath/brcompat.c b/datapath/brcompat.c
new file mode 100644
index 000000000..2e437ccd9
--- /dev/null
+++ b/datapath/brcompat.c
@@ -0,0 +1,519 @@
+#include <linux/kernel.h>
+#include <asm/uaccess.h>
+#include <linux/completion.h>
+#include <linux/delay.h>
+#include <linux/etherdevice.h>
+#include <linux/if_bridge.h>
+#include <linux/rculist.h>
+#include <linux/netdevice.h>
+#include <linux/rtnetlink.h>
+#include <net/genetlink.h>
+
+#include "compat.h"
+#include "openvswitch/brcompat-netlink.h"
+#include "brc_procfs.h"
+#include "brc_sysfs.h"
+#include "datapath.h"
+#include "dp_dev.h"
+
+static struct genl_family brc_genl_family;
+static struct genl_multicast_group brc_mc_group;
+
+/* Time to wait for ovs-vswitchd to respond to a datapath action, in
+ * jiffies. */
+#define BRC_TIMEOUT (HZ * 5)
+
+/* Mutex to serialize ovs-brcompatd callbacks. (Some callbacks naturally hold
+ * br_ioctl_mutex, others hold rtnl_lock, but we can't take the former
+ * ourselves and we don't want to hold the latter over a potentially long
+ * period of time.) */
+static DEFINE_MUTEX(brc_serial);
+
+/* Userspace communication. */
+static DEFINE_SPINLOCK(brc_lock); /* Ensure atomic access to these vars. */
+static DECLARE_COMPLETION(brc_done); /* Userspace signaled operation done? */
+static int brc_err; /* Error code from userspace. */
+static u32 brc_seq; /* Sequence number for current op. */
+
+static int brc_send_command(const char *bridge, const char *port, int op);
+
+static int
+get_dp_ifindices(int *indices, int num)
+{
+ int i, index = 0;
+
+ rcu_read_lock();
+ for (i=0; i < ODP_MAX && index < num; i++) {
+ struct datapath *dp = get_dp(i);
+ if (!dp)
+ continue;
+ indices[index++] = dp->ports[ODPP_LOCAL]->dev->ifindex;
+ }
+ rcu_read_unlock();
+
+ return index;
+}
+
+static void
+get_port_ifindices(struct datapath *dp, int *ifindices, int num)
+{
+ struct net_bridge_port *p;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu (p, &dp->port_list, node) {
+ if (p->port_no < num)
+ ifindices[p->port_no] = p->dev->ifindex;
+ }
+ rcu_read_unlock();
+}
+
+static int brc_add_del_bridge(char __user *uname, int add)
+{
+ char name[IFNAMSIZ];
+
+ if (copy_from_user(name, uname, IFNAMSIZ))
+ return -EFAULT;
+
+ name[IFNAMSIZ - 1] = 0;
+ return brc_send_command(name, NULL,
+ add ? BRC_GENL_C_DP_ADD : BRC_GENL_C_DP_DEL);
+}
+
+static int brc_get_bridges(int __user *uindices, int n)
+{
+ int *indices;
+ int ret;
+
+ if (n >= 2048)
+ return -ENOMEM;
+
+ indices = kcalloc(n, sizeof(int), GFP_KERNEL);
+ if (indices == NULL)
+ return -ENOMEM;
+
+ n = get_dp_ifindices(indices, n);
+
+ ret = copy_to_user(uindices, indices, n * sizeof(int)) ? -EFAULT : n;
+
+ kfree(indices);
+ return ret;
+}
+
+/* Legacy deviceless bridge ioctl's. Called with br_ioctl_mutex. */
+static int
+old_deviceless(void __user *uarg)
+{
+ unsigned long args[3];
+
+ if (copy_from_user(args, uarg, sizeof(args)))
+ return -EFAULT;
+
+ switch (args[0]) {
+ case BRCTL_GET_BRIDGES:
+ return brc_get_bridges((int __user *)args[1], args[2]);
+
+ case BRCTL_ADD_BRIDGE:
+ return brc_add_del_bridge((void __user *)args[1], 1);
+ case BRCTL_DEL_BRIDGE:
+ return brc_add_del_bridge((void __user *)args[1], 0);
+ }
+
+ return -EOPNOTSUPP;
+}
+
+/* Called with the br_ioctl_mutex. */
+static int
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
+brc_ioctl_deviceless_stub(unsigned int cmd, void __user *uarg)
+#else
+brc_ioctl_deviceless_stub(struct net *net, unsigned int cmd, void __user *uarg)
+#endif
+{
+ switch (cmd) {
+ case SIOCGIFBR:
+ case SIOCSIFBR:
+ return old_deviceless(uarg);
+
+ case SIOCBRADDBR:
+ return brc_add_del_bridge(uarg, 1);
+ case SIOCBRDELBR:
+ return brc_add_del_bridge(uarg, 0);
+ }
+
+ return -EOPNOTSUPP;
+}
+
+static int
+brc_add_del_port(struct net_device *dev, int port_ifindex, int add)
+{
+ struct net_device *port;
+ char dev_name[IFNAMSIZ], port_name[IFNAMSIZ];
+ int err;
+
+ port = __dev_get_by_index(&init_net, port_ifindex);
+ if (!port)
+ return -EINVAL;
+
+ /* Save name of dev and port because there's a race between the
+ * rtnl_unlock() and the brc_send_command(). */
+ strcpy(dev_name, dev->name);
+ strcpy(port_name, port->name);
+
+ rtnl_unlock();
+ err = brc_send_command(dev_name, port_name,
+ add ? BRC_GENL_C_PORT_ADD : BRC_GENL_C_PORT_DEL);
+ rtnl_lock();
+
+ return err;
+}
+
+static int
+brc_get_bridge_info(struct net_device *dev, struct __bridge_info __user *ub)
+{
+ struct __bridge_info b;
+ u64 id = 0;
+ int i;
+
+ memset(&b, 0, sizeof(struct __bridge_info));
+
+ for (i=0; i<ETH_ALEN; i++)
+ id |= (u64)dev->dev_addr[i] << (8*(ETH_ALEN-1 - i));
+ b.bridge_id = cpu_to_be64(id);
+ b.stp_enabled = 0;
+
+ if (copy_to_user(ub, &b, sizeof(struct __bridge_info)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int
+brc_get_port_list(struct net_device *dev, int __user *uindices, int num)
+{
+ struct dp_dev *dp_dev = netdev_priv(dev);
+ struct datapath *dp = dp_dev->dp;
+ int *indices;
+
+ if (num < 0)
+ return -EINVAL;
+ if (num == 0)
+ num = 256;
+ if (num > DP_MAX_PORTS)
+ num = DP_MAX_PORTS;
+
+ indices = kcalloc(num, sizeof(int), GFP_KERNEL);
+ if (indices == NULL)
+ return -ENOMEM;
+
+ get_port_ifindices(dp, indices, num);
+ if (copy_to_user(uindices, indices, num * sizeof(int)))
+ num = -EFAULT;
+ kfree(indices);
+ return num;
+}
+
+/* Legacy ioctl's through SIOCDEVPRIVATE. Called with rtnl_lock. */
+static int
+old_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
+{
+ unsigned long args[4];
+
+ if (copy_from_user(args, rq->ifr_data, sizeof(args)))
+ return -EFAULT;
+
+ switch (args[0]) {
+ case BRCTL_ADD_IF:
+ return brc_add_del_port(dev, args[1], 1);
+ case BRCTL_DEL_IF:
+ return brc_add_del_port(dev, args[1], 0);
+
+ case BRCTL_GET_BRIDGE_INFO:
+ return brc_get_bridge_info(dev, (struct __bridge_info __user *)args[1]);
+
+ case BRCTL_GET_PORT_LIST:
+ return brc_get_port_list(dev, (int __user *)args[1], args[2]);
+ }
+
+ return -EOPNOTSUPP;
+}
+
+/* Called with the rtnl_lock. */
+static int
+brc_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
+{
+ int err;
+
+ switch (cmd) {
+ case SIOCDEVPRIVATE:
+ err = old_dev_ioctl(dev, rq, cmd);
+ break;
+
+ case SIOCBRADDIF:
+ return brc_add_del_port(dev, rq->ifr_ifindex, 1);
+ case SIOCBRDELIF:
+ return brc_add_del_port(dev, rq->ifr_ifindex, 0);
+
+ default:
+ err = -EOPNOTSUPP;
+ break;
+ }
+
+ return err;
+}
+
+
+static struct genl_family brc_genl_family = {
+ .id = GENL_ID_GENERATE,
+ .hdrsize = 0,
+ .name = BRC_GENL_FAMILY_NAME,
+ .version = 1,
+ .maxattr = BRC_GENL_A_MAX,
+};
+
+static int brc_genl_query(struct sk_buff *skb, struct genl_info *info)
+{
+ int err = -EINVAL;
+ struct sk_buff *ans_skb;
+ void *data;
+
+ ans_skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!ans_skb)
+ return -ENOMEM;
+
+ data = genlmsg_put_reply(ans_skb, info, &brc_genl_family,
+ 0, BRC_GENL_C_QUERY_MC);
+ if (data == NULL) {
+ err = -ENOMEM;
+ goto err;
+ }
+ NLA_PUT_U32(ans_skb, BRC_GENL_A_MC_GROUP, brc_mc_group.id);
+
+ genlmsg_end(ans_skb, data);
+ return genlmsg_reply(ans_skb, info);
+
+err:
+nla_put_failure:
+ kfree_skb(ans_skb);
+ return err;
+}
+
+static struct genl_ops brc_genl_ops_query_dp = {
+ .cmd = BRC_GENL_C_QUERY_MC,
+ .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privelege. */
+ .policy = NULL,
+ .doit = brc_genl_query,
+ .dumpit = NULL
+};
+
+/* Attribute policy: what each attribute may contain. */
+static struct nla_policy brc_genl_policy[BRC_GENL_A_MAX + 1] = {
+ [BRC_GENL_A_ERR_CODE] = { .type = NLA_U32 },
+ [BRC_GENL_A_PROC_DIR] = { .type = NLA_NUL_STRING },
+ [BRC_GENL_A_PROC_NAME] = { .type = NLA_NUL_STRING },
+ [BRC_GENL_A_PROC_DATA] = { .type = NLA_NUL_STRING },
+};
+
+static int
+brc_genl_dp_result(struct sk_buff *skb, struct genl_info *info)
+{
+ unsigned long int flags;
+ int err;
+
+ if (!info->attrs[BRC_GENL_A_ERR_CODE])
+ return -EINVAL;
+
+ spin_lock_irqsave(&brc_lock, flags);
+ if (brc_seq == info->snd_seq) {
+ brc_err = nla_get_u32(info->attrs[BRC_GENL_A_ERR_CODE]);
+ complete(&brc_done);
+ err = 0;
+ } else {
+ err = -ESTALE;
+ }
+ spin_unlock_irqrestore(&brc_lock, flags);
+
+ return err;
+}
+
+static struct genl_ops brc_genl_ops_dp_result = {
+ .cmd = BRC_GENL_C_DP_RESULT,
+ .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privelege. */
+ .policy = brc_genl_policy,
+ .doit = brc_genl_dp_result,
+ .dumpit = NULL
+};
+
+static struct genl_ops brc_genl_ops_set_proc = {
+ .cmd = BRC_GENL_C_SET_PROC,
+ .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privelege. */
+ .policy = brc_genl_policy,
+ .doit = brc_genl_set_proc,
+ .dumpit = NULL
+};
+
+static int brc_send_command(const char *bridge, const char *port, int op)
+{
+ unsigned long int flags;
+ struct sk_buff *skb;
+ void *data;
+ int error;
+
+ mutex_lock(&brc_serial);
+
+ /* Increment sequence number first, so that we ignore any replies
+ * to stale requests. */
+ spin_lock_irqsave(&brc_lock, flags);
+ brc_seq++;
+ INIT_COMPLETION(brc_done);
+ spin_unlock_irqrestore(&brc_lock, flags);
+
+ /* Compose message. */
+ skb = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+ error = -ENOMEM;
+ if (skb == NULL)
+ goto exit_unlock;
+ data = genlmsg_put(skb, 0, brc_seq, &brc_genl_family, 0, op);
+
+ NLA_PUT_STRING(skb, BRC_GENL_A_DP_NAME, bridge);
+ if (port)
+ NLA_PUT_STRING(skb, BRC_GENL_A_PORT_NAME, port);
+
+ genlmsg_end(skb, data);
+
+ /* Send message. */
+ error = genlmsg_multicast(skb, 0, brc_mc_group.id, GFP_KERNEL);
+ if (error < 0)
+ goto exit_unlock;
+
+ /* Wait for reply. */
+ error = -ETIMEDOUT;
+ if (!wait_for_completion_timeout(&brc_done, BRC_TIMEOUT))
+ goto exit_unlock;
+
+ error = -brc_err;
+ goto exit_unlock;
+
+nla_put_failure:
+ kfree_skb(skb);
+exit_unlock:
+ mutex_unlock(&brc_serial);
+ return error;
+}
+
+int brc_add_dp(struct datapath *dp)
+{
+ if (!try_module_get(THIS_MODULE))
+ return -ENODEV;
+#ifdef SUPPORT_SYSFS
+ brc_sysfs_add_dp(dp);
+#endif
+
+ return 0;
+}
+
+int brc_del_dp(struct datapath *dp)
+{
+#ifdef SUPPORT_SYSFS
+ brc_sysfs_del_dp(dp);
+#endif
+ module_put(THIS_MODULE);
+
+ return 0;
+}
+
+static int
+__init brc_init(void)
+{
+ int i;
+ int err;
+
+ printk("Open vSwitch Bridge Compatibility, built "__DATE__" "__TIME__"\n");
+
+ rcu_read_lock();
+ for (i=0; i<ODP_MAX; i++) {
+ if (get_dp(i)) {
+ rcu_read_unlock();
+ printk(KERN_EMERG "brcompat: no datapaths may exist!\n");
+ return -EEXIST;
+ }
+ }
+ rcu_read_unlock();
+
+ /* Set the bridge ioctl handler */
+ brioctl_set(brc_ioctl_deviceless_stub);
+
+ /* Set the openvswitch_mod device ioctl handler */
+ dp_ioctl_hook = brc_dev_ioctl;
+
+ /* Register hooks for datapath adds and deletes */
+ dp_add_dp_hook = brc_add_dp;
+ dp_del_dp_hook = brc_del_dp;
+
+ /* Register hooks for interface adds and deletes */
+#ifdef SUPPORT_SYSFS
+ dp_add_if_hook = brc_sysfs_add_if;
+ dp_del_if_hook = brc_sysfs_del_if;
+#endif
+
+ /* Randomize the initial sequence number. This is not a security
+ * feature; it only helps avoid crossed wires between userspace and
+ * the kernel when the module is unloaded and reloaded. */
+ brc_seq = net_random();
+
+ /* Register generic netlink family to communicate changes to
+ * userspace. */
+ err = genl_register_family(&brc_genl_family);
+ if (err)
+ goto error;
+
+ err = genl_register_ops(&brc_genl_family, &brc_genl_ops_query_dp);
+ if (err != 0)
+ goto err_unregister;
+
+ err = genl_register_ops(&brc_genl_family, &brc_genl_ops_dp_result);
+ if (err != 0)
+ goto err_unregister;
+
+ err = genl_register_ops(&brc_genl_family, &brc_genl_ops_set_proc);
+ if (err != 0)
+ goto err_unregister;
+
+ strcpy(brc_mc_group.name, "brcompat");
+ err = genl_register_mc_group(&brc_genl_family, &brc_mc_group);
+ if (err < 0)
+ goto err_unregister;
+
+ return 0;
+
+err_unregister:
+ genl_unregister_family(&brc_genl_family);
+error:
+ printk(KERN_EMERG "brcompat: failed to install!");
+ return err;
+}
+
+static void
+brc_cleanup(void)
+{
+ /* Unregister hooks for datapath adds and deletes */
+ dp_add_dp_hook = NULL;
+ dp_del_dp_hook = NULL;
+
+ /* Unregister hooks for interface adds and deletes */
+ dp_add_if_hook = NULL;
+ dp_del_if_hook = NULL;
+
+ /* Unregister ioctl hooks */
+ dp_ioctl_hook = NULL;
+ brioctl_set(NULL);
+
+ genl_unregister_family(&brc_genl_family);
+ brc_procfs_exit();
+}
+
+module_init(brc_init);
+module_exit(brc_cleanup);
+
+MODULE_DESCRIPTION("Open vSwitch bridge compatibility");
+MODULE_AUTHOR("Nicira Networks");
+MODULE_LICENSE("GPL");
diff --git a/datapath/compat.h b/datapath/compat.h
new file mode 100644
index 000000000..12100ae39
--- /dev/null
+++ b/datapath/compat.h
@@ -0,0 +1,17 @@
+#ifndef COMPAT_H
+#define COMPAT_H 1
+
+#include <linux/version.h>
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+
+#include "compat26.h"
+
+#else
+
+#include "compat24.h"
+
+#endif
+
+
+#endif /* compat.h */
diff --git a/datapath/datapath.c b/datapath/datapath.c
new file mode 100644
index 000000000..015edc4bb
--- /dev/null
+++ b/datapath/datapath.c
@@ -0,0 +1,1611 @@
+/*
+ * Distributed under the terms of the GNU GPL version 2.
+ * Copyright (c) 2007, 2008, 2009 Nicira Networks.
+ */
+
+/* Functions for managing the dp interface/device. */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/if_arp.h>
+#include <linux/if_bridge.h>
+#include <linux/if_vlan.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/delay.h>
+#include <linux/time.h>
+#include <linux/etherdevice.h>
+#include <linux/kernel.h>
+#include <linux/kthread.h>
+#include <linux/llc.h>
+#include <linux/mutex.h>
+#include <linux/percpu.h>
+#include <linux/rcupdate.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/version.h>
+#include <linux/ethtool.h>
+#include <linux/random.h>
+#include <linux/wait.h>
+#include <asm/system.h>
+#include <asm/div64.h>
+#include <asm/bug.h>
+#include <linux/netfilter_bridge.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/inetdevice.h>
+#include <linux/list.h>
+#include <linux/rculist.h>
+#include <linux/workqueue.h>
+#include <linux/dmi.h>
+#include <net/llc.h>
+
+#include "openvswitch/datapath-protocol.h"
+#include "datapath.h"
+#include "actions.h"
+#include "dp_dev.h"
+#include "flow.h"
+
+#include "compat.h"
+
+
+int (*dp_ioctl_hook)(struct net_device *dev, struct ifreq *rq, int cmd);
+EXPORT_SYMBOL(dp_ioctl_hook);
+
+int (*dp_add_dp_hook)(struct datapath *dp);
+EXPORT_SYMBOL(dp_add_dp_hook);
+
+int (*dp_del_dp_hook)(struct datapath *dp);
+EXPORT_SYMBOL(dp_del_dp_hook);
+
+int (*dp_add_if_hook)(struct net_bridge_port *p);
+EXPORT_SYMBOL(dp_add_if_hook);
+
+int (*dp_del_if_hook)(struct net_bridge_port *p);
+EXPORT_SYMBOL(dp_del_if_hook);
+
+/* Datapaths. Protected on the read side by rcu_read_lock, on the write side
+ * by dp_mutex. dp_mutex is almost completely redundant with genl_mutex
+ * maintained by the Generic Netlink code, but the timeout path needs mutual
+ * exclusion too.
+ *
+ * dp_mutex nests inside the RTNL lock: if you need both you must take the RTNL
+ * lock first.
+ *
+ * It is safe to access the datapath and net_bridge_port structures with just
+ * dp_mutex.
+ */
+static struct datapath *dps[ODP_MAX];
+static DEFINE_MUTEX(dp_mutex);
+
+/* Number of milliseconds between runs of the maintenance thread. */
+#define MAINT_SLEEP_MSECS 1000
+
+static int new_nbp(struct datapath *, struct net_device *, int port_no);
+
+/* Must be called with rcu_read_lock or dp_mutex. */
+struct datapath *get_dp(int dp_idx)
+{
+ if (dp_idx < 0 || dp_idx >= ODP_MAX)
+ return NULL;
+ return rcu_dereference(dps[dp_idx]);
+}
+EXPORT_SYMBOL_GPL(get_dp);
+
+struct datapath *get_dp_locked(int dp_idx)
+{
+ struct datapath *dp;
+
+ mutex_lock(&dp_mutex);
+ dp = get_dp(dp_idx);
+ if (dp)
+ mutex_lock(&dp->mutex);
+ mutex_unlock(&dp_mutex);
+ return dp;
+}
+
+static inline size_t br_nlmsg_size(void)
+{
+ return NLMSG_ALIGN(sizeof(struct ifinfomsg))
+ + nla_total_size(IFNAMSIZ) /* IFLA_IFNAME */
+ + nla_total_size(MAX_ADDR_LEN) /* IFLA_ADDRESS */
+ + nla_total_size(4) /* IFLA_MASTER */
+ + nla_total_size(4) /* IFLA_MTU */
+ + nla_total_size(4) /* IFLA_LINK */
+ + nla_total_size(1); /* IFLA_OPERSTATE */
+}
+
+static int dp_fill_ifinfo(struct sk_buff *skb,
+ const struct net_bridge_port *port,
+ int event, unsigned int flags)
+{
+ const struct datapath *dp = port->dp;
+ const struct net_device *dev = port->dev;
+ struct ifinfomsg *hdr;
+ struct nlmsghdr *nlh;
+
+ nlh = nlmsg_put(skb, 0, 0, event, sizeof(*hdr), flags);
+ if (nlh == NULL)
+ return -EMSGSIZE;
+
+ hdr = nlmsg_data(nlh);
+ hdr->ifi_family = AF_BRIDGE;
+ hdr->__ifi_pad = 0;
+ hdr->ifi_type = dev->type;
+ hdr->ifi_index = dev->ifindex;
+ hdr->ifi_flags = dev_get_flags(dev);
+ hdr->ifi_change = 0;
+
+ NLA_PUT_STRING(skb, IFLA_IFNAME, dev->name);
+ NLA_PUT_U32(skb, IFLA_MASTER, dp->ports[ODPP_LOCAL]->dev->ifindex);
+ NLA_PUT_U32(skb, IFLA_MTU, dev->mtu);
+#ifdef IFLA_OPERSTATE
+ NLA_PUT_U8(skb, IFLA_OPERSTATE,
+ netif_running(dev) ? dev->operstate : IF_OPER_DOWN);
+#endif
+
+ if (dev->addr_len)
+ NLA_PUT(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr);
+
+ if (dev->ifindex != dev->iflink)
+ NLA_PUT_U32(skb, IFLA_LINK, dev->iflink);
+
+ return nlmsg_end(skb, nlh);
+
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+static void dp_ifinfo_notify(int event, struct net_bridge_port *port)
+{
+ struct net *net = dev_net(port->dev);
+ struct sk_buff *skb;
+ int err = -ENOBUFS;
+
+ skb = nlmsg_new(br_nlmsg_size(), GFP_KERNEL);
+ if (skb == NULL)
+ goto errout;
+
+ err = dp_fill_ifinfo(skb, port, event, 0);
+ if (err < 0) {
+ /* -EMSGSIZE implies BUG in br_nlmsg_size() */
+ WARN_ON(err == -EMSGSIZE);
+ kfree_skb(skb);
+ goto errout;
+ }
+ err = rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_KERNEL);
+errout:
+ if (err < 0)
+ rtnl_set_sk_err(net, RTNLGRP_LINK, err);
+}
+
+static int create_dp(int dp_idx, const char __user *devnamep)
+{
+ struct net_device *dp_dev;
+ char devname[IFNAMSIZ];
+ struct datapath *dp;
+ int err;
+ int i;
+
+ if (devnamep) {
+ err = -EFAULT;
+ if (strncpy_from_user(devname, devnamep, IFNAMSIZ - 1) < 0)
+ goto err;
+ devname[IFNAMSIZ - 1] = '\0';
+ } else {
+ snprintf(devname, sizeof devname, "of%d", dp_idx);
+ }
+
+ rtnl_lock();
+ mutex_lock(&dp_mutex);
+ err = -ENODEV;
+ if (!try_module_get(THIS_MODULE))
+ goto err_unlock;
+
+ /* Exit early if a datapath with that number already exists.
+ * (We don't use -EEXIST because that's ambiguous with 'devname'
+ * conflicting with an existing network device name.) */
+ err = -EBUSY;
+ if (get_dp(dp_idx))
+ goto err_put_module;
+
+ err = -ENOMEM;
+ dp = kzalloc(sizeof *dp, GFP_KERNEL);
+ if (dp == NULL)
+ goto err_put_module;
+
+ mutex_init(&dp->mutex);
+ dp->dp_idx = dp_idx;
+ for (i = 0; i < DP_N_QUEUES; i++)
+ skb_queue_head_init(&dp->queues[i]);
+ init_waitqueue_head(&dp->waitqueue);
+
+ /* Setup our datapath device */
+ dp_dev = dp_dev_create(dp, devname, ODPP_LOCAL);
+ err = PTR_ERR(dp_dev);
+ if (IS_ERR(dp_dev))
+ goto err_free_dp;
+
+ err = -ENOMEM;
+ rcu_assign_pointer(dp->table, dp_table_create(DP_L1_SIZE));
+ if (!dp->table)
+ goto err_destroy_dp_dev;
+ INIT_LIST_HEAD(&dp->port_list);
+
+ err = new_nbp(dp, dp_dev, ODPP_LOCAL);
+ if (err)
+ goto err_destroy_table;
+
+ dp->drop_frags = 0;
+ dp->stats_percpu = alloc_percpu(struct dp_stats_percpu);
+ if (!dp->stats_percpu)
+ goto err_destroy_local_port;
+
+ rcu_assign_pointer(dps[dp_idx], dp);
+ mutex_unlock(&dp_mutex);
+ rtnl_unlock();
+
+ if (dp_add_dp_hook)
+ dp_add_dp_hook(dp);
+
+ return 0;
+
+err_destroy_local_port:
+ dp_del_port(dp->ports[ODPP_LOCAL], NULL);
+err_destroy_table:
+ dp_table_destroy(dp->table, 0);
+err_destroy_dp_dev:
+ dp_dev_destroy(dp_dev);
+err_free_dp:
+ kfree(dp);
+err_put_module:
+ module_put(THIS_MODULE);
+err_unlock:
+ mutex_unlock(&dp_mutex);
+ rtnl_unlock();
+err:
+ return err;
+}
+
+static void do_destroy_dp(struct datapath *dp, struct list_head *dp_devs)
+{
+ struct net_bridge_port *p, *n;
+ int i;
+
+ if (dp_del_dp_hook)
+ dp_del_dp_hook(dp);
+
+ /* Drop references to DP. */
+ list_for_each_entry_safe (p, n, &dp->port_list, node)
+ dp_del_port(p, dp_devs);
+
+ rcu_assign_pointer(dps[dp->dp_idx], NULL);
+ synchronize_rcu();
+
+ /* Wait until no longer in use, then destroy it. */
+ synchronize_rcu();
+ dp_table_destroy(dp->table, 1);
+ for (i = 0; i < DP_N_QUEUES; i++)
+ skb_queue_purge(&dp->queues[i]);
+ for (i = 0; i < DP_MAX_GROUPS; i++)
+ kfree(dp->groups[i]);
+ free_percpu(dp->stats_percpu);
+ kfree(dp);
+ module_put(THIS_MODULE);
+}
+
+static int destroy_dp(int dp_idx)
+{
+ struct dp_dev *dp_dev, *next;
+ struct datapath *dp;
+ LIST_HEAD(dp_devs);
+ int err;
+
+ rtnl_lock();
+ mutex_lock(&dp_mutex);
+ dp = get_dp(dp_idx);
+ err = -ENODEV;
+ if (!dp)
+ goto err_unlock;
+
+ do_destroy_dp(dp, &dp_devs);
+ err = 0;
+
+err_unlock:
+ mutex_unlock(&dp_mutex);
+ rtnl_unlock();
+ list_for_each_entry_safe (dp_dev, next, &dp_devs, list)
+ free_netdev(dp_dev->dev);
+ return err;
+}
+
+/* Called with RTNL lock and dp_mutex. */
+static int new_nbp(struct datapath *dp, struct net_device *dev, int port_no)
+{
+ struct net_bridge_port *p;
+
+ if (dev->br_port != NULL)
+ return -EBUSY;
+
+ p = kzalloc(sizeof(*p), GFP_KERNEL);
+ if (!p)
+ return -ENOMEM;
+
+ dev_set_promiscuity(dev, 1);
+ dev_hold(dev);
+ p->port_no = port_no;
+ p->dp = dp;
+ p->dev = dev;
+ if (!is_dp_dev(dev))
+ rcu_assign_pointer(dev->br_port, p);
+ else {
+ /* It would make sense to assign dev->br_port here too, but
+ * that causes packets received on internal ports to get caught
+ * in dp_frame_hook(). In turn dp_frame_hook() can reject them
+ * back to network stack, but that's a waste of time. */
+ }
+ rcu_assign_pointer(dp->ports[port_no], p);
+ list_add_rcu(&p->node, &dp->port_list);
+ dp->n_ports++;
+
+ dp_ifinfo_notify(RTM_NEWLINK, p);
+
+ return 0;
+}
+
+static int add_port(int dp_idx, struct odp_port __user *portp)
+{
+ struct net_device *dev;
+ struct datapath *dp;
+ struct odp_port port;
+ int port_no;
+ int err;
+
+ err = -EFAULT;
+ if (copy_from_user(&port, portp, sizeof port))
+ goto out;
+ port.devname[IFNAMSIZ - 1] = '\0';
+ port_no = port.port;
+
+ err = -EINVAL;
+ if (port_no < 0 || port_no >= DP_MAX_PORTS)
+ goto out;
+
+ rtnl_lock();
+ dp = get_dp_locked(dp_idx);
+ err = -ENODEV;
+ if (!dp)
+ goto out_unlock_rtnl;
+
+ err = -EEXIST;
+ if (dp->ports[port_no])
+ goto out_unlock_dp;
+
+ if (!(port.flags & ODP_PORT_INTERNAL)) {
+ err = -ENODEV;
+ dev = dev_get_by_name(&init_net, port.devname);
+ if (!dev)
+ goto out_unlock_dp;
+
+ err = -EINVAL;
+ if (dev->flags & IFF_LOOPBACK || dev->type != ARPHRD_ETHER ||
+ is_dp_dev(dev))
+ goto out_put;
+ } else {
+ dev = dp_dev_create(dp, port.devname, port_no);
+ err = PTR_ERR(dev);
+ if (IS_ERR(dev))
+ goto out_unlock_dp;
+ dev_hold(dev);
+ }
+
+ err = new_nbp(dp, dev, port_no);
+ if (err)
+ goto out_put;
+
+ if (dp_add_if_hook)
+ dp_add_if_hook(dp->ports[port_no]);
+
+out_put:
+ dev_put(dev);
+out_unlock_dp:
+ mutex_unlock(&dp->mutex);
+out_unlock_rtnl:
+ rtnl_unlock();
+out:
+ return err;
+}
+
+int dp_del_port(struct net_bridge_port *p, struct list_head *dp_devs)
+{
+ ASSERT_RTNL();
+
+#ifdef SUPPORT_SYSFS
+ if (p->port_no != ODPP_LOCAL && dp_del_if_hook)
+ sysfs_remove_link(&p->dp->ifobj, p->dev->name);
+#endif
+ dp_ifinfo_notify(RTM_DELLINK, p);
+
+ p->dp->n_ports--;
+
+ if (is_dp_dev(p->dev)) {
+ /* Make sure that no packets arrive from now on, since
+ * dp_dev_xmit() will try to find itself through
+ * p->dp->ports[], and we're about to set that to null. */
+ netif_tx_disable(p->dev);
+ }
+
+ /* First drop references to device. */
+ dev_set_promiscuity(p->dev, -1);
+ list_del_rcu(&p->node);
+ rcu_assign_pointer(p->dp->ports[p->port_no], NULL);
+ rcu_assign_pointer(p->dev->br_port, NULL);
+
+ /* Then wait until no one is still using it, and destroy it. */
+ synchronize_rcu();
+
+ if (is_dp_dev(p->dev)) {
+ dp_dev_destroy(p->dev);
+ if (dp_devs) {
+ struct dp_dev *dp_dev = dp_dev_priv(p->dev);
+ list_add(&dp_dev->list, dp_devs);
+ }
+ }
+ if (p->port_no != ODPP_LOCAL && dp_del_if_hook) {
+ dp_del_if_hook(p);
+ } else {
+ dev_put(p->dev);
+ kfree(p);
+ }
+
+ return 0;
+}
+
+static int del_port(int dp_idx, int port_no)
+{
+ struct dp_dev *dp_dev, *next;
+ struct net_bridge_port *p;
+ struct datapath *dp;
+ LIST_HEAD(dp_devs);
+ int err;
+
+ err = -EINVAL;
+ if (port_no < 0 || port_no >= DP_MAX_PORTS || port_no == ODPP_LOCAL)
+ goto out;
+
+ rtnl_lock();
+ dp = get_dp_locked(dp_idx);
+ err = -ENODEV;
+ if (!dp)
+ goto out_unlock_rtnl;
+
+ p = dp->ports[port_no];
+ err = -ENOENT;
+ if (!p)
+ goto out_unlock_dp;
+
+ err = dp_del_port(p, &dp_devs);
+
+out_unlock_dp:
+ mutex_unlock(&dp->mutex);
+out_unlock_rtnl:
+ rtnl_unlock();
+out:
+ list_for_each_entry_safe (dp_dev, next, &dp_devs, list)
+ free_netdev(dp_dev->dev);
+ return err;
+}
+
+/* Must be called with rcu_read_lock. */
+static void
+do_port_input(struct net_bridge_port *p, struct sk_buff *skb)
+{
+ /* Make our own copy of the packet. Otherwise we will mangle the
+ * packet for anyone who came before us (e.g. tcpdump via AF_PACKET).
+ * (No one comes after us, since we tell handle_bridge() that we took
+ * the packet.) */
+ skb = skb_share_check(skb, GFP_ATOMIC);
+ if (!skb)
+ return;
+
+ /* Push the Ethernet header back on. */
+ skb_push(skb, ETH_HLEN);
+ skb_reset_mac_header(skb);
+ dp_process_received_packet(skb, p);
+}
+
+/* Must be called with rcu_read_lock and with bottom-halves disabled. */
+void dp_process_received_packet(struct sk_buff *skb, struct net_bridge_port *p)
+{
+ struct datapath *dp = p->dp;
+ struct dp_stats_percpu *stats;
+ struct odp_flow_key key;
+ struct sw_flow *flow;
+
+ WARN_ON_ONCE(skb_shared(skb));
+ WARN_ON_ONCE(skb->destructor);
+
+ /* BHs are off so we don't have to use get_cpu()/put_cpu() here. */
+ stats = percpu_ptr(dp->stats_percpu, smp_processor_id());
+
+ if (flow_extract(skb, p ? p->port_no : ODPP_NONE, &key)) {
+ if (dp->drop_frags) {
+ kfree_skb(skb);
+ stats->n_frags++;
+ return;
+ }
+ }
+
+ flow = dp_table_lookup(rcu_dereference(dp->table), &key);
+ if (flow) {
+ struct sw_flow_actions *acts = rcu_dereference(flow->sf_acts);
+ flow_used(flow, skb);
+ execute_actions(dp, skb, &key, acts->actions, acts->n_actions,
+ GFP_ATOMIC);
+ stats->n_hit++;
+ } else {
+ stats->n_missed++;
+ dp_output_control(dp, skb, _ODPL_MISS_NR, 0);
+ }
+}
+
+/*
+ * Used as br_handle_frame_hook. (Cannot run bridge at the same time, even on
+ * different set of devices!)
+ */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,22)
+/* Called with rcu_read_lock and bottom-halves disabled. */
+static struct sk_buff *dp_frame_hook(struct net_bridge_port *p,
+ struct sk_buff *skb)
+{
+ do_port_input(p, skb);
+ return NULL;
+}
+#elif LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+/* Called with rcu_read_lock and bottom-halves disabled. */
+static int dp_frame_hook(struct net_bridge_port *p, struct sk_buff **pskb)
+{
+ do_port_input(p, *pskb);
+ return 1;
+}
+#else
+#error
+#endif
+
+#ifdef CONFIG_XEN
+/* This code is copied verbatim from net/dev/core.c in Xen's
+ * linux-2.6.18-92.1.10.el5.xs5.0.0.394.644. We can't call those functions
+ * directly because they aren't exported. */
+static int skb_pull_up_to(struct sk_buff *skb, void *ptr)
+{
+ if (ptr < (void *)skb->tail)
+ return 1;
+ if (__pskb_pull_tail(skb,
+ ptr - (void *)skb->data - skb_headlen(skb))) {
+ return 1;
+ } else {
+ return 0;
+ }
+}
+
+int skb_checksum_setup(struct sk_buff *skb)
+{
+ if (skb->proto_csum_blank) {
+ if (skb->protocol != htons(ETH_P_IP))
+ goto out;
+ if (!skb_pull_up_to(skb, skb->nh.iph + 1))
+ goto out;
+ skb->h.raw = (unsigned char *)skb->nh.iph + 4*skb->nh.iph->ihl;
+ switch (skb->nh.iph->protocol) {
+ case IPPROTO_TCP:
+ skb->csum = offsetof(struct tcphdr, check);
+ break;
+ case IPPROTO_UDP:
+ skb->csum = offsetof(struct udphdr, check);
+ break;
+ default:
+ if (net_ratelimit())
+ printk(KERN_ERR "Attempting to checksum a non-"
+ "TCP/UDP packet, dropping a protocol"
+ " %d packet", skb->nh.iph->protocol);
+ goto out;
+ }
+ if (!skb_pull_up_to(skb, skb->h.raw + skb->csum + 2))
+ goto out;
+ skb->ip_summed = CHECKSUM_HW;
+ skb->proto_csum_blank = 0;
+ }
+ return 0;
+out:
+ return -EPROTO;
+}
+#endif
+
+int
+dp_output_control(struct datapath *dp, struct sk_buff *skb, int queue_no,
+ u32 arg)
+{
+ struct dp_stats_percpu *stats;
+ struct sk_buff_head *queue;
+ int port_no;
+ int err;
+
+ WARN_ON_ONCE(skb_shared(skb));
+ BUG_ON(queue_no != _ODPL_MISS_NR && queue_no != _ODPL_ACTION_NR);
+
+ queue = &dp->queues[queue_no];
+ err = -ENOBUFS;
+ if (skb_queue_len(queue) >= DP_MAX_QUEUE_LEN)
+ goto err_kfree_skb;
+
+ /* If a checksum-deferred packet is forwarded to the controller,
+ * correct the pointers and checksum. This happens on a regular basis
+ * only on Xen (the CHECKSUM_HW case), on which VMs can pass up packets
+ * that do not have their checksum computed. We also implement it for
+ * the non-Xen case, but it is difficult to trigger or test this case
+ * there, hence the WARN_ON_ONCE().
+ */
+ err = skb_checksum_setup(skb);
+ if (err)
+ goto err_kfree_skb;
+#ifndef CHECKSUM_HW
+ if (skb->ip_summed == CHECKSUM_PARTIAL) {
+ WARN_ON_ONCE(1);
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,22)
+ /* Until 2.6.22, the start of the transport header was also the
+ * start of data to be checksummed. Linux 2.6.22 introduced
+ * the csum_start field for this purpose, but we should point
+ * the transport header to it anyway for backward
+ * compatibility, as dev_queue_xmit() does even in 2.6.28. */
+ skb_set_transport_header(skb, skb->csum_start -
+ skb_headroom(skb));
+#endif
+ err = skb_checksum_help(skb);
+ if (err)
+ goto err_kfree_skb;
+ }
+#else
+ if (skb->ip_summed == CHECKSUM_HW) {
+ err = skb_checksum_help(skb, 0);
+ if (err)
+ goto err_kfree_skb;
+ }
+#endif
+
+ /* Break apart GSO packets into their component pieces. Otherwise
+ * userspace may try to stuff a 64kB packet into a 1500-byte MTU. */
+ if (skb_is_gso(skb)) {
+ struct sk_buff *nskb = skb_gso_segment(skb, 0);
+ if (nskb) {
+ kfree_skb(skb);
+ skb = nskb;
+ if (unlikely(IS_ERR(skb))) {
+ err = PTR_ERR(skb);
+ goto err;
+ }
+ } else {
+ /* XXX This case might not be possible. It's hard to
+ * tell from the skb_gso_segment() code and comment. */
+ }
+ }
+
+ /* Figure out port number. */
+ port_no = ODPP_LOCAL;
+ if (skb->dev) {
+ if (skb->dev->br_port)
+ port_no = skb->dev->br_port->port_no;
+ else if (is_dp_dev(skb->dev))
+ port_no = dp_dev_priv(skb->dev)->port_no;
+ }
+
+ /* Append each packet to queue. There will be only one packet unless
+ * we broke up a GSO packet above. */
+ do {
+ struct odp_msg *header;
+ struct sk_buff *nskb = skb->next;
+ skb->next = NULL;
+
+ err = skb_cow(skb, sizeof *header);
+ if (err) {
+ while (nskb) {
+ kfree_skb(skb);
+ skb = nskb;
+ nskb = skb->next;
+ }
+ goto err_kfree_skb;
+ }
+
+ header = (struct odp_msg*)__skb_push(skb, sizeof *header);
+ header->type = queue_no;
+ header->length = skb->len;
+ header->port = port_no;
+ header->reserved = 0;
+ header->arg = arg;
+ skb_queue_tail(queue, skb);
+
+ skb = nskb;
+ } while (skb);
+
+ wake_up_interruptible(&dp->waitqueue);
+ return 0;
+
+err_kfree_skb:
+ kfree_skb(skb);
+err:
+ stats = percpu_ptr(dp->stats_percpu, get_cpu());
+ stats->n_lost++;
+ put_cpu();
+
+ return err;
+}
+
+static int flush_flows(struct datapath *dp)
+{
+ dp->n_flows = 0;
+ return dp_table_flush(dp);
+}
+
+static int validate_actions(const struct sw_flow_actions *actions)
+{
+ unsigned int i;
+
+ for (i = 0; i < actions->n_actions; i++) {
+ const union odp_action *a = &actions->actions[i];
+ switch (a->type) {
+ case ODPAT_OUTPUT:
+ if (a->output.port >= DP_MAX_PORTS)
+ return -EINVAL;
+ break;
+
+ case ODPAT_OUTPUT_GROUP:
+ if (a->output_group.group >= DP_MAX_GROUPS)
+ return -EINVAL;
+ break;
+
+ case ODPAT_SET_VLAN_VID:
+ if (a->vlan_vid.vlan_vid & htons(~VLAN_VID_MASK))
+ return -EINVAL;
+ break;
+
+ case ODPAT_SET_VLAN_PCP:
+ if (a->vlan_pcp.vlan_pcp & ~VLAN_PCP_MASK)
+ return -EINVAL;
+ break;
+
+ default:
+ if (a->type >= ODPAT_N_ACTIONS)
+ return -EOPNOTSUPP;
+ break;
+ }
+ }
+
+ return 0;
+}
+
+static struct sw_flow_actions *get_actions(const struct odp_flow *flow)
+{
+ struct sw_flow_actions *actions;
+ int error;
+
+ actions = flow_actions_alloc(flow->n_actions);
+ error = PTR_ERR(actions);
+ if (IS_ERR(actions))
+ goto error;
+
+ error = -EFAULT;
+ if (copy_from_user(actions->actions, flow->actions,
+ flow->n_actions * sizeof(union odp_action)))
+ goto error_free_actions;
+ error = validate_actions(actions);
+ if (error)
+ goto error_free_actions;
+
+ return actions;
+
+error_free_actions:
+ kfree(actions);
+error:
+ return ERR_PTR(error);
+}
+
+static void get_stats(struct sw_flow *flow, struct odp_flow_stats *stats)
+{
+ if (flow->used.tv_sec) {
+ stats->used_sec = flow->used.tv_sec;
+ stats->used_nsec = flow->used.tv_nsec;
+ } else {
+ stats->used_sec = 0;
+ stats->used_nsec = 0;
+ }
+ stats->n_packets = flow->packet_count;
+ stats->n_bytes = flow->byte_count;
+ stats->ip_tos = flow->ip_tos;
+ stats->tcp_flags = flow->tcp_flags;
+}
+
+static void clear_stats(struct sw_flow *flow)
+{
+ flow->used.tv_sec = flow->used.tv_nsec = 0;
+ flow->tcp_flags = 0;
+ flow->ip_tos = 0;
+ flow->packet_count = 0;
+ flow->byte_count = 0;
+}
+
+static int put_flow(struct datapath *dp, struct odp_flow_put __user *ufp)
+{
+ struct odp_flow_put uf;
+ struct sw_flow *flow, **bucket;
+ struct dp_table *table;
+ struct odp_flow_stats stats;
+ int error;
+
+ error = -EFAULT;
+ if (copy_from_user(&uf, ufp, sizeof(struct odp_flow_put)))
+ goto error;
+ uf.flow.key.reserved = 0;
+
+retry:
+ table = rcu_dereference(dp->table);
+ bucket = dp_table_lookup_for_insert(table, &uf.flow.key);
+ if (!bucket) {
+ /* No such flow, and the slots where it could go are full. */
+ error = uf.flags & ODPPF_CREATE ? -EXFULL : -ENOENT;
+ goto error;
+ } else if (!*bucket) {
+ /* No such flow, but we found an available slot for it. */
+ struct sw_flow_actions *acts;
+
+ error = -ENOENT;
+ if (!(uf.flags & ODPPF_CREATE))
+ goto error;
+
+ /* Expand table, if necessary, to make room. */
+ if (dp->n_flows * 4 >= table->n_buckets &&
+ table->n_buckets < DP_MAX_BUCKETS) {
+ error = dp_table_expand(dp);
+ if (error)
+ goto error;
+
+ /* The bucket's location has changed. Try again. */
+ goto retry;
+ }
+
+ /* Allocate flow. */
+ error = -ENOMEM;
+ flow = kmem_cache_alloc(flow_cache, GFP_KERNEL);
+ if (flow == NULL)
+ goto error;
+ flow->key = uf.flow.key;
+ spin_lock_init(&flow->lock);
+ clear_stats(flow);
+
+ /* Obtain actions. */
+ acts = get_actions(&uf.flow);
+ error = PTR_ERR(acts);
+ if (IS_ERR(acts))
+ goto error_free_flow;
+ rcu_assign_pointer(flow->sf_acts, acts);
+
+ /* Put flow in bucket. */
+ rcu_assign_pointer(*bucket, flow);
+ dp->n_flows++;
+ memset(&stats, 0, sizeof(struct odp_flow_stats));
+ } else {
+ /* We found a matching flow. */
+ struct sw_flow *flow = *rcu_dereference(bucket);
+ struct sw_flow_actions *old_acts, *new_acts;
+ unsigned long int flags;
+
+ /* Bail out if we're not allowed to modify an existing flow. */
+ error = -EEXIST;
+ if (!(uf.flags & ODPPF_MODIFY))
+ goto error;
+
+ /* Swap actions. */
+ new_acts = get_actions(&uf.flow);
+ error = PTR_ERR(new_acts);
+ if (IS_ERR(new_acts))
+ goto error;
+ old_acts = rcu_dereference(flow->sf_acts);
+ if (old_acts->n_actions != new_acts->n_actions ||
+ memcmp(old_acts->actions, new_acts->actions,
+ sizeof(union odp_action) * old_acts->n_actions)) {
+ rcu_assign_pointer(flow->sf_acts, new_acts);
+ flow_deferred_free_acts(old_acts);
+ } else {
+ kfree(new_acts);
+ }
+
+ /* Fetch stats, then clear them if necessary. */
+ spin_lock_irqsave(&flow->lock, flags);
+ get_stats(flow, &stats);
+ if (uf.flags & ODPPF_ZERO_STATS)
+ clear_stats(flow);
+ spin_unlock_irqrestore(&flow->lock, flags);
+ }
+
+ /* Copy stats to userspace. */
+ if (__copy_to_user(&ufp->flow.stats, &stats,
+ sizeof(struct odp_flow_stats)))
+ return -EFAULT;
+ return 0;
+
+error_free_flow:
+ kmem_cache_free(flow_cache, flow);
+error:
+ return error;
+}
+
+static int put_actions(const struct sw_flow *flow, struct odp_flow __user *ufp)
+{
+ union odp_action __user *actions;
+ struct sw_flow_actions *sf_acts;
+ u32 n_actions;
+
+ if (__get_user(actions, &ufp->actions) ||
+ __get_user(n_actions, &ufp->n_actions))
+ return -EFAULT;
+
+ if (!n_actions)
+ return 0;
+ if (ufp->n_actions > INT_MAX / sizeof(union odp_action))
+ return -EINVAL;
+
+ sf_acts = rcu_dereference(flow->sf_acts);
+ if (__put_user(sf_acts->n_actions, &ufp->n_actions) ||
+ (actions && copy_to_user(actions, sf_acts->actions,
+ sizeof(union odp_action) *
+ min(sf_acts->n_actions, n_actions))))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int answer_query(struct sw_flow *flow, struct odp_flow __user *ufp)
+{
+ struct odp_flow_stats stats;
+ unsigned long int flags;
+
+ spin_lock_irqsave(&flow->lock, flags);
+ get_stats(flow, &stats);
+ spin_unlock_irqrestore(&flow->lock, flags);
+
+ if (__copy_to_user(&ufp->stats, &stats, sizeof(struct odp_flow_stats)))
+ return -EFAULT;
+ return put_actions(flow, ufp);
+}
+
+static int del_or_query_flow(struct datapath *dp,
+ struct odp_flow __user *ufp,
+ unsigned int cmd)
+{
+ struct dp_table *table = rcu_dereference(dp->table);
+ struct odp_flow uf;
+ struct sw_flow *flow;
+ int error;
+
+ error = -EFAULT;
+ if (copy_from_user(&uf, ufp, sizeof uf))
+ goto error;
+ uf.key.reserved = 0;
+
+ flow = dp_table_lookup(table, &uf.key);
+ error = -ENOENT;
+ if (!flow)
+ goto error;
+
+ if (cmd == ODP_FLOW_DEL) {
+ /* XXX redundant lookup */
+ error = dp_table_delete(table, flow);
+ if (error)
+ goto error;
+
+ /* XXX These statistics might lose a few packets, since other
+ * CPUs can be using this flow. We used to synchronize_rcu()
+ * to make sure that we get completely accurate stats, but that
+ * blows our performance, badly. */
+ dp->n_flows--;
+ error = answer_query(flow, ufp);
+ flow_deferred_free(flow);
+ } else {
+ error = answer_query(flow, ufp);
+ }
+
+error:
+ return error;
+}
+
+static int query_multiple_flows(struct datapath *dp,
+ const struct odp_flowvec *flowvec)
+{
+ struct dp_table *table = rcu_dereference(dp->table);
+ int i;
+ for (i = 0; i < flowvec->n_flows; i++) {
+ struct __user odp_flow *ufp = &flowvec->flows[i];
+ struct odp_flow uf;
+ struct sw_flow *flow;
+ int error;
+
+ if (__copy_from_user(&uf, ufp, sizeof uf))
+ return -EFAULT;
+ uf.key.reserved = 0;
+
+ flow = dp_table_lookup(table, &uf.key);
+ if (!flow)
+ error = __clear_user(&ufp->stats, sizeof ufp->stats);
+ else
+ error = answer_query(flow, ufp);
+ if (error)
+ return -EFAULT;
+ }
+ return flowvec->n_flows;
+}
+
+struct list_flows_cbdata {
+ struct odp_flow __user *uflows;
+ int n_flows;
+ int listed_flows;
+};
+
+static int list_flow(struct sw_flow *flow, void *cbdata_)
+{
+ struct list_flows_cbdata *cbdata = cbdata_;
+ struct odp_flow __user *ufp = &cbdata->uflows[cbdata->listed_flows++];
+ int error;
+
+ if (__copy_to_user(&ufp->key, &flow->key, sizeof flow->key))
+ return -EFAULT;
+ error = answer_query(flow, ufp);
+ if (error)
+ return error;
+
+ if (cbdata->listed_flows >= cbdata->n_flows)
+ return cbdata->listed_flows;
+ return 0;
+}
+
+static int list_flows(struct datapath *dp, const struct odp_flowvec *flowvec)
+{
+ struct list_flows_cbdata cbdata;
+ int error;
+
+ if (!flowvec->n_flows)
+ return 0;
+
+ cbdata.uflows = flowvec->flows;
+ cbdata.n_flows = flowvec->n_flows;
+ cbdata.listed_flows = 0;
+ error = dp_table_foreach(rcu_dereference(dp->table),
+ list_flow, &cbdata);
+ return error ? error : cbdata.listed_flows;
+}
+
+static int do_flowvec_ioctl(struct datapath *dp, unsigned long argp,
+ int (*function)(struct datapath *,
+ const struct odp_flowvec *))
+{
+ struct odp_flowvec __user *uflowvec;
+ struct odp_flowvec flowvec;
+ int retval;
+
+ uflowvec = (struct odp_flowvec __user *)argp;
+ if (!access_ok(VERIFY_WRITE, uflowvec, sizeof *uflowvec) ||
+ copy_from_user(&flowvec, uflowvec, sizeof flowvec))
+ return -EFAULT;
+
+ if (flowvec.n_flows > INT_MAX / sizeof(struct odp_flow))
+ return -EINVAL;
+
+ if (!access_ok(VERIFY_WRITE, flowvec.flows,
+ flowvec.n_flows * sizeof(struct odp_flow)))
+ return -EFAULT;
+
+ retval = function(dp, &flowvec);
+ return (retval < 0 ? retval
+ : retval == flowvec.n_flows ? 0
+ : __put_user(retval, &uflowvec->n_flows));
+}
+
+static int do_execute(struct datapath *dp, const struct odp_execute *executep)
+{
+ struct odp_execute execute;
+ struct odp_flow_key key;
+ struct sk_buff *skb;
+ struct sw_flow_actions *actions;
+ int err;
+
+ err = -EFAULT;
+ if (copy_from_user(&execute, executep, sizeof execute))
+ goto error;
+
+ err = -EINVAL;
+ if (execute.length < ETH_HLEN || execute.length > 65535)
+ goto error;
+
+ err = -ENOMEM;
+ actions = flow_actions_alloc(execute.n_actions);
+ if (!actions)
+ goto error;
+
+ err = -EFAULT;
+ if (copy_from_user(actions->actions, execute.actions,
+ execute.n_actions * sizeof *execute.actions))
+ goto error_free_actions;
+
+ err = validate_actions(actions);
+ if (err)
+ goto error_free_actions;
+
+ err = -ENOMEM;
+ skb = alloc_skb(execute.length, GFP_KERNEL);
+ if (!skb)
+ goto error_free_actions;
+ if (execute.in_port < DP_MAX_PORTS) {
+ struct net_bridge_port *p = dp->ports[execute.in_port];
+ if (p)
+ skb->dev = p->dev;
+ }
+
+ err = -EFAULT;
+ if (copy_from_user(skb_put(skb, execute.length), execute.data,
+ execute.length))
+ goto error_free_skb;
+
+ flow_extract(skb, execute.in_port, &key);
+ err = execute_actions(dp, skb, &key, actions->actions,
+ actions->n_actions, GFP_KERNEL);
+ kfree(actions);
+ return err;
+
+error_free_skb:
+ kfree_skb(skb);
+error_free_actions:
+ kfree(actions);
+error:
+ return err;
+}
+
+static int
+get_dp_stats(struct datapath *dp, struct odp_stats __user *statsp)
+{
+ struct odp_stats stats;
+ int i;
+
+ stats.n_flows = dp->n_flows;
+ stats.cur_capacity = rcu_dereference(dp->table)->n_buckets * 2;
+ stats.max_capacity = DP_MAX_BUCKETS * 2;
+ stats.n_ports = dp->n_ports;
+ stats.max_ports = DP_MAX_PORTS;
+ stats.max_groups = DP_MAX_GROUPS;
+ stats.n_frags = stats.n_hit = stats.n_missed = stats.n_lost = 0;
+ for_each_possible_cpu(i) {
+ const struct dp_stats_percpu *s;
+ s = percpu_ptr(dp->stats_percpu, i);
+ stats.n_frags += s->n_frags;
+ stats.n_hit += s->n_hit;
+ stats.n_missed += s->n_missed;
+ stats.n_lost += s->n_lost;
+ }
+ stats.max_miss_queue = DP_MAX_QUEUE_LEN;
+ stats.max_action_queue = DP_MAX_QUEUE_LEN;
+ return copy_to_user(statsp, &stats, sizeof stats) ? -EFAULT : 0;
+}
+
+static int
+put_port(const struct net_bridge_port *p, struct odp_port __user *uop)
+{
+ struct odp_port op;
+ memset(&op, 0, sizeof op);
+ strncpy(op.devname, p->dev->name, sizeof op.devname);
+ op.port = p->port_no;
+ op.flags = is_dp_dev(p->dev) ? ODP_PORT_INTERNAL : 0;
+ return copy_to_user(uop, &op, sizeof op) ? -EFAULT : 0;
+}
+
+static int
+query_port(struct datapath *dp, struct odp_port __user *uport)
+{
+ struct odp_port port;
+
+ if (copy_from_user(&port, uport, sizeof port))
+ return -EFAULT;
+ if (port.devname[0]) {
+ struct net_bridge_port *p;
+ struct net_device *dev;
+ int err;
+
+ port.devname[IFNAMSIZ - 1] = '\0';
+
+ dev = dev_get_by_name(&init_net, port.devname);
+ if (!dev)
+ return -ENODEV;
+
+ p = dev->br_port;
+ if (!p && is_dp_dev(dev)) {
+ struct dp_dev *dp_dev = dp_dev_priv(dev);
+ if (dp_dev->dp == dp)
+ p = dp->ports[dp_dev->port_no];
+ }
+ err = p && p->dp == dp ? put_port(p, uport) : -ENOENT;
+ dev_put(dev);
+
+ return err;
+ } else {
+ if (port.port >= DP_MAX_PORTS)
+ return -EINVAL;
+ if (!dp->ports[port.port])
+ return -ENOENT;
+ return put_port(dp->ports[port.port], uport);
+ }
+}
+
+static int
+list_ports(struct datapath *dp, struct odp_portvec __user *pvp)
+{
+ struct odp_portvec pv;
+ struct net_bridge_port *p;
+ int idx;
+
+ if (copy_from_user(&pv, pvp, sizeof pv))
+ return -EFAULT;
+
+ idx = 0;
+ if (pv.n_ports) {
+ list_for_each_entry_rcu (p, &dp->port_list, node) {
+ if (put_port(p, &pv.ports[idx]))
+ return -EFAULT;
+ if (idx++ >= pv.n_ports)
+ break;
+ }
+ }
+ return put_user(idx, &pvp->n_ports);
+}
+
+/* RCU callback for freeing a dp_port_group */
+static void free_port_group(struct rcu_head *rcu)
+{
+ struct dp_port_group *g = container_of(rcu, struct dp_port_group, rcu);
+ kfree(g);
+}
+
+static int
+set_port_group(struct datapath *dp, const struct odp_port_group __user *upg)
+{
+ struct odp_port_group pg;
+ struct dp_port_group *new_group, *old_group;
+ int error;
+
+ error = -EFAULT;
+ if (copy_from_user(&pg, upg, sizeof pg))
+ goto error;
+
+ error = -EINVAL;
+ if (pg.n_ports > DP_MAX_PORTS || pg.group >= DP_MAX_GROUPS)
+ goto error;
+
+ error = -ENOMEM;
+ new_group = kmalloc(sizeof *new_group + sizeof(u16) * pg.n_ports,
+ GFP_KERNEL);
+ if (!new_group)
+ goto error;
+
+ new_group->n_ports = pg.n_ports;
+ error = -EFAULT;
+ if (copy_from_user(new_group->ports, pg.ports,
+ sizeof(u16) * pg.n_ports))
+ goto error_free;
+
+ old_group = rcu_dereference(dp->groups[pg.group]);
+ rcu_assign_pointer(dp->groups[pg.group], new_group);
+ if (old_group)
+ call_rcu(&old_group->rcu, free_port_group);
+ return 0;
+
+error_free:
+ kfree(new_group);
+error:
+ return error;
+}
+
+static int
+get_port_group(struct datapath *dp, struct odp_port_group *upg)
+{
+ struct odp_port_group pg;
+ struct dp_port_group *g;
+ u16 n_copy;
+
+ if (copy_from_user(&pg, upg, sizeof pg))
+ return -EFAULT;
+
+ if (pg.group >= DP_MAX_GROUPS)
+ return -EINVAL;
+
+ g = dp->groups[pg.group];
+ n_copy = g ? min_t(int, g->n_ports, pg.n_ports) : 0;
+ if (n_copy && copy_to_user(pg.ports, g->ports, n_copy * sizeof(u16)))
+ return -EFAULT;
+
+ if (put_user(g ? g->n_ports : 0, &upg->n_ports))
+ return -EFAULT;
+
+ return 0;
+}
+
+static long openvswitch_ioctl(struct file *f, unsigned int cmd,
+ unsigned long argp)
+{
+ int dp_idx = iminor(f->f_dentry->d_inode);
+ struct datapath *dp;
+ int drop_frags, listeners, port_no;
+ int err;
+
+ /* Handle commands with special locking requirements up front. */
+ switch (cmd) {
+ case ODP_DP_CREATE:
+ return create_dp(dp_idx, (char __user *)argp);
+
+ case ODP_DP_DESTROY:
+ return destroy_dp(dp_idx);
+
+ case ODP_PORT_ADD:
+ return add_port(dp_idx, (struct odp_port __user *)argp);
+
+ case ODP_PORT_DEL:
+ err = get_user(port_no, (int __user *)argp);
+ if (err)
+ break;
+ return del_port(dp_idx, port_no);
+ }
+
+ dp = get_dp_locked(dp_idx);
+ if (!dp)
+ return -ENODEV;
+
+ switch (cmd) {
+ case ODP_DP_STATS:
+ err = get_dp_stats(dp, (struct odp_stats __user *)argp);
+ break;
+
+ case ODP_GET_DROP_FRAGS:
+ err = put_user(dp->drop_frags, (int __user *)argp);
+ break;
+
+ case ODP_SET_DROP_FRAGS:
+ err = get_user(drop_frags, (int __user *)argp);
+ if (err)
+ break;
+ err = -EINVAL;
+ if (drop_frags != 0 && drop_frags != 1)
+ break;
+ dp->drop_frags = drop_frags;
+ err = 0;
+ break;
+
+ case ODP_GET_LISTEN_MASK:
+ err = put_user((int)f->private_data, (int __user *)argp);
+ break;
+
+ case ODP_SET_LISTEN_MASK:
+ err = get_user(listeners, (int __user *)argp);
+ if (err)
+ break;
+ err = -EINVAL;
+ if (listeners & ~ODPL_ALL)
+ break;
+ err = 0;
+ f->private_data = (void*)listeners;
+ break;
+
+ case ODP_PORT_QUERY:
+ err = query_port(dp, (struct odp_port __user *)argp);
+ break;
+
+ case ODP_PORT_LIST:
+ err = list_ports(dp, (struct odp_portvec __user *)argp);
+ break;
+
+ case ODP_PORT_GROUP_SET:
+ err = set_port_group(dp, (struct odp_port_group __user *)argp);
+ break;
+
+ case ODP_PORT_GROUP_GET:
+ err = get_port_group(dp, (struct odp_port_group __user *)argp);
+ break;
+
+ case ODP_FLOW_FLUSH:
+ err = flush_flows(dp);
+ break;
+
+ case ODP_FLOW_PUT:
+ err = put_flow(dp, (struct odp_flow_put __user *)argp);
+ break;
+
+ case ODP_FLOW_DEL:
+ case ODP_FLOW_GET:
+ err = del_or_query_flow(dp, (struct odp_flow __user *)argp,
+ cmd);
+ break;
+
+ case ODP_FLOW_GET_MULTIPLE:
+ err = do_flowvec_ioctl(dp, argp, query_multiple_flows);
+ break;
+
+ case ODP_FLOW_LIST:
+ err = do_flowvec_ioctl(dp, argp, list_flows);
+ break;
+
+ case ODP_EXECUTE:
+ err = do_execute(dp, (struct odp_execute __user *)argp);
+ break;
+
+ default:
+ err = -ENOIOCTLCMD;
+ break;
+ }
+ mutex_unlock(&dp->mutex);
+ return err;
+}
+
+static int dp_has_packet_of_interest(struct datapath *dp, int listeners)
+{
+ int i;
+ for (i = 0; i < DP_N_QUEUES; i++) {
+ if (listeners & (1 << i) && !skb_queue_empty(&dp->queues[i]))
+ return 1;
+ }
+ return 0;
+}
+
+ssize_t openvswitch_read(struct file *f, char __user *buf, size_t nbytes,
+ loff_t *ppos)
+{
+ int listeners = (int) f->private_data;
+ int dp_idx = iminor(f->f_dentry->d_inode);
+ struct datapath *dp = get_dp(dp_idx);
+ struct sk_buff *skb;
+ struct iovec __user iov;
+ size_t copy_bytes;
+ int retval;
+
+ if (!dp)
+ return -ENODEV;
+
+ if (nbytes == 0 || !listeners)
+ return 0;
+
+ for (;;) {
+ int i;
+
+ for (i = 0; i < DP_N_QUEUES; i++) {
+ if (listeners & (1 << i)) {
+ skb = skb_dequeue(&dp->queues[i]);
+ if (skb)
+ goto success;
+ }
+ }
+
+ if (f->f_flags & O_NONBLOCK) {
+ retval = -EAGAIN;
+ goto error;
+ }
+
+ wait_event_interruptible(dp->waitqueue,
+ dp_has_packet_of_interest(dp,
+ listeners));
+
+ if (signal_pending(current)) {
+ retval = -ERESTARTSYS;
+ goto error;
+ }
+ }
+success:
+ copy_bytes = min(skb->len, nbytes);
+ iov.iov_base = buf;
+ iov.iov_len = copy_bytes;
+ retval = skb_copy_datagram_iovec(skb, 0, &iov, iov.iov_len);
+ if (!retval)
+ retval = copy_bytes;
+ kfree_skb(skb);
+
+error:
+ return retval;
+}
+
+static unsigned int openvswitch_poll(struct file *file, poll_table *wait)
+{
+ int dp_idx = iminor(file->f_dentry->d_inode);
+ struct datapath *dp = get_dp(dp_idx);
+ unsigned int mask;
+
+ if (dp) {
+ mask = 0;
+ poll_wait(file, &dp->waitqueue, wait);
+ if (dp_has_packet_of_interest(dp, (int)file->private_data))
+ mask |= POLLIN | POLLRDNORM;
+ } else {
+ mask = POLLIN | POLLRDNORM | POLLHUP;
+ }
+ return mask;
+}
+
+struct file_operations openvswitch_fops = {
+ /* XXX .aio_read = openvswitch_aio_read, */
+ .read = openvswitch_read,
+ .poll = openvswitch_poll,
+ .unlocked_ioctl = openvswitch_ioctl,
+ /* XXX .fasync = openvswitch_fasync, */
+};
+
+static int major;
+static struct llc_sap *dp_stp_sap;
+
+static int dp_stp_rcv(struct sk_buff *skb, struct net_device *dev,
+ struct packet_type *pt, struct net_device *orig_dev)
+{
+ /* We don't really care about STP packets, we just listen for them for
+ * mutual exclusion with the bridge module, so this just discards
+ * them. */
+ kfree_skb(skb);
+ return 0;
+}
+
+static int __init dp_init(void)
+{
+ int err;
+
+ printk("Open vSwitch %s, built "__DATE__" "__TIME__"\n", VERSION BUILDNR);
+
+ /* Register to receive STP packets because the bridge module also
+ * attempts to do so. Since there can only be a single listener for a
+ * given protocol, this provides mutual exclusion against the bridge
+ * module, preventing both of them from being loaded at the same
+ * time. */
+ dp_stp_sap = llc_sap_open(LLC_SAP_BSPAN, dp_stp_rcv);
+ if (!dp_stp_sap) {
+ printk(KERN_ERR "openvswitch: can't register sap for STP (probably the bridge module is loaded)\n");
+ return -EADDRINUSE;
+ }
+
+ err = flow_init();
+ if (err)
+ goto error;
+
+ err = register_netdevice_notifier(&dp_device_notifier);
+ if (err)
+ goto error_flow_exit;
+
+ major = register_chrdev(0, "openvswitch", &openvswitch_fops);
+ if (err < 0)
+ goto error_unreg_notifier;
+
+ /* Hook into callback used by the bridge to intercept packets.
+ * Parasites we are. */
+ br_handle_frame_hook = dp_frame_hook;
+
+ return 0;
+
+error_unreg_notifier:
+ unregister_netdevice_notifier(&dp_device_notifier);
+error_flow_exit:
+ flow_exit();
+error:
+ return err;
+}
+
+static void dp_cleanup(void)
+{
+ rcu_barrier();
+ unregister_chrdev(major, "openvswitch");
+ unregister_netdevice_notifier(&dp_device_notifier);
+ flow_exit();
+ br_handle_frame_hook = NULL;
+ llc_sap_put(dp_stp_sap);
+}
+
+module_init(dp_init);
+module_exit(dp_cleanup);
+
+MODULE_DESCRIPTION("Open vSwitch switching datapath");
+MODULE_LICENSE("GPL");
diff --git a/datapath/datapath.h b/datapath/datapath.h
new file mode 100644
index 000000000..102b27f31
--- /dev/null
+++ b/datapath/datapath.h
@@ -0,0 +1,139 @@
+/* Interface exported by openvswitch_mod. */
+
+#ifndef DATAPATH_H
+#define DATAPATH_H 1
+
+#include <asm/page.h>
+#include <linux/kernel.h>
+#include <linux/mutex.h>
+#include <linux/netlink.h>
+#include <linux/netdevice.h>
+#include <linux/workqueue.h>
+#include <linux/skbuff.h>
+#include "flow.h"
+#include "brc_sysfs.h"
+
+struct sk_buff;
+
+/* Mask for the priority bits in a vlan header. If we ever merge upstream
+ * then this should go into include/linux/if_vlan.h. */
+#define VLAN_PCP_MASK 0xe000
+
+#define DP_MAX_PORTS 256
+#define DP_MAX_GROUPS 16
+
+#define DP_L2_BITS (PAGE_SHIFT - ilog2(sizeof(struct sw_flow*)))
+#define DP_L2_SIZE (1 << DP_L2_BITS)
+#define DP_L2_SHIFT 0
+
+#define DP_L1_BITS (PAGE_SHIFT - ilog2(sizeof(struct sw_flow**)))
+#define DP_L1_SIZE (1 << DP_L1_BITS)
+#define DP_L1_SHIFT DP_L2_BITS
+
+#define DP_MAX_BUCKETS (DP_L1_SIZE * DP_L2_SIZE)
+
+struct dp_table {
+ unsigned int n_buckets;
+ struct sw_flow ***flows[2];
+ struct rcu_head rcu;
+};
+
+#define DP_N_QUEUES 2
+#define DP_MAX_QUEUE_LEN 100
+
+struct dp_stats_percpu {
+ u64 n_frags;
+ u64 n_hit;
+ u64 n_missed;
+ u64 n_lost;
+};
+
+struct dp_port_group {
+ struct rcu_head rcu;
+ int n_ports;
+ u16 ports[];
+};
+
+struct datapath {
+ struct mutex mutex;
+ int dp_idx;
+
+#ifdef SUPPORT_SYSFS
+ struct kobject ifobj;
+#endif
+
+ int drop_frags;
+
+ /* Queued data. */
+ struct sk_buff_head queues[DP_N_QUEUES];
+ wait_queue_head_t waitqueue;
+
+ /* Flow table. */
+ unsigned int n_flows;
+ struct dp_table *table;
+
+ /* Port groups. */
+ struct dp_port_group *groups[DP_MAX_GROUPS];
+
+ /* Switch ports. */
+ unsigned int n_ports;
+ struct net_bridge_port *ports[DP_MAX_PORTS];
+ struct list_head port_list; /* All ports, including local_port. */
+
+ /* Stats. */
+ struct dp_stats_percpu *stats_percpu;
+};
+
+struct net_bridge_port {
+ u16 port_no;
+ struct datapath *dp;
+ struct net_device *dev;
+#ifdef SUPPORT_SYSFS
+ struct kobject kobj;
+#endif
+ struct list_head node; /* Element in datapath.ports. */
+};
+
+extern struct notifier_block dp_device_notifier;
+extern int (*dp_ioctl_hook)(struct net_device *dev, struct ifreq *rq, int cmd);
+extern int (*dp_add_dp_hook)(struct datapath *dp);
+extern int (*dp_del_dp_hook)(struct datapath *dp);
+extern int (*dp_add_if_hook)(struct net_bridge_port *p);
+extern int (*dp_del_if_hook)(struct net_bridge_port *p);
+
+/* Flow table. */
+struct dp_table *dp_table_create(unsigned int n_buckets);
+void dp_table_destroy(struct dp_table *, int free_flows);
+struct sw_flow *dp_table_lookup(struct dp_table *, const struct odp_flow_key *);
+struct sw_flow **dp_table_lookup_for_insert(struct dp_table *, const struct odp_flow_key *);
+int dp_table_delete(struct dp_table *, struct sw_flow *);
+int dp_table_expand(struct datapath *);
+int dp_table_flush(struct datapath *);
+int dp_table_foreach(struct dp_table *table,
+ int (*callback)(struct sw_flow *flow, void *aux),
+ void *aux);
+
+void dp_process_received_packet(struct sk_buff *, struct net_bridge_port *);
+int dp_del_port(struct net_bridge_port *, struct list_head *);
+int dp_output_port(struct datapath *, struct sk_buff *, int out_port,
+ int ignore_no_fwd);
+int dp_output_control(struct datapath *, struct sk_buff *, int, u32 arg);
+void dp_set_origin(struct datapath *, u16, struct sk_buff *);
+
+struct datapath *get_dp(int dp_idx);
+
+static inline const char *dp_name(const struct datapath *dp)
+{
+ return dp->ports[ODPP_LOCAL]->dev->name;
+}
+
+#ifdef CONFIG_XEN
+int skb_checksum_setup(struct sk_buff *skb);
+#else
+static inline int skb_checksum_setup(struct sk_buff *skb)
+{
+ return 0;
+}
+#endif
+
+#endif /* datapath.h */
diff --git a/datapath/dp_dev.c b/datapath/dp_dev.c
new file mode 100644
index 000000000..8a749dbc4
--- /dev/null
+++ b/datapath/dp_dev.c
@@ -0,0 +1,210 @@
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/ethtool.h>
+#include <linux/rcupdate.h>
+#include <linux/skbuff.h>
+#include <linux/workqueue.h>
+
+#include "datapath.h"
+#include "dp_dev.h"
+
+struct datapath *dp_dev_get_dp(struct net_device *netdev)
+{
+ return dp_dev_priv(netdev)->dp;
+}
+EXPORT_SYMBOL(dp_dev_get_dp);
+
+static struct net_device_stats *dp_dev_get_stats(struct net_device *netdev)
+{
+ struct dp_dev *dp_dev = dp_dev_priv(netdev);
+ return &dp_dev->stats;
+}
+
+int dp_dev_recv(struct net_device *netdev, struct sk_buff *skb)
+{
+ struct dp_dev *dp_dev = dp_dev_priv(netdev);
+ int len;
+ len = skb->len;
+ skb->pkt_type = PACKET_HOST;
+ skb->protocol = eth_type_trans(skb, netdev);
+ if (in_interrupt())
+ netif_rx(skb);
+ else
+ netif_rx_ni(skb);
+ netdev->last_rx = jiffies;
+ dp_dev->stats.rx_packets++;
+ dp_dev->stats.rx_bytes += len;
+ return len;
+}
+
+static int dp_dev_mac_addr(struct net_device *dev, void *p)
+{
+ struct sockaddr *addr = p;
+
+ if (!is_valid_ether_addr(addr->sa_data))
+ return -EADDRNOTAVAIL;
+ memcpy(dev->dev_addr, addr->sa_data, dev->addr_len);
+ return 0;
+}
+
+static int dp_dev_xmit(struct sk_buff *skb, struct net_device *netdev)
+{
+ struct dp_dev *dp_dev = dp_dev_priv(netdev);
+
+ /* By orphaning 'skb' we will screw up socket accounting slightly, but
+ * the effect is limited to the device queue length. If we don't
+ * do this, then the sk_buff will be destructed eventually, but it is
+ * harder to predict when. */
+ skb_orphan(skb);
+
+ /* We are going to modify 'skb', by sticking it on &dp_dev->xmit_queue,
+ * so we need to have our own clone. (At any rate, fwd_port_input()
+ * will need its own clone, so there's no benefit to queuing any other
+ * way.) */
+ skb = skb_share_check(skb, GFP_ATOMIC);
+ if (!skb)
+ return 0;
+
+ dp_dev->stats.tx_packets++;
+ dp_dev->stats.tx_bytes += skb->len;
+
+ if (skb_queue_len(&dp_dev->xmit_queue) >= netdev->tx_queue_len) {
+ /* Queue overflow. Stop transmitter. */
+ netif_stop_queue(netdev);
+
+ /* We won't see all dropped packets individually, so overrun
+ * error is appropriate. */
+ dp_dev->stats.tx_fifo_errors++;
+ }
+ skb_queue_tail(&dp_dev->xmit_queue, skb);
+ netdev->trans_start = jiffies;
+
+ schedule_work(&dp_dev->xmit_work);
+
+ return 0;
+}
+
+static void dp_dev_do_xmit(struct work_struct *work)
+{
+ struct dp_dev *dp_dev = container_of(work, struct dp_dev, xmit_work);
+ struct datapath *dp = dp_dev->dp;
+ struct sk_buff *skb;
+
+ while ((skb = skb_dequeue(&dp_dev->xmit_queue)) != NULL) {
+ skb_reset_mac_header(skb);
+ rcu_read_lock_bh();
+ dp_process_received_packet(skb, dp->ports[dp_dev->port_no]);
+ rcu_read_unlock_bh();
+ }
+ netif_wake_queue(dp_dev->dev);
+}
+
+static int dp_dev_open(struct net_device *netdev)
+{
+ netif_start_queue(netdev);
+ return 0;
+}
+
+static int dp_dev_stop(struct net_device *netdev)
+{
+ netif_stop_queue(netdev);
+ return 0;
+}
+
+static void dp_getinfo(struct net_device *netdev, struct ethtool_drvinfo *info)
+{
+ struct dp_dev *dp_dev = dp_dev_priv(netdev);
+ strcpy(info->driver, "openvswitch");
+ sprintf(info->bus_info, "%d", dp_dev->dp->dp_idx);
+}
+
+static struct ethtool_ops dp_ethtool_ops = {
+ .get_drvinfo = dp_getinfo,
+ .get_link = ethtool_op_get_link,
+ .get_sg = ethtool_op_get_sg,
+ .get_tx_csum = ethtool_op_get_tx_csum,
+ .get_tso = ethtool_op_get_tso,
+};
+
+static void
+do_setup(struct net_device *netdev)
+{
+ ether_setup(netdev);
+
+ netdev->do_ioctl = dp_ioctl_hook;
+ netdev->get_stats = dp_dev_get_stats;
+ netdev->hard_start_xmit = dp_dev_xmit;
+ netdev->open = dp_dev_open;
+ SET_ETHTOOL_OPS(netdev, &dp_ethtool_ops);
+ netdev->stop = dp_dev_stop;
+ netdev->tx_queue_len = 100;
+ netdev->set_mac_address = dp_dev_mac_addr;
+
+ netdev->flags = IFF_BROADCAST | IFF_MULTICAST;
+
+ random_ether_addr(netdev->dev_addr);
+
+ /* Set the OUI to the Nicira one. */
+ netdev->dev_addr[0] = 0x00;
+ netdev->dev_addr[1] = 0x23;
+ netdev->dev_addr[2] = 0x20;
+
+ /* Set the top bits to indicate random Nicira address. */
+ netdev->dev_addr[3] |= 0xc0;
+}
+
+/* Create a datapath device associated with 'dp'. If 'dp_name' is null,
+ * the device name will be of the form 'of<dp_idx>'. Returns the new device or
+ * an error code.
+ *
+ * Called with RTNL lock and dp_mutex. */
+struct net_device *dp_dev_create(struct datapath *dp, const char *dp_name, int port_no)
+{
+ struct dp_dev *dp_dev;
+ struct net_device *netdev;
+ char dev_name[IFNAMSIZ];
+ int err;
+
+ if (dp_name) {
+ if (strlen(dp_name) >= IFNAMSIZ)
+ return ERR_PTR(-EINVAL);
+ strncpy(dev_name, dp_name, sizeof(dev_name));
+ } else
+ snprintf(dev_name, sizeof dev_name, "of%d", dp->dp_idx);
+
+ netdev = alloc_netdev(sizeof(struct dp_dev), dev_name, do_setup);
+ if (!netdev)
+ return ERR_PTR(-ENOMEM);
+
+ err = register_netdevice(netdev);
+ if (err) {
+ free_netdev(netdev);
+ return ERR_PTR(err);
+ }
+
+ dp_dev = dp_dev_priv(netdev);
+ dp_dev->dp = dp;
+ dp_dev->port_no = port_no;
+ dp_dev->dev = netdev;
+ skb_queue_head_init(&dp_dev->xmit_queue);
+ INIT_WORK(&dp_dev->xmit_work, dp_dev_do_xmit);
+ return netdev;
+}
+
+/* Called with RTNL lock and dp_mutex.*/
+void dp_dev_destroy(struct net_device *netdev)
+{
+ struct dp_dev *dp_dev = dp_dev_priv(netdev);
+
+ netif_tx_disable(netdev);
+ synchronize_net();
+ skb_queue_purge(&dp_dev->xmit_queue);
+ unregister_netdevice(netdev);
+}
+
+int is_dp_dev(struct net_device *netdev)
+{
+ return netdev->open == dp_dev_open;
+}
+EXPORT_SYMBOL(is_dp_dev);
diff --git a/datapath/dp_dev.h b/datapath/dp_dev.h
new file mode 100644
index 000000000..848743900
--- /dev/null
+++ b/datapath/dp_dev.h
@@ -0,0 +1,27 @@
+#ifndef DP_DEV_H
+#define DP_DEV_H 1
+
+struct dp_dev {
+ struct datapath *dp;
+ int port_no;
+
+ struct net_device *dev;
+ struct net_device_stats stats;
+ struct sk_buff_head xmit_queue;
+ struct work_struct xmit_work;
+
+ struct list_head list;
+};
+
+static inline struct dp_dev *dp_dev_priv(struct net_device *netdev)
+{
+ return netdev_priv(netdev);
+}
+
+struct net_device *dp_dev_create(struct datapath *, const char *, int port_no);
+void dp_dev_destroy(struct net_device *);
+int dp_dev_recv(struct net_device *, struct sk_buff *);
+int is_dp_dev(struct net_device *);
+struct datapath *dp_dev_get_dp(struct net_device *);
+
+#endif /* dp_dev.h */
diff --git a/datapath/dp_notify.c b/datapath/dp_notify.c
new file mode 100644
index 000000000..56d5c3c98
--- /dev/null
+++ b/datapath/dp_notify.c
@@ -0,0 +1,29 @@
+/*
+ * Distributed under the terms of the GNU GPL version 2.
+ * Copyright (c) 2007, 2008, 2009 Nicira Networks.
+ */
+
+/* Handle changes to managed devices */
+
+#include <linux/netdevice.h>
+
+#include "datapath.h"
+
+
+static int dp_device_event(struct notifier_block *unused, unsigned long event,
+ void *ptr)
+{
+ struct net_device *dev = ptr;
+ struct net_bridge_port *p = dev->br_port;
+ if (event == NETDEV_UNREGISTER && p) {
+ struct datapath *dp = p->dp;
+ mutex_lock(&dp->mutex);
+ dp_del_port(p, NULL);
+ mutex_unlock(&dp->mutex);
+ }
+ return NOTIFY_DONE;
+}
+
+struct notifier_block dp_device_notifier = {
+ .notifier_call = dp_device_event
+};
diff --git a/datapath/flow.c b/datapath/flow.c
new file mode 100644
index 000000000..b24c242c5
--- /dev/null
+++ b/datapath/flow.c
@@ -0,0 +1,301 @@
+/*
+ * Distributed under the terms of the GNU GPL version 2.
+ * Copyright (c) 2007, 2008, 2009 Nicira Networks.
+ */
+
+#include "flow.h"
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/if_ether.h>
+#include <linux/if_vlan.h>
+#include <net/llc_pdu.h>
+#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/llc.h>
+#include <linux/module.h>
+#include <linux/in.h>
+#include <linux/rcupdate.h>
+#include <linux/if_ether.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/icmp.h>
+#include <net/ip.h>
+
+#include "compat.h"
+
+struct kmem_cache *flow_cache;
+
+static inline int iphdr_ok(struct sk_buff *skb)
+{
+ int nh_ofs = skb_network_offset(skb);
+ if (skb->len >= nh_ofs + sizeof(struct iphdr)) {
+ int ip_len = ip_hdrlen(skb);
+ return (ip_len >= sizeof(struct iphdr)
+ && pskb_may_pull(skb, nh_ofs + ip_len));
+ }
+ return 0;
+}
+
+static inline int tcphdr_ok(struct sk_buff *skb)
+{
+ int th_ofs = skb_transport_offset(skb);
+ if (pskb_may_pull(skb, th_ofs + sizeof(struct tcphdr))) {
+ int tcp_len = tcp_hdrlen(skb);
+ return (tcp_len >= sizeof(struct tcphdr)
+ && skb->len >= th_ofs + tcp_len);
+ }
+ return 0;
+}
+
+static inline int udphdr_ok(struct sk_buff *skb)
+{
+ int th_ofs = skb_transport_offset(skb);
+ return pskb_may_pull(skb, th_ofs + sizeof(struct udphdr));
+}
+
+static inline int icmphdr_ok(struct sk_buff *skb)
+{
+ int th_ofs = skb_transport_offset(skb);
+ return pskb_may_pull(skb, th_ofs + sizeof(struct icmphdr));
+}
+
+#define TCP_FLAGS_OFFSET 13
+#define TCP_FLAG_MASK 0x3f
+
+static inline struct ovs_tcphdr *ovs_tcp_hdr(const struct sk_buff *skb)
+{
+ return (struct ovs_tcphdr *)skb_transport_header(skb);
+}
+
+void flow_used(struct sw_flow *flow, struct sk_buff *skb)
+{
+ unsigned long flags;
+ u8 tcp_flags = 0;
+
+ if (flow->key.dl_type == htons(ETH_P_IP) && iphdr_ok(skb)) {
+ struct iphdr *nh = ip_hdr(skb);
+ flow->ip_tos = nh->tos;
+ if (flow->key.nw_proto == IPPROTO_TCP && tcphdr_ok(skb)) {
+ u8 *tcp = (u8 *)tcp_hdr(skb);
+ tcp_flags = *(tcp + TCP_FLAGS_OFFSET) & TCP_FLAG_MASK;
+ }
+ }
+
+ spin_lock_irqsave(&flow->lock, flags);
+ getnstimeofday(&flow->used);
+ flow->packet_count++;
+ flow->byte_count += skb->len;
+ flow->tcp_flags |= tcp_flags;
+ spin_unlock_irqrestore(&flow->lock, flags);
+}
+
+struct sw_flow_actions *flow_actions_alloc(size_t n_actions)
+{
+ struct sw_flow_actions *sfa;
+
+ if (n_actions > (PAGE_SIZE - sizeof *sfa) / sizeof(union odp_action))
+ return ERR_PTR(-EINVAL);
+
+ sfa = kmalloc(sizeof *sfa + n_actions * sizeof(union odp_action),
+ GFP_KERNEL);
+ if (!sfa)
+ return ERR_PTR(-ENOMEM);
+
+ sfa->n_actions = n_actions;
+ return sfa;
+}
+
+
+/* Frees 'flow' immediately. */
+void flow_free(struct sw_flow *flow)
+{
+ if (unlikely(!flow))
+ return;
+ kfree(flow->sf_acts);
+ kmem_cache_free(flow_cache, flow);
+}
+
+/* RCU callback used by flow_deferred_free. */
+static void rcu_free_flow_callback(struct rcu_head *rcu)
+{
+ struct sw_flow *flow = container_of(rcu, struct sw_flow, rcu);
+ flow_free(flow);
+}
+
+/* Schedules 'flow' to be freed after the next RCU grace period.
+ * The caller must hold rcu_read_lock for this to be sensible. */
+void flow_deferred_free(struct sw_flow *flow)
+{
+ call_rcu(&flow->rcu, rcu_free_flow_callback);
+}
+
+/* RCU callback used by flow_deferred_free_acts. */
+static void rcu_free_acts_callback(struct rcu_head *rcu)
+{
+ struct sw_flow_actions *sf_acts = container_of(rcu,
+ struct sw_flow_actions, rcu);
+ kfree(sf_acts);
+}
+
+/* Schedules 'sf_acts' to be freed after the next RCU grace period.
+ * The caller must hold rcu_read_lock for this to be sensible. */
+void flow_deferred_free_acts(struct sw_flow_actions *sf_acts)
+{
+ call_rcu(&sf_acts->rcu, rcu_free_acts_callback);
+}
+
+#define SNAP_OUI_LEN 3
+
+struct eth_snap_hdr
+{
+ struct ethhdr eth;
+ u8 dsap; /* Always 0xAA */
+ u8 ssap; /* Always 0xAA */
+ u8 ctrl;
+ u8 oui[SNAP_OUI_LEN];
+ u16 ethertype;
+} __attribute__ ((packed));
+
+static int is_snap(const struct eth_snap_hdr *esh)
+{
+ return (esh->dsap == LLC_SAP_SNAP
+ && esh->ssap == LLC_SAP_SNAP
+ && !memcmp(esh->oui, "\0\0\0", 3));
+}
+
+/* Parses the Ethernet frame in 'skb', which was received on 'in_port',
+ * and initializes 'key' to match. Returns 1 if 'skb' contains an IP
+ * fragment, 0 otherwise. */
+int flow_extract(struct sk_buff *skb, u16 in_port, struct odp_flow_key *key)
+{
+ struct ethhdr *eth;
+ struct eth_snap_hdr *esh;
+ int retval = 0;
+ int nh_ofs;
+
+ memset(key, 0, sizeof *key);
+ key->dl_vlan = htons(ODP_VLAN_NONE);
+ key->in_port = in_port;
+
+ if (skb->len < sizeof *eth)
+ return 0;
+ if (!pskb_may_pull(skb, skb->len >= 64 ? 64 : skb->len)) {
+ return 0;
+ }
+
+ skb_reset_mac_header(skb);
+ eth = eth_hdr(skb);
+ esh = (struct eth_snap_hdr *) eth;
+ nh_ofs = sizeof *eth;
+ if (likely(ntohs(eth->h_proto) >= ODP_DL_TYPE_ETH2_CUTOFF))
+ key->dl_type = eth->h_proto;
+ else if (skb->len >= sizeof *esh && is_snap(esh)) {
+ key->dl_type = esh->ethertype;
+ nh_ofs = sizeof *esh;
+ } else {
+ key->dl_type = htons(ODP_DL_TYPE_NOT_ETH_TYPE);
+ if (skb->len >= nh_ofs + sizeof(struct llc_pdu_un)) {
+ nh_ofs += sizeof(struct llc_pdu_un);
+ }
+ }
+
+ /* Check for a VLAN tag */
+ if (key->dl_type == htons(ETH_P_8021Q) &&
+ skb->len >= nh_ofs + sizeof(struct vlan_hdr)) {
+ struct vlan_hdr *vh = (struct vlan_hdr*)(skb->data + nh_ofs);
+ key->dl_type = vh->h_vlan_encapsulated_proto;
+ key->dl_vlan = vh->h_vlan_TCI & htons(VLAN_VID_MASK);
+ nh_ofs += sizeof(struct vlan_hdr);
+ }
+ memcpy(key->dl_src, eth->h_source, ETH_ALEN);
+ memcpy(key->dl_dst, eth->h_dest, ETH_ALEN);
+ skb_set_network_header(skb, nh_ofs);
+
+ /* Network layer. */
+ if (key->dl_type == htons(ETH_P_IP) && iphdr_ok(skb)) {
+ struct iphdr *nh = ip_hdr(skb);
+ int th_ofs = nh_ofs + nh->ihl * 4;
+ key->nw_src = nh->saddr;
+ key->nw_dst = nh->daddr;
+ key->nw_proto = nh->protocol;
+ skb_set_transport_header(skb, th_ofs);
+
+ /* Transport layer. */
+ if (!(nh->frag_off & htons(IP_MF | IP_OFFSET))) {
+ if (key->nw_proto == IPPROTO_TCP) {
+ if (tcphdr_ok(skb)) {
+ struct tcphdr *tcp = tcp_hdr(skb);
+ key->tp_src = tcp->source;
+ key->tp_dst = tcp->dest;
+ } else {
+ /* Avoid tricking other code into
+ * thinking that this packet has an L4
+ * header. */
+ key->nw_proto = 0;
+ }
+ } else if (key->nw_proto == IPPROTO_UDP) {
+ if (udphdr_ok(skb)) {
+ struct udphdr *udp = udp_hdr(skb);
+ key->tp_src = udp->source;
+ key->tp_dst = udp->dest;
+ } else {
+ /* Avoid tricking other code into
+ * thinking that this packet has an L4
+ * header. */
+ key->nw_proto = 0;
+ }
+ } else if (key->nw_proto == IPPROTO_ICMP) {
+ if (icmphdr_ok(skb)) {
+ struct icmphdr *icmp = icmp_hdr(skb);
+ /* The ICMP type and code fields use the 16-bit
+ * transport port fields, so we need to store them
+ * in 16-bit network byte order. */
+ key->tp_src = htons(icmp->type);
+ key->tp_dst = htons(icmp->code);
+ } else {
+ /* Avoid tricking other code into
+ * thinking that this packet has an L4
+ * header. */
+ key->nw_proto = 0;
+ }
+ }
+ } else {
+ retval = 1;
+ }
+ } else {
+ skb_reset_transport_header(skb);
+ }
+ return retval;
+}
+
+/* Initializes the flow module.
+ * Returns zero if successful or a negative error code. */
+int flow_init(void)
+{
+ flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow), 0,
+ 0, NULL);
+ if (flow_cache == NULL)
+ return -ENOMEM;
+
+ return 0;
+}
+
+/* Uninitializes the flow module. */
+void flow_exit(void)
+{
+ kmem_cache_destroy(flow_cache);
+}
+
+void print_flow(const struct odp_flow_key *key)
+{
+#define MAC_FMT "%02x:%02x:%02x:%02x:%02x:%02x"
+#define MAC_ARG(x) ((u8*)(x))[0],((u8*)(x))[1],((u8*)(x))[2],((u8*)(x))[3],((u8*)(x))[4],((u8*)(x))[5]
+ printk("port%04x:vlan%d mac"MAC_FMT"->"MAC_FMT" "
+ "type%04x proto%d ip%x->%x port%d->%d\n",
+ key->in_port, ntohs(key->dl_vlan),
+ MAC_ARG(key->dl_src), MAC_ARG(key->dl_dst),
+ ntohs(key->dl_type), key->nw_proto,
+ key->nw_src, key->nw_dst,
+ ntohs(key->tp_src), ntohs(key->tp_dst));
+}
diff --git a/datapath/flow.h b/datapath/flow.h
new file mode 100644
index 000000000..55efede1b
--- /dev/null
+++ b/datapath/flow.h
@@ -0,0 +1,49 @@
+#ifndef FLOW_H
+#define FLOW_H 1
+
+#include <linux/kernel.h>
+#include <linux/spinlock.h>
+#include <linux/types.h>
+#include <linux/rcupdate.h>
+#include <linux/gfp.h>
+
+#include "openvswitch/datapath-protocol.h"
+
+struct sk_buff;
+
+struct sw_flow_actions {
+ struct rcu_head rcu;
+ unsigned int n_actions;
+ union odp_action actions[];
+};
+
+struct sw_flow {
+ struct rcu_head rcu;
+ struct odp_flow_key key;
+ struct sw_flow_actions *sf_acts;
+
+ struct timespec used; /* Last used time. */
+
+ u8 ip_tos; /* IP TOS value. */
+
+ spinlock_t lock; /* Lock for values below. */
+ u64 packet_count; /* Number of packets matched. */
+ u64 byte_count; /* Number of bytes matched. */
+ u8 tcp_flags; /* Union of seen TCP flags. */
+};
+
+extern struct kmem_cache *flow_cache;
+
+struct sw_flow_actions *flow_actions_alloc(size_t n_actions);
+void flow_free(struct sw_flow *);
+void flow_deferred_free(struct sw_flow *);
+void flow_deferred_free_acts(struct sw_flow_actions *);
+int flow_extract(struct sk_buff *, u16 in_port, struct odp_flow_key *);
+void flow_used(struct sw_flow *, struct sk_buff *);
+
+void print_flow(const struct odp_flow_key *);
+
+int flow_init(void);
+void flow_exit(void);
+
+#endif /* flow.h */
diff --git a/datapath/linux-2.6/.gitignore b/datapath/linux-2.6/.gitignore
new file mode 100644
index 000000000..af5821a22
--- /dev/null
+++ b/datapath/linux-2.6/.gitignore
@@ -0,0 +1,20 @@
+/Kbuild
+/Makefile
+/Makefile.main
+/actions.c
+/brcompat.c
+/brc_sysfs_dp.c
+/brc_sysfs_if.c
+/datapath.c
+/dp_dev.c
+/dp_notify.c
+/flow.c
+/genetlink-brcompat.c
+/genetlink-openvswitch.c
+/kcompat.h
+/linux-2.6
+/modules.order
+/random32.c
+/table.c
+/tmp
+/veth.c
diff --git a/datapath/linux-2.6/Kbuild.in b/datapath/linux-2.6/Kbuild.in
new file mode 100644
index 000000000..f08eb9c5e
--- /dev/null
+++ b/datapath/linux-2.6/Kbuild.in
@@ -0,0 +1,34 @@
+# -*- makefile -*-
+export builddir = @abs_builddir@
+export srcdir = @abs_srcdir@
+export top_srcdir = @abs_top_srcdir@
+export VERSION = @VERSION@
+export BUILDNR = @BUILDNR@
+
+include $(srcdir)/../Modules.mk
+include $(srcdir)/Modules.mk
+
+EXTRA_CFLAGS := -DVERSION=\"$(VERSION)\"
+EXTRA_CFLAGS += -I$(srcdir)/..
+EXTRA_CFLAGS += -I$(builddir)/..
+EXTRA_CFLAGS += -I$(top_srcdir)/include
+ifeq '$(BUILDNR)' '0'
+EXTRA_CFLAGS += -DBUILDNR=\"\"
+else
+EXTRA_CFLAGS += -DBUILDNR=\"+build$(BUILDNR)\"
+endif
+EXTRA_CFLAGS += -g
+EXTRA_CFLAGS += -include $(builddir)/kcompat.h
+
+# These include directories have to go before -I$(KSRC)/include.
+# NOSTDINC_FLAGS just happens to be a variable that goes in the
+# right place, even though it's conceptually incorrect.
+NOSTDINC_FLAGS += -I$(srcdir)/compat-2.6 -I$(srcdir)/compat-2.6/include
+
+obj-m := $(patsubst %,%_mod.o,$(build_modules))
+
+define module_template
+$(1)_mod-y = $$(notdir $$(patsubst %.c,%.o,$($(1)_sources)))
+endef
+
+$(foreach module,$(build_modules),$(eval $(call module_template,$(module))))
diff --git a/datapath/linux-2.6/Makefile.in b/datapath/linux-2.6/Makefile.in
new file mode 100644
index 000000000..efc1663e4
--- /dev/null
+++ b/datapath/linux-2.6/Makefile.in
@@ -0,0 +1,9 @@
+ifeq ($(KERNELRELEASE),)
+# We're being called directly by running make in this directory.
+include Makefile.main
+else
+# We're being included by the Linux kernel build system
+include Kbuild
+endif
+
+
diff --git a/datapath/linux-2.6/Makefile.main.in b/datapath/linux-2.6/Makefile.main.in
new file mode 100644
index 000000000..0005ec4fa
--- /dev/null
+++ b/datapath/linux-2.6/Makefile.main.in
@@ -0,0 +1,82 @@
+# -*- makefile -*-
+export builddir = @abs_builddir@
+export srcdir = @abs_srcdir@
+export top_srcdir = @abs_top_srcdir@
+export KSRC = @KSRC26@
+export VERSION = @VERSION@
+export BUILD_VETH = @BUILD_VETH@
+
+include $(srcdir)/../Modules.mk
+include $(srcdir)/Modules.mk
+
+default: $(build_links)
+
+$(foreach s,$(sort $(foreach m,$(build_modules),$($(m)_sources))), \
+ $(eval $(notdir $(s)): ; ln -s $(srcdir)/../$(s) $@))
+
+distclean: clean
+ rm -f kcompat.h
+distdir: clean
+install:
+all: default
+check: all
+clean:
+ rm -f *.o *.ko *_mod.* Module.symvers *.cmd kcompat.h.new
+ for d in $(build_links); do if test -h $$d; then rm $$d; fi; done
+
+ifneq ($(KSRC),)
+
+ifeq (/lib/modules/$(shell uname -r)/source, $(KSRC))
+ KOBJ := /lib/modules/$(shell uname -r)/build
+else
+ KOBJ := $(KSRC)
+endif
+
+ifneq ($(shell grep -c 'PATCHLEVEL = 6' $(KSRC)/Makefile),1)
+ $(error Linux kernel source in $(KSRC) not 2.6)
+endif
+
+VERSION_FILE := $(KOBJ)/include/linux/version.h
+ifeq (,$(wildcard $(VERSION_FILE)))
+ $(error Linux kernel source not configured - missing version.h)
+endif
+
+CONFIG_FILE := $(KSRC)/include/linux/autoconf.h
+ifeq (,$(wildcard $(CONFIG_FILE)))
+ $(error Linux kernel source not configured - missing autoconf.h)
+endif
+
+default:
+ $(MAKE) -C $(KSRC) M=$(builddir) modules
+endif
+
+# Much of the kernel build system in this file is derived from Intel's
+# e1000 distribution, with the following license:
+
+################################################################################
+#
+# Intel PRO/1000 Linux driver
+# Copyright(c) 1999 - 2007, 2009 Intel Corporation.
+#
+# This program is free software; you can redistribute it and/or modify it
+# under the terms and conditions of the GNU General Public License,
+# version 2, as published by the Free Software Foundation.
+#
+# This program is distributed in the hope it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+# more details.
+#
+# You should have received a copy of the GNU General Public License along with
+# this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+#
+# The full GNU General Public License is included in this distribution in
+# the file called "COPYING".
+#
+# Contact Information:
+# Linux NICS <linux.nics@intel.com>
+# e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
+# Intel Corporation, 5200 N.E. Elam Young Parkway, Hillsboro, OR 97124-6497
+#
+################################################################################
diff --git a/datapath/linux-2.6/Modules.mk b/datapath/linux-2.6/Modules.mk
new file mode 100644
index 000000000..bbc4c72f4
--- /dev/null
+++ b/datapath/linux-2.6/Modules.mk
@@ -0,0 +1,50 @@
+openvswitch_sources += \
+ linux-2.6/compat-2.6/genetlink-openvswitch.c \
+ linux-2.6/compat-2.6/random32.c
+openvswitch_headers += \
+ linux-2.6/compat-2.6/compat26.h \
+ linux-2.6/compat-2.6/include/asm-generic/bug.h \
+ linux-2.6/compat-2.6/include/linux/dmi.h \
+ linux-2.6/compat-2.6/include/linux/err.h \
+ linux-2.6/compat-2.6/include/linux/icmp.h \
+ linux-2.6/compat-2.6/include/linux/if_arp.h \
+ linux-2.6/compat-2.6/include/linux/ip.h \
+ linux-2.6/compat-2.6/include/linux/ipv6.h \
+ linux-2.6/compat-2.6/include/linux/jiffies.h \
+ linux-2.6/compat-2.6/include/linux/kernel.h \
+ linux-2.6/compat-2.6/include/linux/log2.h \
+ linux-2.6/compat-2.6/include/linux/lockdep.h \
+ linux-2.6/compat-2.6/include/linux/mutex.h \
+ linux-2.6/compat-2.6/include/linux/netdevice.h \
+ linux-2.6/compat-2.6/include/linux/netfilter_bridge.h \
+ linux-2.6/compat-2.6/include/linux/netfilter_ipv4.h \
+ linux-2.6/compat-2.6/include/linux/netlink.h \
+ linux-2.6/compat-2.6/include/linux/percpu.h \
+ linux-2.6/compat-2.6/include/linux/random.h \
+ linux-2.6/compat-2.6/include/linux/rculist.h \
+ linux-2.6/compat-2.6/include/linux/rtnetlink.h \
+ linux-2.6/compat-2.6/include/linux/skbuff.h \
+ linux-2.6/compat-2.6/include/linux/tcp.h \
+ linux-2.6/compat-2.6/include/linux/timer.h \
+ linux-2.6/compat-2.6/include/linux/types.h \
+ linux-2.6/compat-2.6/include/linux/udp.h \
+ linux-2.6/compat-2.6/include/linux/workqueue.h \
+ linux-2.6/compat-2.6/include/net/checksum.h \
+ linux-2.6/compat-2.6/include/net/genetlink.h \
+ linux-2.6/compat-2.6/include/net/netlink.h
+
+both_modules += brcompat
+brcompat_sources = \
+ linux-2.6/compat-2.6/genetlink-brcompat.c \
+ brcompat.c \
+ brc_procfs.c \
+ brc_sysfs_dp.c \
+ brc_sysfs_if.c
+brcompat_headers = \
+ brc_procfs.h \
+ brc_sysfs.h
+
+dist_modules += veth
+build_modules += $(if $(BUILD_VETH),veth)
+veth_sources = linux-2.6/compat-2.6/veth.c
+veth_headers =
diff --git a/datapath/linux-2.6/compat-2.6/compat26.h b/datapath/linux-2.6/compat-2.6/compat26.h
new file mode 100644
index 000000000..61448d635
--- /dev/null
+++ b/datapath/linux-2.6/compat-2.6/compat26.h
@@ -0,0 +1,37 @@
+#ifndef __COMPAT26_H
+#define __COMPAT26_H 1
+
+#include <linux/version.h>
+
+#if defined(CONFIG_PREEMPT) && LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,21)
+#error "CONFIG_PREEMPT is broken with 2.6.x before 2.6.21--see commit 4498121ca3, \"[NET]: Handle disabled preemption in gfp_any()\""
+#endif
+
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
+/*----------------------------------------------------------------------------
+ * In 2.6.24, a namespace argument became required for dev_get_by_name. */
+
+#define dev_get_by_name(net, name) \
+ dev_get_by_name((name))
+
+#define dev_get_by_index(net, ifindex) \
+ dev_get_by_index((ifindex))
+
+#define __dev_get_by_name(net, name) \
+ __dev_get_by_name((name))
+
+#define __dev_get_by_index(net, ifindex) \
+ __dev_get_by_index((ifindex))
+
+#endif /* linux kernel <= 2.6.23 */
+
+
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,22)
+/*----------------------------------------------------------------------------
+ * In 2.6.23, the last argument was dropped from kmem_cache_create. */
+#define kmem_cache_create(n, s, a, f, c) \
+ kmem_cache_create((n), (s), (a), (f), (c), NULL)
+
+#endif /* linux kernel <= 2.6.22 */
+
+#endif /* compat26.h */
diff --git a/datapath/linux-2.6/compat-2.6/genetlink-brcompat.c b/datapath/linux-2.6/compat-2.6/genetlink-brcompat.c
new file mode 100644
index 000000000..c43b3ce46
--- /dev/null
+++ b/datapath/linux-2.6/compat-2.6/genetlink-brcompat.c
@@ -0,0 +1,20 @@
+#include "net/genetlink.h"
+
+#include <linux/version.h>
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,23)
+
+/* We fix grp->id to 32 so that it doesn't collide with any of the multicast
+ * groups selected by openvswitch_mod, which uses groups 16 through 31.
+ * Collision isn't fatal--multicast listeners should check that the family is
+ * the one that they want and discard others--but it wastes time and memory to
+ * receive unwanted messages. */
+int genl_register_mc_group(struct genl_family *family,
+ struct genl_multicast_group *grp)
+{
+ grp->id = 32;
+ grp->family = family;
+
+ return 0;
+}
+
+#endif /* kernel < 2.6.23 */
diff --git a/datapath/linux-2.6/compat-2.6/genetlink-openvswitch.c b/datapath/linux-2.6/compat-2.6/genetlink-openvswitch.c
new file mode 100644
index 000000000..9e09215f5
--- /dev/null
+++ b/datapath/linux-2.6/compat-2.6/genetlink-openvswitch.c
@@ -0,0 +1,22 @@
+#include "net/genetlink.h"
+
+#include <linux/version.h>
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,23)
+
+/* We use multicast groups 16 through 31 to avoid colliding with the multicast
+ * group selected by brcompat_mod, which uses groups 32. Collision isn't
+ * fatal--multicast listeners should check that the family is the one that they
+ * want and discard others--but it wastes time and memory to receive unwanted
+ * messages. */
+int genl_register_mc_group(struct genl_family *family,
+ struct genl_multicast_group *grp)
+{
+ /* This code is called single-threaded. */
+ static unsigned int next_id = 0;
+ grp->id = next_id++ % 16 + 16;
+ grp->family = family;
+
+ return 0;
+}
+
+#endif /* kernel < 2.6.23 */
diff --git a/datapath/linux-2.6/compat-2.6/include/asm-generic/bug.h b/datapath/linux-2.6/compat-2.6/include/asm-generic/bug.h
new file mode 100644
index 000000000..1d9b31401
--- /dev/null
+++ b/datapath/linux-2.6/compat-2.6/include/asm-generic/bug.h
@@ -0,0 +1,19 @@
+#ifndef __ASM_GENERIC_BUG_WRAPPER_H
+#define __ASM_GENERIC_BUG_WRAPPER_H
+
+#include_next <asm-generic/bug.h>
+
+#ifndef WARN_ON_ONCE
+#define WARN_ON_ONCE(condition) ({ \
+ static int __warned; \
+ int __ret_warn_once = !!(condition); \
+ \
+ if (unlikely(__ret_warn_once) && !__warned) { \
+ WARN_ON(1); \
+ __warned = 1; \
+ } \
+ unlikely(__ret_warn_once); \
+})
+#endif
+
+#endif
diff --git a/datapath/linux-2.6/compat-2.6/include/linux/cpumask.h b/datapath/linux-2.6/compat-2.6/include/linux/cpumask.h
new file mode 100644
index 000000000..48c73aa8f
--- /dev/null
+++ b/datapath/linux-2.6/compat-2.6/include/linux/cpumask.h
@@ -0,0 +1,11 @@
+#ifndef __LINUX_CPUMASK_WRAPPER_H
+#define __LINUX_CPUMASK_WRAPPER_H
+
+#include_next <linux/cpumask.h>
+
+/* for_each_cpu was renamed for_each_possible_cpu in 2.6.18. */
+#ifndef for_each_possible_cpu
+#define for_each_possible_cpu for_each_cpu
+#endif
+
+#endif /* linux/cpumask.h wrapper */
diff --git a/datapath/linux-2.6/compat-2.6/include/linux/dmi.h b/datapath/linux-2.6/compat-2.6/include/linux/dmi.h
new file mode 100644
index 000000000..52916fec8
--- /dev/null
+++ b/datapath/linux-2.6/compat-2.6/include/linux/dmi.h
@@ -0,0 +1,114 @@
+#ifndef __LINUX_DMI_WRAPPER_H
+#define __LINUX_DMI_WRAPPER_H 1
+
+#include <linux/version.h>
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,23)
+
+#include_next <linux/dmi.h>
+
+#else /* linux version >= 2.6.23 */
+
+#ifndef __DMI_H__
+#define __DMI_H__
+
+#include <linux/list.h>
+
+enum dmi_field {
+ DMI_NONE,
+ DMI_BIOS_VENDOR,
+ DMI_BIOS_VERSION,
+ DMI_BIOS_DATE,
+ DMI_SYS_VENDOR,
+ DMI_PRODUCT_NAME,
+ DMI_PRODUCT_VERSION,
+ DMI_PRODUCT_SERIAL,
+ DMI_PRODUCT_UUID,
+ DMI_BOARD_VENDOR,
+ DMI_BOARD_NAME,
+ DMI_BOARD_VERSION,
+ DMI_BOARD_SERIAL,
+ DMI_BOARD_ASSET_TAG,
+ DMI_CHASSIS_VENDOR,
+ DMI_CHASSIS_TYPE,
+ DMI_CHASSIS_VERSION,
+ DMI_CHASSIS_SERIAL,
+ DMI_CHASSIS_ASSET_TAG,
+ DMI_STRING_MAX,
+};
+
+enum dmi_device_type {
+ DMI_DEV_TYPE_ANY = 0,
+ DMI_DEV_TYPE_OTHER,
+ DMI_DEV_TYPE_UNKNOWN,
+ DMI_DEV_TYPE_VIDEO,
+ DMI_DEV_TYPE_SCSI,
+ DMI_DEV_TYPE_ETHERNET,
+ DMI_DEV_TYPE_TOKENRING,
+ DMI_DEV_TYPE_SOUND,
+ DMI_DEV_TYPE_IPMI = -1,
+ DMI_DEV_TYPE_OEM_STRING = -2
+};
+
+struct dmi_header {
+ u8 type;
+ u8 length;
+ u16 handle;
+};
+
+/*
+ * DMI callbacks for problem boards
+ */
+struct dmi_strmatch {
+ u8 slot;
+ char *substr;
+};
+
+struct dmi_system_id {
+ int (*callback)(struct dmi_system_id *);
+ const char *ident;
+ struct dmi_strmatch matches[4];
+ void *driver_data;
+};
+
+#define DMI_MATCH(a, b) { a, b }
+
+struct dmi_device {
+ struct list_head list;
+ int type;
+ const char *name;
+ void *device_data; /* Type specific data */
+};
+
+/* No CONFIG_DMI before 2.6.16 */
+#if defined(CONFIG_DMI) || defined(CONFIG_X86_32)
+
+extern int dmi_check_system(struct dmi_system_id *list);
+extern char * dmi_get_system_info(int field);
+extern struct dmi_device * dmi_find_device(int type, const char *name,
+ struct dmi_device *from);
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16)
+extern void dmi_scan_machine(void);
+#endif
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,17)
+extern int dmi_get_year(int field);
+#endif
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,19)
+extern int dmi_name_in_vendors(char *str);
+#endif
+
+#else
+
+static inline int dmi_check_system(struct dmi_system_id *list) { return 0; }
+static inline char * dmi_get_system_info(int field) { return NULL; }
+static inline struct dmi_device * dmi_find_device(int type, const char *name,
+ struct dmi_device *from) { return NULL; }
+static inline int dmi_get_year(int year) { return 0; }
+static inline int dmi_name_in_vendors(char *s) { return 0; }
+
+#endif
+
+#endif /* __DMI_H__ */
+
+#endif /* linux kernel < 2.6.22 */
+
+#endif
diff --git a/datapath/linux-2.6/compat-2.6/include/linux/err.h b/datapath/linux-2.6/compat-2.6/include/linux/err.h
new file mode 100644
index 000000000..50faf2a11
--- /dev/null
+++ b/datapath/linux-2.6/compat-2.6/include/linux/err.h
@@ -0,0 +1,21 @@
+#ifndef __LINUX_ERR_WRAPPER_H
+#define __LINUX_ERR_WRAPPER_H 1
+
+#include_next <linux/err.h>
+
+#ifndef HAVE_ERR_CAST
+/**
+ * ERR_CAST - Explicitly cast an error-valued pointer to another pointer type
+ * @ptr: The pointer to cast.
+ *
+ * Explicitly cast an error-valued pointer to another pointer type in such a
+ * way as to make it clear that's what's going on.
+ */
+static inline void *ERR_CAST(const void *ptr)
+{
+ /* cast away the const */
+ return (void *) ptr;
+}
+#endif /* HAVE_ERR_CAST */
+
+#endif
diff --git a/datapath/linux-2.6/compat-2.6/include/linux/icmp.h b/datapath/linux-2.6/compat-2.6/include/linux/icmp.h
new file mode 100644
index 000000000..89b354e4c
--- /dev/null
+++ b/datapath/linux-2.6/compat-2.6/include/linux/icmp.h
@@ -0,0 +1,13 @@
+#ifndef __LINUX_ICMP_WRAPPER_H
+#define __LINUX_ICMP_WRAPPER_H 1
+
+#include_next <linux/icmp.h>
+
+#ifndef HAVE_SKBUFF_HEADER_HELPERS
+static inline struct icmphdr *icmp_hdr(const struct sk_buff *skb)
+{
+ return (struct icmphdr *)skb_transport_header(skb);
+}
+#endif
+
+#endif
diff --git a/datapath/linux-2.6/compat-2.6/include/linux/if_arp.h b/datapath/linux-2.6/compat-2.6/include/linux/if_arp.h
new file mode 100644
index 000000000..e48d6ba0d
--- /dev/null
+++ b/datapath/linux-2.6/compat-2.6/include/linux/if_arp.h
@@ -0,0 +1,15 @@
+#ifndef __LINUX_IF_ARP_WRAPPER_H
+#define __LINUX_IF_ARP_WRAPPER_H 1
+
+#include_next <linux/if_arp.h>
+
+#ifndef HAVE_SKBUFF_HEADER_HELPERS
+#include <linux/skbuff.h>
+
+static inline struct arphdr *arp_hdr(const struct sk_buff *skb)
+{
+ return (struct arphdr *)skb_network_header(skb);
+}
+#endif /* !HAVE_SKBUFF_HEADER_HELPERS */
+
+#endif
diff --git a/datapath/linux-2.6/compat-2.6/include/linux/ip.h b/datapath/linux-2.6/compat-2.6/include/linux/ip.h
new file mode 100644
index 000000000..36765396b
--- /dev/null
+++ b/datapath/linux-2.6/compat-2.6/include/linux/ip.h
@@ -0,0 +1,18 @@
+#ifndef __LINUX_IP_WRAPPER_H
+#define __LINUX_IP_WRAPPER_H 1
+
+#include_next <linux/ip.h>
+
+#ifndef HAVE_SKBUFF_HEADER_HELPERS
+static inline struct iphdr *ip_hdr(const struct sk_buff *skb)
+{
+ return (struct iphdr *)skb_network_header(skb);
+}
+
+static inline unsigned int ip_hdrlen(const struct sk_buff *skb)
+{
+ return ip_hdr(skb)->ihl * 4;
+}
+#endif /* !HAVE_SKBUFF_HEADER_HELPERS */
+
+#endif
diff --git a/datapath/linux-2.6/compat-2.6/include/linux/ipv6.h b/datapath/linux-2.6/compat-2.6/include/linux/ipv6.h
new file mode 100644
index 000000000..25a5431af
--- /dev/null
+++ b/datapath/linux-2.6/compat-2.6/include/linux/ipv6.h
@@ -0,0 +1,13 @@
+#ifndef __LINUX_IPV6_WRAPPER_H
+#define __LINUX_IPV6_WRAPPER_H 1
+
+#include_next <linux/ipv6.h>
+
+#ifndef HAVE_SKBUFF_HEADER_HELPERS
+static inline struct ipv6hdr *ipv6_hdr(const struct sk_buff *skb)
+{
+ return (struct ipv6hdr *)skb_network_header(skb);
+}
+#endif
+
+#endif
diff --git a/datapath/linux-2.6/compat-2.6/include/linux/jiffies.h b/datapath/linux-2.6/compat-2.6/include/linux/jiffies.h
new file mode 100644
index 000000000..3286e6346
--- /dev/null
+++ b/datapath/linux-2.6/compat-2.6/include/linux/jiffies.h
@@ -0,0 +1,26 @@
+#ifndef __LINUX_JIFFIES_WRAPPER_H
+#define __LINUX_JIFFIES_WRAPPER_H 1
+
+#include_next <linux/jiffies.h>
+
+#include <linux/version.h>
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,19)
+
+/* Same as above, but does so with platform independent 64bit types.
+ * These must be used when utilizing jiffies_64 (i.e. return value of
+ * get_jiffies_64() */
+#define time_after64(a,b) \
+ (typecheck(__u64, a) && \
+ typecheck(__u64, b) && \
+ ((__s64)(b) - (__s64)(a) < 0))
+#define time_before64(a,b) time_after64(b,a)
+
+#define time_after_eq64(a,b) \
+ (typecheck(__u64, a) && \
+ typecheck(__u64, b) && \
+ ((__s64)(a) - (__s64)(b) >= 0))
+#define time_before_eq64(a,b) time_after_eq64(b,a)
+
+#endif /* linux kernel < 2.6.19 */
+
+#endif
diff --git a/datapath/linux-2.6/compat-2.6/include/linux/kernel.h b/datapath/linux-2.6/compat-2.6/include/linux/kernel.h
new file mode 100644
index 000000000..9459155d6
--- /dev/null
+++ b/datapath/linux-2.6/compat-2.6/include/linux/kernel.h
@@ -0,0 +1,9 @@
+#ifndef __KERNEL_H_WRAPPER
+#define __KERNEL_H_WRAPPER 1
+
+#include_next <linux/kernel.h>
+#ifndef HAVE_LOG2_H
+#include <linux/log2.h>
+#endif
+
+#endif /* linux/kernel.h */
diff --git a/datapath/linux-2.6/compat-2.6/include/linux/lockdep.h b/datapath/linux-2.6/compat-2.6/include/linux/lockdep.h
new file mode 100644
index 000000000..1c839423a
--- /dev/null
+++ b/datapath/linux-2.6/compat-2.6/include/linux/lockdep.h
@@ -0,0 +1,450 @@
+/*
+ * Runtime locking correctness validator
+ *
+ * Copyright (C) 2006,2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ *
+ * see Documentation/lockdep-design.txt for more details.
+ */
+#ifndef __LINUX_LOCKDEP_WRAPPER_H
+#define __LINUX_LOCKDEP_WRAPPER_H
+
+#include_next <linux/lockdep.h>
+
+#include <linux/version.h>
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,18)
+
+struct task_struct;
+struct lockdep_map;
+
+#ifdef CONFIG_LOCKDEP
+
+#include <linux/linkage.h>
+#include <linux/list.h>
+#include <linux/debug_locks.h>
+#include <linux/stacktrace.h>
+
+/*
+ * Lock-class usage-state bits:
+ */
+enum lock_usage_bit
+{
+ LOCK_USED = 0,
+ LOCK_USED_IN_HARDIRQ,
+ LOCK_USED_IN_SOFTIRQ,
+ LOCK_ENABLED_SOFTIRQS,
+ LOCK_ENABLED_HARDIRQS,
+ LOCK_USED_IN_HARDIRQ_READ,
+ LOCK_USED_IN_SOFTIRQ_READ,
+ LOCK_ENABLED_SOFTIRQS_READ,
+ LOCK_ENABLED_HARDIRQS_READ,
+ LOCK_USAGE_STATES
+};
+
+/*
+ * Usage-state bitmasks:
+ */
+#define LOCKF_USED (1 << LOCK_USED)
+#define LOCKF_USED_IN_HARDIRQ (1 << LOCK_USED_IN_HARDIRQ)
+#define LOCKF_USED_IN_SOFTIRQ (1 << LOCK_USED_IN_SOFTIRQ)
+#define LOCKF_ENABLED_HARDIRQS (1 << LOCK_ENABLED_HARDIRQS)
+#define LOCKF_ENABLED_SOFTIRQS (1 << LOCK_ENABLED_SOFTIRQS)
+
+#define LOCKF_ENABLED_IRQS (LOCKF_ENABLED_HARDIRQS | LOCKF_ENABLED_SOFTIRQS)
+#define LOCKF_USED_IN_IRQ (LOCKF_USED_IN_HARDIRQ | LOCKF_USED_IN_SOFTIRQ)
+
+#define LOCKF_USED_IN_HARDIRQ_READ (1 << LOCK_USED_IN_HARDIRQ_READ)
+#define LOCKF_USED_IN_SOFTIRQ_READ (1 << LOCK_USED_IN_SOFTIRQ_READ)
+#define LOCKF_ENABLED_HARDIRQS_READ (1 << LOCK_ENABLED_HARDIRQS_READ)
+#define LOCKF_ENABLED_SOFTIRQS_READ (1 << LOCK_ENABLED_SOFTIRQS_READ)
+
+#define LOCKF_ENABLED_IRQS_READ \
+ (LOCKF_ENABLED_HARDIRQS_READ | LOCKF_ENABLED_SOFTIRQS_READ)
+#define LOCKF_USED_IN_IRQ_READ \
+ (LOCKF_USED_IN_HARDIRQ_READ | LOCKF_USED_IN_SOFTIRQ_READ)
+
+#define MAX_LOCKDEP_SUBCLASSES 8UL
+
+/*
+ * Lock-classes are keyed via unique addresses, by embedding the
+ * lockclass-key into the kernel (or module) .data section. (For
+ * static locks we use the lock address itself as the key.)
+ */
+struct lockdep_subclass_key {
+ char __one_byte;
+} __attribute__ ((__packed__));
+
+struct lock_class_key {
+ struct lockdep_subclass_key subkeys[MAX_LOCKDEP_SUBCLASSES];
+};
+
+/*
+ * The lock-class itself:
+ */
+struct lock_class {
+ /*
+ * class-hash:
+ */
+ struct list_head hash_entry;
+
+ /*
+ * global list of all lock-classes:
+ */
+ struct list_head lock_entry;
+
+ struct lockdep_subclass_key *key;
+ unsigned int subclass;
+
+ /*
+ * IRQ/softirq usage tracking bits:
+ */
+ unsigned long usage_mask;
+ struct stack_trace usage_traces[LOCK_USAGE_STATES];
+
+ /*
+ * These fields represent a directed graph of lock dependencies,
+ * to every node we attach a list of "forward" and a list of
+ * "backward" graph nodes.
+ */
+ struct list_head locks_after, locks_before;
+
+ /*
+ * Generation counter, when doing certain classes of graph walking,
+ * to ensure that we check one node only once:
+ */
+ unsigned int version;
+
+ /*
+ * Statistics counter:
+ */
+ unsigned long ops;
+
+ const char *name;
+ int name_version;
+
+#ifdef CONFIG_LOCK_STAT
+ unsigned long contention_point[4];
+#endif
+};
+
+#ifdef CONFIG_LOCK_STAT
+struct lock_time {
+ s64 min;
+ s64 max;
+ s64 total;
+ unsigned long nr;
+};
+
+enum bounce_type {
+ bounce_acquired_write,
+ bounce_acquired_read,
+ bounce_contended_write,
+ bounce_contended_read,
+ nr_bounce_types,
+
+ bounce_acquired = bounce_acquired_write,
+ bounce_contended = bounce_contended_write,
+};
+
+struct lock_class_stats {
+ unsigned long contention_point[4];
+ struct lock_time read_waittime;
+ struct lock_time write_waittime;
+ struct lock_time read_holdtime;
+ struct lock_time write_holdtime;
+ unsigned long bounces[nr_bounce_types];
+};
+
+struct lock_class_stats lock_stats(struct lock_class *class);
+void clear_lock_stats(struct lock_class *class);
+#endif
+
+/*
+ * Map the lock object (the lock instance) to the lock-class object.
+ * This is embedded into specific lock instances:
+ */
+struct lockdep_map {
+ struct lock_class_key *key;
+ struct lock_class *class_cache;
+ const char *name;
+#ifdef CONFIG_LOCK_STAT
+ int cpu;
+#endif
+};
+
+/*
+ * Every lock has a list of other locks that were taken after it.
+ * We only grow the list, never remove from it:
+ */
+struct lock_list {
+ struct list_head entry;
+ struct lock_class *class;
+ struct stack_trace trace;
+ int distance;
+};
+
+/*
+ * We record lock dependency chains, so that we can cache them:
+ */
+struct lock_chain {
+ struct list_head entry;
+ u64 chain_key;
+};
+
+struct held_lock {
+ /*
+ * One-way hash of the dependency chain up to this point. We
+ * hash the hashes step by step as the dependency chain grows.
+ *
+ * We use it for dependency-caching and we skip detection
+ * passes and dependency-updates if there is a cache-hit, so
+ * it is absolutely critical for 100% coverage of the validator
+ * to have a unique key value for every unique dependency path
+ * that can occur in the system, to make a unique hash value
+ * as likely as possible - hence the 64-bit width.
+ *
+ * The task struct holds the current hash value (initialized
+ * with zero), here we store the previous hash value:
+ */
+ u64 prev_chain_key;
+ struct lock_class *class;
+ unsigned long acquire_ip;
+ struct lockdep_map *instance;
+
+#ifdef CONFIG_LOCK_STAT
+ u64 waittime_stamp;
+ u64 holdtime_stamp;
+#endif
+ /*
+ * The lock-stack is unified in that the lock chains of interrupt
+ * contexts nest ontop of process context chains, but we 'separate'
+ * the hashes by starting with 0 if we cross into an interrupt
+ * context, and we also keep do not add cross-context lock
+ * dependencies - the lock usage graph walking covers that area
+ * anyway, and we'd just unnecessarily increase the number of
+ * dependencies otherwise. [Note: hardirq and softirq contexts
+ * are separated from each other too.]
+ *
+ * The following field is used to detect when we cross into an
+ * interrupt context:
+ */
+ int irq_context;
+ int trylock;
+ int read;
+ int check;
+ int hardirqs_off;
+};
+
+/*
+ * Initialization, self-test and debugging-output methods:
+ */
+extern void lockdep_init(void);
+extern void lockdep_info(void);
+extern void lockdep_reset(void);
+extern void lockdep_reset_lock(struct lockdep_map *lock);
+extern void lockdep_free_key_range(void *start, unsigned long size);
+
+extern void lockdep_off(void);
+extern void lockdep_on(void);
+
+/*
+ * These methods are used by specific locking variants (spinlocks,
+ * rwlocks, mutexes and rwsems) to pass init/acquire/release events
+ * to lockdep:
+ */
+
+extern void lockdep_init_map(struct lockdep_map *lock, const char *name,
+ struct lock_class_key *key, int subclass);
+
+/*
+ * Reinitialize a lock key - for cases where there is special locking or
+ * special initialization of locks so that the validator gets the scope
+ * of dependencies wrong: they are either too broad (they need a class-split)
+ * or they are too narrow (they suffer from a false class-split):
+ */
+#define lockdep_set_class(lock, key) \
+ lockdep_init_map(&(lock)->dep_map, #key, key, 0)
+#define lockdep_set_class_and_name(lock, key, name) \
+ lockdep_init_map(&(lock)->dep_map, name, key, 0)
+#define lockdep_set_class_and_subclass(lock, key, sub) \
+ lockdep_init_map(&(lock)->dep_map, #key, key, sub)
+#define lockdep_set_subclass(lock, sub) \
+ lockdep_init_map(&(lock)->dep_map, #lock, \
+ (lock)->dep_map.key, sub)
+
+/*
+ * Acquire a lock.
+ *
+ * Values for "read":
+ *
+ * 0: exclusive (write) acquire
+ * 1: read-acquire (no recursion allowed)
+ * 2: read-acquire with same-instance recursion allowed
+ *
+ * Values for check:
+ *
+ * 0: disabled
+ * 1: simple checks (freeing, held-at-exit-time, etc.)
+ * 2: full validation
+ */
+extern void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
+ int trylock, int read, int check, unsigned long ip);
+
+extern void lock_release(struct lockdep_map *lock, int nested,
+ unsigned long ip);
+
+# define INIT_LOCKDEP .lockdep_recursion = 0,
+
+#define lockdep_depth(tsk) (debug_locks ? (tsk)->lockdep_depth : 0)
+
+#else /* !LOCKDEP */
+
+static inline void lockdep_off(void)
+{
+}
+
+static inline void lockdep_on(void)
+{
+}
+
+# define lock_acquire(l, s, t, r, c, i) do { } while (0)
+# define lock_release(l, n, i) do { } while (0)
+# define lockdep_init() do { } while (0)
+# define lockdep_info() do { } while (0)
+# define lockdep_init_map(lock, name, key, sub) do { (void)(key); } while (0)
+# define lockdep_set_class(lock, key) do { (void)(key); } while (0)
+# define lockdep_set_class_and_name(lock, key, name) \
+ do { (void)(key); } while (0)
+#define lockdep_set_class_and_subclass(lock, key, sub) \
+ do { (void)(key); } while (0)
+#define lockdep_set_subclass(lock, sub) do { } while (0)
+
+# define INIT_LOCKDEP
+# define lockdep_reset() do { debug_locks = 1; } while (0)
+# define lockdep_free_key_range(start, size) do { } while (0)
+/*
+ * The class key takes no space if lockdep is disabled:
+ */
+struct lock_class_key { };
+
+#define lockdep_depth(tsk) (0)
+
+#endif /* !LOCKDEP */
+
+#ifdef CONFIG_LOCK_STAT
+
+extern void lock_contended(struct lockdep_map *lock, unsigned long ip);
+extern void lock_acquired(struct lockdep_map *lock);
+
+#define LOCK_CONTENDED(_lock, try, lock) \
+do { \
+ if (!try(_lock)) { \
+ lock_contended(&(_lock)->dep_map, _RET_IP_); \
+ lock(_lock); \
+ } \
+ lock_acquired(&(_lock)->dep_map); \
+} while (0)
+
+#else /* CONFIG_LOCK_STAT */
+
+#define lock_contended(lockdep_map, ip) do {} while (0)
+#define lock_acquired(lockdep_map) do {} while (0)
+
+#define LOCK_CONTENDED(_lock, try, lock) \
+ lock(_lock)
+
+#endif /* CONFIG_LOCK_STAT */
+
+#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_GENERIC_HARDIRQS)
+extern void early_init_irq_lock_class(void);
+#else
+static inline void early_init_irq_lock_class(void)
+{
+}
+#endif
+
+#ifdef CONFIG_TRACE_IRQFLAGS
+extern void early_boot_irqs_off(void);
+extern void early_boot_irqs_on(void);
+extern void print_irqtrace_events(struct task_struct *curr);
+#else
+static inline void early_boot_irqs_off(void)
+{
+}
+static inline void early_boot_irqs_on(void)
+{
+}
+static inline void print_irqtrace_events(struct task_struct *curr)
+{
+}
+#endif
+
+/*
+ * For trivial one-depth nesting of a lock-class, the following
+ * global define can be used. (Subsystems with multiple levels
+ * of nesting should define their own lock-nesting subclasses.)
+ */
+#define SINGLE_DEPTH_NESTING 1
+
+/*
+ * Map the dependency ops to NOP or to real lockdep ops, depending
+ * on the per lock-class debug mode:
+ */
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+# ifdef CONFIG_PROVE_LOCKING
+# define spin_acquire(l, s, t, i) lock_acquire(l, s, t, 0, 2, i)
+# else
+# define spin_acquire(l, s, t, i) lock_acquire(l, s, t, 0, 1, i)
+# endif
+# define spin_release(l, n, i) lock_release(l, n, i)
+#else
+# define spin_acquire(l, s, t, i) do { } while (0)
+# define spin_release(l, n, i) do { } while (0)
+#endif
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+# ifdef CONFIG_PROVE_LOCKING
+# define rwlock_acquire(l, s, t, i) lock_acquire(l, s, t, 0, 2, i)
+# define rwlock_acquire_read(l, s, t, i) lock_acquire(l, s, t, 2, 2, i)
+# else
+# define rwlock_acquire(l, s, t, i) lock_acquire(l, s, t, 0, 1, i)
+# define rwlock_acquire_read(l, s, t, i) lock_acquire(l, s, t, 2, 1, i)
+# endif
+# define rwlock_release(l, n, i) lock_release(l, n, i)
+#else
+# define rwlock_acquire(l, s, t, i) do { } while (0)
+# define rwlock_acquire_read(l, s, t, i) do { } while (0)
+# define rwlock_release(l, n, i) do { } while (0)
+#endif
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+# ifdef CONFIG_PROVE_LOCKING
+# define mutex_acquire(l, s, t, i) lock_acquire(l, s, t, 0, 2, i)
+# else
+# define mutex_acquire(l, s, t, i) lock_acquire(l, s, t, 0, 1, i)
+# endif
+# define mutex_release(l, n, i) lock_release(l, n, i)
+#else
+# define mutex_acquire(l, s, t, i) do { } while (0)
+# define mutex_release(l, n, i) do { } while (0)
+#endif
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+# ifdef CONFIG_PROVE_LOCKING
+# define rwsem_acquire(l, s, t, i) lock_acquire(l, s, t, 0, 2, i)
+# define rwsem_acquire_read(l, s, t, i) lock_acquire(l, s, t, 1, 2, i)
+# else
+# define rwsem_acquire(l, s, t, i) lock_acquire(l, s, t, 0, 1, i)
+# define rwsem_acquire_read(l, s, t, i) lock_acquire(l, s, t, 1, 1, i)
+# endif
+# define rwsem_release(l, n, i) lock_release(l, n, i)
+#else
+# define rwsem_acquire(l, s, t, i) do { } while (0)
+# define rwsem_acquire_read(l, s, t, i) do { } while (0)
+# define rwsem_release(l, n, i) do { } while (0)
+#endif
+
+#endif /* linux kernel < 2.6.18 */
+
+#endif /* __LINUX_LOCKDEP_WRAPPER_H */
diff --git a/datapath/linux-2.6/compat-2.6/include/linux/log2.h b/datapath/linux-2.6/compat-2.6/include/linux/log2.h
new file mode 100644
index 000000000..69abae5e8
--- /dev/null
+++ b/datapath/linux-2.6/compat-2.6/include/linux/log2.h
@@ -0,0 +1,17 @@
+#ifndef __LINUX_LOG2_WRAPPER
+#define __LINUX_LOG2_WRAPPER
+
+#ifdef HAVE_LOG2_H
+#include_next <linux/log2.h>
+#else
+/* This is very stripped down because log2.h has far too many dependencies. */
+
+extern __attribute__((const, noreturn))
+int ____ilog2_NaN(void);
+
+#define ilog2(n) ((n) == 4 ? 2 : \
+ (n) == 8 ? 3 : \
+ ____ilog2_NaN())
+#endif
+
+#endif
diff --git a/datapath/linux-2.6/compat-2.6/include/linux/mutex.h b/datapath/linux-2.6/compat-2.6/include/linux/mutex.h
new file mode 100644
index 000000000..93dfa3b2b
--- /dev/null
+++ b/datapath/linux-2.6/compat-2.6/include/linux/mutex.h
@@ -0,0 +1,59 @@
+#ifndef __LINUX_MUTEX_WRAPPER_H
+#define __LINUX_MUTEX_WRAPPER_H
+
+
+#include <linux/version.h>
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,16)
+
+#include <asm/semaphore.h>
+
+struct mutex {
+ struct semaphore sema;
+};
+
+#define mutex_init(mutex) init_MUTEX(&(mutex)->sema)
+#define mutex_destroy(mutex) do { } while (0)
+
+#define __MUTEX_INITIALIZER(name) \
+ __SEMAPHORE_INITIALIZER(name,1)
+
+#define DEFINE_MUTEX(mutexname) \
+ struct mutex mutexname = { __MUTEX_INITIALIZER(mutexname.sema) }
+
+/*
+ * See kernel/mutex.c for detailed documentation of these APIs.
+ * Also see Documentation/mutex-design.txt.
+ */
+static inline void mutex_lock(struct mutex *lock)
+{
+ down(&lock->sema);
+}
+
+static inline int mutex_lock_interruptible(struct mutex *lock)
+{
+ return down_interruptible(&lock->sema);
+}
+
+#define mutex_lock_nested(lock, subclass) mutex_lock(lock)
+#define mutex_lock_interruptible_nested(lock, subclass) mutex_lock_interruptible(lock)
+
+/*
+ * NOTE: mutex_trylock() follows the spin_trylock() convention,
+ * not the down_trylock() convention!
+ */
+static inline int mutex_trylock(struct mutex *lock)
+{
+ return !down_trylock(&lock->sema);
+}
+
+static inline void mutex_unlock(struct mutex *lock)
+{
+ up(&lock->sema);
+}
+#else
+
+#include_next <linux/mutex.h>
+
+#endif /* linux version < 2.6.16 */
+
+#endif
diff --git a/datapath/linux-2.6/compat-2.6/include/linux/netdevice.h b/datapath/linux-2.6/compat-2.6/include/linux/netdevice.h
new file mode 100644
index 000000000..32e1735dc
--- /dev/null
+++ b/datapath/linux-2.6/compat-2.6/include/linux/netdevice.h
@@ -0,0 +1,35 @@
+#ifndef __LINUX_NETDEVICE_WRAPPER_H
+#define __LINUX_NETDEVICE_WRAPPER_H 1
+
+#include_next <linux/netdevice.h>
+
+struct net;
+
+#ifndef to_net_dev
+#define to_net_dev(class) container_of(class, struct net_device, class_dev)
+#endif
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,26)
+static inline
+struct net *dev_net(const struct net_device *dev)
+{
+ return NULL;
+}
+#endif /* linux kernel < 2.6.26 */
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,24)
+#define proc_net init_net.proc_net
+#endif
+
+#ifndef for_each_netdev
+/* Linux before 2.6.22 didn't have for_each_netdev at all. */
+#define for_each_netdev(net, d) for (d = dev_base; d; d = d->next)
+#elif LINUX_VERSION_CODE < KERNEL_VERSION(2,6,24)
+/* Linux 2.6.24 added a network namespace pointer to the macro. */
+#undef for_each_netdev
+#define for_each_netdev(net,d) list_for_each_entry(d, &dev_base_head, dev_list)
+#endif
+
+
+
+#endif
diff --git a/datapath/linux-2.6/compat-2.6/include/linux/netfilter_bridge.h b/datapath/linux-2.6/compat-2.6/include/linux/netfilter_bridge.h
new file mode 100644
index 000000000..1c8183c86
--- /dev/null
+++ b/datapath/linux-2.6/compat-2.6/include/linux/netfilter_bridge.h
@@ -0,0 +1,24 @@
+#ifndef __LINUX_NETFILTER_BRIDGE_WRAPPER_H
+#define __LINUX_NETFILTER_BRIDGE_WRAPPER_H
+
+#include_next <linux/netfilter_bridge.h>
+
+#include <linux/version.h>
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,22)
+
+#include <linux/if_vlan.h>
+#include <linux/if_pppox.h>
+
+static inline unsigned int nf_bridge_encap_header_len(const struct sk_buff *skb)
+{
+ switch (skb->protocol) {
+ case __constant_htons(ETH_P_8021Q):
+ return VLAN_HLEN;
+ default:
+ return 0;
+ }
+}
+
+#endif /* linux version < 2.6.22 */
+
+#endif
diff --git a/datapath/linux-2.6/compat-2.6/include/linux/netfilter_ipv4.h b/datapath/linux-2.6/compat-2.6/include/linux/netfilter_ipv4.h
new file mode 100644
index 000000000..ed8a5d948
--- /dev/null
+++ b/datapath/linux-2.6/compat-2.6/include/linux/netfilter_ipv4.h
@@ -0,0 +1,19 @@
+#ifndef __LINUX_NETFILTER_IPV4_WRAPPER_H
+#define __LINUX_NETFILTER_IPV4_WRAPPER_H 1
+
+#include_next <linux/netfilter_ipv4.h>
+
+#include <linux/version.h>
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,25)
+
+#ifdef __KERNEL__
+
+#define NF_INET_PRE_ROUTING NF_IP_PRE_ROUTING
+#define NF_INET_POST_ROUTING NF_IP_POST_ROUTING
+#define NF_INET_FORWARD NF_IP_FORWARD
+
+#endif /* __KERNEL__ */
+
+#endif /* linux kernel < 2.6.25 */
+
+#endif
diff --git a/datapath/linux-2.6/compat-2.6/include/linux/netlink.h b/datapath/linux-2.6/compat-2.6/include/linux/netlink.h
new file mode 100644
index 000000000..c5f83bd07
--- /dev/null
+++ b/datapath/linux-2.6/compat-2.6/include/linux/netlink.h
@@ -0,0 +1,24 @@
+#ifndef __LINUX_NETLINK_WRAPPER_H
+#define __LINUX_NETLINK_WRAPPER_H 1
+
+#include <linux/skbuff.h>
+#include_next <linux/netlink.h>
+#include <net/netlink.h>
+
+#include <linux/version.h>
+
+#ifndef NLMSG_DEFAULT_SIZE
+#define NLMSG_DEFAULT_SIZE (NLMSG_GOODSIZE - NLMSG_HDRLEN)
+#endif
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,19)
+#define nlmsg_new(s, f) nlmsg_new_proper((s), (f))
+static inline struct sk_buff *nlmsg_new_proper(int size, gfp_t flags)
+{
+ return alloc_skb(size, flags);
+}
+
+#endif /* linux kernel < 2.6.19 */
+
+
+#endif
diff --git a/datapath/linux-2.6/compat-2.6/include/linux/percpu.h b/datapath/linux-2.6/compat-2.6/include/linux/percpu.h
new file mode 100644
index 000000000..0f68bb253
--- /dev/null
+++ b/datapath/linux-2.6/compat-2.6/include/linux/percpu.h
@@ -0,0 +1,10 @@
+#ifndef __LINUX_PERCPU_H_WRAPPER
+#define __LINUX_PERCPU_H_WRAPPER 1
+
+#include_next <linux/percpu.h>
+
+#ifndef percpu_ptr
+#define percpu_ptr per_cpu_ptr
+#endif
+
+#endif /* linux/percpu.h wrapper */
diff --git a/datapath/linux-2.6/compat-2.6/include/linux/random.h b/datapath/linux-2.6/compat-2.6/include/linux/random.h
new file mode 100644
index 000000000..4e4932c9c
--- /dev/null
+++ b/datapath/linux-2.6/compat-2.6/include/linux/random.h
@@ -0,0 +1,17 @@
+#ifndef __LINUX_RANDOM_WRAPPER_H
+#define __LINUX_RANDOM_WRAPPER_H 1
+
+#include_next <linux/random.h>
+
+#include <linux/version.h>
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,19)
+
+#ifdef __KERNEL__
+u32 random32(void);
+void srandom32(u32 seed);
+#endif /* __KERNEL__ */
+
+#endif /* linux kernel < 2.6.19 */
+
+
+#endif
diff --git a/datapath/linux-2.6/compat-2.6/include/linux/rculist.h b/datapath/linux-2.6/compat-2.6/include/linux/rculist.h
new file mode 100644
index 000000000..4164c0e99
--- /dev/null
+++ b/datapath/linux-2.6/compat-2.6/include/linux/rculist.h
@@ -0,0 +1,12 @@
+#ifndef __LINUX_RCULIST_WRAPPER_H
+#define __LINUX_RCULIST_WRAPPER_H
+
+#include <linux/version.h>
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,26)
+#include_next <linux/rculist.h>
+#else
+/* Prior to 2.6.26, the contents of rculist.h were part of list.h. */
+#include <linux/list.h>
+#endif
+
+#endif
diff --git a/datapath/linux-2.6/compat-2.6/include/linux/rtnetlink.h b/datapath/linux-2.6/compat-2.6/include/linux/rtnetlink.h
new file mode 100644
index 000000000..8bc51560f
--- /dev/null
+++ b/datapath/linux-2.6/compat-2.6/include/linux/rtnetlink.h
@@ -0,0 +1,29 @@
+#ifndef __RTNETLINK_WRAPPER_H
+#define __RTNETLINK_WRAPPER_H 1
+
+#include_next <linux/rtnetlink.h>
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,19)
+static inline int rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid,
+ u32 group, struct nlmsghdr *nlh, gfp_t flags)
+{
+ BUG_ON(nlh); /* not implemented */
+ if (group) {
+ /* errors reported via destination sk->sk_err */
+ nlmsg_multicast(rtnl, skb, 0, group);
+ }
+ return 0;
+}
+
+static inline void rtnl_set_sk_err(struct net *net, u32 group, int error)
+{
+ netlink_set_err(rtnl, 0, group, error);
+}
+#elif LINUX_VERSION_CODE < KERNEL_VERSION(2,6,25)
+#define rtnl_notify(skb, net, pid, group, nlh, flags) \
+ ((void) (net), rtnl_notify(skb, pid, group, nlh, flags))
+#define rtnl_set_sk_err(net, group, error) \
+ ((void) (net), rtnl_set_sk_err(group, error))
+#endif /* linux kernel < 2.6.25 */
+
+#endif /* linux/rtnetlink.h wrapper */
diff --git a/datapath/linux-2.6/compat-2.6/include/linux/skbuff.h b/datapath/linux-2.6/compat-2.6/include/linux/skbuff.h
new file mode 100644
index 000000000..666ef8504
--- /dev/null
+++ b/datapath/linux-2.6/compat-2.6/include/linux/skbuff.h
@@ -0,0 +1,170 @@
+#ifndef __LINUX_SKBUFF_WRAPPER_H
+#define __LINUX_SKBUFF_WRAPPER_H 1
+
+#include_next <linux/skbuff.h>
+
+#include <linux/version.h>
+
+#ifndef HAVE_SKB_COPY_FROM_LINEAR_DATA_OFFSET
+static inline void skb_copy_from_linear_data_offset(const struct sk_buff *skb,
+ const int offset, void *to,
+ const unsigned int len)
+{
+ memcpy(to, skb->data + offset, len);
+}
+
+static inline void skb_copy_to_linear_data_offset(struct sk_buff *skb,
+ const int offset,
+ const void *from,
+ const unsigned int len)
+{
+ memcpy(skb->data + offset, from, len);
+}
+
+#endif /* !HAVE_SKB_COPY_FROM_LINEAR_DATA_OFFSET */
+
+/*
+ * The networking layer reserves some headroom in skb data (via
+ * dev_alloc_skb). This is used to avoid having to reallocate skb data when
+ * the header has to grow. In the default case, if the header has to grow
+ * 16 bytes or less we avoid the reallocation.
+ *
+ * Unfortunately this headroom changes the DMA alignment of the resulting
+ * network packet. As for NET_IP_ALIGN, this unaligned DMA is expensive
+ * on some architectures. An architecture can override this value,
+ * perhaps setting it to a cacheline in size (since that will maintain
+ * cacheline alignment of the DMA). It must be a power of 2.
+ *
+ * Various parts of the networking layer expect at least 16 bytes of
+ * headroom, you should not reduce this.
+ */
+#ifndef NET_SKB_PAD
+#define NET_SKB_PAD 16
+#endif
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,23)
+static inline int __skb_cow(struct sk_buff *skb, unsigned int headroom,
+ int cloned)
+{
+ int delta = 0;
+
+ if (headroom < NET_SKB_PAD)
+ headroom = NET_SKB_PAD;
+ if (headroom > skb_headroom(skb))
+ delta = headroom - skb_headroom(skb);
+
+ if (delta || cloned)
+ return pskb_expand_head(skb, ALIGN(delta, NET_SKB_PAD), 0,
+ GFP_ATOMIC);
+ return 0;
+}
+
+static inline int skb_cow_head(struct sk_buff *skb, unsigned int headroom)
+{
+ return __skb_cow(skb, headroom, skb_header_cloned(skb));
+}
+#endif /* linux < 2.6.23 */
+
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,17)
+/* Emulate Linux 2.6.17 and later behavior, in which kfree_skb silently ignores
+ * null pointer arguments. */
+#define kfree_skb(skb) kfree_skb_maybe_null(skb)
+static inline void kfree_skb_maybe_null(struct sk_buff *skb)
+{
+ if (likely(skb != NULL))
+ (kfree_skb)(skb);
+}
+#endif
+
+
+#ifndef CHECKSUM_PARTIAL
+/* Note that CHECKSUM_PARTIAL is not implemented, but this allows us to at
+ * least test against it: see update_csum() in forward.c. */
+#define CHECKSUM_PARTIAL 3
+#endif
+#ifndef CHECKSUM_COMPLETE
+#define CHECKSUM_COMPLETE CHECKSUM_HW
+#endif
+
+#ifdef HAVE_MAC_RAW
+#define mac_header mac.raw
+#define network_header nh.raw
+#endif
+
+#ifndef HAVE_SKBUFF_HEADER_HELPERS
+static inline unsigned char *skb_transport_header(const struct sk_buff *skb)
+{
+ return skb->h.raw;
+}
+
+static inline void skb_reset_transport_header(struct sk_buff *skb)
+{
+ skb->h.raw = skb->data;
+}
+
+static inline void skb_set_transport_header(struct sk_buff *skb,
+ const int offset)
+{
+ skb->h.raw = skb->data + offset;
+}
+
+static inline unsigned char *skb_network_header(const struct sk_buff *skb)
+{
+ return skb->nh.raw;
+}
+
+static inline void skb_set_network_header(struct sk_buff *skb, const int offset)
+{
+ skb->nh.raw = skb->data + offset;
+}
+
+static inline unsigned char *skb_mac_header(const struct sk_buff *skb)
+{
+ return skb->mac.raw;
+}
+
+static inline void skb_reset_mac_header(struct sk_buff *skb)
+{
+ skb->mac_header = skb->data;
+}
+
+static inline void skb_set_mac_header(struct sk_buff *skb, const int offset)
+{
+ skb->mac.raw = skb->data + offset;
+}
+
+static inline int skb_transport_offset(const struct sk_buff *skb)
+{
+ return skb_transport_header(skb) - skb->data;
+}
+
+static inline int skb_network_offset(const struct sk_buff *skb)
+{
+ return skb_network_header(skb) - skb->data;
+}
+
+static inline void skb_copy_to_linear_data(struct sk_buff *skb,
+ const void *from,
+ const unsigned int len)
+{
+ memcpy(skb->data, from, len);
+}
+#endif /* !HAVE_SKBUFF_HEADER_HELPERS */
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,18)
+#warning "TSO/UFO not supported on kernels earlier than 2.6.18"
+
+static inline int skb_is_gso(const struct sk_buff *skb)
+{
+ return 0;
+}
+
+static inline struct sk_buff *skb_gso_segment(struct sk_buff *skb,
+ int features)
+{
+ return NULL;
+}
+#endif /* before 2.6.18 */
+
+#endif
diff --git a/datapath/linux-2.6/compat-2.6/include/linux/tcp.h b/datapath/linux-2.6/compat-2.6/include/linux/tcp.h
new file mode 100644
index 000000000..6fad1933b
--- /dev/null
+++ b/datapath/linux-2.6/compat-2.6/include/linux/tcp.h
@@ -0,0 +1,18 @@
+#ifndef __LINUX_TCP_WRAPPER_H
+#define __LINUX_TCP_WRAPPER_H 1
+
+#include_next <linux/tcp.h>
+
+#ifndef HAVE_SKBUFF_HEADER_HELPERS
+static inline struct tcphdr *tcp_hdr(const struct sk_buff *skb)
+{
+ return (struct tcphdr *)skb_transport_header(skb);
+}
+
+static inline unsigned int tcp_hdrlen(const struct sk_buff *skb)
+{
+ return tcp_hdr(skb)->doff * 4;
+}
+#endif /* !HAVE_SKBUFF_HEADER_HELPERS */
+
+#endif
diff --git a/datapath/linux-2.6/compat-2.6/include/linux/timer.h b/datapath/linux-2.6/compat-2.6/include/linux/timer.h
new file mode 100644
index 000000000..6c3a9b0f5
--- /dev/null
+++ b/datapath/linux-2.6/compat-2.6/include/linux/timer.h
@@ -0,0 +1,96 @@
+#ifndef __LINUX_TIMER_WRAPPER_H
+#define __LINUX_TIMER_WRAPPER_H 1
+
+#include_next <linux/timer.h>
+
+#include <linux/version.h>
+
+#ifndef RHEL_RELEASE_VERSION
+#define RHEL_RELEASE_VERSION(X,Y) ( 0 )
+#endif
+#if ((LINUX_VERSION_CODE < KERNEL_VERSION(2,6,20)) && \
+ (!defined(RHEL_RELEASE_CODE) || \
+ (RHEL_RELEASE_CODE < RHEL_RELEASE_VERSION(5,1))))
+
+extern unsigned long volatile jiffies;
+
+/**
+ * __round_jiffies - function to round jiffies to a full second
+ * @j: the time in (absolute) jiffies that should be rounded
+ * @cpu: the processor number on which the timeout will happen
+ *
+ * __round_jiffies() rounds an absolute time in the future (in jiffies)
+ * up or down to (approximately) full seconds. This is useful for timers
+ * for which the exact time they fire does not matter too much, as long as
+ * they fire approximately every X seconds.
+ *
+ * By rounding these timers to whole seconds, all such timers will fire
+ * at the same time, rather than at various times spread out. The goal
+ * of this is to have the CPU wake up less, which saves power.
+ *
+ * The exact rounding is skewed for each processor to avoid all
+ * processors firing at the exact same time, which could lead
+ * to lock contention or spurious cache line bouncing.
+ *
+ * The return value is the rounded version of the @j parameter.
+ */
+static inline unsigned long __round_jiffies(unsigned long j, int cpu)
+{
+ int rem;
+ unsigned long original = j;
+
+ /*
+ * We don't want all cpus firing their timers at once hitting the
+ * same lock or cachelines, so we skew each extra cpu with an extra
+ * 3 jiffies. This 3 jiffies came originally from the mm/ code which
+ * already did this.
+ * The skew is done by adding 3*cpunr, then round, then subtract this
+ * extra offset again.
+ */
+ j += cpu * 3;
+
+ rem = j % HZ;
+
+ /*
+ * If the target jiffie is just after a whole second (which can happen
+ * due to delays of the timer irq, long irq off times etc etc) then
+ * we should round down to the whole second, not up. Use 1/4th second
+ * as cutoff for this rounding as an extreme upper bound for this.
+ */
+ if (rem < HZ/4) /* round down */
+ j = j - rem;
+ else /* round up */
+ j = j - rem + HZ;
+
+ /* now that we have rounded, subtract the extra skew again */
+ j -= cpu * 3;
+
+ if (j <= jiffies) /* rounding ate our timeout entirely; */
+ return original;
+ return j;
+}
+
+
+/**
+ * round_jiffies - function to round jiffies to a full second
+ * @j: the time in (absolute) jiffies that should be rounded
+ *
+ * round_jiffies() rounds an absolute time in the future (in jiffies)
+ * up or down to (approximately) full seconds. This is useful for timers
+ * for which the exact time they fire does not matter too much, as long as
+ * they fire approximately every X seconds.
+ *
+ * By rounding these timers to whole seconds, all such timers will fire
+ * at the same time, rather than at various times spread out. The goal
+ * of this is to have the CPU wake up less, which saves power.
+ *
+ * The return value is the rounded version of the @j parameter.
+ */
+static inline unsigned long round_jiffies(unsigned long j)
+{
+ return __round_jiffies(j, 0); // FIXME
+}
+
+#endif /* linux kernel < 2.6.20 */
+
+#endif
diff --git a/datapath/linux-2.6/compat-2.6/include/linux/types.h b/datapath/linux-2.6/compat-2.6/include/linux/types.h
new file mode 100644
index 000000000..c1f375eb3
--- /dev/null
+++ b/datapath/linux-2.6/compat-2.6/include/linux/types.h
@@ -0,0 +1,14 @@
+#ifndef __LINUX_TYPES_WRAPPER_H
+#define __LINUX_TYPES_WRAPPER_H 1
+
+#include_next <linux/types.h>
+
+#include <linux/version.h>
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,20)
+
+typedef __u16 __bitwise __sum16;
+typedef __u32 __bitwise __wsum;
+
+#endif /* linux kernel < 2.6.20 */
+
+#endif
diff --git a/datapath/linux-2.6/compat-2.6/include/linux/udp.h b/datapath/linux-2.6/compat-2.6/include/linux/udp.h
new file mode 100644
index 000000000..6fe4721bf
--- /dev/null
+++ b/datapath/linux-2.6/compat-2.6/include/linux/udp.h
@@ -0,0 +1,13 @@
+#ifndef __LINUX_UDP_WRAPPER_H
+#define __LINUX_UDP_WRAPPER_H 1
+
+#include_next <linux/udp.h>
+
+#ifndef HAVE_SKBUFF_HEADER_HELPERS
+static inline struct udphdr *udp_hdr(const struct sk_buff *skb)
+{
+ return (struct udphdr *)skb_transport_header(skb);
+}
+#endif /* HAVE_SKBUFF_HEADER_HELPERS */
+
+#endif
diff --git a/datapath/linux-2.6/compat-2.6/include/linux/workqueue.h b/datapath/linux-2.6/compat-2.6/include/linux/workqueue.h
new file mode 100644
index 000000000..1ac3b6ecb
--- /dev/null
+++ b/datapath/linux-2.6/compat-2.6/include/linux/workqueue.h
@@ -0,0 +1,42 @@
+#ifndef __LINUX_WORKQUEUE_WRAPPER_H
+#define __LINUX_WORKQUEUE_WRAPPER_H 1
+
+#include_next <linux/workqueue.h>
+
+#include <linux/version.h>
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,20)
+
+#ifdef __KERNEL__
+/*
+ * initialize a work-struct's func and data pointers:
+ */
+#undef PREPARE_WORK
+#define PREPARE_WORK(_work, _func) \
+ do { \
+ (_work)->func = (void(*)(void*)) _func; \
+ (_work)->data = _work; \
+ } while (0)
+
+/*
+ * initialize all of a work-struct:
+ */
+#undef INIT_WORK
+#define INIT_WORK(_work, _func) \
+ do { \
+ INIT_LIST_HEAD(&(_work)->entry); \
+ (_work)->pending = 0; \
+ PREPARE_WORK((_work), (_func)); \
+ init_timer(&(_work)->timer); \
+ } while (0)
+
+#endif /* __KERNEL__ */
+
+#endif /* linux kernel < 2.6.20 */
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,22)
+/* There is no equivalent to cancel_work_sync() so just flush all
+ * pending work. */
+#define cancel_work_sync(_work) flush_scheduled_work()
+#endif
+
+#endif
diff --git a/datapath/linux-2.6/compat-2.6/include/net/checksum.h b/datapath/linux-2.6/compat-2.6/include/net/checksum.h
new file mode 100644
index 000000000..c64c6bd0c
--- /dev/null
+++ b/datapath/linux-2.6/compat-2.6/include/net/checksum.h
@@ -0,0 +1,16 @@
+#ifndef __NET_CHECKSUM_WRAPPER_H
+#define __NET_CHECKSUM_WRAPPER_H 1
+
+#include_next <net/checksum.h>
+
+#include <linux/version.h>
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,20)
+
+static inline __wsum csum_unfold(__sum16 n)
+{
+ return (__force __wsum)n;
+}
+
+#endif /* linux kernel < 2.6.20 */
+
+#endif /* checksum.h */
diff --git a/datapath/linux-2.6/compat-2.6/include/net/genetlink.h b/datapath/linux-2.6/compat-2.6/include/net/genetlink.h
new file mode 100644
index 000000000..57a47316d
--- /dev/null
+++ b/datapath/linux-2.6/compat-2.6/include/net/genetlink.h
@@ -0,0 +1,123 @@
+#ifndef __NET_GENERIC_NETLINK_WRAPPER_H
+#define __NET_GENERIC_NETLINK_WRAPPER_H 1
+
+
+#include <linux/netlink.h>
+#include_next <net/genetlink.h>
+
+#include <linux/version.h>
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,23)
+
+#include <linux/genetlink.h>
+
+/*----------------------------------------------------------------------------
+ * In 2.6.23, registering of multicast groups was added. Our compatability
+ * layer just supports registering a single group, since that's all we
+ * need.
+ */
+
+/**
+ * struct genl_multicast_group - generic netlink multicast group
+ * @name: name of the multicast group, names are per-family
+ * @id: multicast group ID, assigned by the core, to use with
+ * genlmsg_multicast().
+ * @list: list entry for linking
+ * @family: pointer to family, need not be set before registering
+ */
+struct genl_multicast_group
+{
+ struct genl_family *family; /* private */
+ struct list_head list; /* private */
+ char name[GENL_NAMSIZ];
+ u32 id;
+};
+
+int genl_register_mc_group(struct genl_family *family,
+ struct genl_multicast_group *grp);
+#endif /* linux kernel < 2.6.23 */
+
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,19)
+/**
+ * genlmsg_msg_size - length of genetlink message not including padding
+ * @payload: length of message payload
+ */
+static inline int genlmsg_msg_size(int payload)
+{
+ return GENL_HDRLEN + payload;
+}
+
+/**
+ * genlmsg_total_size - length of genetlink message including padding
+ * @payload: length of message payload
+ */
+static inline int genlmsg_total_size(int payload)
+{
+ return NLMSG_ALIGN(genlmsg_msg_size(payload));
+}
+
+#define genlmsg_multicast(s, p, g, f) \
+ genlmsg_multicast_flags((s), (p), (g), (f))
+
+static inline int genlmsg_multicast_flags(struct sk_buff *skb, u32 pid,
+ unsigned int group, gfp_t flags)
+{
+ int err;
+
+ NETLINK_CB(skb).dst_group = group;
+
+ err = netlink_broadcast(genl_sock, skb, pid, group, flags);
+ if (err > 0)
+ err = 0;
+
+ return err;
+}
+#endif /* linux kernel < 2.6.19 */
+
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,20)
+
+#define genlmsg_put(skb, p, seq, fam, flg, c) \
+ genlmsg_put((skb), (p), (seq), (fam)->id, (fam)->hdrsize, \
+ (flg), (c), (fam)->version)
+
+/**
+ * genlmsg_put_reply - Add generic netlink header to a reply message
+ * @skb: socket buffer holding the message
+ * @info: receiver info
+ * @family: generic netlink family
+ * @flags: netlink message flags
+ * @cmd: generic netlink command
+ *
+ * Returns pointer to user specific header
+ */
+static inline void *genlmsg_put_reply(struct sk_buff *skb,
+ struct genl_info *info, struct genl_family *family,
+ int flags, u8 cmd)
+{
+ return genlmsg_put(skb, info->snd_pid, info->snd_seq, family,
+ flags, cmd);
+}
+
+/**
+ * genlmsg_reply - reply to a request
+ * @skb: netlink message to be sent back
+ * @info: receiver information
+ */
+static inline int genlmsg_reply(struct sk_buff *skb, struct genl_info *info)
+{
+ return genlmsg_unicast(skb, info->snd_pid);
+}
+
+/**
+ * genlmsg_new - Allocate a new generic netlink message
+ * @payload: size of the message payload
+ * @flags: the type of memory to allocate.
+ */
+static inline struct sk_buff *genlmsg_new(size_t payload, gfp_t flags)
+{
+ return nlmsg_new(genlmsg_total_size(payload), flags);
+}
+#endif /* linux kernel < 2.6.20 */
+
+#endif /* genetlink.h */
diff --git a/datapath/linux-2.6/compat-2.6/include/net/netlink.h b/datapath/linux-2.6/compat-2.6/include/net/netlink.h
new file mode 100644
index 000000000..e0d594d78
--- /dev/null
+++ b/datapath/linux-2.6/compat-2.6/include/net/netlink.h
@@ -0,0 +1,22 @@
+#ifndef __NET_NETLINK_WRAPPER_H
+#define __NET_NETLINK_WRAPPER_H 1
+
+#include_next <net/netlink.h>
+
+#ifndef HAVE_NLA_NUL_STRING
+#define NLA_NUL_STRING NLA_STRING
+
+static inline int VERIFY_NUL_STRING(struct nlattr *attr)
+{
+ return (!attr || (nla_len(attr)
+ && memchr(nla_data(attr), '\0', nla_len(attr)))
+ ? 0 : EINVAL);
+}
+#else
+static inline int VERIFY_NUL_STRING(struct nlattr *attr)
+{
+ return 0;
+}
+#endif /* !HAVE_NLA_NUL_STRING */
+
+#endif /* net/netlink.h */
diff --git a/datapath/linux-2.6/compat-2.6/random32.c b/datapath/linux-2.6/compat-2.6/random32.c
new file mode 100644
index 000000000..b0dd2a32b
--- /dev/null
+++ b/datapath/linux-2.6/compat-2.6/random32.c
@@ -0,0 +1,144 @@
+#include <linux/version.h>
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,19)
+
+/*
+ This is a maximally equidistributed combined Tausworthe generator
+ based on code from GNU Scientific Library 1.5 (30 Jun 2004)
+
+ x_n = (s1_n ^ s2_n ^ s3_n)
+
+ s1_{n+1} = (((s1_n & 4294967294) <<12) ^ (((s1_n <<13) ^ s1_n) >>19))
+ s2_{n+1} = (((s2_n & 4294967288) << 4) ^ (((s2_n << 2) ^ s2_n) >>25))
+ s3_{n+1} = (((s3_n & 4294967280) <<17) ^ (((s3_n << 3) ^ s3_n) >>11))
+
+ The period of this generator is about 2^88.
+
+ From: P. L'Ecuyer, "Maximally Equidistributed Combined Tausworthe
+ Generators", Mathematics of Computation, 65, 213 (1996), 203--213.
+
+ This is available on the net from L'Ecuyer's home page,
+
+ http://www.iro.umontreal.ca/~lecuyer/myftp/papers/tausme.ps
+ ftp://ftp.iro.umontreal.ca/pub/simulation/lecuyer/papers/tausme.ps
+
+ There is an erratum in the paper "Tables of Maximally
+ Equidistributed Combined LFSR Generators", Mathematics of
+ Computation, 68, 225 (1999), 261--269:
+ http://www.iro.umontreal.ca/~lecuyer/myftp/papers/tausme2.ps
+
+ ... the k_j most significant bits of z_j must be non-
+ zero, for each j. (Note: this restriction also applies to the
+ computer code given in [4], but was mistakenly not mentioned in
+ that paper.)
+
+ This affects the seeding procedure by imposing the requirement
+ s1 > 1, s2 > 7, s3 > 15.
+
+*/
+
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/jiffies.h>
+#include <linux/random.h>
+#include <linux/smp.h>
+
+#include "compat26.h"
+
+struct rnd_state {
+ u32 s1, s2, s3;
+};
+
+static struct rnd_state net_rand_state[NR_CPUS];
+
+static u32 __random32(struct rnd_state *state)
+{
+#define TAUSWORTHE(s,a,b,c,d) ((s&c)<<d) ^ (((s <<a) ^ s)>>b)
+
+ state->s1 = TAUSWORTHE(state->s1, 13, 19, 4294967294UL, 12);
+ state->s2 = TAUSWORTHE(state->s2, 2, 25, 4294967288UL, 4);
+ state->s3 = TAUSWORTHE(state->s3, 3, 11, 4294967280UL, 17);
+
+ return (state->s1 ^ state->s2 ^ state->s3);
+}
+
+static void __set_random32(struct rnd_state *state, unsigned long s)
+{
+ if (s == 0)
+ s = 1; /* default seed is 1 */
+
+#define LCG(n) (69069 * n)
+ state->s1 = LCG(s);
+ state->s2 = LCG(state->s1);
+ state->s3 = LCG(state->s2);
+
+ /* "warm it up" */
+ __random32(state);
+ __random32(state);
+ __random32(state);
+ __random32(state);
+ __random32(state);
+ __random32(state);
+}
+
+/**
+ * random32 - pseudo random number generator
+ *
+ * A 32 bit pseudo-random number is generated using a fast
+ * algorithm suitable for simulation. This algorithm is NOT
+ * considered safe for cryptographic use.
+ */
+u32 random32(void)
+{
+ return __random32(&net_rand_state[smp_processor_id()]);
+}
+
+/**
+ * srandom32 - add entropy to pseudo random number generator
+ * @seed: seed value
+ *
+ * Add some additional seeding to the random32() pool.
+ * Note: this pool is per cpu so it only affects current CPU.
+ */
+void srandom32(u32 entropy)
+{
+ struct rnd_state *state = &net_rand_state[smp_processor_id()];
+ __set_random32(state, state->s1 ^ entropy);
+}
+
+static int __init random32_reseed(void);
+
+/*
+ * Generate some initially weak seeding values to allow
+ * to start the random32() engine.
+ */
+int __init random32_init(void)
+{
+ int i;
+
+ for (i = 0; i < NR_CPUS; i++) {
+ struct rnd_state *state = &net_rand_state[i];
+ __set_random32(state, i + jiffies);
+ }
+ random32_reseed();
+ return 0;
+}
+
+/*
+ * Generate better values after random number generator
+ * is fully initalized.
+ */
+static int __init random32_reseed(void)
+{
+ int i;
+ unsigned long seed;
+
+ for (i = 0; i < NR_CPUS; i++) {
+ struct rnd_state *state = &net_rand_state[i];
+
+ get_random_bytes(&seed, sizeof(seed));
+ __set_random32(state, seed);
+ }
+ return 0;
+}
+
+#endif /* kernel < 2.6.19 */
diff --git a/datapath/linux-2.6/compat-2.6/veth.c b/datapath/linux-2.6/compat-2.6/veth.c
new file mode 100644
index 000000000..3cda33651
--- /dev/null
+++ b/datapath/linux-2.6/compat-2.6/veth.c
@@ -0,0 +1,537 @@
+/* veth driver port to Linux 2.6.18 */
+
+/*
+ * drivers/net/veth.c
+ *
+ * Copyright (C) 2007, 2009 OpenVZ http://openvz.org, SWsoft Inc
+ *
+ * Author: Pavel Emelianov <xemul@openvz.org>
+ * Ethtool interface from: Eric W. Biederman <ebiederm@xmission.com>
+ *
+ */
+
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include <linux/ethtool.h>
+#include <linux/etherdevice.h>
+
+#include <net/dst.h>
+#include <net/xfrm.h>
+
+#define DRV_NAME "veth"
+#define DRV_VERSION "1.0"
+
+struct veth_net_stats {
+ unsigned long rx_packets;
+ unsigned long tx_packets;
+ unsigned long rx_bytes;
+ unsigned long tx_bytes;
+ unsigned long tx_dropped;
+};
+
+struct veth_priv {
+ struct net_device *peer;
+ struct net_device *dev;
+ struct list_head list;
+ struct veth_net_stats *stats;
+ unsigned ip_summed;
+ struct net_device_stats dev_stats;
+};
+
+static LIST_HEAD(veth_list);
+
+/*
+ * ethtool interface
+ */
+
+static struct {
+ const char string[ETH_GSTRING_LEN];
+} ethtool_stats_keys[] = {
+ { "peer_ifindex" },
+};
+
+static int veth_get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
+{
+ cmd->supported = 0;
+ cmd->advertising = 0;
+ cmd->speed = SPEED_10000;
+ cmd->duplex = DUPLEX_FULL;
+ cmd->port = PORT_TP;
+ cmd->phy_address = 0;
+ cmd->transceiver = XCVR_INTERNAL;
+ cmd->autoneg = AUTONEG_DISABLE;
+ cmd->maxtxpkt = 0;
+ cmd->maxrxpkt = 0;
+ return 0;
+}
+
+static void veth_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info)
+{
+ strcpy(info->driver, DRV_NAME);
+ strcpy(info->version, DRV_VERSION);
+ strcpy(info->fw_version, "N/A");
+}
+
+static void veth_get_strings(struct net_device *dev, u32 stringset, u8 *buf)
+{
+ switch(stringset) {
+ case ETH_SS_STATS:
+ memcpy(buf, &ethtool_stats_keys, sizeof(ethtool_stats_keys));
+ break;
+ }
+}
+
+static void veth_get_ethtool_stats(struct net_device *dev,
+ struct ethtool_stats *stats, u64 *data)
+{
+ struct veth_priv *priv;
+
+ priv = netdev_priv(dev);
+ data[0] = priv->peer->ifindex;
+}
+
+static u32 veth_get_rx_csum(struct net_device *dev)
+{
+ struct veth_priv *priv;
+
+ priv = netdev_priv(dev);
+ return priv->ip_summed == CHECKSUM_UNNECESSARY;
+}
+
+static int veth_set_rx_csum(struct net_device *dev, u32 data)
+{
+ struct veth_priv *priv;
+
+ priv = netdev_priv(dev);
+ priv->ip_summed = data ? CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
+ return 0;
+}
+
+static u32 veth_get_tx_csum(struct net_device *dev)
+{
+ return (dev->features & NETIF_F_NO_CSUM) != 0;
+}
+
+static int veth_set_tx_csum(struct net_device *dev, u32 data)
+{
+ if (data)
+ dev->features |= NETIF_F_NO_CSUM;
+ else
+ dev->features &= ~NETIF_F_NO_CSUM;
+ return 0;
+}
+
+static struct ethtool_ops veth_ethtool_ops = {
+ .get_settings = veth_get_settings,
+ .get_drvinfo = veth_get_drvinfo,
+ .get_link = ethtool_op_get_link,
+ .get_rx_csum = veth_get_rx_csum,
+ .set_rx_csum = veth_set_rx_csum,
+ .get_tx_csum = veth_get_tx_csum,
+ .set_tx_csum = veth_set_tx_csum,
+ .get_sg = ethtool_op_get_sg,
+ .set_sg = ethtool_op_set_sg,
+ .get_strings = veth_get_strings,
+ .get_ethtool_stats = veth_get_ethtool_stats,
+};
+
+/*
+ * xmit
+ */
+
+static int veth_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ struct net_device *rcv = NULL;
+ struct veth_priv *priv, *rcv_priv;
+ struct veth_net_stats *stats;
+ int length, cpu;
+
+ skb_orphan(skb);
+
+ priv = netdev_priv(dev);
+ rcv = priv->peer;
+ rcv_priv = netdev_priv(rcv);
+
+ cpu = smp_processor_id();
+ stats = per_cpu_ptr(priv->stats, cpu);
+
+ if (!(rcv->flags & IFF_UP))
+ goto outf;
+
+ skb->dev = rcv;
+ skb->pkt_type = PACKET_HOST;
+ skb->protocol = eth_type_trans(skb, rcv);
+ if (dev->features & NETIF_F_NO_CSUM)
+ skb->ip_summed = rcv_priv->ip_summed;
+
+ dst_release(skb->dst);
+ skb->dst = NULL;
+ secpath_reset(skb);
+ nf_reset(skb);
+
+ length = skb->len;
+
+ stats->tx_bytes += length;
+ stats->tx_packets++;
+
+ stats = per_cpu_ptr(rcv_priv->stats, cpu);
+ stats->rx_bytes += length;
+ stats->rx_packets++;
+
+ netif_rx(skb);
+ return 0;
+
+outf:
+ kfree_skb(skb);
+ stats->tx_dropped++;
+ return 0;
+}
+
+/*
+ * general routines
+ */
+
+static struct net_device_stats *veth_get_stats(struct net_device *dev)
+{
+ struct veth_priv *priv;
+ struct net_device_stats *dev_stats;
+ int cpu;
+ struct veth_net_stats *stats;
+
+ priv = netdev_priv(dev);
+ dev_stats = &priv->dev_stats;
+
+ dev_stats->rx_packets = 0;
+ dev_stats->tx_packets = 0;
+ dev_stats->rx_bytes = 0;
+ dev_stats->tx_bytes = 0;
+ dev_stats->tx_dropped = 0;
+
+ for_each_online_cpu(cpu) {
+ stats = per_cpu_ptr(priv->stats, cpu);
+
+ dev_stats->rx_packets += stats->rx_packets;
+ dev_stats->tx_packets += stats->tx_packets;
+ dev_stats->rx_bytes += stats->rx_bytes;
+ dev_stats->tx_bytes += stats->tx_bytes;
+ dev_stats->tx_dropped += stats->tx_dropped;
+ }
+
+ return dev_stats;
+}
+
+static int veth_open(struct net_device *dev)
+{
+ struct veth_priv *priv;
+
+ priv = netdev_priv(dev);
+ if (priv->peer == NULL)
+ return -ENOTCONN;
+
+ if (priv->peer->flags & IFF_UP) {
+ netif_carrier_on(dev);
+ netif_carrier_on(priv->peer);
+ }
+ return 0;
+}
+
+static int veth_dev_init(struct net_device *dev)
+{
+ struct veth_net_stats *stats;
+ struct veth_priv *priv;
+
+ stats = alloc_percpu(struct veth_net_stats);
+ if (stats == NULL)
+ return -ENOMEM;
+
+ priv = netdev_priv(dev);
+ priv->stats = stats;
+ return 0;
+}
+
+static void veth_dev_free(struct net_device *dev)
+{
+ struct veth_priv *priv;
+
+ priv = netdev_priv(dev);
+ free_percpu(priv->stats);
+ free_netdev(dev);
+}
+
+static void veth_setup(struct net_device *dev)
+{
+ ether_setup(dev);
+
+ dev->hard_start_xmit = veth_xmit;
+ dev->get_stats = veth_get_stats;
+ dev->open = veth_open;
+ dev->ethtool_ops = &veth_ethtool_ops;
+ dev->features |= NETIF_F_LLTX;
+ dev->init = veth_dev_init;
+ dev->destructor = veth_dev_free;
+}
+
+static void veth_change_state(struct net_device *dev)
+{
+ struct net_device *peer;
+ struct veth_priv *priv;
+
+ priv = netdev_priv(dev);
+ peer = priv->peer;
+
+ if (netif_carrier_ok(peer)) {
+ if (!netif_carrier_ok(dev))
+ netif_carrier_on(dev);
+ } else {
+ if (netif_carrier_ok(dev))
+ netif_carrier_off(dev);
+ }
+}
+
+static int veth_device_event(struct notifier_block *unused,
+ unsigned long event, void *ptr)
+{
+ struct net_device *dev = ptr;
+
+ if (dev->open != veth_open)
+ goto out;
+
+ switch (event) {
+ case NETDEV_CHANGE:
+ veth_change_state(dev);
+ break;
+ }
+out:
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block veth_notifier_block __read_mostly = {
+ .notifier_call = veth_device_event,
+};
+
+/*
+ * netlink interface
+ */
+
+static int veth_newlink(const char *devname, const char *peername)
+{
+ int err;
+ const char *names[2];
+ struct net_device *devs[2];
+ int i;
+
+ names[0] = devname;
+ names[1] = peername;
+ devs[0] = devs[1] = NULL;
+
+ for (i = 0; i < 2; i++) {
+ struct net_device *dev;
+
+ err = -ENOMEM;
+ devs[i] = alloc_netdev(sizeof(struct veth_priv),
+ names[i], veth_setup);
+ if (!devs[i]) {
+ goto err;
+ }
+
+ dev = devs[i];
+
+ if (strchr(dev->name, '%')) {
+ err = dev_alloc_name(dev, dev->name);
+ if (err < 0)
+ goto err;
+ }
+ random_ether_addr(dev->dev_addr);
+
+ err = register_netdevice(dev);
+ if (err < 0)
+ goto err;
+
+ netif_carrier_off(dev);
+ }
+
+ /*
+ * tie the devices together
+ */
+
+ for (i = 0; i < 2; i++) {
+ struct veth_priv *priv = netdev_priv(devs[i]);
+ priv->dev = devs[i];
+ priv->peer = devs[!i];
+ if (!i)
+ list_add(&priv->list, &veth_list);
+ else
+ INIT_LIST_HEAD(&priv->list);
+ }
+ return 0;
+
+err:
+ for (i = 0; i < 2; i++) {
+ if (devs[i]) {
+ if (devs[i]->reg_state != NETREG_UNINITIALIZED)
+ unregister_netdevice(devs[i]);
+ else
+ free_netdev(devs[i]);
+ }
+ }
+ return err;
+}
+
+static void veth_dellink(struct net_device *dev)
+{
+ struct veth_priv *priv;
+ struct net_device *peer;
+
+ priv = netdev_priv(dev);
+ peer = priv->peer;
+
+ if (!list_empty(&priv->list))
+ list_del(&priv->list);
+
+ priv = netdev_priv(peer);
+ if (!list_empty(&priv->list))
+ list_del(&priv->list);
+
+ unregister_netdevice(dev);
+ unregister_netdevice(peer);
+}
+
+/*
+ * sysfs
+ */
+
+/*
+ * "show" function for the veth_pairs attribute.
+ * The class parameter is ignored.
+ */
+static ssize_t veth_show_veth_pairs(struct class *cls, char *buffer)
+{
+ int res = 0;
+ struct veth_priv *priv;
+
+ list_for_each_entry(priv, &veth_list, list) {
+ if (res > (PAGE_SIZE - (IFNAMSIZ * 2 + 1))) {
+ /* not enough space for another interface name */
+ if ((PAGE_SIZE - res) > 10)
+ res = PAGE_SIZE - 10;
+ res += sprintf(buffer + res, "++more++");
+ break;
+ }
+ res += sprintf(buffer + res, "%s,%s ",
+ priv->dev->name, priv->peer->name);
+ }
+ res += sprintf(buffer + res, "\n");
+ res++;
+ return res;
+}
+
+/*
+ * "store" function for the veth_pairs attribute. This is what
+ * creates and deletes veth pairs.
+ *
+ * The class parameter is ignored.
+ *
+ */
+static ssize_t veth_store_veth_pairs(struct class *cls, const char *buffer,
+ size_t count)
+{
+ int c = *buffer++;
+ int retval;
+ printk("1\n");
+ if (c == '+') {
+ char devname[IFNAMSIZ + 1] = "";
+ char peername[IFNAMSIZ + 1] = "";
+ char *comma = strchr(buffer, ',');
+ printk("2\n");
+ if (!comma)
+ goto err_no_cmd;
+ strncat(devname, buffer,
+ min_t(int, sizeof devname, comma - buffer));
+ strncat(peername, comma + 1,
+ min_t(int, sizeof peername, strcspn(comma + 1, "\n")));
+ printk("3 '%s' '%s'\n", devname, peername);
+ if (!dev_valid_name(devname) || !dev_valid_name(peername))
+ goto err_no_cmd;
+ printk("4\n");
+ rtnl_lock();
+ retval = veth_newlink(devname, peername);
+ rtnl_unlock();
+ return retval ? retval : count;
+ } else if (c == '-') {
+ struct net_device *dev;
+
+ rtnl_lock();
+ dev = dev_get_by_name(buffer);
+ if (!dev)
+ retval = -ENODEV;
+ else if (dev->init != veth_dev_init)
+ retval = -EINVAL;
+ else {
+ veth_dellink(dev);
+ retval = count;
+ }
+ rtnl_unlock();
+
+ return retval;
+ }
+
+err_no_cmd:
+ printk(KERN_ERR DRV_NAME ": no command found in veth_pairs. Use +ifname,peername or -ifname.\n");
+ return -EPERM;
+}
+
+/* class attribute for veth_pairs file. This ends up in /sys/class/net */
+static CLASS_ATTR(veth_pairs, S_IWUSR | S_IRUGO,
+ veth_show_veth_pairs, veth_store_veth_pairs);
+
+static struct class *netdev_class;
+
+/*
+ * Initialize sysfs. This sets up the veth_pairs file in
+ * /sys/class/net.
+ */
+int veth_create_sysfs(void)
+{
+ struct net_device *dev = dev_get_by_name("lo");
+ if (!dev)
+ return -ESRCH;
+ netdev_class = dev->class_dev.class;
+ if (!netdev_class)
+ return -ENODEV;
+
+ return class_create_file(netdev_class, &class_attr_veth_pairs);
+}
+
+/*
+ * Remove /sys/class/net/veth_pairs.
+ */
+void veth_destroy_sysfs(void)
+{
+ class_remove_file(netdev_class, &class_attr_veth_pairs);
+}
+
+
+
+/*
+ * init/fini
+ */
+
+static __init int veth_init(void)
+{
+ int retval = veth_create_sysfs();
+ if (retval)
+ return retval;
+ register_netdevice_notifier(&veth_notifier_block);
+ return 0;
+}
+
+static __exit void veth_exit(void)
+{
+ unregister_netdevice_notifier(&veth_notifier_block);
+}
+
+module_init(veth_init);
+module_exit(veth_exit);
+
+MODULE_DESCRIPTION("Virtual Ethernet Tunnel");
+MODULE_LICENSE("GPL v2");
diff --git a/datapath/linux-2.6/config/config-linux-2.6.23-rc9-kvm b/datapath/linux-2.6/config/config-linux-2.6.23-rc9-kvm
new file mode 100644
index 000000000..f287cf724
--- /dev/null
+++ b/datapath/linux-2.6/config/config-linux-2.6.23-rc9-kvm
@@ -0,0 +1,1408 @@
+#
+# Automatically generated make config: don't edit
+# Linux kernel version: 2.6.23-rc9
+# Fri Oct 19 15:08:37 2007
+#
+CONFIG_X86_32=y
+CONFIG_GENERIC_TIME=y
+CONFIG_GENERIC_CMOS_UPDATE=y
+CONFIG_CLOCKSOURCE_WATCHDOG=y
+CONFIG_GENERIC_CLOCKEVENTS=y
+CONFIG_GENERIC_CLOCKEVENTS_BROADCAST=y
+CONFIG_LOCKDEP_SUPPORT=y
+CONFIG_STACKTRACE_SUPPORT=y
+CONFIG_SEMAPHORE_SLEEPERS=y
+CONFIG_X86=y
+CONFIG_MMU=y
+CONFIG_ZONE_DMA=y
+CONFIG_QUICKLIST=y
+CONFIG_GENERIC_ISA_DMA=y
+CONFIG_GENERIC_IOMAP=y
+CONFIG_GENERIC_BUG=y
+CONFIG_GENERIC_HWEIGHT=y
+CONFIG_ARCH_MAY_HAVE_PC_FDC=y
+CONFIG_DMI=y
+CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config"
+
+#
+# General setup
+#
+CONFIG_EXPERIMENTAL=y
+CONFIG_LOCK_KERNEL=y
+CONFIG_INIT_ENV_ARG_LIMIT=32
+CONFIG_LOCALVERSION=""
+# CONFIG_LOCALVERSION_AUTO is not set
+CONFIG_SWAP=y
+CONFIG_SYSVIPC=y
+CONFIG_SYSVIPC_SYSCTL=y
+CONFIG_POSIX_MQUEUE=y
+CONFIG_BSD_PROCESS_ACCT=y
+# CONFIG_BSD_PROCESS_ACCT_V3 is not set
+CONFIG_TASKSTATS=y
+CONFIG_TASK_DELAY_ACCT=y
+CONFIG_TASK_XACCT=y
+CONFIG_TASK_IO_ACCOUNTING=y
+# CONFIG_USER_NS is not set
+# CONFIG_AUDIT is not set
+CONFIG_IKCONFIG=y
+CONFIG_IKCONFIG_PROC=y
+CONFIG_LOG_BUF_SHIFT=14
+# CONFIG_CPUSETS is not set
+CONFIG_SYSFS_DEPRECATED=y
+CONFIG_RELAY=y
+CONFIG_BLK_DEV_INITRD=y
+CONFIG_INITRAMFS_SOURCE=""
+CONFIG_CC_OPTIMIZE_FOR_SIZE=y
+CONFIG_SYSCTL=y
+# CONFIG_EMBEDDED is not set
+CONFIG_UID16=y
+CONFIG_SYSCTL_SYSCALL=y
+CONFIG_KALLSYMS=y
+CONFIG_KALLSYMS_ALL=y
+CONFIG_KALLSYMS_EXTRA_PASS=y
+CONFIG_HOTPLUG=y
+CONFIG_PRINTK=y
+CONFIG_BUG=y
+CONFIG_ELF_CORE=y
+CONFIG_BASE_FULL=y
+CONFIG_FUTEX=y
+CONFIG_ANON_INODES=y
+CONFIG_EPOLL=y
+CONFIG_SIGNALFD=y
+CONFIG_EVENTFD=y
+CONFIG_SHMEM=y
+CONFIG_VM_EVENT_COUNTERS=y
+CONFIG_SLAB=y
+# CONFIG_SLUB is not set
+# CONFIG_SLOB is not set
+CONFIG_RT_MUTEXES=y
+# CONFIG_TINY_SHMEM is not set
+CONFIG_BASE_SMALL=0
+CONFIG_MODULES=y
+CONFIG_MODULE_UNLOAD=y
+# CONFIG_MODULE_FORCE_UNLOAD is not set
+# CONFIG_MODVERSIONS is not set
+# CONFIG_MODULE_SRCVERSION_ALL is not set
+CONFIG_KMOD=y
+CONFIG_STOP_MACHINE=y
+CONFIG_BLOCK=y
+CONFIG_LBD=y
+CONFIG_BLK_DEV_IO_TRACE=y
+CONFIG_LSF=y
+# CONFIG_BLK_DEV_BSG is not set
+
+#
+# IO Schedulers
+#
+CONFIG_IOSCHED_NOOP=y
+CONFIG_IOSCHED_AS=y
+CONFIG_IOSCHED_DEADLINE=y
+CONFIG_IOSCHED_CFQ=y
+CONFIG_DEFAULT_AS=y
+# CONFIG_DEFAULT_DEADLINE is not set
+# CONFIG_DEFAULT_CFQ is not set
+# CONFIG_DEFAULT_NOOP is not set
+CONFIG_DEFAULT_IOSCHED="anticipatory"
+
+#
+# Processor type and features
+#
+# CONFIG_TICK_ONESHOT is not set
+# CONFIG_NO_HZ is not set
+# CONFIG_HIGH_RES_TIMERS is not set
+CONFIG_SMP=y
+CONFIG_X86_PC=y
+# CONFIG_X86_ELAN is not set
+# CONFIG_X86_VOYAGER is not set
+# CONFIG_X86_NUMAQ is not set
+# CONFIG_X86_SUMMIT is not set
+# CONFIG_X86_BIGSMP is not set
+# CONFIG_X86_VISWS is not set
+# CONFIG_X86_GENERICARCH is not set
+# CONFIG_X86_ES7000 is not set
+# CONFIG_PARAVIRT is not set
+# CONFIG_M386 is not set
+CONFIG_M486=y
+# CONFIG_M586 is not set
+# CONFIG_M586TSC is not set
+# CONFIG_M586MMX is not set
+# CONFIG_M686 is not set
+# CONFIG_MPENTIUMII is not set
+# CONFIG_MPENTIUMIII is not set
+# CONFIG_MPENTIUMM is not set
+# CONFIG_MCORE2 is not set
+# CONFIG_MPENTIUM4 is not set
+# CONFIG_MK6 is not set
+# CONFIG_MK7 is not set
+# CONFIG_MK8 is not set
+# CONFIG_MCRUSOE is not set
+# CONFIG_MEFFICEON is not set
+# CONFIG_MWINCHIPC6 is not set
+# CONFIG_MWINCHIP2 is not set
+# CONFIG_MWINCHIP3D is not set
+# CONFIG_MGEODEGX1 is not set
+# CONFIG_MGEODE_LX is not set
+# CONFIG_MCYRIXIII is not set
+# CONFIG_MVIAC3_2 is not set
+# CONFIG_MVIAC7 is not set
+CONFIG_X86_GENERIC=y
+CONFIG_X86_CMPXCHG=y
+CONFIG_X86_L1_CACHE_SHIFT=7
+CONFIG_X86_XADD=y
+CONFIG_RWSEM_XCHGADD_ALGORITHM=y
+# CONFIG_ARCH_HAS_ILOG2_U32 is not set
+# CONFIG_ARCH_HAS_ILOG2_U64 is not set
+CONFIG_GENERIC_CALIBRATE_DELAY=y
+CONFIG_X86_PPRO_FENCE=y
+CONFIG_X86_F00F_BUG=y
+CONFIG_X86_WP_WORKS_OK=y
+CONFIG_X86_INVLPG=y
+CONFIG_X86_BSWAP=y
+CONFIG_X86_POPAD_OK=y
+CONFIG_X86_ALIGNMENT_16=y
+CONFIG_X86_INTEL_USERCOPY=y
+CONFIG_X86_MINIMUM_CPU_FAMILY=4
+# CONFIG_HPET_TIMER is not set
+CONFIG_NR_CPUS=8
+# CONFIG_SCHED_SMT is not set
+CONFIG_SCHED_MC=y
+CONFIG_PREEMPT_NONE=y
+# CONFIG_PREEMPT_VOLUNTARY is not set
+# CONFIG_PREEMPT is not set
+CONFIG_PREEMPT_BKL=y
+CONFIG_X86_LOCAL_APIC=y
+CONFIG_X86_IO_APIC=y
+# CONFIG_X86_MCE is not set
+CONFIG_VM86=y
+# CONFIG_TOSHIBA is not set
+# CONFIG_I8K is not set
+# CONFIG_X86_REBOOTFIXUPS is not set
+# CONFIG_MICROCODE is not set
+# CONFIG_X86_MSR is not set
+# CONFIG_X86_CPUID is not set
+
+#
+# Firmware Drivers
+#
+# CONFIG_EDD is not set
+# CONFIG_DELL_RBU is not set
+# CONFIG_DCDBAS is not set
+CONFIG_DMIID=y
+# CONFIG_NOHIGHMEM is not set
+CONFIG_HIGHMEM4G=y
+# CONFIG_HIGHMEM64G is not set
+CONFIG_PAGE_OFFSET=0xC0000000
+CONFIG_HIGHMEM=y
+CONFIG_ARCH_FLATMEM_ENABLE=y
+CONFIG_ARCH_SPARSEMEM_ENABLE=y
+CONFIG_ARCH_SELECT_MEMORY_MODEL=y
+CONFIG_ARCH_POPULATES_NODE_MAP=y
+CONFIG_SELECT_MEMORY_MODEL=y
+CONFIG_FLATMEM_MANUAL=y
+# CONFIG_DISCONTIGMEM_MANUAL is not set
+# CONFIG_SPARSEMEM_MANUAL is not set
+CONFIG_FLATMEM=y
+CONFIG_FLAT_NODE_MEM_MAP=y
+CONFIG_SPARSEMEM_STATIC=y
+CONFIG_SPLIT_PTLOCK_CPUS=4
+# CONFIG_RESOURCES_64BIT is not set
+CONFIG_ZONE_DMA_FLAG=1
+CONFIG_BOUNCE=y
+CONFIG_NR_QUICK=1
+CONFIG_VIRT_TO_BUS=y
+# CONFIG_HIGHPTE is not set
+# CONFIG_MATH_EMULATION is not set
+# CONFIG_MTRR is not set
+CONFIG_IRQBALANCE=y
+CONFIG_SECCOMP=y
+# CONFIG_HZ_100 is not set
+CONFIG_HZ_250=y
+# CONFIG_HZ_300 is not set
+# CONFIG_HZ_1000 is not set
+CONFIG_HZ=250
+# CONFIG_KEXEC is not set
+# CONFIG_CRASH_DUMP is not set
+CONFIG_PHYSICAL_START=0x100000
+# CONFIG_RELOCATABLE is not set
+CONFIG_PHYSICAL_ALIGN=0x100000
+CONFIG_HOTPLUG_CPU=y
+CONFIG_COMPAT_VDSO=y
+CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y
+
+#
+# Power management options (ACPI, APM)
+#
+CONFIG_PM=y
+# CONFIG_PM_LEGACY is not set
+# CONFIG_PM_DEBUG is not set
+CONFIG_PM_SLEEP_SMP=y
+CONFIG_PM_SLEEP=y
+CONFIG_SUSPEND_SMP_POSSIBLE=y
+CONFIG_SUSPEND=y
+CONFIG_HIBERNATION_SMP_POSSIBLE=y
+# CONFIG_HIBERNATION is not set
+# CONFIG_ACPI is not set
+CONFIG_APM=y
+# CONFIG_APM_IGNORE_USER_SUSPEND is not set
+# CONFIG_APM_DO_ENABLE is not set
+# CONFIG_APM_CPU_IDLE is not set
+# CONFIG_APM_DISPLAY_BLANK is not set
+# CONFIG_APM_ALLOW_INTS is not set
+# CONFIG_APM_REAL_MODE_POWER_OFF is not set
+
+#
+# CPU Frequency scaling
+#
+# CONFIG_CPU_FREQ is not set
+
+#
+# Bus options (PCI, PCMCIA, EISA, MCA, ISA)
+#
+CONFIG_PCI=y
+# CONFIG_PCI_GOBIOS is not set
+# CONFIG_PCI_GOMMCONFIG is not set
+# CONFIG_PCI_GODIRECT is not set
+CONFIG_PCI_GOANY=y
+CONFIG_PCI_BIOS=y
+CONFIG_PCI_DIRECT=y
+# CONFIG_PCIEPORTBUS is not set
+CONFIG_ARCH_SUPPORTS_MSI=y
+# CONFIG_PCI_MSI is not set
+# CONFIG_PCI_DEBUG is not set
+CONFIG_HT_IRQ=y
+CONFIG_ISA_DMA_API=y
+CONFIG_ISA=y
+# CONFIG_EISA is not set
+# CONFIG_MCA is not set
+# CONFIG_SCx200 is not set
+
+#
+# PCCARD (PCMCIA/CardBus) support
+#
+# CONFIG_PCCARD is not set
+# CONFIG_HOTPLUG_PCI is not set
+
+#
+# Executable file formats
+#
+CONFIG_BINFMT_ELF=y
+# CONFIG_BINFMT_AOUT is not set
+CONFIG_BINFMT_MISC=m
+
+#
+# Networking
+#
+CONFIG_NET=y
+
+#
+# Networking options
+#
+CONFIG_PACKET=y
+CONFIG_PACKET_MMAP=y
+CONFIG_UNIX=y
+CONFIG_XFRM=y
+CONFIG_XFRM_USER=m
+CONFIG_XFRM_SUB_POLICY=y
+CONFIG_XFRM_MIGRATE=y
+CONFIG_NET_KEY=m
+CONFIG_NET_KEY_MIGRATE=y
+CONFIG_INET=y
+CONFIG_IP_MULTICAST=y
+CONFIG_IP_ADVANCED_ROUTER=y
+CONFIG_ASK_IP_FIB_HASH=y
+# CONFIG_IP_FIB_TRIE is not set
+CONFIG_IP_FIB_HASH=y
+CONFIG_IP_MULTIPLE_TABLES=y
+CONFIG_IP_ROUTE_MULTIPATH=y
+# CONFIG_IP_ROUTE_VERBOSE is not set
+# CONFIG_IP_PNP is not set
+CONFIG_NET_IPIP=m
+CONFIG_NET_IPGRE=m
+CONFIG_NET_IPGRE_BROADCAST=y
+CONFIG_IP_MROUTE=y
+CONFIG_IP_PIMSM_V1=y
+CONFIG_IP_PIMSM_V2=y
+# CONFIG_ARPD is not set
+CONFIG_SYN_COOKIES=y
+CONFIG_INET_AH=m
+CONFIG_INET_ESP=m
+CONFIG_INET_IPCOMP=m
+CONFIG_INET_XFRM_TUNNEL=m
+CONFIG_INET_TUNNEL=m
+CONFIG_INET_XFRM_MODE_TRANSPORT=m
+CONFIG_INET_XFRM_MODE_TUNNEL=m
+CONFIG_INET_XFRM_MODE_BEET=m
+CONFIG_INET_DIAG=y
+CONFIG_INET_TCP_DIAG=y
+CONFIG_TCP_CONG_ADVANCED=y
+CONFIG_TCP_CONG_BIC=m
+CONFIG_TCP_CONG_CUBIC=y
+CONFIG_TCP_CONG_WESTWOOD=m
+CONFIG_TCP_CONG_HTCP=m
+CONFIG_TCP_CONG_HSTCP=m
+CONFIG_TCP_CONG_HYBLA=m
+CONFIG_TCP_CONG_VEGAS=m
+CONFIG_TCP_CONG_SCALABLE=m
+CONFIG_TCP_CONG_LP=m
+CONFIG_TCP_CONG_VENO=m
+CONFIG_TCP_CONG_YEAH=m
+CONFIG_TCP_CONG_ILLINOIS=m
+# CONFIG_DEFAULT_BIC is not set
+CONFIG_DEFAULT_CUBIC=y
+# CONFIG_DEFAULT_HTCP is not set
+# CONFIG_DEFAULT_VEGAS is not set
+# CONFIG_DEFAULT_WESTWOOD is not set
+# CONFIG_DEFAULT_RENO is not set
+CONFIG_DEFAULT_TCP_CONG="cubic"
+CONFIG_TCP_MD5SIG=y
+# CONFIG_IP_VS is not set
+CONFIG_IPV6=m
+CONFIG_IPV6_PRIVACY=y
+CONFIG_IPV6_ROUTER_PREF=y
+CONFIG_IPV6_ROUTE_INFO=y
+# CONFIG_IPV6_OPTIMISTIC_DAD is not set
+CONFIG_INET6_AH=m
+CONFIG_INET6_ESP=m
+CONFIG_INET6_IPCOMP=m
+# CONFIG_IPV6_MIP6 is not set
+CONFIG_INET6_XFRM_TUNNEL=m
+CONFIG_INET6_TUNNEL=m
+CONFIG_INET6_XFRM_MODE_TRANSPORT=m
+CONFIG_INET6_XFRM_MODE_TUNNEL=m
+CONFIG_INET6_XFRM_MODE_BEET=m
+# CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION is not set
+CONFIG_IPV6_SIT=m
+CONFIG_IPV6_TUNNEL=m
+CONFIG_IPV6_MULTIPLE_TABLES=y
+CONFIG_IPV6_SUBTREES=y
+CONFIG_NETWORK_SECMARK=y
+CONFIG_NETFILTER=y
+# CONFIG_NETFILTER_DEBUG is not set
+CONFIG_BRIDGE_NETFILTER=y
+
+#
+# Core Netfilter Configuration
+#
+CONFIG_NETFILTER_NETLINK=m
+CONFIG_NETFILTER_NETLINK_QUEUE=m
+CONFIG_NETFILTER_NETLINK_LOG=m
+CONFIG_NF_CONNTRACK_ENABLED=m
+CONFIG_NF_CONNTRACK=m
+CONFIG_NF_CT_ACCT=y
+CONFIG_NF_CONNTRACK_MARK=y
+CONFIG_NF_CONNTRACK_SECMARK=y
+CONFIG_NF_CONNTRACK_EVENTS=y
+CONFIG_NF_CT_PROTO_GRE=m
+CONFIG_NF_CT_PROTO_SCTP=m
+# CONFIG_NF_CT_PROTO_UDPLITE is not set
+CONFIG_NF_CONNTRACK_AMANDA=m
+CONFIG_NF_CONNTRACK_FTP=m
+CONFIG_NF_CONNTRACK_H323=m
+CONFIG_NF_CONNTRACK_IRC=m
+CONFIG_NF_CONNTRACK_NETBIOS_NS=m
+CONFIG_NF_CONNTRACK_PPTP=m
+CONFIG_NF_CONNTRACK_SANE=m
+CONFIG_NF_CONNTRACK_SIP=m
+CONFIG_NF_CONNTRACK_TFTP=m
+CONFIG_NF_CT_NETLINK=m
+CONFIG_NETFILTER_XTABLES=m
+CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m
+# CONFIG_NETFILTER_XT_TARGET_CONNMARK is not set
+# CONFIG_NETFILTER_XT_TARGET_DSCP is not set
+CONFIG_NETFILTER_XT_TARGET_MARK=m
+CONFIG_NETFILTER_XT_TARGET_NFQUEUE=m
+CONFIG_NETFILTER_XT_TARGET_NFLOG=m
+# CONFIG_NETFILTER_XT_TARGET_NOTRACK is not set
+# CONFIG_NETFILTER_XT_TARGET_TRACE is not set
+CONFIG_NETFILTER_XT_TARGET_SECMARK=m
+CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=m
+CONFIG_NETFILTER_XT_TARGET_TCPMSS=m
+CONFIG_NETFILTER_XT_MATCH_COMMENT=m
+CONFIG_NETFILTER_XT_MATCH_CONNBYTES=m
+# CONFIG_NETFILTER_XT_MATCH_CONNLIMIT is not set
+CONFIG_NETFILTER_XT_MATCH_CONNMARK=m
+CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m
+CONFIG_NETFILTER_XT_MATCH_DCCP=m
+CONFIG_NETFILTER_XT_MATCH_DSCP=m
+CONFIG_NETFILTER_XT_MATCH_ESP=m
+CONFIG_NETFILTER_XT_MATCH_HELPER=m
+CONFIG_NETFILTER_XT_MATCH_LENGTH=m
+CONFIG_NETFILTER_XT_MATCH_LIMIT=m
+CONFIG_NETFILTER_XT_MATCH_MAC=m
+CONFIG_NETFILTER_XT_MATCH_MARK=m
+CONFIG_NETFILTER_XT_MATCH_POLICY=m
+CONFIG_NETFILTER_XT_MATCH_MULTIPORT=m
+# CONFIG_NETFILTER_XT_MATCH_PHYSDEV is not set
+CONFIG_NETFILTER_XT_MATCH_PKTTYPE=m
+CONFIG_NETFILTER_XT_MATCH_QUOTA=m
+CONFIG_NETFILTER_XT_MATCH_REALM=m
+CONFIG_NETFILTER_XT_MATCH_SCTP=m
+CONFIG_NETFILTER_XT_MATCH_STATE=m
+CONFIG_NETFILTER_XT_MATCH_STATISTIC=m
+CONFIG_NETFILTER_XT_MATCH_STRING=m
+CONFIG_NETFILTER_XT_MATCH_TCPMSS=m
+# CONFIG_NETFILTER_XT_MATCH_U32 is not set
+CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=m
+
+#
+# IP: Netfilter Configuration
+#
+CONFIG_NF_CONNTRACK_IPV4=m
+CONFIG_NF_CONNTRACK_PROC_COMPAT=y
+# CONFIG_IP_NF_QUEUE is not set
+CONFIG_IP_NF_IPTABLES=m
+CONFIG_IP_NF_MATCH_IPRANGE=m
+CONFIG_IP_NF_MATCH_TOS=m
+CONFIG_IP_NF_MATCH_RECENT=m
+CONFIG_IP_NF_MATCH_ECN=m
+CONFIG_IP_NF_MATCH_AH=m
+CONFIG_IP_NF_MATCH_TTL=m
+CONFIG_IP_NF_MATCH_OWNER=m
+CONFIG_IP_NF_MATCH_ADDRTYPE=m
+CONFIG_IP_NF_FILTER=m
+CONFIG_IP_NF_TARGET_REJECT=m
+CONFIG_IP_NF_TARGET_LOG=m
+CONFIG_IP_NF_TARGET_ULOG=m
+CONFIG_NF_NAT=m
+CONFIG_NF_NAT_NEEDED=y
+CONFIG_IP_NF_TARGET_MASQUERADE=m
+CONFIG_IP_NF_TARGET_REDIRECT=m
+CONFIG_IP_NF_TARGET_NETMAP=m
+CONFIG_IP_NF_TARGET_SAME=m
+CONFIG_NF_NAT_SNMP_BASIC=m
+CONFIG_NF_NAT_PROTO_GRE=m
+CONFIG_NF_NAT_FTP=m
+CONFIG_NF_NAT_IRC=m
+CONFIG_NF_NAT_TFTP=m
+CONFIG_NF_NAT_AMANDA=m
+CONFIG_NF_NAT_PPTP=m
+CONFIG_NF_NAT_H323=m
+CONFIG_NF_NAT_SIP=m
+CONFIG_IP_NF_MANGLE=m
+CONFIG_IP_NF_TARGET_TOS=m
+CONFIG_IP_NF_TARGET_ECN=m
+CONFIG_IP_NF_TARGET_TTL=m
+CONFIG_IP_NF_TARGET_CLUSTERIP=m
+CONFIG_IP_NF_RAW=m
+CONFIG_IP_NF_ARPTABLES=m
+CONFIG_IP_NF_ARPFILTER=m
+CONFIG_IP_NF_ARP_MANGLE=m
+
+#
+# IPv6: Netfilter Configuration (EXPERIMENTAL)
+#
+CONFIG_NF_CONNTRACK_IPV6=m
+# CONFIG_IP6_NF_QUEUE is not set
+CONFIG_IP6_NF_IPTABLES=m
+CONFIG_IP6_NF_MATCH_RT=m
+CONFIG_IP6_NF_MATCH_OPTS=m
+CONFIG_IP6_NF_MATCH_FRAG=m
+CONFIG_IP6_NF_MATCH_HL=m
+CONFIG_IP6_NF_MATCH_OWNER=m
+CONFIG_IP6_NF_MATCH_IPV6HEADER=m
+CONFIG_IP6_NF_MATCH_AH=m
+CONFIG_IP6_NF_MATCH_MH=m
+CONFIG_IP6_NF_MATCH_EUI64=m
+CONFIG_IP6_NF_FILTER=m
+CONFIG_IP6_NF_TARGET_LOG=m
+CONFIG_IP6_NF_TARGET_REJECT=m
+CONFIG_IP6_NF_MANGLE=m
+CONFIG_IP6_NF_TARGET_HL=m
+CONFIG_IP6_NF_RAW=m
+
+#
+# DECnet: Netfilter Configuration
+#
+# CONFIG_DECNET_NF_GRABULATOR is not set
+
+#
+# Bridge: Netfilter Configuration
+#
+# CONFIG_BRIDGE_NF_EBTABLES is not set
+CONFIG_IP_DCCP=m
+CONFIG_INET_DCCP_DIAG=m
+CONFIG_IP_DCCP_ACKVEC=y
+
+#
+# DCCP CCIDs Configuration (EXPERIMENTAL)
+#
+CONFIG_IP_DCCP_CCID2=m
+# CONFIG_IP_DCCP_CCID2_DEBUG is not set
+CONFIG_IP_DCCP_CCID3=m
+CONFIG_IP_DCCP_TFRC_LIB=m
+# CONFIG_IP_DCCP_CCID3_DEBUG is not set
+CONFIG_IP_DCCP_CCID3_RTO=100
+
+#
+# DCCP Kernel Hacking
+#
+# CONFIG_IP_DCCP_DEBUG is not set
+CONFIG_IP_SCTP=m
+# CONFIG_SCTP_DBG_MSG is not set
+# CONFIG_SCTP_DBG_OBJCNT is not set
+# CONFIG_SCTP_HMAC_NONE is not set
+# CONFIG_SCTP_HMAC_SHA1 is not set
+CONFIG_SCTP_HMAC_MD5=y
+CONFIG_TIPC=m
+CONFIG_TIPC_ADVANCED=y
+CONFIG_TIPC_ZONES=3
+CONFIG_TIPC_CLUSTERS=1
+CONFIG_TIPC_NODES=255
+CONFIG_TIPC_SLAVE_NODES=0
+CONFIG_TIPC_PORTS=8191
+CONFIG_TIPC_LOG=0
+# CONFIG_TIPC_DEBUG is not set
+CONFIG_ATM=m
+CONFIG_ATM_CLIP=m
+# CONFIG_ATM_CLIP_NO_ICMP is not set
+CONFIG_ATM_LANE=m
+# CONFIG_ATM_MPOA is not set
+CONFIG_ATM_BR2684=m
+CONFIG_ATM_BR2684_IPFILTER=y
+CONFIG_BRIDGE=m
+CONFIG_VLAN_8021Q=m
+CONFIG_DECNET=m
+# CONFIG_DECNET_ROUTER is not set
+CONFIG_LLC=m
+CONFIG_LLC2=m
+CONFIG_IPX=m
+CONFIG_IPX_INTERN=y
+CONFIG_ATALK=m
+CONFIG_DEV_APPLETALK=m
+# CONFIG_LTPC is not set
+# CONFIG_COPS is not set
+CONFIG_IPDDP=m
+CONFIG_IPDDP_ENCAP=y
+CONFIG_IPDDP_DECAP=y
+CONFIG_X25=m
+CONFIG_LAPB=m
+CONFIG_ECONET=m
+CONFIG_ECONET_AUNUDP=y
+CONFIG_ECONET_NATIVE=y
+CONFIG_WAN_ROUTER=m
+
+#
+# QoS and/or fair queueing
+#
+CONFIG_NET_SCHED=y
+CONFIG_NET_SCH_FIFO=y
+
+#
+# Queueing/Scheduling
+#
+CONFIG_NET_SCH_CBQ=m
+CONFIG_NET_SCH_HTB=m
+CONFIG_NET_SCH_HFSC=m
+CONFIG_NET_SCH_ATM=m
+CONFIG_NET_SCH_PRIO=m
+# CONFIG_NET_SCH_RR is not set
+CONFIG_NET_SCH_RED=m
+CONFIG_NET_SCH_SFQ=m
+CONFIG_NET_SCH_TEQL=m
+CONFIG_NET_SCH_TBF=m
+CONFIG_NET_SCH_GRED=m
+CONFIG_NET_SCH_DSMARK=m
+CONFIG_NET_SCH_NETEM=m
+CONFIG_NET_SCH_INGRESS=m
+
+#
+# Classification
+#
+CONFIG_NET_CLS=y
+CONFIG_NET_CLS_BASIC=m
+CONFIG_NET_CLS_TCINDEX=m
+CONFIG_NET_CLS_ROUTE4=m
+CONFIG_NET_CLS_ROUTE=y
+CONFIG_NET_CLS_FW=m
+CONFIG_NET_CLS_U32=m
+CONFIG_CLS_U32_PERF=y
+CONFIG_CLS_U32_MARK=y
+CONFIG_NET_CLS_RSVP=m
+CONFIG_NET_CLS_RSVP6=m
+CONFIG_NET_EMATCH=y
+CONFIG_NET_EMATCH_STACK=32
+CONFIG_NET_EMATCH_CMP=m
+CONFIG_NET_EMATCH_NBYTE=m
+CONFIG_NET_EMATCH_U32=m
+CONFIG_NET_EMATCH_META=m
+CONFIG_NET_EMATCH_TEXT=m
+CONFIG_NET_CLS_ACT=y
+CONFIG_NET_ACT_POLICE=m
+CONFIG_NET_ACT_GACT=m
+CONFIG_GACT_PROB=y
+CONFIG_NET_ACT_MIRRED=m
+CONFIG_NET_ACT_IPT=m
+CONFIG_NET_ACT_PEDIT=m
+CONFIG_NET_ACT_SIMP=m
+# CONFIG_NET_CLS_POLICE is not set
+CONFIG_NET_CLS_IND=y
+
+#
+# Network testing
+#
+# CONFIG_NET_PKTGEN is not set
+# CONFIG_HAMRADIO is not set
+# CONFIG_IRDA is not set
+# CONFIG_BT is not set
+CONFIG_AF_RXRPC=m
+# CONFIG_AF_RXRPC_DEBUG is not set
+CONFIG_RXKAD=m
+CONFIG_FIB_RULES=y
+
+#
+# Wireless
+#
+# CONFIG_CFG80211 is not set
+# CONFIG_WIRELESS_EXT is not set
+# CONFIG_MAC80211 is not set
+# CONFIG_IEEE80211 is not set
+# CONFIG_RFKILL is not set
+# CONFIG_NET_9P is not set
+
+#
+# Device Drivers
+#
+
+#
+# Generic Driver Options
+#
+CONFIG_STANDALONE=y
+CONFIG_PREVENT_FIRMWARE_BUILD=y
+# CONFIG_FW_LOADER is not set
+# CONFIG_DEBUG_DRIVER is not set
+# CONFIG_DEBUG_DEVRES is not set
+# CONFIG_SYS_HYPERVISOR is not set
+CONFIG_CONNECTOR=m
+# CONFIG_MTD is not set
+CONFIG_PARPORT=m
+CONFIG_PARPORT_PC=m
+# CONFIG_PARPORT_SERIAL is not set
+# CONFIG_PARPORT_PC_FIFO is not set
+# CONFIG_PARPORT_PC_SUPERIO is not set
+# CONFIG_PARPORT_GSC is not set
+# CONFIG_PARPORT_AX88796 is not set
+# CONFIG_PARPORT_1284 is not set
+# CONFIG_PNP is not set
+CONFIG_BLK_DEV=y
+# CONFIG_BLK_DEV_FD is not set
+# CONFIG_BLK_DEV_XD is not set
+# CONFIG_PARIDE is not set
+# CONFIG_BLK_CPQ_DA is not set
+# CONFIG_BLK_CPQ_CISS_DA is not set
+# CONFIG_BLK_DEV_DAC960 is not set
+# CONFIG_BLK_DEV_UMEM is not set
+# CONFIG_BLK_DEV_COW_COMMON is not set
+CONFIG_BLK_DEV_LOOP=m
+CONFIG_BLK_DEV_CRYPTOLOOP=m
+CONFIG_BLK_DEV_NBD=m
+# CONFIG_BLK_DEV_SX8 is not set
+CONFIG_BLK_DEV_RAM=m
+CONFIG_BLK_DEV_RAM_COUNT=16
+CONFIG_BLK_DEV_RAM_SIZE=4096
+CONFIG_BLK_DEV_RAM_BLOCKSIZE=1024
+# CONFIG_CDROM_PKTCDVD is not set
+# CONFIG_ATA_OVER_ETH is not set
+CONFIG_MISC_DEVICES=y
+# CONFIG_IBM_ASM is not set
+# CONFIG_PHANTOM is not set
+# CONFIG_EEPROM_93CX6 is not set
+# CONFIG_SGI_IOC4 is not set
+# CONFIG_TIFM_CORE is not set
+CONFIG_IDE=y
+CONFIG_BLK_DEV_IDE=y
+
+#
+# Please see Documentation/ide.txt for help/info on IDE drives
+#
+# CONFIG_BLK_DEV_IDE_SATA is not set
+# CONFIG_BLK_DEV_HD_IDE is not set
+CONFIG_BLK_DEV_IDEDISK=y
+# CONFIG_IDEDISK_MULTI_MODE is not set
+CONFIG_BLK_DEV_IDECD=y
+# CONFIG_BLK_DEV_IDETAPE is not set
+# CONFIG_BLK_DEV_IDEFLOPPY is not set
+# CONFIG_IDE_TASK_IOCTL is not set
+CONFIG_IDE_PROC_FS=y
+
+#
+# IDE chipset support/bugfixes
+#
+CONFIG_IDE_GENERIC=y
+# CONFIG_BLK_DEV_CMD640 is not set
+CONFIG_BLK_DEV_IDEPCI=y
+# CONFIG_IDEPCI_SHARE_IRQ is not set
+CONFIG_IDEPCI_PCIBUS_ORDER=y
+# CONFIG_BLK_DEV_OFFBOARD is not set
+# CONFIG_BLK_DEV_GENERIC is not set
+# CONFIG_BLK_DEV_OPTI621 is not set
+# CONFIG_BLK_DEV_RZ1000 is not set
+# CONFIG_BLK_DEV_IDEDMA_PCI is not set
+# CONFIG_IDE_ARM is not set
+# CONFIG_IDE_CHIPSETS is not set
+# CONFIG_BLK_DEV_IDEDMA is not set
+# CONFIG_BLK_DEV_HD is not set
+
+#
+# SCSI device support
+#
+# CONFIG_RAID_ATTRS is not set
+# CONFIG_SCSI is not set
+# CONFIG_SCSI_DMA is not set
+# CONFIG_SCSI_NETLINK is not set
+# CONFIG_ATA is not set
+# CONFIG_MD is not set
+
+#
+# Fusion MPT device support
+#
+# CONFIG_FUSION is not set
+
+#
+# IEEE 1394 (FireWire) support
+#
+# CONFIG_FIREWIRE is not set
+# CONFIG_IEEE1394 is not set
+# CONFIG_I2O is not set
+# CONFIG_MACINTOSH_DRIVERS is not set
+CONFIG_NETDEVICES=y
+# CONFIG_NETDEVICES_MULTIQUEUE is not set
+# CONFIG_IFB is not set
+CONFIG_DUMMY=m
+# CONFIG_BONDING is not set
+# CONFIG_MACVLAN is not set
+# CONFIG_EQUALIZER is not set
+CONFIG_TUN=m
+# CONFIG_ARCNET is not set
+# CONFIG_PHYLIB is not set
+CONFIG_NET_ETHERNET=y
+CONFIG_MII=y
+# CONFIG_HAPPYMEAL is not set
+# CONFIG_SUNGEM is not set
+# CONFIG_CASSINI is not set
+# CONFIG_NET_VENDOR_3COM is not set
+# CONFIG_LANCE is not set
+# CONFIG_NET_VENDOR_SMC is not set
+# CONFIG_NET_VENDOR_RACAL is not set
+# CONFIG_NET_TULIP is not set
+# CONFIG_AT1700 is not set
+# CONFIG_DEPCA is not set
+# CONFIG_HP100 is not set
+# CONFIG_NET_ISA is not set
+CONFIG_NET_PCI=y
+CONFIG_PCNET32=y
+# CONFIG_PCNET32_NAPI is not set
+# CONFIG_AMD8111_ETH is not set
+# CONFIG_ADAPTEC_STARFIRE is not set
+# CONFIG_AC3200 is not set
+# CONFIG_APRICOT is not set
+# CONFIG_B44 is not set
+# CONFIG_FORCEDETH is not set
+# CONFIG_CS89x0 is not set
+# CONFIG_DGRS is not set
+# CONFIG_EEPRO100 is not set
+# CONFIG_E100 is not set
+# CONFIG_FEALNX is not set
+# CONFIG_NATSEMI is not set
+CONFIG_NE2K_PCI=y
+CONFIG_8139CP=y
+# CONFIG_8139TOO is not set
+# CONFIG_SIS900 is not set
+# CONFIG_EPIC100 is not set
+# CONFIG_SUNDANCE is not set
+# CONFIG_TLAN is not set
+# CONFIG_VIA_RHINE is not set
+# CONFIG_SC92031 is not set
+# CONFIG_NET_POCKET is not set
+# CONFIG_NETDEV_1000 is not set
+# CONFIG_NETDEV_10000 is not set
+# CONFIG_TR is not set
+
+#
+# Wireless LAN
+#
+# CONFIG_WLAN_PRE80211 is not set
+# CONFIG_WLAN_80211 is not set
+# CONFIG_WAN is not set
+CONFIG_ATM_DRIVERS=y
+# CONFIG_ATM_DUMMY is not set
+# CONFIG_ATM_TCP is not set
+# CONFIG_ATM_LANAI is not set
+# CONFIG_ATM_ENI is not set
+# CONFIG_ATM_FIRESTREAM is not set
+# CONFIG_ATM_ZATM is not set
+# CONFIG_ATM_NICSTAR is not set
+# CONFIG_ATM_IDT77252 is not set
+# CONFIG_ATM_AMBASSADOR is not set
+# CONFIG_ATM_HORIZON is not set
+# CONFIG_ATM_IA is not set
+# CONFIG_ATM_FORE200E_MAYBE is not set
+# CONFIG_ATM_HE is not set
+# CONFIG_FDDI is not set
+CONFIG_HIPPI=y
+# CONFIG_ROADRUNNER is not set
+# CONFIG_PLIP is not set
+# CONFIG_PPP is not set
+# CONFIG_SLIP is not set
+# CONFIG_SHAPER is not set
+# CONFIG_NETCONSOLE is not set
+# CONFIG_NETPOLL is not set
+# CONFIG_NET_POLL_CONTROLLER is not set
+# CONFIG_ISDN is not set
+# CONFIG_PHONE is not set
+
+#
+# Input device support
+#
+CONFIG_INPUT=y
+# CONFIG_INPUT_FF_MEMLESS is not set
+# CONFIG_INPUT_POLLDEV is not set
+
+#
+# Userland interfaces
+#
+CONFIG_INPUT_MOUSEDEV=y
+CONFIG_INPUT_MOUSEDEV_PSAUX=y
+CONFIG_INPUT_MOUSEDEV_SCREEN_X=1024
+CONFIG_INPUT_MOUSEDEV_SCREEN_Y=768
+# CONFIG_INPUT_JOYDEV is not set
+# CONFIG_INPUT_TSDEV is not set
+# CONFIG_INPUT_EVDEV is not set
+# CONFIG_INPUT_EVBUG is not set
+
+#
+# Input Device Drivers
+#
+CONFIG_INPUT_KEYBOARD=y
+CONFIG_KEYBOARD_ATKBD=y
+# CONFIG_KEYBOARD_SUNKBD is not set
+# CONFIG_KEYBOARD_LKKBD is not set
+# CONFIG_KEYBOARD_XTKBD is not set
+# CONFIG_KEYBOARD_NEWTON is not set
+# CONFIG_KEYBOARD_STOWAWAY is not set
+CONFIG_INPUT_MOUSE=y
+CONFIG_MOUSE_PS2=y
+CONFIG_MOUSE_PS2_ALPS=y
+CONFIG_MOUSE_PS2_LOGIPS2PP=y
+CONFIG_MOUSE_PS2_SYNAPTICS=y
+CONFIG_MOUSE_PS2_LIFEBOOK=y
+CONFIG_MOUSE_PS2_TRACKPOINT=y
+# CONFIG_MOUSE_PS2_TOUCHKIT is not set
+# CONFIG_MOUSE_SERIAL is not set
+# CONFIG_MOUSE_APPLETOUCH is not set
+# CONFIG_MOUSE_INPORT is not set
+# CONFIG_MOUSE_LOGIBM is not set
+# CONFIG_MOUSE_PC110PAD is not set
+# CONFIG_MOUSE_VSXXXAA is not set
+# CONFIG_INPUT_JOYSTICK is not set
+# CONFIG_INPUT_TABLET is not set
+# CONFIG_INPUT_TOUCHSCREEN is not set
+# CONFIG_INPUT_MISC is not set
+
+#
+# Hardware I/O ports
+#
+CONFIG_SERIO=y
+CONFIG_SERIO_I8042=y
+CONFIG_SERIO_SERPORT=y
+# CONFIG_SERIO_CT82C710 is not set
+# CONFIG_SERIO_PARKBD is not set
+# CONFIG_SERIO_PCIPS2 is not set
+CONFIG_SERIO_LIBPS2=y
+# CONFIG_SERIO_RAW is not set
+# CONFIG_GAMEPORT is not set
+
+#
+# Character devices
+#
+CONFIG_VT=y
+CONFIG_VT_CONSOLE=y
+CONFIG_HW_CONSOLE=y
+# CONFIG_VT_HW_CONSOLE_BINDING is not set
+# CONFIG_SERIAL_NONSTANDARD is not set
+
+#
+# Serial drivers
+#
+CONFIG_SERIAL_8250=y
+CONFIG_SERIAL_8250_CONSOLE=y
+CONFIG_FIX_EARLYCON_MEM=y
+CONFIG_SERIAL_8250_PCI=y
+CONFIG_SERIAL_8250_NR_UARTS=4
+CONFIG_SERIAL_8250_RUNTIME_UARTS=4
+# CONFIG_SERIAL_8250_EXTENDED is not set
+
+#
+# Non-8250 serial port support
+#
+CONFIG_SERIAL_CORE=y
+CONFIG_SERIAL_CORE_CONSOLE=y
+# CONFIG_SERIAL_JSM is not set
+CONFIG_UNIX98_PTYS=y
+CONFIG_LEGACY_PTYS=y
+CONFIG_LEGACY_PTY_COUNT=256
+# CONFIG_PRINTER is not set
+# CONFIG_PPDEV is not set
+# CONFIG_TIPAR is not set
+# CONFIG_IPMI_HANDLER is not set
+CONFIG_WATCHDOG=y
+CONFIG_WATCHDOG_NOWAYOUT=y
+
+#
+# Watchdog Device Drivers
+#
+CONFIG_SOFT_WATCHDOG=y
+# CONFIG_ACQUIRE_WDT is not set
+# CONFIG_ADVANTECH_WDT is not set
+# CONFIG_ALIM1535_WDT is not set
+# CONFIG_ALIM7101_WDT is not set
+# CONFIG_SC520_WDT is not set
+# CONFIG_EUROTECH_WDT is not set
+# CONFIG_IB700_WDT is not set
+# CONFIG_IBMASR is not set
+# CONFIG_WAFER_WDT is not set
+# CONFIG_I6300ESB_WDT is not set
+# CONFIG_ITCO_WDT is not set
+# CONFIG_SC1200_WDT is not set
+# CONFIG_PC87413_WDT is not set
+# CONFIG_60XX_WDT is not set
+# CONFIG_SBC8360_WDT is not set
+# CONFIG_CPU5_WDT is not set
+# CONFIG_SMSC37B787_WDT is not set
+# CONFIG_W83627HF_WDT is not set
+# CONFIG_W83697HF_WDT is not set
+# CONFIG_W83877F_WDT is not set
+# CONFIG_W83977F_WDT is not set
+# CONFIG_MACHZ_WDT is not set
+# CONFIG_SBC_EPX_C3_WATCHDOG is not set
+
+#
+# ISA-based Watchdog Cards
+#
+# CONFIG_PCWATCHDOG is not set
+# CONFIG_MIXCOMWD is not set
+# CONFIG_WDT is not set
+
+#
+# PCI-based Watchdog Cards
+#
+# CONFIG_PCIPCWATCHDOG is not set
+# CONFIG_WDTPCI is not set
+# CONFIG_HW_RANDOM is not set
+# CONFIG_NVRAM is not set
+# CONFIG_RTC is not set
+# CONFIG_GEN_RTC is not set
+# CONFIG_DTLK is not set
+# CONFIG_R3964 is not set
+# CONFIG_APPLICOM is not set
+# CONFIG_SONYPI is not set
+# CONFIG_AGP is not set
+# CONFIG_DRM is not set
+# CONFIG_MWAVE is not set
+# CONFIG_PC8736x_GPIO is not set
+# CONFIG_NSC_GPIO is not set
+# CONFIG_CS5535_GPIO is not set
+CONFIG_RAW_DRIVER=m
+CONFIG_MAX_RAW_DEVS=256
+# CONFIG_HANGCHECK_TIMER is not set
+# CONFIG_TCG_TPM is not set
+# CONFIG_TELCLOCK is not set
+CONFIG_DEVPORT=y
+# CONFIG_I2C is not set
+
+#
+# SPI support
+#
+# CONFIG_SPI is not set
+# CONFIG_SPI_MASTER is not set
+# CONFIG_W1 is not set
+# CONFIG_POWER_SUPPLY is not set
+CONFIG_HWMON=y
+# CONFIG_HWMON_VID is not set
+# CONFIG_SENSORS_ABITUGURU is not set
+# CONFIG_SENSORS_ABITUGURU3 is not set
+# CONFIG_SENSORS_K8TEMP is not set
+# CONFIG_SENSORS_F71805F is not set
+# CONFIG_SENSORS_CORETEMP is not set
+# CONFIG_SENSORS_IT87 is not set
+# CONFIG_SENSORS_PC87360 is not set
+# CONFIG_SENSORS_PC87427 is not set
+# CONFIG_SENSORS_SIS5595 is not set
+# CONFIG_SENSORS_SMSC47M1 is not set
+# CONFIG_SENSORS_SMSC47B397 is not set
+# CONFIG_SENSORS_VIA686A is not set
+# CONFIG_SENSORS_VT1211 is not set
+# CONFIG_SENSORS_VT8231 is not set
+# CONFIG_SENSORS_W83627HF is not set
+# CONFIG_SENSORS_W83627EHF is not set
+# CONFIG_SENSORS_HDAPS is not set
+# CONFIG_SENSORS_APPLESMC is not set
+# CONFIG_HWMON_DEBUG_CHIP is not set
+
+#
+# Multifunction device drivers
+#
+# CONFIG_MFD_SM501 is not set
+
+#
+# Multimedia devices
+#
+# CONFIG_VIDEO_DEV is not set
+# CONFIG_DVB_CORE is not set
+# CONFIG_DAB is not set
+
+#
+# Graphics support
+#
+# CONFIG_BACKLIGHT_LCD_SUPPORT is not set
+
+#
+# Display device support
+#
+# CONFIG_DISPLAY_SUPPORT is not set
+# CONFIG_VGASTATE is not set
+CONFIG_VIDEO_OUTPUT_CONTROL=m
+# CONFIG_FB is not set
+
+#
+# Console display driver support
+#
+CONFIG_VGA_CONSOLE=y
+# CONFIG_VGACON_SOFT_SCROLLBACK is not set
+# CONFIG_VIDEO_SELECT is not set
+# CONFIG_MDA_CONSOLE is not set
+CONFIG_DUMMY_CONSOLE=y
+
+#
+# Sound
+#
+# CONFIG_SOUND is not set
+CONFIG_HID_SUPPORT=y
+# CONFIG_HID is not set
+CONFIG_USB_SUPPORT=y
+CONFIG_USB_ARCH_HAS_HCD=y
+CONFIG_USB_ARCH_HAS_OHCI=y
+CONFIG_USB_ARCH_HAS_EHCI=y
+# CONFIG_USB is not set
+
+#
+# NOTE: USB_STORAGE enables SCSI, and 'SCSI disk support'
+#
+
+#
+# USB Gadget Support
+#
+# CONFIG_USB_GADGET is not set
+# CONFIG_MMC is not set
+# CONFIG_NEW_LEDS is not set
+# CONFIG_INFINIBAND is not set
+# CONFIG_EDAC is not set
+# CONFIG_RTC_CLASS is not set
+
+#
+# DMA Engine support
+#
+# CONFIG_DMA_ENGINE is not set
+
+#
+# DMA Clients
+#
+
+#
+# DMA Devices
+#
+# CONFIG_AUXDISPLAY is not set
+CONFIG_VIRTUALIZATION=y
+# CONFIG_KVM is not set
+
+#
+# Userspace I/O
+#
+# CONFIG_UIO is not set
+
+#
+# File systems
+#
+# CONFIG_EXT2_FS is not set
+CONFIG_EXT3_FS=y
+CONFIG_EXT3_FS_XATTR=y
+CONFIG_EXT3_FS_POSIX_ACL=y
+CONFIG_EXT3_FS_SECURITY=y
+# CONFIG_EXT4DEV_FS is not set
+CONFIG_JBD=y
+# CONFIG_JBD_DEBUG is not set
+CONFIG_FS_MBCACHE=y
+# CONFIG_REISERFS_FS is not set
+# CONFIG_JFS_FS is not set
+CONFIG_FS_POSIX_ACL=y
+# CONFIG_XFS_FS is not set
+# CONFIG_GFS2_FS is not set
+# CONFIG_OCFS2_FS is not set
+# CONFIG_MINIX_FS is not set
+CONFIG_ROMFS_FS=m
+CONFIG_INOTIFY=y
+CONFIG_INOTIFY_USER=y
+# CONFIG_QUOTA is not set
+CONFIG_DNOTIFY=y
+# CONFIG_AUTOFS_FS is not set
+# CONFIG_AUTOFS4_FS is not set
+# CONFIG_FUSE_FS is not set
+CONFIG_GENERIC_ACL=y
+
+#
+# CD-ROM/DVD Filesystems
+#
+CONFIG_ISO9660_FS=y
+CONFIG_JOLIET=y
+CONFIG_ZISOFS=y
+CONFIG_UDF_FS=y
+CONFIG_UDF_NLS=y
+
+#
+# DOS/FAT/NT Filesystems
+#
+# CONFIG_MSDOS_FS is not set
+# CONFIG_VFAT_FS is not set
+# CONFIG_NTFS_FS is not set
+
+#
+# Pseudo filesystems
+#
+CONFIG_PROC_FS=y
+CONFIG_PROC_KCORE=y
+CONFIG_PROC_SYSCTL=y
+CONFIG_SYSFS=y
+CONFIG_TMPFS=y
+CONFIG_TMPFS_POSIX_ACL=y
+# CONFIG_HUGETLBFS is not set
+# CONFIG_HUGETLB_PAGE is not set
+CONFIG_RAMFS=y
+CONFIG_CONFIGFS_FS=m
+
+#
+# Miscellaneous filesystems
+#
+# CONFIG_ADFS_FS is not set
+# CONFIG_AFFS_FS is not set
+# CONFIG_ECRYPT_FS is not set
+# CONFIG_HFS_FS is not set
+# CONFIG_HFSPLUS_FS is not set
+# CONFIG_BEFS_FS is not set
+# CONFIG_BFS_FS is not set
+# CONFIG_EFS_FS is not set
+CONFIG_CRAMFS=m
+# CONFIG_VXFS_FS is not set
+# CONFIG_HPFS_FS is not set
+# CONFIG_QNX4FS_FS is not set
+# CONFIG_SYSV_FS is not set
+# CONFIG_UFS_FS is not set
+
+#
+# Network File Systems
+#
+CONFIG_NFS_FS=y
+CONFIG_NFS_V3=y
+# CONFIG_NFS_V3_ACL is not set
+# CONFIG_NFS_V4 is not set
+# CONFIG_NFS_DIRECTIO is not set
+# CONFIG_NFSD is not set
+CONFIG_LOCKD=y
+CONFIG_LOCKD_V4=y
+CONFIG_NFS_COMMON=y
+CONFIG_SUNRPC=y
+# CONFIG_SUNRPC_BIND34 is not set
+# CONFIG_RPCSEC_GSS_KRB5 is not set
+# CONFIG_RPCSEC_GSS_SPKM3 is not set
+# CONFIG_SMB_FS is not set
+# CONFIG_CIFS is not set
+# CONFIG_NCP_FS is not set
+# CONFIG_CODA_FS is not set
+# CONFIG_AFS_FS is not set
+
+#
+# Partition Types
+#
+# CONFIG_PARTITION_ADVANCED is not set
+CONFIG_MSDOS_PARTITION=y
+
+#
+# Native Language Support
+#
+CONFIG_NLS=y
+CONFIG_NLS_DEFAULT="iso8859-1"
+CONFIG_NLS_CODEPAGE_437=m
+CONFIG_NLS_CODEPAGE_737=m
+CONFIG_NLS_CODEPAGE_775=m
+CONFIG_NLS_CODEPAGE_850=m
+CONFIG_NLS_CODEPAGE_852=m
+CONFIG_NLS_CODEPAGE_855=m
+CONFIG_NLS_CODEPAGE_857=m
+CONFIG_NLS_CODEPAGE_860=m
+CONFIG_NLS_CODEPAGE_861=m
+CONFIG_NLS_CODEPAGE_862=m
+CONFIG_NLS_CODEPAGE_863=m
+CONFIG_NLS_CODEPAGE_864=m
+CONFIG_NLS_CODEPAGE_865=m
+CONFIG_NLS_CODEPAGE_866=m
+CONFIG_NLS_CODEPAGE_869=m
+CONFIG_NLS_CODEPAGE_936=m
+CONFIG_NLS_CODEPAGE_950=m
+CONFIG_NLS_CODEPAGE_932=m
+CONFIG_NLS_CODEPAGE_949=m
+CONFIG_NLS_CODEPAGE_874=m
+CONFIG_NLS_ISO8859_8=m
+CONFIG_NLS_CODEPAGE_1250=m
+CONFIG_NLS_CODEPAGE_1251=m
+CONFIG_NLS_ASCII=m
+CONFIG_NLS_ISO8859_1=m
+CONFIG_NLS_ISO8859_2=m
+CONFIG_NLS_ISO8859_3=m
+CONFIG_NLS_ISO8859_4=m
+CONFIG_NLS_ISO8859_5=m
+CONFIG_NLS_ISO8859_6=m
+CONFIG_NLS_ISO8859_7=m
+CONFIG_NLS_ISO8859_9=m
+CONFIG_NLS_ISO8859_13=m
+CONFIG_NLS_ISO8859_14=m
+CONFIG_NLS_ISO8859_15=m
+CONFIG_NLS_KOI8_R=m
+CONFIG_NLS_KOI8_U=m
+CONFIG_NLS_UTF8=m
+
+#
+# Distributed Lock Manager
+#
+# CONFIG_DLM is not set
+CONFIG_INSTRUMENTATION=y
+# CONFIG_PROFILING is not set
+# CONFIG_KPROBES is not set
+
+#
+# Kernel hacking
+#
+CONFIG_TRACE_IRQFLAGS_SUPPORT=y
+# CONFIG_PRINTK_TIME is not set
+# CONFIG_ENABLE_MUST_CHECK is not set
+CONFIG_MAGIC_SYSRQ=y
+# CONFIG_UNUSED_SYMBOLS is not set
+CONFIG_DEBUG_FS=y
+# CONFIG_HEADERS_CHECK is not set
+CONFIG_DEBUG_KERNEL=y
+# CONFIG_DEBUG_SHIRQ is not set
+CONFIG_DETECT_SOFTLOCKUP=y
+CONFIG_SCHED_DEBUG=y
+# CONFIG_SCHEDSTATS is not set
+# CONFIG_TIMER_STATS is not set
+CONFIG_DEBUG_SLAB=y
+CONFIG_DEBUG_SLAB_LEAK=y
+CONFIG_DEBUG_RT_MUTEXES=y
+CONFIG_DEBUG_PI_LIST=y
+# CONFIG_RT_MUTEX_TESTER is not set
+CONFIG_DEBUG_SPINLOCK=y
+CONFIG_DEBUG_MUTEXES=y
+CONFIG_DEBUG_LOCK_ALLOC=y
+CONFIG_PROVE_LOCKING=y
+CONFIG_LOCKDEP=y
+CONFIG_LOCK_STAT=y
+# CONFIG_DEBUG_LOCKDEP is not set
+CONFIG_TRACE_IRQFLAGS=y
+CONFIG_DEBUG_SPINLOCK_SLEEP=y
+# CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set
+CONFIG_STACKTRACE=y
+CONFIG_DEBUG_KOBJECT=y
+CONFIG_DEBUG_HIGHMEM=y
+CONFIG_DEBUG_BUGVERBOSE=y
+CONFIG_DEBUG_INFO=y
+CONFIG_DEBUG_VM=y
+CONFIG_DEBUG_LIST=y
+CONFIG_FRAME_POINTER=y
+CONFIG_FORCED_INLINING=y
+CONFIG_RCU_TORTURE_TEST=m
+# CONFIG_FAULT_INJECTION is not set
+CONFIG_EARLY_PRINTK=y
+CONFIG_DEBUG_STACKOVERFLOW=y
+# CONFIG_DEBUG_STACK_USAGE is not set
+CONFIG_DEBUG_PAGEALLOC=y
+CONFIG_DEBUG_RODATA=y
+CONFIG_4KSTACKS=y
+CONFIG_X86_FIND_SMP_CONFIG=y
+CONFIG_X86_MPPARSE=y
+CONFIG_DOUBLEFAULT=y
+
+#
+# Security options
+#
+CONFIG_KEYS=y
+# CONFIG_KEYS_DEBUG_PROC_KEYS is not set
+# CONFIG_SECURITY is not set
+CONFIG_CRYPTO=y
+CONFIG_CRYPTO_ALGAPI=y
+CONFIG_CRYPTO_BLKCIPHER=m
+CONFIG_CRYPTO_HASH=m
+CONFIG_CRYPTO_MANAGER=m
+CONFIG_CRYPTO_HMAC=m
+# CONFIG_CRYPTO_XCBC is not set
+CONFIG_CRYPTO_NULL=m
+CONFIG_CRYPTO_MD4=m
+CONFIG_CRYPTO_MD5=y
+CONFIG_CRYPTO_SHA1=m
+CONFIG_CRYPTO_SHA256=m
+# CONFIG_CRYPTO_SHA512 is not set
+# CONFIG_CRYPTO_WP512 is not set
+# CONFIG_CRYPTO_TGR192 is not set
+CONFIG_CRYPTO_GF128MUL=m
+# CONFIG_CRYPTO_ECB is not set
+CONFIG_CRYPTO_CBC=m
+CONFIG_CRYPTO_PCBC=m
+CONFIG_CRYPTO_LRW=m
+# CONFIG_CRYPTO_CRYPTD is not set
+CONFIG_CRYPTO_DES=m
+CONFIG_CRYPTO_FCRYPT=m
+# CONFIG_CRYPTO_BLOWFISH is not set
+# CONFIG_CRYPTO_TWOFISH is not set
+# CONFIG_CRYPTO_TWOFISH_586 is not set
+# CONFIG_CRYPTO_SERPENT is not set
+CONFIG_CRYPTO_AES=m
+# CONFIG_CRYPTO_AES_586 is not set
+# CONFIG_CRYPTO_CAST5 is not set
+# CONFIG_CRYPTO_CAST6 is not set
+CONFIG_CRYPTO_TEA=m
+# CONFIG_CRYPTO_ARC4 is not set
+# CONFIG_CRYPTO_KHAZAD is not set
+# CONFIG_CRYPTO_ANUBIS is not set
+CONFIG_CRYPTO_DEFLATE=m
+# CONFIG_CRYPTO_MICHAEL_MIC is not set
+# CONFIG_CRYPTO_CRC32C is not set
+# CONFIG_CRYPTO_CAMELLIA is not set
+# CONFIG_CRYPTO_TEST is not set
+CONFIG_CRYPTO_HW=y
+# CONFIG_CRYPTO_DEV_PADLOCK is not set
+# CONFIG_CRYPTO_DEV_GEODE is not set
+
+#
+# Library routines
+#
+CONFIG_BITREVERSE=y
+CONFIG_CRC_CCITT=m
+CONFIG_CRC16=m
+CONFIG_CRC_ITU_T=m
+CONFIG_CRC32=y
+# CONFIG_CRC7 is not set
+CONFIG_LIBCRC32C=m
+CONFIG_ZLIB_INFLATE=y
+CONFIG_ZLIB_DEFLATE=m
+CONFIG_TEXTSEARCH=y
+CONFIG_TEXTSEARCH_KMP=m
+CONFIG_TEXTSEARCH_BM=m
+CONFIG_TEXTSEARCH_FSM=m
+CONFIG_PLIST=y
+CONFIG_HAS_IOMEM=y
+CONFIG_HAS_IOPORT=y
+CONFIG_HAS_DMA=y
+CONFIG_GENERIC_HARDIRQS=y
+CONFIG_GENERIC_IRQ_PROBE=y
+CONFIG_GENERIC_PENDING_IRQ=y
+CONFIG_X86_SMP=y
+CONFIG_X86_HT=y
+CONFIG_X86_BIOS_REBOOT=y
+CONFIG_X86_TRAMPOLINE=y
+CONFIG_KTIME_SCALAR=y
diff --git a/datapath/table.c b/datapath/table.c
new file mode 100644
index 000000000..c0885b705
--- /dev/null
+++ b/datapath/table.c
@@ -0,0 +1,240 @@
+#include "flow.h"
+#include "datapath.h"
+
+#include <linux/gfp.h>
+#include <linux/jhash.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <asm/pgtable.h>
+
+static void free_table(struct sw_flow ***flows, unsigned int n_buckets,
+ int free_flows)
+{
+ unsigned int i;
+
+ for (i = 0; i < n_buckets >> DP_L1_BITS; i++) {
+ struct sw_flow **l2 = flows[i];
+ if (free_flows) {
+ unsigned int j;
+ for (j = 0; j < DP_L1_SIZE; j++) {
+ if (l2[j])
+ flow_free(l2[j]);
+ }
+ }
+ free_page((unsigned long)l2);
+ }
+ kfree(flows);
+}
+
+static struct sw_flow ***alloc_table(unsigned int n_buckets)
+{
+ struct sw_flow ***flows;
+ unsigned int i;
+
+ flows = kmalloc((n_buckets >> DP_L1_BITS) * sizeof(struct sw_flow**),
+ GFP_KERNEL);
+ if (!flows)
+ return NULL;
+ for (i = 0; i < n_buckets >> DP_L1_BITS; i++) {
+ flows[i] = (struct sw_flow **)get_zeroed_page(GFP_KERNEL);
+ if (!flows[i]) {
+ free_table(flows, i << DP_L1_BITS, 0);
+ return NULL;
+ }
+ }
+ return flows;
+}
+
+struct dp_table *dp_table_create(unsigned int n_buckets)
+{
+ struct dp_table *table;
+
+ table = kzalloc(sizeof *table, GFP_KERNEL);
+ if (!table)
+ goto err;
+
+ table->n_buckets = n_buckets;
+ table->flows[0] = alloc_table(n_buckets);
+ if (!table[0].flows)
+ goto err_free_tables;
+
+ table->flows[1] = alloc_table(n_buckets);
+ if (!table->flows[1])
+ goto err_free_flows0;
+
+ return table;
+
+err_free_flows0:
+ free_table(table->flows[0], table->n_buckets, 0);
+err_free_tables:
+ kfree(table);
+err:
+ return NULL;
+}
+
+void dp_table_destroy(struct dp_table *table, int free_flows)
+{
+ int i;
+ for (i = 0; i < 2; i++)
+ free_table(table->flows[i], table->n_buckets, free_flows);
+ kfree(table);
+}
+
+static struct sw_flow **find_bucket(struct dp_table *table,
+ struct sw_flow ***flows, u32 hash)
+{
+ unsigned int l1 = (hash & (table->n_buckets - 1)) >> DP_L1_SHIFT;
+ unsigned int l2 = hash & ((1 << DP_L2_BITS) - 1);
+ return &flows[l1][l2];
+}
+
+static struct sw_flow *lookup_table(struct dp_table *table,
+ struct sw_flow ***flows, u32 hash,
+ const struct odp_flow_key *key)
+{
+ struct sw_flow **bucket = find_bucket(table, flows, hash);
+ struct sw_flow *flow = rcu_dereference(*bucket);
+ if (flow && !memcmp(&flow->key, key, sizeof(struct odp_flow_key)))
+ return flow;
+ return NULL;
+}
+
+static u32 flow_hash0(const struct odp_flow_key *key)
+{
+ return jhash2((u32*)key, sizeof *key / sizeof(u32), 0xaaaaaaaa);
+}
+
+static u32 flow_hash1(const struct odp_flow_key *key)
+{
+ return jhash2((u32*)key, sizeof *key / sizeof(u32), 0x55555555);
+}
+
+static void find_buckets(struct dp_table *table,
+ const struct odp_flow_key *key,
+ struct sw_flow **buckets[2])
+{
+ buckets[0] = find_bucket(table, table->flows[0], flow_hash0(key));
+ buckets[1] = find_bucket(table, table->flows[1], flow_hash1(key));
+}
+
+struct sw_flow *dp_table_lookup(struct dp_table *table,
+ const struct odp_flow_key *key)
+{
+ struct sw_flow *flow;
+ flow = lookup_table(table, table->flows[0], flow_hash0(key), key);
+ if (!flow)
+ flow = lookup_table(table, table->flows[1],
+ flow_hash1(key), key);
+ return flow;
+}
+
+int dp_table_foreach(struct dp_table *table,
+ int (*callback)(struct sw_flow *flow, void *aux),
+ void *aux)
+{
+ unsigned int i, j, k;
+ for (i = 0; i < 2; i++) {
+ for (j = 0; j < table->n_buckets >> DP_L1_BITS; j++) {
+ struct sw_flow **l2 = table->flows[i][j];
+ for (k = 0; k < DP_L1_SIZE; k++) {
+ struct sw_flow *flow = rcu_dereference(l2[k]);
+ if (flow) {
+ int error = callback(flow, aux);
+ if (error)
+ return error;
+ }
+ }
+ }
+ }
+ return 0;
+}
+
+static int insert_flow(struct sw_flow *flow, void *new_table_)
+{
+ struct dp_table *new_table = new_table_;
+ struct sw_flow **buckets[2];
+ int i;
+
+ find_buckets(new_table, &flow->key, buckets);
+ for (i = 0; i < 2; i++) {
+ if (!*buckets[i]) {
+ rcu_assign_pointer(*buckets[i], flow);
+ return 0;
+ }
+ }
+ WARN_ON_ONCE(1);
+ return 0;
+}
+
+static void dp_free_table_rcu(struct rcu_head *rcu)
+{
+ struct dp_table *table = container_of(rcu, struct dp_table, rcu);
+ dp_table_destroy(table, 0);
+}
+
+int dp_table_expand(struct datapath *dp)
+{
+ struct dp_table *old_table = rcu_dereference(dp->table);
+ struct dp_table *new_table = dp_table_create(old_table->n_buckets * 2);
+ if (!new_table)
+ return -ENOMEM;
+ dp_table_foreach(old_table, insert_flow, new_table);
+ rcu_assign_pointer(dp->table, new_table);
+ call_rcu(&old_table->rcu, dp_free_table_rcu);
+ return 0;
+}
+
+static void dp_free_table_and_flows_rcu(struct rcu_head *rcu)
+{
+ struct dp_table *table = container_of(rcu, struct dp_table, rcu);
+ dp_table_destroy(table, 1);
+}
+
+int dp_table_flush(struct datapath *dp)
+{
+ struct dp_table *old_table = rcu_dereference(dp->table);
+ struct dp_table *new_table = dp_table_create(DP_L1_SIZE);
+ if (!new_table)
+ return -ENOMEM;
+ rcu_assign_pointer(dp->table, new_table);
+ call_rcu(&old_table->rcu, dp_free_table_and_flows_rcu);
+ return 0;
+}
+
+struct sw_flow **
+dp_table_lookup_for_insert(struct dp_table *table,
+ const struct odp_flow_key *target)
+{
+ struct sw_flow **buckets[2];
+ struct sw_flow **empty_bucket = NULL;
+ int i;
+
+ find_buckets(table, target, buckets);
+ for (i = 0; i < 2; i++) {
+ struct sw_flow *f = rcu_dereference(*buckets[i]);
+ if (f) {
+ if (!memcmp(&f->key, target, sizeof(struct odp_flow_key)))
+ return buckets[i];
+ } else if (!empty_bucket)
+ empty_bucket = buckets[i];
+ }
+ return empty_bucket;
+}
+
+int dp_table_delete(struct dp_table *table, struct sw_flow *target)
+{
+ struct sw_flow **buckets[2];
+ int i;
+
+ find_buckets(table, &target->key, buckets);
+ for (i = 0; i < 2; i++) {
+ struct sw_flow *flow = rcu_dereference(*buckets[i]);
+ if (flow == target) {
+ rcu_assign_pointer(*buckets[i], NULL);
+ return 0;
+ }
+ }
+ return -ENOENT;
+}
diff --git a/debian/.gitignore b/debian/.gitignore
new file mode 100644
index 000000000..2053c5c10
--- /dev/null
+++ b/debian/.gitignore
@@ -0,0 +1,19 @@
+*.debhelper
+*.debhelper.log
+*.substvars
+/control
+/corekeeper
+/files
+/nicira-switch
+/openvswitch
+/openvswitch-common
+/openvswitch-common.copyright
+/openvswitch-controller
+/openvswitch-datapath-source
+/openvswitch-dbg
+/openvswitch-monitor
+/openvswitch-pki
+/openvswitch-pki-server
+/openvswitch-switch
+/openvswitch-switch-config
+/openvswitch-switch.copyright
diff --git a/debian/automake.mk b/debian/automake.mk
new file mode 100644
index 000000000..813987e73
--- /dev/null
+++ b/debian/automake.mk
@@ -0,0 +1,50 @@
+EXTRA_DIST += \
+ debian/changelog \
+ debian/commands/reconfigure \
+ debian/commands/update \
+ debian/compat \
+ debian/control \
+ debian/control.modules.in \
+ debian/copyright \
+ debian/corekeeper.cron.daily \
+ debian/corekeeper.init \
+ debian/dirs \
+ debian/ovs-switch-setup \
+ debian/ovs-switch-setup.8 \
+ debian/openvswitch-common.dirs \
+ debian/openvswitch-common.install \
+ debian/openvswitch-common.manpages \
+ debian/openvswitch-controller.README.Debian \
+ debian/openvswitch-controller.default \
+ debian/openvswitch-controller.dirs \
+ debian/openvswitch-controller.init \
+ debian/openvswitch-controller.install \
+ debian/openvswitch-controller.manpages \
+ debian/openvswitch-controller.postinst \
+ debian/openvswitch-datapath-module-_KVERS_.postinst.modules.in \
+ debian/openvswitch-datapath-source.README.Debian \
+ debian/openvswitch-datapath-source.copyright \
+ debian/openvswitch-datapath-source.dirs \
+ debian/openvswitch-datapath-source.install \
+ debian/openvswitch-pki-server.apache2 \
+ debian/openvswitch-pki-server.dirs \
+ debian/openvswitch-pki-server.install \
+ debian/openvswitch-pki-server.postinst \
+ debian/openvswitch-pki.postinst \
+ debian/openvswitch-switch-config.dirs \
+ debian/openvswitch-switch-config.install \
+ debian/openvswitch-switch-config.manpages \
+ debian/openvswitch-switch-config.overrides \
+ debian/openvswitch-switch-config.templates \
+ debian/openvswitch-switch.README.Debian \
+ debian/openvswitch-switch.dirs \
+ debian/openvswitch-switch.init \
+ debian/openvswitch-switch.install \
+ debian/openvswitch-switch.logrotate \
+ debian/openvswitch-switch.manpages \
+ debian/openvswitch-switch.postinst \
+ debian/openvswitch-switch.postrm \
+ debian/openvswitch-switch.template \
+ debian/po/POTFILES.in \
+ debian/po/templates.pot \
+ debian/rules
diff --git a/debian/changelog b/debian/changelog
new file mode 100644
index 000000000..4aa1f90b4
--- /dev/null
+++ b/debian/changelog
@@ -0,0 +1,5 @@
+openvswitch (0.90.0) unstable; urgency=low
+
+ * Development version.
+
+ -- Open vSwitch developers <ovs-dev@openvswitch.org> Mon, 19 Nov 2007 14:57:52 -0800
diff --git a/debian/commands/reconfigure b/debian/commands/reconfigure
new file mode 100755
index 000000000..dc493a188
--- /dev/null
+++ b/debian/commands/reconfigure
@@ -0,0 +1,128 @@
+#! /usr/bin/perl
+
+use POSIX;
+use strict;
+use warnings;
+
+my $default = '/etc/default/openvswitch-switch';
+
+my (%config) = load_config($default);
+if (@ARGV) {
+ foreach my $arg (@ARGV) {
+ my ($key, $value) = $arg =~ /^([^=]+)=(.*)/
+ or die "bad argument '$arg'\n";
+ if ($value ne '') {
+ $config{$key} = $value;
+ } else {
+ delete $config{$key};
+ }
+ }
+ save_config($default, %config);
+}
+print "$_=$config{$_}\n" foreach sort(keys(%config));
+
+sub load_config {
+ my ($file) = @_;
+
+ # Get the list of the variables that the shell sets automatically.
+ my (%auto_vars) = read_vars("set -a && env");
+
+ # Get the variables from $default.
+ my (%config) = read_vars("set -a && . '$default' && env");
+
+ # Subtract.
+ delete @config{keys %auto_vars};
+
+ return %config;
+}
+
+sub read_vars {
+ my ($cmd) = @_;
+ local @ENV;
+ if (!open(VARS, '-|', $cmd)) {
+ print STDERR "$cmd: failed to execute: $!\n";
+ return ();
+ }
+ my (%config);
+ while (<VARS>) {
+ my ($var, $value) = /^([^=]+)=(.*)$/ or next;
+ $config{$var} = $value;
+ }
+ close(VARS);
+ return %config;
+}
+
+sub shell_escape {
+ local $_ = $_[0];
+ if ($_ eq '') {
+ return '""';
+ } elsif (m&^[-a-zA-Z0-9:./%^_+,]*$&) {
+ return $_;
+ } else {
+ s/'/'\\''/;
+ return "'$_'";
+ }
+}
+
+sub shell_assign {
+ my ($var, $value) = @_;
+ return $var . '=' . shell_escape($value);
+}
+
+sub save_config {
+ my ($file, %config) = @_;
+ my (@lines);
+ if (open(FILE, '<', $file)) {
+ @lines = <FILE>;
+ chomp @lines;
+ close(FILE);
+ }
+
+ # Replace all existing variable assignments.
+ for (my ($i) = 0; $i <= $#lines; $i++) {
+ local $_ = $lines[$i];
+ my ($var, $value) = /^\s*([^=#]+)=(.*)$/ or next;
+ if (exists($config{$var})) {
+ $lines[$i] = shell_assign($var, $config{$var});
+ delete $config{$var};
+ } else {
+ $lines[$i] = "#$lines[$i]";
+ }
+ }
+
+ # Find a place to put any remaining variable assignments.
+ VAR:
+ for my $var (keys(%config)) {
+ my $assign = shell_assign($var, $config{$var});
+
+ # Replace the last commented-out variable assignment to $var, if any.
+ for (my ($i) = $#lines; $i >= 0; $i--) {
+ local $_ = $lines[$i];
+ if (/^\s*#\s*$var=/) {
+ $lines[$i] = $assign;
+ next VAR;
+ }
+ }
+
+ # Find a place to add the var: after the final commented line
+ # just after a line that contains "$var:".
+ for (my ($i) = 0; $i <= $#lines; $i++) {
+ if ($lines[$i] =~ /^\s*#\s*$var:/) {
+ for (my ($j) = $i + 1; $j <= $#lines; $j++) {
+ if ($lines[$j] !~ /^\s*#/) {
+ splice(@lines, $j, 0, $assign);
+ next VAR;
+ }
+ }
+ }
+ }
+
+ # Just append it.
+ push(@lines, $assign);
+ }
+
+ open(NEWFILE, '>', "$file.tmp") or die "$file.tmp: create: $!\n";
+ print NEWFILE join('', map("$_\n", @lines));
+ close(NEWFILE);
+ rename("$file.tmp", $file) or die "$file.tmp: rename to $file: $!\n";
+}
diff --git a/debian/commands/update b/debian/commands/update
new file mode 100755
index 000000000..545e3c233
--- /dev/null
+++ b/debian/commands/update
@@ -0,0 +1,4 @@
+#! /bin/sh
+set -e
+apt-get update -qy
+apt-get upgrade -qy
diff --git a/debian/compat b/debian/compat
new file mode 100644
index 000000000..7ed6ff82d
--- /dev/null
+++ b/debian/compat
@@ -0,0 +1 @@
+5
diff --git a/debian/control b/debian/control
new file mode 100644
index 000000000..09eda1148
--- /dev/null
+++ b/debian/control
@@ -0,0 +1,143 @@
+Source: openvswitch
+Section: net
+Priority: extra
+Maintainer: Open vSwitch developers <ovs-dev@openvswitch.org>
+Build-Depends: debhelper (>= 5), autoconf (>= 2.60), automake1.10, libssl-dev, pkg-config (>= 0.21), po-debconf, bzip2, openssl, libncurses5-dev, libpcre3-dev
+Standards-Version: 3.7.3
+
+Package: openvswitch-datapath-source
+Architecture: all
+Depends: module-assistant, bzip2, debhelper (>= 5.0.37)
+Suggests: openvswitch-switch
+Description: Source code for Open vSwitch datapath Linux module
+ This package provides the Open vSwitch datapath module source code
+ that is needed by openvswitch-switch. The kernel module can be built
+ from it using module-assistant or make-kpkg. README.Debian in this
+ package provides further instructions.
+ .
+ Open vSwitch is a software-based Ethernet switch targeted at virtual
+ servers.
+
+Package: openvswitch-common
+Architecture: any
+Depends: ${shlibs:Depends}, openssl
+Description: Open vSwitch common components
+ openvswitch-common provides components required by both openvswitch-switch
+ and openvswitch-controller.
+ .
+ Open vSwitch is a software-based Ethernet switch targeted at virtual
+ servers.
+
+Package: openvswitch-switch
+Architecture: any
+Suggests: openvswitch-datapath-module
+Depends: ${shlibs:Depends}, ${misc:Depends}, openvswitch-common, dhcp3-client, module-init-tools, dmidecode, procps, debianutils
+Description: Open vSwitch switch implementations
+ openvswitch-switch provides the userspace components and utilities for
+ the Open vSwitch kernel-based switch.
+ .
+ Open vSwitch is a software-based Ethernet switch targeted at virtual
+ servers.
+
+Package: openvswitch-switch-config
+Architecture: any
+Depends: ${shlibs:Depends}, ${misc:Depends}, openvswitch-switch, libwww-perl, libdigest-sha1-perl
+Description: Open vSwitch switch implementations
+ openvswitch-switch-config provides a utility for interactively configuring
+ the Open vSwitch switch provided in the openvswitch-switch package.
+ .
+ Open vSwitch is a software-based Ethernet switch targeted at virtual
+ servers.
+
+Package: openvswitch-switchui
+Architecture: any
+Recommends: openvswitch-switch
+Depends: ${shlibs:Depends}, ${misc:Depends}, console-tools
+Description: Monitoring utility for OpenFlow switches
+ The ovs-switchui utility included in this package provides a
+ "front-panel display" to allow administrators to view the status of
+ an OpenFlow switch at a glance.
+ .
+ The ezio-term utility, also included, provides a VT100-compatible
+ terminal interface for EZIO3 (aka MTB-134) 16x2 LCD displays found on
+ server appliances made by Portwell. It allows ovs-switchui to work
+ with such displays.
+
+Package: openvswitch-pki
+Architecture: all
+Depends: ${shlibs:Depends}, ${misc:Depends}, openvswitch-common
+Description: Open vSwitch public key infrastructure
+ openvswitch-pki provides PKI (public key infrastructure) support for
+ Open vSwitch switches and controllers, reducing the risk of
+ man-in-the-middle attacks on the Open vSwitch network infrastructure.
+ .
+ Open vSwitch is a software-based Ethernet switch targeted at virtual
+ servers.
+
+Package: openvswitch-pki-server
+Architecture: all
+Depends: ${shlibs:Depends}, ${misc:Depends}, ${perl:Depends}, openvswitch-pki, apache2
+Description: Open vSwitch public key infrastructure (HTTP server support)
+ openvswitch-pki-server provides HTTP access to the Open vSwitch PKI (public
+ key infrastructure) maintained on the local machine by the
+ openvswitch-pki package. This HTTP access is needed for secure and
+ convenient OpenFlow switch setup using the ovs-switch-setup program
+ in the openvswitch-switch package.
+ .
+ Open vSwitch is a software-based Ethernet switch targeted at virtual
+ servers.
+
+Package: openvswitch-controller
+Architecture: any
+Depends: ${shlibs:Depends}, openvswitch-common, openvswitch-pki
+Description: Open vSwitch controller implementation
+ The Open vSwitch controller enables OpenFlow switches that connect to it
+ to act as MAC-learning Ethernet switches.
+ .
+ Open vSwitch is a software-based Ethernet switch targeted at virtual
+ servers.
+
+Package: corekeeper
+Architecture: all
+Depends: tmpreaper
+Description: Core file centralizer and reaper
+ The corekeeper package configures the system to dump all core files to
+ /var/log/core. It also deletes core files older than 7 days.
+
+Package: openvswitch-dbg
+Architecture: any
+Depends: ${shlibs:Depends}
+Description: Debug symbols for Open vSwitch packages
+ This package contains the debug symbols for all the other openvswitch-*
+ packages. Install it to debug one of them or to examine a core dump
+ produced by one of them.
+
+Package: openvswitch-monitor
+Architecture: any
+Recommends: openvswitch-switch
+Depends: ${shlibs:Depends}, ${misc:Depends}
+Description: Monitor utility for Open vSwitch switches
+ The ovs-monitor utility included in this package monitors the secure
+ channel and datapath. If either become unresponsive, the switch is
+ rebooted.
+
+Package: openvswitch-wdt
+Architecture: any
+Recommends: openvswitch-switch
+Depends: ${shlibs:Depends}, ${misc:Depends}
+Description: Watchdog utility for Open vSwitch switches
+ The ovs-wdt program included in this package manages the hardware
+ watchdog timer in switches based on the Portwell NAR-5520 hardware.
+
+Package: nicira-switch
+Architecture: all
+Depends:
+ openvswitch-common (= ${source:Version}),
+ openvswitch-switch (= ${source:Version}),
+ openvswitch-switchui (= ${source:Version}),
+ openvswitch-datapath-module (= ${source:Version}),
+ corekeeper, openvswitch-monitor, openvswitch-wdt
+Description: Metapackage for installing a Nicira Open vSwitch switch
+ Installing this package will install everything needed for a Nicira
+ Portwell-based Open vSwitch switch, including monitoring and the switch UI.
+
diff --git a/debian/control.modules.in b/debian/control.modules.in
new file mode 100644
index 000000000..4da85b40d
--- /dev/null
+++ b/debian/control.modules.in
@@ -0,0 +1,20 @@
+Source: openvswitch
+Section: net
+Priority: extra
+Maintainer: Open vSwitch developers <ovs-dev@openvswitch.org>
+Build-Depends: debhelper (>= 5.0.37)
+Standards-Version: 3.7.3
+
+Package: openvswitch-datapath-module-_KVERS_
+Architecture: any
+Recommends: kernel-image-_KVERS_, openvswitch-switch
+Provides: openvswitch-datapath-module
+Description: Open vSwitch Linux datapath kernel module
+ This package contains the Open vSwitch loadable datapath kernel modules for
+ the kernel-image-_KVERS_ package.
+ .
+ If you compiled a custom kernel, you will most likely need to compile
+ a custom version of this module as well. The
+ openvswitch-datapath-source package has been provided for this
+ purpose. Refer to README.Debian provided in that package for further
+ instructions.
diff --git a/debian/copyright b/debian/copyright
new file mode 100644
index 000000000..0f89e828d
--- /dev/null
+++ b/debian/copyright
@@ -0,0 +1,21 @@
+Upstream Authors:
+
+ Nicira Networks
+
+Copyright:
+
+ Copyright (C) 2008 Nicira Networks.
+
+License:
+
+ Permission to use, copy, modify, and/or distribute this software for any
+ purpose with or without fee is hereby granted, provided that the above
+ copyright notice and this permission notice appear in all copies.
+
+ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
diff --git a/debian/corekeeper.cron.daily b/debian/corekeeper.cron.daily
new file mode 100755
index 000000000..badc192d0
--- /dev/null
+++ b/debian/corekeeper.cron.daily
@@ -0,0 +1,5 @@
+#! /bin/sh
+
+PATH=/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin
+
+tmpreaper 7d --mtime --all /var/log/core
diff --git a/debian/corekeeper.init b/debian/corekeeper.init
new file mode 100755
index 000000000..27d62a124
--- /dev/null
+++ b/debian/corekeeper.init
@@ -0,0 +1,63 @@
+#!/bin/sh
+#
+# Example init.d script with LSB support.
+#
+# Please read this init.d carefully and modify the sections to
+# adjust it to the program you want to run.
+#
+# Copyright (c) 2007 Javier Fernandez-Sanguino <jfs@debian.org>
+#
+# This is free software; you may redistribute it and/or modify
+# it under the terms of the GNU General Public License as
+# published by the Free Software Foundation; either version 2,
+# or (at your option) any later version.
+#
+# This is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License with
+# the Debian operating system, in /usr/share/common-licenses/GPL; if
+# not, write to the Free Software Foundation, Inc., 59 Temple Place,
+# Suite 330, Boston, MA 02111-1307 USA
+#
+### BEGIN INIT INFO
+# Provides: corekeeper
+# Required-Start:
+# Required-Stop:
+# Should-Start: $syslog
+# Should-Stop:
+# Default-Start: 2 3 4 5
+# Default-Stop: 0 1 6
+# Short-Description: Configure core file dump location
+### END INIT INFO
+
+PATH=/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin
+
+. /lib/lsb/init-functions
+
+set -e
+
+case "$1" in
+ start)
+ log_daemon_msg "Initializing core dump location..."
+ if echo "/var/log/core/core.%e.%t" > /proc/sys/kernel/core_pattern
+ then
+ log_progress_msg "success"
+ log_end_msg 0
+ exit 0
+ else
+ log_end_msg 1
+ exit 1
+ fi
+ ;;
+ stop|restart|force-reload|status|reload)
+ exit 0
+ ;;
+ *)
+ N=/etc/init.d/$NAME
+ echo "Usage: $N {start|stop|restart|force-reload|status}" >&2
+ exit 1
+ ;;
+esac
diff --git a/debian/dirs b/debian/dirs
new file mode 100644
index 000000000..ca882bbb7
--- /dev/null
+++ b/debian/dirs
@@ -0,0 +1,2 @@
+usr/bin
+usr/sbin
diff --git a/debian/openvswitch-common.dirs b/debian/openvswitch-common.dirs
new file mode 100644
index 000000000..be9ed2f03
--- /dev/null
+++ b/debian/openvswitch-common.dirs
@@ -0,0 +1 @@
+var/log/openvswitch
diff --git a/debian/openvswitch-common.install b/debian/openvswitch-common.install
new file mode 100644
index 000000000..1967ccc10
--- /dev/null
+++ b/debian/openvswitch-common.install
@@ -0,0 +1,3 @@
+_debian/utilities/ovs-appctl usr/sbin
+_debian/utilities/ovs-parse-leaks usr/bin
+_debian/utilities/ovs-pki usr/sbin
diff --git a/debian/openvswitch-common.manpages b/debian/openvswitch-common.manpages
new file mode 100644
index 000000000..99c48bd08
--- /dev/null
+++ b/debian/openvswitch-common.manpages
@@ -0,0 +1,2 @@
+_debian/utilities/ovs-appctl.8
+_debian/utilities/ovs-pki.8
diff --git a/debian/openvswitch-controller.README.Debian b/debian/openvswitch-controller.README.Debian
new file mode 100644
index 000000000..18819a79a
--- /dev/null
+++ b/debian/openvswitch-controller.README.Debian
@@ -0,0 +1,12 @@
+README.Debian for openvswitch-controller
+-------------------------------------
+
+* To (re)configure the controller, edit /etc/default/openvswitch-controller
+ and run "/etc/init.d/openvswitch-controller restart".
+
+* To enable OpenFlow switches to automatically discover the location
+ of the controller, you must install and configure a DHCP server.
+ The secchan(8) manpage (found in the openvswitch-switch package) gives
+ a working example configuration file for the ISC DHCP server.
+
+ -- Ben Pfaff <blp@nicira.com>, Mon, 11 May 2009 13:26:38 -0700
diff --git a/debian/openvswitch-controller.default b/debian/openvswitch-controller.default
new file mode 100644
index 000000000..1d9f92613
--- /dev/null
+++ b/debian/openvswitch-controller.default
@@ -0,0 +1,29 @@
+# This is a POSIX shell fragment -*- sh -*-
+
+# LISTEN: What OpenFlow connection methods should the controller listen on?
+#
+# This is a space-delimited list of connection methods:
+#
+# * "pssl:[PORT]": Listen for SSL connections on the specified PORT
+# (default: 6633). The private key, certificate, and CA certificate
+# must be specified below.
+#
+# * "pctp:[PORT]": Listen for TCP connections on the specified PORT
+# (default: 6633). Not recommended for security reasons.
+#
+LISTEN="pssl:"
+
+# PRIVKEY: Name of file containing controller's private key.
+# Required if SSL enabled.
+PRIVKEY=/etc/openvswitch-controller/privkey.pem
+
+# CERT: Name of file containing certificate for private key.
+# Required if SSL enabled.
+CERT=/etc/openvswitch-controller/cert.pem
+
+# CACERT: Name of file containing switch CA certificate.
+# Required if SSL enabled.
+CACERT=/etc/openvswitch-controller/cacert.pem
+
+# Additional options to pass to controller, e.g. "--hub"
+DAEMON_OPTS=""
diff --git a/debian/openvswitch-controller.dirs b/debian/openvswitch-controller.dirs
new file mode 100644
index 000000000..4ada77c69
--- /dev/null
+++ b/debian/openvswitch-controller.dirs
@@ -0,0 +1 @@
+etc/openvswitch-controller
diff --git a/debian/openvswitch-controller.init b/debian/openvswitch-controller.init
new file mode 100755
index 000000000..ee9c44d5d
--- /dev/null
+++ b/debian/openvswitch-controller.init
@@ -0,0 +1,269 @@
+#!/bin/sh
+#
+# Copyright (c) 2007, 2009 Javier Fernandez-Sanguino <jfs@debian.org>
+#
+# This is free software; you may redistribute it and/or modify
+# it under the terms of the GNU General Public License as
+# published by the Free Software Foundation; either version 2,
+# or (at your option) any later version.
+#
+# This is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License with
+# the Debian operating system, in /usr/share/common-licenses/GPL; if
+# not, write to the Free Software Foundation, Inc., 59 Temple Place,
+# Suite 330, Boston, MA 02111-1307 USA
+#
+### BEGIN INIT INFO
+# Provides: openvswitch-controller
+# Required-Start: $network $local_fs
+# Required-Stop:
+# Should-Start: $named
+# Should-Stop:
+# Default-Start: 2 3 4 5
+# Default-Stop: 0 1 6
+# Short-Description: Open vSwitch controller
+### END INIT INFO
+
+PATH=/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin
+
+DAEMON=/usr/sbin/controller # Introduce the server's location here
+NAME=ovs-controller # Introduce the short server's name here
+DESC=ovs-controller # Introduce a short description here
+LOGDIR=/var/log/openvswitch # Log directory to use
+
+PIDFILE=/var/run/$NAME.pid
+
+test -x $DAEMON || exit 0
+
+. /lib/lsb/init-functions
+
+# Default options, these can be overriden by the information
+# at /etc/default/$NAME
+DAEMON_OPTS="" # Additional options given to the server
+
+DODTIME=10 # Time to wait for the server to die, in seconds
+ # If this value is set too low you might not
+ # let some servers to die gracefully and
+ # 'restart' will not work
+
+LOGFILE=$LOGDIR/$NAME.log # Server logfile
+#DAEMONUSER= # User to run the daemons as. If this value
+ # is set start-stop-daemon will chuid the server
+
+# Include defaults if available
+default=/etc/default/openvswitch-controller
+if [ -f $default ] ; then
+ . $default
+fi
+
+# Check that the user exists (if we set a user)
+# Does the user exist?
+if [ -n "$DAEMONUSER" ] ; then
+ if getent passwd | grep -q "^$DAEMONUSER:"; then
+ # Obtain the uid and gid
+ DAEMONUID=`getent passwd |grep "^$DAEMONUSER:" | awk -F : '{print $3}'`
+ DAEMONGID=`getent passwd |grep "^$DAEMONUSER:" | awk -F : '{print $4}'`
+ else
+ log_failure_msg "The user $DAEMONUSER, required to run $NAME does not exist."
+ exit 1
+ fi
+fi
+
+
+set -e
+
+running_pid() {
+# Check if a given process pid's cmdline matches a given name
+ pid=$1
+ name=$2
+ [ -z "$pid" ] && return 1
+ [ ! -d /proc/$pid ] && return 1
+ cmd=`cat /proc/$pid/cmdline | tr "\000" "\n"|head -n 1 |cut -d : -f 1`
+ # Is this the expected server
+ [ "$cmd" != "$name" ] && return 1
+ return 0
+}
+
+running() {
+# Check if the process is running looking at /proc
+# (works for all users)
+
+ # No pidfile, probably no daemon present
+ [ ! -f "$PIDFILE" ] && return 1
+ pid=`cat $PIDFILE`
+ running_pid $pid $DAEMON || return 1
+ return 0
+}
+
+start_server() {
+ if [ -z "$LISTEN" ]; then
+ echo "$default: No connection methods configured, controller disabled" >&2
+ exit 0
+ fi
+
+ SSL_OPTS=
+ case $LISTEN in
+ *ssl*)
+ : ${PRIVKEY:=/etc/openvswitch-controller/privkey.pem}
+ : ${CERT:=/etc/openvswitch-controller/cert.pem}
+ : ${CACERT:=/etc/openvswitch-controller/cacert.pem}
+ if test ! -e "$PRIVKEY" || test ! -e "$CERT" ||
+ test ! -e "$CACERT"; then
+ if test ! -e "$PRIVKEY"; then
+ echo "$PRIVKEY: private key missing" >&2
+ fi
+ if test ! -e "$CERT"; then
+ echo "$CERT: certificate for private key missing" >&2
+ fi
+ if test ! -e "$CACERT"; then
+ echo "$CACERT: CA certificate missing" >&2
+ fi
+ exit 1
+ fi
+ SSL_OPTS="--private-key=$PRIVKEY --certificate=$CERT --ca-cert=$CACERT"
+ ;;
+ esac
+
+# Start the process using the wrapper
+ if [ -z "$DAEMONUSER" ] ; then
+ start-stop-daemon --start --pidfile $PIDFILE \
+ --exec $DAEMON -- --detach --pidfile=$PIDFILE \
+ $LISTEN $DAEMON_OPTS $SSL_OPTS
+ errcode=$?
+ else
+# if we are using a daemonuser then change the user id
+ start-stop-daemon --start --quiet --pidfile $PIDFILE \
+ --chuid $DAEMONUSER --exec $DAEMON -- \
+ --detach --pidfile=$PIDFILE $LISTEN $DAEMON_OPTS \
+ $SSL_OPTS
+ errcode=$?
+ fi
+ return $errcode
+}
+
+stop_server() {
+# Stop the process using the wrapper
+ if [ -z "$DAEMONUSER" ] ; then
+ start-stop-daemon --stop --quiet --pidfile $PIDFILE \
+ --exec $DAEMON
+ errcode=$?
+ else
+# if we are using a daemonuser then look for process that match
+ start-stop-daemon --stop --quiet --pidfile $PIDFILE \
+ --user $DAEMONUSER --exec $DAEMON
+ errcode=$?
+ fi
+
+ return $errcode
+}
+
+reload_server() {
+ [ ! -f "$PIDFILE" ] && return 1
+ pid=`cat $PIDFILE` # This is the daemon's pid
+ # Send a SIGHUP
+ kill -1 $pid
+ return $?
+}
+
+force_stop() {
+# Force the process to die killing it manually
+ [ ! -e "$PIDFILE" ] && return
+ if running ; then
+ kill -15 $pid
+ # Is it really dead?
+ sleep "$DIETIME"s
+ if running ; then
+ kill -9 $pid
+ sleep "$DIETIME"s
+ if running ; then
+ echo "Cannot kill $NAME (pid=$pid)!"
+ exit 1
+ fi
+ fi
+ fi
+ rm -f $PIDFILE
+}
+
+
+case "$1" in
+ start)
+ log_daemon_msg "Starting $DESC " "$NAME"
+ # Check if it's running first
+ if running ; then
+ log_progress_msg "apparently already running"
+ log_end_msg 0
+ exit 0
+ fi
+ if start_server && running ; then
+ # It's ok, the server started and is running
+ log_end_msg 0
+ else
+ # Either we could not start it or it is not running
+ # after we did
+ # NOTE: Some servers might die some time after they start,
+ # this code does not try to detect this and might give
+ # a false positive (use 'status' for that)
+ log_end_msg 1
+ fi
+ ;;
+ stop)
+ log_daemon_msg "Stopping $DESC" "$NAME"
+ if running ; then
+ # Only stop the server if we see it running
+ stop_server
+ log_end_msg $?
+ else
+ # If it's not running don't do anything
+ log_progress_msg "apparently not running"
+ log_end_msg 0
+ exit 0
+ fi
+ ;;
+ force-stop)
+ # First try to stop gracefully the program
+ $0 stop
+ if running; then
+ # If it's still running try to kill it more forcefully
+ log_daemon_msg "Stopping (force) $DESC" "$NAME"
+ force_stop
+ log_end_msg $?
+ fi
+ ;;
+ restart|force-reload)
+ log_daemon_msg "Restarting $DESC" "$NAME"
+ stop_server
+ # Wait some sensible amount, some server need this
+ [ -n "$DIETIME" ] && sleep $DIETIME
+ start_server
+ running
+ log_end_msg $?
+ ;;
+ status)
+
+ log_daemon_msg "Checking status of $DESC" "$NAME"
+ if running ; then
+ log_progress_msg "running"
+ log_end_msg 0
+ else
+ log_progress_msg "apparently not running"
+ log_end_msg 1
+ exit 1
+ fi
+ ;;
+ # Use this if the daemon cannot reload
+ reload)
+ log_warning_msg "Reloading $NAME daemon: not implemented, as the daemon"
+ log_warning_msg "cannot re-read the config file (use restart)."
+ ;;
+ *)
+ N=/etc/init.d/$NAME
+ echo "Usage: $N {start|stop|force-stop|restart|force-reload|status}" >&2
+ exit 1
+ ;;
+esac
+
+exit 0
diff --git a/debian/openvswitch-controller.install b/debian/openvswitch-controller.install
new file mode 100644
index 000000000..7d0edbbed
--- /dev/null
+++ b/debian/openvswitch-controller.install
@@ -0,0 +1 @@
+_debian/utilities/ovs-controller usr/sbin
diff --git a/debian/openvswitch-controller.manpages b/debian/openvswitch-controller.manpages
new file mode 100644
index 000000000..6a9911e1e
--- /dev/null
+++ b/debian/openvswitch-controller.manpages
@@ -0,0 +1 @@
+_debian/utilities/ovs-controller.8
diff --git a/debian/openvswitch-controller.postinst b/debian/openvswitch-controller.postinst
new file mode 100755
index 000000000..51acfb1ab
--- /dev/null
+++ b/debian/openvswitch-controller.postinst
@@ -0,0 +1,52 @@
+#!/bin/sh
+# postinst script for openvswitch-controller
+#
+# see: dh_installdeb(1)
+
+set -e
+
+# summary of how this script can be called:
+# * <postinst> `configure' <most-recently-configured-version>
+# * <old-postinst> `abort-upgrade' <new version>
+# * <conflictor's-postinst> `abort-remove' `in-favour' <package>
+# <new-version>
+# * <postinst> `abort-remove'
+# * <deconfigured's-postinst> `abort-deconfigure' `in-favour'
+# <failed-install-package> <version> `removing'
+# <conflicting-package> <version>
+# for details, see http://www.debian.org/doc/debian-policy/ or
+# the debian-policy package
+
+
+case "$1" in
+ configure)
+ cd /etc/openvswitch-controller
+ if ! test -e cacert.pem; then
+ ln -s /usr/share/openvswitch/pki/switchca/cacert.pem cacert.pem
+ fi
+ if ! test -e privkey.pem || ! test -e cert.pem; then
+ oldumask=$(umask)
+ umask 077
+ ovs-pki req+sign tmp controller >/dev/null
+ mv tmp-privkey.pem privkey.pem
+ mv tmp-cert.pem cert.pem
+ mv tmp-req.pem req.pem
+ chmod go+r cert.pem req.pem
+ umask $oldumask
+ fi
+ ;;
+
+ abort-upgrade|abort-remove|abort-deconfigure)
+ ;;
+
+ *)
+ echo "postinst called with unknown argument \`$1'" >&2
+ exit 1
+ ;;
+esac
+
+#DEBHELPER#
+
+exit 0
+
+
diff --git a/debian/openvswitch-datapath-module-_KVERS_.postinst.modules.in b/debian/openvswitch-datapath-module-_KVERS_.postinst.modules.in
new file mode 100755
index 000000000..026830082
--- /dev/null
+++ b/debian/openvswitch-datapath-module-_KVERS_.postinst.modules.in
@@ -0,0 +1,25 @@
+#!/bin/sh
+# postinst script for #PACKAGE#
+#
+# see: dh_installdeb(1)
+
+set -e
+
+depmod -a
+
+#DEBHELPER#
+
+# If the switch is running, restart it. This ensures that we are using the
+# latest kernel module, because the init script will unload and reload the
+# module.
+#
+# (Ideally we'd only want to do this if this package corresponds to the
+# running kernel, but I don't know a reliable way to check.)
+INIT=/etc/init.d/openvswitch-switch
+if test -x $INIT && $INIT status; then
+ $INIT restart || true
+fi
+
+exit 0
+
+
diff --git a/debian/openvswitch-datapath-source.README.Debian b/debian/openvswitch-datapath-source.README.Debian
new file mode 100644
index 000000000..73bba7a14
--- /dev/null
+++ b/debian/openvswitch-datapath-source.README.Debian
@@ -0,0 +1,31 @@
+Open vSwitch for Debian
+----------------------
+
+* How do I build this module the Debian way?
+
+ - Building with module-assistant:
+
+ $ module-assistant auto-install openvswitch
+ or
+ $ m-a a-i openvswitch
+
+ If kernel source or headers are in a non-standard directory, add
+ the option -k /path/to/kernel/source with the correct path.
+
+ - Building with make-kpkg
+
+ $ cd /usr/src/
+ $ tar jxvf openvswitch.tar.bz2
+ $ cd /usr/src/kernel-source-2.6.9
+ $ make-kpkg --added-modules=openvswitch modules
+
+ - Building without make-kpkg
+
+ $ cd /usr/src/
+ $ tar jxvf openvswitch.tar.bz2
+ $ cd modules/openvswitch
+ $ fakeroot debian/rules kdist_image
+
+ If you run this as root, fakeroot is not needed.
+
+ -- Ben Pfaff <blp@nicira.com>, Mon, 11 May 2009 13:27:50 -0700
diff --git a/debian/openvswitch-datapath-source.copyright b/debian/openvswitch-datapath-source.copyright
new file mode 100644
index 000000000..32cba237a
--- /dev/null
+++ b/debian/openvswitch-datapath-source.copyright
@@ -0,0 +1,15 @@
+Upstream Authors:
+
+ Nicira Networks
+
+Copyright:
+
+ Copyright (C) 2008 Nicira Networks
+
+License:
+
+ Files in the datapath/ and its sub-directories are covered under the GNU
+ General Public License Version 2.
+
+ On Debian systems, the complete text of the GNU General
+ Public License can be found in `/usr/share/common-licenses/GPL'.
diff --git a/debian/openvswitch-datapath-source.dirs b/debian/openvswitch-datapath-source.dirs
new file mode 100644
index 000000000..e5a7d6b07
--- /dev/null
+++ b/debian/openvswitch-datapath-source.dirs
@@ -0,0 +1 @@
+usr/src/modules/openvswitch-datapath/debian
diff --git a/debian/openvswitch-datapath-source.install b/debian/openvswitch-datapath-source.install
new file mode 100644
index 000000000..d1acc8947
--- /dev/null
+++ b/debian/openvswitch-datapath-source.install
@@ -0,0 +1,6 @@
+debian/changelog usr/src/modules/openvswitch-datapath/debian
+debian/control usr/src/modules/openvswitch-datapath/debian
+debian/compat usr/src/modules/openvswitch-datapath/debian
+debian/*.modules.in usr/src/modules/openvswitch-datapath/debian
+debian/rules usr/src/modules/openvswitch-datapath/debian
+_debian/openvswitch.tar.gz usr/src/modules/openvswitch-datapath
diff --git a/debian/openvswitch-monitor.default b/debian/openvswitch-monitor.default
new file mode 100644
index 000000000..f0c356e81
--- /dev/null
+++ b/debian/openvswitch-monitor.default
@@ -0,0 +1,27 @@
+# This is a POSIX shell fragment -*- sh -*-
+
+# To configure the Open vSwitch monitor package, modify the following.
+# Afterward, the monitor will be configured automatically at boot time.
+# It can be started immediately with
+# /etc/init.d/openvswitch-monitor start
+
+# Defaults for initscript
+# sourced by /etc/init.d/openvswitch-monitor
+# installed at /etc/default/openvswitch-monitor by the maintainer scripts
+
+# THRESHOLD: The number of failed attempts the monitor should make until
+# it reboots the system. A value of zero disables the monitor.
+THRESHOLD=3
+
+# INTERVAL: The number of seconds to wait between probing secchan and
+# the datapath.
+INTERVAL=1
+
+# LOG_FILE: File to log messages related to monitoring.
+LOG_FILE="/var/log/openvswitch/monitor"
+
+# SWITCH_VCONN: The vconn used to connect to the switch (secchan).
+# The secchan must be configured to listen to this vconn. The default
+# here set is also listened to by default by the openvswitch-switch
+# package, so ordinarily there is no need to modify this.
+SWITCH_VCONN="/var/run/secchan.mgmt"
diff --git a/debian/openvswitch-monitor.dirs b/debian/openvswitch-monitor.dirs
new file mode 100644
index 000000000..236670a2d
--- /dev/null
+++ b/debian/openvswitch-monitor.dirs
@@ -0,0 +1 @@
+usr/sbin
diff --git a/debian/openvswitch-monitor.init b/debian/openvswitch-monitor.init
new file mode 100755
index 000000000..8c7e1ad08
--- /dev/null
+++ b/debian/openvswitch-monitor.init
@@ -0,0 +1,174 @@
+#!/bin/sh
+#
+# Example init.d script with LSB support.
+#
+# Please read this init.d carefully and modify the sections to
+# adjust it to the program you want to run.
+#
+# Copyright (c) 2007, 2009 Javier Fernandez-Sanguino <jfs@debian.org>
+#
+# This is free software; you may redistribute it and/or modify
+# it under the terms of the GNU General Public License as
+# published by the Free Software Foundation; either version 2,
+# or (at your option) any later version.
+#
+# This is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License with
+# the Debian operating system, in /usr/share/common-licenses/GPL; if
+# not, write to the Free Software Foundation, Inc., 59 Temple Place,
+# Suite 330, Boston, MA 02111-1307 USA
+#
+### BEGIN INIT INFO
+# Provides: openvswitch-monitor
+# Required-Start: $network $local_fs
+# Required-Stop:
+# Should-Start: $named $syslog openvswitch-switch
+# Should-Stop:
+# Default-Start: 2 3 4 5
+# Default-Stop: 0 1 6
+# Short-Description: Open vSwitch switch monitor
+### END INIT INFO
+
+PATH=/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin
+
+DAEMON=/usr/sbin/ovs-monitor
+NAME=openvswitch-monitor
+DESC="Open vSwitch switch monitor"
+
+PIDFILE=/var/run/$NAME.pid
+
+test -x $DAEMON || exit 0
+
+. /lib/lsb/init-functions
+
+# Default options, these can be overriden by the information
+# at /etc/default/$NAME
+DAEMON_OPTS="" # Additional options given to the daemon
+
+DODTIME=10 # Time to wait for the daemon to die, in seconds
+ # If this value is set too low you might not
+ # let some daemons to die gracefully and
+ # 'restart' will not work
+
+# Include defaults if available
+if [ -f /etc/default/$NAME ] ; then
+ . /etc/default/$NAME
+fi
+
+set -e
+
+running_pid() {
+# Check if a given process pid's cmdline matches a given name
+ pid=$1
+ name=$2
+ [ -z "$pid" ] && return 1
+ [ ! -d /proc/$pid ] && return 1
+ return 0
+}
+
+running() {
+# Check if the process is running looking at /proc
+# (works for all users)
+
+ # No pidfile, probably no daemon present
+ [ ! -f "$PIDFILE" ] && return 1
+ pid=`cat $PIDFILE`
+ running_pid $pid $DAEMON || return 1
+ return 0
+}
+
+start_daemon() {
+# Start the process using the wrapper
+ if test $THRESHOLD != 0; then
+ start-stop-daemon --start --quiet -m --background --pidfile $PIDFILE \
+ --exec $DAEMON -- -c $THRESHOLD -i $INTERVAL -l $LOG_FILE \
+ -s $SWITCH_VCONN $DAEMON_OPTS
+ fi
+
+ # Wait up to 3 seconds for the daemon to start.
+ for i in 1 2 3; do
+ if running; then
+ break
+ fi
+ sleep 1
+ done
+}
+
+stop_daemon() {
+ start-stop-daemon -o --stop --pidfile $PIDFILE
+ rm $PIDFILE
+}
+
+case "$1" in
+ start)
+ log_daemon_msg "Starting $DESC " "$NAME"
+ # Check if it's running first
+ if running ; then
+ log_progress_msg "apparently already running"
+ log_end_msg 0
+ exit 0
+ fi
+ if start_daemon && running ; then
+ # It's ok, the daemon started and is running
+ log_end_msg 0
+ else
+ # Either we could not start it or it is not running
+ # after we did
+ # NOTE: Some daemons might die some time after they start,
+ # this code does not try to detect this and might give
+ # a false positive (use 'status' for that)
+ log_end_msg 1
+ fi
+ ;;
+ stop)
+ log_daemon_msg "Stopping $DESC" "$NAME"
+ if running ; then
+ # Only stop the daemon if we see it running
+ stop_daemon
+ log_end_msg $?
+ else
+ # If it's not running don't do anything
+ log_progress_msg "apparently not running"
+ log_end_msg 0
+ exit 0
+ fi
+ ;;
+ restart|force-reload)
+ log_daemon_msg "Restarting $DESC" "$NAME"
+ if running ; then
+ stop_daemon
+ # Wait some sensible amount, some daemons need this
+ [ -n "$DIETIME" ] && sleep $DIETIME
+ fi
+ start_daemon
+ running
+ log_end_msg $?
+ ;;
+ status)
+ log_daemon_msg "Checking status of $DESC" "$NAME"
+ if running ; then
+ log_progress_msg "running"
+ log_end_msg 0
+ else
+ log_progress_msg "apparently not running"
+ log_end_msg 1
+ exit 1
+ fi
+ ;;
+ # Use this if the daemon cannot reload
+ reload)
+ log_warning_msg "Reloading $NAME daemon: not implemented, as the daemon"
+ log_warning_msg "cannot re-read the config file (use restart)."
+ ;;
+ *)
+ N=/etc/init.d/$NAME
+ echo "Usage: $N {start|stop|restart|force-reload|status}" >&2
+ exit 1
+ ;;
+esac
+
+exit 0
diff --git a/debian/openvswitch-monitor.install b/debian/openvswitch-monitor.install
new file mode 100644
index 000000000..9fc601a8a
--- /dev/null
+++ b/debian/openvswitch-monitor.install
@@ -0,0 +1 @@
+utilities/ovs-monitor usr/sbin
diff --git a/debian/openvswitch-pki-server.apache2 b/debian/openvswitch-pki-server.apache2
new file mode 100644
index 000000000..d0bc8ba9e
--- /dev/null
+++ b/debian/openvswitch-pki-server.apache2
@@ -0,0 +1 @@
+Alias /openvswitch/pki/ /usr/share/openvswitch/pki/
diff --git a/debian/openvswitch-pki-server.dirs b/debian/openvswitch-pki-server.dirs
new file mode 100644
index 000000000..7307777bc
--- /dev/null
+++ b/debian/openvswitch-pki-server.dirs
@@ -0,0 +1 @@
+etc/apache2/sites-available
diff --git a/debian/openvswitch-pki-server.install b/debian/openvswitch-pki-server.install
new file mode 100644
index 000000000..5af75da09
--- /dev/null
+++ b/debian/openvswitch-pki-server.install
@@ -0,0 +1 @@
+_debian/utilities/ovs-pki-cgi usr/lib/cgi-bin
diff --git a/debian/openvswitch-pki-server.postinst b/debian/openvswitch-pki-server.postinst
new file mode 100755
index 000000000..d161a98a9
--- /dev/null
+++ b/debian/openvswitch-pki-server.postinst
@@ -0,0 +1,44 @@
+#!/bin/sh
+# postinst script for openflow
+#
+# see: dh_installdeb(1)
+
+set -e
+
+# summary of how this script can be called:
+# * <postinst> `configure' <most-recently-configured-version>
+# * <old-postinst> `abort-upgrade' <new version>
+# * <conflictor's-postinst> `abort-remove' `in-favour' <package>
+# <new-version>
+# * <postinst> `abort-remove'
+# * <deconfigured's-postinst> `abort-deconfigure' `in-favour'
+# <failed-install-package> <version> `removing'
+# <conflicting-package> <version>
+# for details, see http://www.debian.org/doc/debian-policy/ or
+# the debian-policy package
+
+case "$1" in
+ configure)
+ # Enable site under Apache.
+ a2ensite openflow-pki >/dev/null
+ if command -v invoke-rc.d >/dev/null 2>&1; then
+ invoke-rc.d apache2 force-reload || :
+ else
+ [ -x /etc/init.d/apache2 ] && /etc/init.d/apache2 force-reload || :
+ fi
+ ;;
+
+ abort-upgrade|abort-remove|abort-deconfigure)
+ ;;
+
+ *)
+ echo "postinst called with unknown argument \`$1'" >&2
+ exit 1
+ ;;
+esac
+
+#DEBHELPER#
+
+exit 0
+
+
diff --git a/debian/openvswitch-pki.postinst b/debian/openvswitch-pki.postinst
new file mode 100755
index 000000000..a75a314f4
--- /dev/null
+++ b/debian/openvswitch-pki.postinst
@@ -0,0 +1,41 @@
+#!/bin/sh
+# postinst script for openvswitch
+#
+# see: dh_installdeb(1)
+
+set -e
+
+# summary of how this script can be called:
+# * <postinst> `configure' <most-recently-configured-version>
+# * <old-postinst> `abort-upgrade' <new version>
+# * <conflictor's-postinst> `abort-remove' `in-favour' <package>
+# <new-version>
+# * <postinst> `abort-remove'
+# * <deconfigured's-postinst> `abort-deconfigure' `in-favour'
+# <failed-install-package> <version> `removing'
+# <conflicting-package> <version>
+# for details, see http://www.debian.org/doc/debian-policy/ or
+# the debian-policy package
+
+case "$1" in
+ configure)
+ # Create certificate authorities.
+ if test ! -d /usr/share/openvswitch/pki; then
+ ovs-pki init
+ fi
+ ;;
+
+ abort-upgrade|abort-remove|abort-deconfigure)
+ ;;
+
+ *)
+ echo "postinst called with unknown argument \`$1'" >&2
+ exit 1
+ ;;
+esac
+
+#DEBHELPER#
+
+exit 0
+
+
diff --git a/debian/openvswitch-switch-config.dirs b/debian/openvswitch-switch-config.dirs
new file mode 100644
index 000000000..881ded8ae
--- /dev/null
+++ b/debian/openvswitch-switch-config.dirs
@@ -0,0 +1 @@
+/usr/share/lintian/overrides
diff --git a/debian/openvswitch-switch-config.install b/debian/openvswitch-switch-config.install
new file mode 100644
index 000000000..c8cbf17f1
--- /dev/null
+++ b/debian/openvswitch-switch-config.install
@@ -0,0 +1 @@
+debian/ovs-switch-setup usr/sbin
diff --git a/debian/openvswitch-switch-config.manpages b/debian/openvswitch-switch-config.manpages
new file mode 100644
index 000000000..0e1227932
--- /dev/null
+++ b/debian/openvswitch-switch-config.manpages
@@ -0,0 +1 @@
+debian/ovs-switch-setup.8
diff --git a/debian/openvswitch-switch-config.overrides b/debian/openvswitch-switch-config.overrides
new file mode 100644
index 000000000..4ac77abac
--- /dev/null
+++ b/debian/openvswitch-switch-config.overrides
@@ -0,0 +1 @@
+debconf-is-not-a-registry
diff --git a/debian/openvswitch-switch-config.templates b/debian/openvswitch-switch-config.templates
new file mode 100644
index 000000000..24bf0352a
--- /dev/null
+++ b/debian/openvswitch-switch-config.templates
@@ -0,0 +1,228 @@
+Template: openvswitch-switch/netdevs
+Type: multiselect
+_Choices: ${choices}
+_Description: OpenFlow switch network devices:
+ Choose the network devices that should become part of the OpenFlow
+ switch. At least two devices must be selected for this machine to be
+ a useful switch. Unselecting all network devices will disable the
+ OpenFlow switch entirely.
+ .
+ The network devices that you select should not be configured with IP
+ or IPv6 addresses, even if the switch contacts the controller over
+ one of the selected network devices. This is because a running
+ OpenFlow switch takes over network devices at a low level: they
+ become part of the switch and cannot be used for other purposes.
+
+Template: openvswitch-switch/no-netdevs
+Type: error
+_Description: No network devices were selected.
+ No network devices were selected for inclusion in the OpenFlow switch.
+ The switch will be disabled.
+
+Template: openvswitch-switch/configured-netdevs
+Type: note
+_Description: Some Network Devices Have IP or IPv6 Addresses
+ The following network devices selected to be part of the OpenFlow switch
+ have IP or IPv6 addresses configured:
+ .
+ ${configured-netdevs}
+ .
+ This is usually a mistake, even if the switch contacts the controller over
+ one of the selected network devices. This is because a running
+ OpenFlow switch takes over network devices at a low level: they
+ become part of the switch and cannot be used for other purposes.
+ .
+ If this is an unintentional mistake, move back and fix the selection,
+ or de-configure the IP or IPv6 from these network devices.
+
+Template: openvswitch-switch/mode
+Type: select
+_Choices: discovery, in-band, out-of-band
+Default: discovery
+_Description: Switch-to-controller access method:
+ The OpenFlow switch must be able to contact the OpenFlow controller over
+ the network. It can do so in one of three ways:
+ .
+ discovery: A single network is used for OpenFlow traffic and other
+ data traffic; that is, the switch contacts the controller over one of
+ the network devices selected as OpenFlow switch network devices in
+ the previous question. The switch automatically determines the
+ location of the controller using a DHCP request with an
+ OpenFlow-specific vendor option. This is the most common case.
+ .
+ in-band: As above, but the location of the controller is manually
+ configured.
+ .
+ out-of-band: OpenFlow traffic uses a network separate from the data traffic
+ that it controls. If this is the case, the control network must already
+ be configured on a network device other than one of those selected as
+ an OpenFlow switch netdev in the previous question.
+
+Template: openvswitch-switch/discover
+Type: note
+_Description: Preparing to discover controller.
+ The setup program will now attempt to discover the OpenFlow controller.
+ Controller discovery may take up to 30 seconds. Please be patient.
+ .
+ See secchan(8) for instructions on how to configure a DHCP server for
+ controller discovery.
+
+Template: openvswitch-switch/discovery-failure
+Type: error
+_Description: Controller discovery failed.
+ The controller's location could not be determined automatically.
+ .
+ Ensure that the OpenFlow DHCP server is properly configured. See
+ secchan(8) for instructions on how to configure a DHCP server for
+ controller discovery.
+
+Template: openvswitch-switch/discovery-success
+Type: boolean
+Default: true
+_Description: Use discovered settings?
+ Controller discovery obtained the following settings:
+ .
+ Controller location: ${controller-vconn}
+ .
+ PKI URL: ${pki-uri}
+ .
+ Please verify that these settings are correct.
+
+Template: openvswitch-switch/switch-ip
+Type: string
+Default: dhcp
+_Description: Switch IP address:
+ For in-band communication with the controller, the OpenFlow switch must
+ be able to determine its own IP address. Its IP address may be configured
+ statically or dynamically.
+ .
+ For static configuration, specify the switch's IP address as a string.
+ .
+ For dynamic configuration with DHCP (the most common case), specify "dhcp".
+ Configuration with DHCP will only work reliably if the network topology
+ allows the switch to contact the DHCP server before it connects to the
+ OpenFlow controller.
+
+Template: openvswitch-switch/switch-ip-error
+Type: error
+_Description: The switch IP address is invalid.
+ The switch IP address must specified as "dhcp" or a valid IP address in
+ dotted-octet form (e.g. "1.2.3.4").
+
+Template: openvswitch-switch/controller-vconn
+Type: string
+_Description: Controller location:
+ Specify how the OpenFlow switch should connect to the OpenFlow controller.
+ The value should be in form "ssl:HOST[:PORT]" to connect to the controller
+ over SSL (recommended for security) or "tcp:HOST[:PORT]" to connect over
+ cleartext TCP.
+
+Template: openvswitch-switch/controller-vconn-error
+Type: error
+_Description: The controller location is invalid.
+ The controller location must be specifed as "ssl:HOST[:PORT]" to
+ connect to the controller over SSL (recommended for security) or
+ "tcp:HOST[:PORT]" to connect over cleartext TCP.
+
+Template: openvswitch-switch/pki-uri
+Type: string
+_Description: OpenFlow PKI server host name or URL:
+ Specify a URL to the OpenFlow public key infrastructure (PKI). If a
+ host name or IP address is specified in place of a URL, then
+ http://<host>/openvswitch/pki/ will be used,
+ where <host> is the specified host name or IP address.
+ .
+ The OpenFlow PKI is usually on the same machine as the OpenFlow
+ controller.
+ .
+ The setup process will connect to the OpenFlow PKI server over
+ HTTP, using the system's configured default HTTP proxy (if any).
+
+Template: openvswitch-switch/fetch-cacert-failed
+Type: error
+_Description: The switch CA certificate could not be retrieved.
+ Retrieval of ${url} failed, with the following status: "${error}".
+ .
+ Ensure that the OpenFlow PKI server is correctly configured and
+ available at ${pki-uri}. If the system is configured to use an HTTP
+ proxy, also make sure that the HTTP proxy is available and that the
+ PKI server can be reached through it.
+
+Template: openvswitch-switch/verify-controller-ca
+Type: select
+_Choices: yes, no
+Default: yes
+_Description: Is ${fingerprint} the controller CA's fingerprint?
+ If a man-in-the-middle attack is possible in your network
+ environment, check that the controller CA's fingerprint is really
+ ${fingerprint}. Answer "yes" if it matches, "no" if
+ there is a discrepancy.
+ .
+ If a man-in-the-middle attack is not a concern, there is no need to
+ verify the fingerprint. Simply answer "yes".
+
+Template: openvswitch-switch/send-cert-req
+Type: select
+_Choices: yes, no
+Default: yes
+_Description: Send certificate request to switch CA?
+ Before it can connect to the controller over SSL, the OpenFlow
+ switch's key must be signed by the switch certificate authority (CA)
+ located on the OpenFlow PKI server, which is usually collocated with
+ the OpenFlow controller. A signing request can be sent to the PKI
+ server now.
+ .
+ Answer "yes" to send a signing request to the switch CA now. This is
+ ordinarily the correct choice. There is no harm in sending a given
+ signing request more than once.
+ .
+ Answer "no" to skip sending a signing request to the switch CA.
+ Unless the request has already been sent to the switch CA, manual
+ sending of the request and signing will be necessary.
+
+Template: openvswitch-switch/send-cert-req-failed
+Type: error
+_Description: The certificate request could not be sent.
+ Posting to ${url} failed, with the following status: "${error}".
+ .
+ Ensure that the OpenFlow PKI server is correctly configured and
+ available at ${pki-uri}.
+
+Template: openvswitch-switch/fetch-switch-cert
+Type: select
+_Choices: yes, no
+_Description: Fetch signed switch certificate from PKI server?
+ Before it can connect to the controller over SSL, the OpenFlow
+ switch's key must be signed by the switch certificate authority (CA)
+ located on the OpenFlow PKI server, which is usually collocated with
+ the OpenFlow controller.
+ .
+ At this point, a signing request has been sent to the switch CA (or
+ sending a request has been manually skipped), but the signed
+ certificate has not yet been retrieved. Manual action may need to be
+ taken at the PKI server to approve the signing request.
+ .
+ Answer "yes" to attempt to retrieve the signed switch certificate
+ from the switch CA. If the switch certificate request has been
+ signed at the PKI server, this is the correct choice.
+ .
+ Answer "no" to postpone switch configuration. The configuration
+ process must be restarted later, when the switch certificate request
+ has been signed.
+
+Template: openvswitch-switch/fetch-switch-cert-failed
+Type: error
+_Description: Signed switch certificate could not be retrieved.
+ The signed switch certificate could not be retrieved from the switch
+ CA: retrieval of ${url} failed, with the following status: "${error}".
+ .
+ This probably indicates that the switch's certificate request has not
+ yet been signed. If this is the problem, it may be fixed by signing
+ the certificate request at ${pki-uri}, then trying to fetch the
+ signed switch certificate again.
+
+Template: openvswitch-switch/complete
+Type: note
+_Description: OpenFlow Switch Setup Finished
+ Setup of this OpenFlow switch is finished. Complete the setup procedure
+ to enable the switch.
diff --git a/debian/openvswitch-switch.README.Debian b/debian/openvswitch-switch.README.Debian
new file mode 100644
index 000000000..eb504f655
--- /dev/null
+++ b/debian/openvswitch-switch.README.Debian
@@ -0,0 +1,18 @@
+README.Debian for openvswitch-switch
+---------------------------------
+
+* The switch must be configured before it can be used. To configure
+ it interactively, install the openvswitch-switch-config package and run
+ the ovs-switch-setup program. Alternatively, edit
+ /etc/default/openvswitch-switch by hand, then start the switch manually
+ with "/etc/init.d/openvswitch-switch start".
+
+* To use the Linux kernel-based switch implementation, you will need
+ to build and install the Open vSwitch kernel module. To do so, install
+ the openvswitch-datapath-source package, then follow the instructions
+ given in /usr/share/doc/openvswitch-datapath-source/README.Debian
+
+* This package does not yet support the userspace datapath-based
+ switch implementation.
+
+ -- Ben Pfaff <blp@nicira.com>, Mon, 11 May 2009 13:29:43 -0700
diff --git a/debian/openvswitch-switch.dirs b/debian/openvswitch-switch.dirs
new file mode 100644
index 000000000..b4a528732
--- /dev/null
+++ b/debian/openvswitch-switch.dirs
@@ -0,0 +1,2 @@
+/etc/openvswitch-switch
+/usr/share/openvswitch/switch
diff --git a/debian/openvswitch-switch.init b/debian/openvswitch-switch.init
new file mode 100755
index 000000000..b238f72e1
--- /dev/null
+++ b/debian/openvswitch-switch.init
@@ -0,0 +1,428 @@
+#! /bin/sh
+#
+# /etc/init.d/openvswitch-switch
+#
+# Written by Miquel van Smoorenburg <miquels@cistron.nl>.
+# Modified for Debian by Ian Murdock <imurdock@gnu.ai.mit.edu>.
+# Further changes by Javier Fernandez-Sanguino <jfs@debian.org>
+# Modified for openvswitch-switch.
+#
+# Version: @(#)skeleton 1.9 26-Feb-2001 miquels@cistron.nl
+#
+### BEGIN INIT INFO
+# Provides: openvswitch-switch
+# Required-Start: $network $named $remote_fs $syslog
+# Required-Stop:
+# Default-Start: 2 3 4 5
+# Default-Stop: 0 1 6
+# Short-Description: Open vSwitch switch
+### END INIT INFO
+
+PATH=/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin
+DAEMON=/usr/sbin/secchan
+NAME=secchan
+DESC=secchan
+
+test -x $DAEMON || exit 0
+
+NICIRA_OUI="002320"
+
+LOGDIR=/var/log/openvswitch
+PIDFILE=/var/run/$NAME.pid
+DHCLIENT_PIDFILE=/var/run/dhclient.of0.pid
+DODTIME=1 # Time to wait for the server to die, in seconds
+ # If this value is set too low you might not
+ # let some servers to die gracefully and
+ # 'restart' will not work
+
+# Include secchan defaults if available
+unset NETDEVS
+unset MODE
+unset SWITCH_IP
+unset CONTROLLER
+unset PRIVKEY
+unset CERT
+unset CACERT
+unset CACERT_MODE
+unset MGMT_VCONNS
+unset COMMANDS
+unset DAEMON_OPTS
+unset CORE_LIMIT
+unset DATAPATH_ID
+default=/etc/default/openvswitch-switch
+if [ -f $default ] ; then
+ . $default
+fi
+
+set -e
+
+running_pid()
+{
+ # Check if a given process pid's cmdline matches a given name
+ pid=$1
+ name=$2
+ [ -z "$pid" ] && return 1
+ [ ! -d /proc/$pid ] && return 1
+ cmd=`cat /proc/$pid/cmdline | tr "\000" "\n"|head -n 1 |cut -d : -f 1`
+ # Is this the expected child?
+ case $cmd in
+ $name|*/$name)
+ return 0
+ ;;
+ *)
+ return 1
+ ;;
+ esac
+}
+
+running()
+{
+# Check if the process is running looking at /proc
+# (works for all users)
+
+ # No pidfile, probably no daemon present
+ [ ! -f "$PIDFILE" ] && return 1
+ # Obtain the pid and check it against the binary name
+ pid=`cat $PIDFILE`
+ running_pid $pid $NAME || return 1
+ return 0
+}
+
+force_stop() {
+# Forcefully kill the process
+ [ ! -f "$PIDFILE" ] && return
+ if running ; then
+ kill -15 $pid
+ # Is it really dead?
+ [ -n "$DODTIME" ] && sleep "$DODTIME"s
+ if running ; then
+ kill -9 $pid
+ [ -n "$DODTIME" ] && sleep "$DODTIME"s
+ if running ; then
+ echo "Cannot kill $NAME (pid=$pid)!"
+ exit 1
+ fi
+ fi
+ fi
+ rm -f $PIDFILE
+ return 0
+}
+
+must_succeed() {
+ echo -n "$1: "
+ shift
+ if "$@"; then
+ echo "success."
+ else
+ echo " ERROR."
+ exit 1
+ fi
+}
+
+check_op() {
+ echo -n "$1: "
+ shift
+ if "$@"; then
+ echo "success."
+ else
+ echo " ERROR."
+ fi
+}
+
+configure_ssl() {
+ if (test "$CACERT_MODE" != secure && test "$CACERT_MODE" != bootstrap) \
+ || test ! -e "$PRIVKEY" || test ! -e "$CERT" \
+ || (test ! -e "$CACERT" && test "$CACERT_MODE" != bootstrap); then
+ if test "$CACERT_MODE" != secure && test "$CACERT_MODE" != bootstrap
+ then
+ echo "CACERT_MODE is not set to 'secure' or 'bootstrap'"
+ fi
+ if test ! -e "$PRIVKEY"; then
+ echo "$PRIVKEY: private key missing" >&2
+ fi
+ if test ! -e "$CERT"; then
+ echo "$CERT: certificate for private key missing" >&2
+ fi
+ if test ! -e "$CACERT" && test "$CACERT_MODE" != bootstrap; then
+ echo "$CACERT: CA certificate missing (and CA certificate bootstrapping not enabled)" >&2
+ fi
+ echo "Run ovs-switch-setup (in the openvswitch-switch-config package) or edit /etc/default/openvswitch-switch to configure" >&2
+ if test "$MODE" = discovery; then
+ echo "You may also delete or rename $PRIVKEY to disable SSL requirement" >&2
+ fi
+ exit 1
+ fi
+
+ SSL_OPTS="--private-key=$PRIVKEY --certificate=$CERT"
+ if test ! -e "$CACERT" && test "$CACERT_MODE" = bootstrap; then
+ SSL_OPTS="$SSL_OPTS --bootstrap-ca-cert=$CACERT"
+ else
+ SSL_OPTS="$SSL_OPTS --ca-cert=$CACERT"
+ fi
+}
+
+check_int_var() {
+ eval value=\$$1
+ if test -n "$value"; then
+ if expr "X$value" : 'X[0-9][0-9]*$' > /dev/null 2>&1; then
+ if test $value -lt $2; then
+ echo "warning: The $1 option may not be set to a value below $2, treating as $2" >&2
+ eval $1=$2
+ fi
+ else
+ echo "warning: The $1 option must be set to a number, ignoring" >&2
+ unset $1
+ fi
+ fi
+}
+
+check_new_option() {
+ case $DAEMON_OPTS in
+ *$1*)
+ echo "warning: The $1 option in DAEMON_OPTS may now be set with the $2 variable in $default. The setting in DAEMON_OPTS will override the $2 variable, which will prevent the switch UI from configuring $1." >&2
+ ;;
+ esac
+}
+
+case "$1" in
+ start)
+ if test -z "$NETDEVS"; then
+ echo "$default: No network devices configured, switch disabled" >&2
+ echo "Run ovs-switch-setup (in the openvswitch-switch-config package) or edit /etc/default/openvswitch-switch to configure" >&2
+ exit 0
+ fi
+ if test "$MODE" = discovery; then
+ unset CONTROLLER
+ elif test "$MODE" = in-band || test "$MODE" = out-of-band; then
+ if test -z "$CONTROLLER"; then
+ echo "$default: No controller configured and not configured for discovery, switch disabled" >&2
+ echo "Run ovs-switch-setup (in the openvswitch-switch-config package) or edit /etc/default/openvswitch-switch to configure" >&2
+ exit 0
+ fi
+ else
+ echo "$default: MODE must set to 'discovery', 'in-band', or 'out-of-band'" >&2
+ echo "Run ovs-switch-setup (in the openvswitch-switch-config package) or edit /etc/default/openvswitch-switch to configure" >&2
+ exit 1
+ fi
+ : ${PRIVKEY:=/etc/openvswitch-switch/of0-privkey.pem}
+ : ${CERT:=/etc/openvswitch-switch/of0-cert.pem}
+ : ${CACERT:=/etc/openvswitch-switch/cacert.pem}
+ case $CONTROLLER in
+ '')
+ # Discovery mode.
+ if test -e "$PRIVKEY"; then
+ configure_ssl
+ fi
+ ;;
+ tcp:*)
+ ;;
+ ssl:*)
+ configure_ssl
+ ;;
+ *)
+ echo "$default: CONTROLLER must be in the form 'ssl:HOST[:PORT]' or 'tcp:HOST[:PORT]' when not in discovery mode" >&2
+ echo "Run ovs-switch-setup (in the openvswitch-switch-config package) or edit /etc/default/openvswitch-switch to configure" >&2
+ exit 1
+ esac
+ case $DISCONNECTED_MODE in
+ ''|switch|drop) ;;
+ *) echo "$default: warning: DISCONNECTED_MODE is not 'switch' or 'drop'" >&2 ;;
+ esac
+
+ check_int_var RATE_LIMIT 100
+ check_int_var INACTIVITY_PROBE 5
+ check_int_var MAX_BACKOFF 1
+
+ check_new_option --fail DISCONNECTED_MODE
+ check_new_option --stp STP
+ check_new_option --rate-limit RATE_LIMIT
+ check_new_option --inactivity INACTIVITY_PROBE
+ check_new_option --max-backoff MAX_BACKOFF
+ case $DAEMON_OPTS in
+ *--rate-limit*)
+ echo "$default: --rate-limit may now be set with RATE_LIMIT" >&2
+ esac
+
+ echo -n "Loading openvswitch_mod: "
+ if grep -q '^openvswitch_mod$' /proc/modules; then
+ echo "already loaded, nothing to do."
+ elif modprobe openvswitch_mod; then
+ echo "success."
+ else
+ echo "ERROR."
+ echo "openvswitch_mod has probably not been built for this kernel."
+ if ! test -d /usr/share/doc/openvswitch-datapath-source; then
+ echo "Install the openvswitch-datapath-source package, then read"
+ echo "/usr/share/doc/openvswitch-datapath-source/README.Debian"
+ else
+ echo "For instructions, read"
+ echo "/usr/share/doc/openvswitch-datapath-source/README.Debian"
+ fi
+ exit 1
+ fi
+
+ for netdev in $NETDEVS; do
+ check_op "Removing IP address from $netdev" ifconfig $netdev 0.0.0.0
+ done
+
+ must_succeed "Creating datapath" ovs-dpctl add-dp of0 $NETDEVS
+
+ xx='[0-9abcdefABCDEF][0-9abcdefABCDEF]'
+ case $DATAPATH_ID in
+ '')
+ # Check if the DMI System UUID contains a Nicira mac address
+ # that should be used for this datapath. The UUID is assumed
+ # to be RFC 4122 compliant.
+ DMIDECODE=`which dmidecode`
+ if [ -n $DMIDECODE ]; then
+ UUID_MAC=`$DMIDECODE -s system-uuid | cut -d'-' -f 5`
+ case $UUID_MAC in
+ $NICIRA_OUI*)
+ ifconfig of0 down
+ must_succeed "Setting of0 MAC address to $UUID_MAC" ifconfig of0 hw ether $UUID_MAC
+ ifconfig of0 up
+ ;;
+ esac
+ fi
+ ;;
+ $xx:$xx:$xx:$xx:$xx:$xx)
+ ifconfig of0 down
+ must_succeed "Setting of0 MAC address to $DATAPATH_ID" ifconfig of0 hw ether $DATAPATH_ID
+ ifconfig of0 up
+ ;;
+ *)
+ echo "DATAPATH_ID is not a valid MAC address in the form XX:XX:XX:XX:XX:XX, ignoring" >&2
+ ;;
+ esac
+
+ if test "$MODE" = in-band; then
+ if test "$SWITCH_IP" = dhcp; then
+ must_succeed "Temporarily disabling of0" ifconfig of0 down
+ else
+ COMMAND="ifconfig of0 $SWITCH_IP"
+ if test -n "$SWITCH_NETMASK"; then
+ COMMAND="$COMMAND netmask $SWITCH_NETMASK"
+ fi
+ must_succeed "Configuring of0: $COMMAND" $COMMAND
+ if test -n "$SWITCH_GATEWAY"; then
+ # This can fail because the route already exists,
+ # so we don't insist that it succeed.
+ COMMAND="route add default gw $SWITCH_GATEWAY"
+ check_op "Adding default route: $COMMAND" $COMMAND
+ fi
+ fi
+ else
+ must_succeed "Disabling of0" ifconfig of0 down
+ fi
+
+ if test -n "$CORE_LIMIT"; then
+ check_op "Setting core limit to $CORE_LIMIT" ulimit -c "$CORE_LIMIT"
+ fi
+
+ # Compose secchan options.
+ set --
+ set -- "$@" --verbose=ANY:console:emer --verbose=ANY:syslog:err
+ set -- "$@" --log-file
+ set -- "$@" --detach --pidfile=$PIDFILE
+ for vconn in $MGMT_VCONNS; do
+ set -- "$@" --listen="$vconn"
+ done
+ if test -n "$COMMANDS"; then
+ set -- "$@" --command-acl="$COMMANDS"
+ fi
+ case $STP in
+ yes) set -- "$@" --stp ;;
+ no) set -- "$@" --no-stp ;;
+ esac
+ case $DISCONNECTED_MODE in
+ switch) set -- "$@" --fail=open ;;
+ drop) set -- "$@" --fail=closed ;;
+ esac
+ if test -n "$RATE_LIMIT"; then
+ set -- "$@" --rate-limit=$RATE_LIMIT
+ fi
+ if test -n "$INACTIVITY_PROBE"; then
+ set -- "$@" --inactivity-probe=$INACTIVITY_PROBE
+ fi
+ if test -n "$MAX_BACKOFF"; then
+ set -- "$@" --max-backoff=$MAX_BACKOFF
+ fi
+ set -- "$@" $SSL_OPTS $DAEMON_OPTS
+ if test "$MODE" = out-of-band; then
+ set -- "$@" --out-of-band
+ fi
+ set -- "$@" of0 "$CONTROLLER"
+ echo -n "Starting $DESC: "
+ start-stop-daemon --start --quiet --pidfile $PIDFILE \
+ --exec $DAEMON -- "$@"
+ if running; then
+ echo "$NAME."
+ else
+ echo " ERROR."
+ fi
+
+ if test "$MODE" = in-band && test "$SWITCH_IP" = dhcp; then
+ echo -n "Starting dhclient on of0: "
+ start-stop-daemon --start --quiet --pidfile $DHCLIENT_PIDFILE \
+ --exec /sbin/dhclient -- -q -pf $DHCLIENT_PIDFILE of0
+ if running; then
+ echo "dhclient."
+ else
+ echo " ERROR."
+ fi
+ fi
+ ;;
+ stop)
+ if test -e /var/run/dhclient.of0.pid; then
+ echo -n "Stopping dhclient on of0: "
+ start-stop-daemon --stop --quiet --oknodo \
+ --pidfile $DHCLIENT_PIDFILE --exec /sbin/dhclient
+ echo "dhclient."
+ fi
+
+ echo -n "Stopping $DESC: "
+ start-stop-daemon --stop --quiet --oknodo --pidfile $PIDFILE \
+ --exec $DAEMON
+ echo "$NAME."
+
+ check_op "Deleting datapath" ovs-dpctl del-dp of0
+ check_op "Unloading kernel module" modprobe -r openvswitch_mod
+ ;;
+ force-stop)
+ echo -n "Forcefully stopping $DESC: "
+ force_stop
+ if ! running; then
+ echo "$NAME."
+ else
+ echo " ERROR."
+ fi
+ ;;
+ reload)
+ ;;
+ force-reload)
+ start-stop-daemon --stop --test --quiet --pidfile \
+ $PIDFILE --exec $DAEMON \
+ && $0 restart \
+ || exit 0
+ ;;
+ restart)
+ $0 stop || true
+ $0 start
+ ;;
+ status)
+ echo -n "$NAME is "
+ if running ; then
+ echo "running"
+ else
+ echo " not running."
+ exit 1
+ fi
+ ;;
+ *)
+ N=/etc/init.d/$NAME
+ echo "Usage: $N {start|stop|restart|force-reload|status|force-stop}" >&2
+ exit 1
+ ;;
+esac
+
+exit 0
diff --git a/debian/openvswitch-switch.install b/debian/openvswitch-switch.install
new file mode 100644
index 000000000..9fddacf01
--- /dev/null
+++ b/debian/openvswitch-switch.install
@@ -0,0 +1,7 @@
+_debian/secchan/secchan usr/sbin
+_debian/utilities/ovs-dpctl usr/sbin
+_debian/utilities/ovs-discover usr/sbin
+_debian/utilities/ovs-kill usr/sbin
+_debian/utilities/ovs-ofctl usr/sbin
+debian/openvswitch/usr/share/openvswitch/commands/* usr/share/openvswitch/commands
+debian/commands/* usr/share/openvswitch/commands
diff --git a/debian/openvswitch-switch.logrotate b/debian/openvswitch-switch.logrotate
new file mode 100644
index 000000000..41394e862
--- /dev/null
+++ b/debian/openvswitch-switch.logrotate
@@ -0,0 +1,11 @@
+/var/log/openvswitch/secchan.log {
+ daily
+ compress
+ create 640 root adm
+ delaycompress
+ missingok
+ rotate 30
+ postrotate
+ ovs-appctl --target /var/run/secchan.pid --reopen
+ endscript
+}
diff --git a/debian/openvswitch-switch.manpages b/debian/openvswitch-switch.manpages
new file mode 100644
index 000000000..f789eba9f
--- /dev/null
+++ b/debian/openvswitch-switch.manpages
@@ -0,0 +1,5 @@
+_debian/secchan/secchan.8
+_debian/utilities/ovs-discover.8
+_debian/utilities/ovs-dpctl.8
+_debian/utilities/ovs-kill.8
+_debian/utilities/ovs-ofctl.8
diff --git a/debian/openvswitch-switch.postinst b/debian/openvswitch-switch.postinst
new file mode 100755
index 000000000..74b52ba90
--- /dev/null
+++ b/debian/openvswitch-switch.postinst
@@ -0,0 +1,51 @@
+#!/bin/sh
+# postinst script for openvswitch-switch
+#
+# see: dh_installdeb(1)
+
+set -e
+
+# summary of how this script can be called:
+# * <postinst> `configure' <most-recently-configured-version>
+# * <old-postinst> `abort-upgrade' <new version>
+# * <conflictor's-postinst> `abort-remove' `in-favour' <package>
+# <new-version>
+# * <postinst> `abort-remove'
+# * <deconfigured's-postinst> `abort-deconfigure' `in-favour'
+# <failed-install-package> <version> `removing'
+# <conflicting-package> <version>
+# for details, see http://www.debian.org/doc/debian-policy/ or
+# the debian-policy package
+
+
+case "$1" in
+ configure)
+ DEFAULT=/etc/default/openvswitch-switch
+ TEMPLATE=/usr/share/openvswitch/switch/default.template
+ if ! test -e $DEFAULT; then
+ cp $TEMPLATE $DEFAULT
+ else
+ for var in $(awk -F'[ :]' '/^# [_A-Z0-9]+:/{print $2}' $TEMPLATE)
+ do
+ if ! grep $var $DEFAULT >/dev/null 2>&1; then
+ echo >> $DEFAULT
+ sed -n "/$var:/,/$var=/p" $TEMPLATE >> $DEFAULT
+ fi
+ done
+ fi
+ ;;
+
+ abort-upgrade|abort-remove|abort-deconfigure)
+ ;;
+
+ *)
+ echo "postinst called with unknown argument \`$1'" >&2
+ exit 1
+ ;;
+esac
+
+#DEBHELPER#
+
+exit 0
+
+
diff --git a/debian/openvswitch-switch.postrm b/debian/openvswitch-switch.postrm
new file mode 100755
index 000000000..19e8ebe4c
--- /dev/null
+++ b/debian/openvswitch-switch.postrm
@@ -0,0 +1,43 @@
+#!/bin/sh
+# postrm script for openvswitch-switch
+#
+# see: dh_installdeb(1)
+
+set -e
+
+# summary of how this script can be called:
+# * <postrm> `remove'
+# * <postrm> `purge'
+# * <old-postrm> `upgrade' <new-version>
+# * <new-postrm> `failed-upgrade' <old-version>
+# * <new-postrm> `abort-install'
+# * <new-postrm> `abort-install' <old-version>
+# * <new-postrm> `abort-upgrade' <old-version>
+# * <disappearer's-postrm> `disappear' <overwriter>
+# <overwriter-version>
+# for details, see http://www.debian.org/doc/debian-policy/ or
+# the debian-policy package
+
+
+case "$1" in
+ purge)
+ rm -f /etc/default/openvswitch-switch
+ ;;
+
+ remove|upgrade|failed-upgrade|abort-install|abort-upgrade|disappear)
+ ;;
+
+ *)
+ echo "postrm called with unknown argument \`$1'" >&2
+ exit 1
+ ;;
+esac
+
+# dh_installdeb will replace this with shell code automatically
+# generated by other debhelper scripts.
+
+#DEBHELPER#
+
+exit 0
+
+
diff --git a/debian/openvswitch-switch.template b/debian/openvswitch-switch.template
new file mode 100644
index 000000000..7fe0e15c6
--- /dev/null
+++ b/debian/openvswitch-switch.template
@@ -0,0 +1,165 @@
+# This is a POSIX shell fragment -*- sh -*-
+
+# To configure the secure channel, fill in the following properly and
+# uncomment them. Afterward, the secure channel will come up
+# automatically at boot time. It can be started immediately with
+# /etc/init.d/openvswitch-switch start
+# Alternatively, use the ovs-switch-setup program (from the
+# openvswitch-switch-config package) to do everything automatically.
+
+# NETDEVS: Which network devices should the OpenFlow switch include?
+#
+# List the network devices that should become part of the OpenFlow
+# switch, separated by spaces. At least two devices must be selected
+# for this machine to be a useful switch. Unselecting all network
+# devices will disable the OpenFlow switch entirely.
+#
+# The network devices that you select should not be configured with IP
+# or IPv6 addresses, even if the switch contacts the controller over
+# one of the selected network devices. This is because a running
+# Open vSwitch switch takes over network devices at a low level: they
+# become part of the switch and cannot be used for other purposes.
+#NETDEVS=""
+
+# MODE: The OpenFlow switch has three modes that determine how it
+# reaches the controller:
+#
+# * in-band with discovery: A single network is used for OpenFlow
+# traffic and other data traffic; that is, the switch contacts the
+# controller over one of the network devices selected as OpenFlow
+# switch ports. The switch automatically determines the location of
+# the controller using a DHCP request with an OpenFlow-specific
+# vendor option. This is the most common case.
+#
+# * in-band: As above, but the location of the controller is manually
+# configured.
+#
+# * out-of-band: OpenFlow traffic uses a network separate from the
+# data traffic that it controls. If this is the case, the control
+# network must already be configured on a network device other than
+# one of those selected as an Open vSwitch switch port in the previous
+# question.
+#
+# Set MODE to 'discovery', 'in-band', or 'out-of-band' for these
+# respective cases.
+MODE=discovery
+
+# SWITCH_IP: In 'in-band' mode, the switch's IP address may be
+# configured statically or dynamically:
+#
+# * For static configuration, specify the switch's IP address as a
+# string. In this case you may also set SWITCH_NETMASK and
+# SWITCH_GATEWAY appropriately (see below).
+#
+# * For dynamic configuration with DHCP (the most common case),
+# specify "dhcp". Configuration with DHCP will only work reliably
+# if the network topology allows the switch to contact the DHCP
+# server before it connects to the OpenFlow controller.
+#
+# This setting has no effect unless MODE is set to 'in-band'.
+SWITCH_IP=dhcp
+
+# SWITCH_NETMASK: IP netmask to use in 'in-band' mode when the switch
+# IP address is not 'dhcp'.
+#SWITCH_NETMASK=255.255.255.0
+
+# SWITCH_GATEWAY: IP gateway to use in 'in-band' mode when the switch
+# IP address is not 'dhcp'.
+#SWITCH_GATEWAY=192.168.1.1
+
+# CONTROLLER: Location of controller.
+# One of the following formats:
+# tcp:HOST[:PORT] via TCP to PORT (default: 6633) on HOST
+# ssl:HOST[:PORT] via SSL to PORT (default: 6633) on HOST
+# The default below assumes that the controller is running locally.
+# This setting has no effect when MODE is set to 'discovery'.
+#CONTROLLER="tcp:127.0.0.1"
+
+# PRIVKEY: Name of file containing switch's private key.
+# Required if SSL enabled.
+#PRIVKEY=/etc/openvswitch-switch/of0-privkey.pem
+
+# CERT: Name of file containing certificate for private key.
+# Required if SSL enabled.
+#CERT=/etc/openvswitch-switch/of0-cert.pem
+
+# CACERT: Name of file containing controller CA certificate.
+# Required if SSL enabled.
+#CACERT=/etc/openvswitch-switch/cacert.pem
+
+# CACERT_MODE: Two modes are available:
+#
+# * secure: The controller CA certificate named in CACERT above must exist.
+# (You must copy it manually from the PKI server or another trusted source.)
+#
+# * bootstrap: If the controller CA certificate named in CACERT above does
+# not exist, the switch will obtain it from the controller the first time
+# it connects and save a copy to the file named in CACERT. This is insecure,
+# in the same way that initial connections with ssh are insecure, but
+# it is convenient.
+#
+# Set CACERT_MODE to 'secure' or 'bootstrap' for these respective cases.
+#CACERT_MODE=secure
+
+# MGMT_VCONNS: List of vconns (space-separated) on which secchan
+# should listen for management connections from ovs-ofctl, etc.
+# openvswitch-switchui by default connects to
+# unix:/var/run/secchan.mgmt, so do not disable this if you want to
+# use openvswitch-switchui.
+MGMT_VCONNS="punix:/var/run/secchan.mgmt"
+
+# COMMANDS: Access control list for the commands that can be executed
+# remotely over the OpenFlow protocol, as a comma-separated list of
+# shell glob patterns. Negative patterns (beginning with !) act as a
+# blacklist. To be executable, a command name must match one positive
+# pattern and not match any negative patterns.
+#COMMANDS="reboot,update"
+
+# DISCONNECTED_MODE: Switch behavior when attempts to connect to the
+# controller repeatedly fail, either 'switch', to act as an L2 switch
+# in this case, or 'drop', to drop all packets (except those necessary
+# to connect to the controller). If unset, the default is 'drop'.
+#DISCONNECTED_MODE=switch
+
+# STP: Enable or disabled 802.1D-1998 Spanning Tree Protocol. Set to
+# 'yes' to enable STP, 'no' to disable it. If unset, secchan's
+# current default is 'no' (but this may change in the future).
+#STP=no
+
+# RATE_LIMIT: Maximum number of received frames, that do not match any
+# existing switch flow, to forward up to the controller per second.
+# The valid range is 100 and up. If unset, this rate will not be
+# limited.
+#RATE_LIMIT=1000
+
+# INACTIVITY_PROBE: The maximum number of seconds of inactivity on the
+# controller connection before secchan sends an inactivity probe
+# message to the controller. The valid range is 5 and up. If unset,
+# secchan defaults to 15 seconds.
+#INACTIVITY_PROBE=5
+
+# MAX_BACKOFF: The maximum time that secchan will wait between
+# attempts to connect to the controller. The valid range is 1 and up.
+# If unset, secchan defaults to 15 seconds.
+#MAX_BACKOFF=15
+
+# DAEMON_OPTS: Additional options to pass to secchan, e.g. "--fail=open"
+DAEMON_OPTS=""
+
+# CORE_LIMIT: Maximum size for core dumps.
+#
+# Leaving this unset will use the system default. Setting it to 0
+# will disable core dumps. Setting it to "unlimited" will dump all
+# core files regardless of size.
+#CORE_LIMIT=unlimited
+
+# DATAPATH_ID: Identifier for this switch.
+#
+# By default, the switch checks if the DMI System UUID contains a Nicira
+# mac address to use as a datapath ID. If not, then the switch generates
+# a new, random datapath ID every time it starts up. By setting this
+# value, the supplied datapath ID will always be used.
+#
+# Set DATAPATH_ID to a MAC address in the form XX:XX:XX:XX:XX:XX where each
+# X is a hexadecimal digit (0-9 or a-f).
+#DATAPATH_ID=XX:XX:XX:XX:XX:XX
diff --git a/debian/openvswitch-switchui.copyright b/debian/openvswitch-switchui.copyright
new file mode 100644
index 000000000..ab7cac594
--- /dev/null
+++ b/debian/openvswitch-switchui.copyright
@@ -0,0 +1,33 @@
+Upstream Authors:
+
+ Nicira Networks, Inc.
+
+Copyright:
+
+ Copyright (c) 2008, 2009 Nicira Networks, Inc.
+
+License:
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+ In addition, as a special exception, Nicira Networks gives
+ permission to link the code of its release of ovs-vswitchd with
+ the OpenSSL project's "OpenSSL" library (or with modified versions
+ of it that use the same license as the "OpenSSL" library), and
+ distribute the linked executables. You must obey the GNU General
+ Public License in all respects for all of the code used other than
+ "OpenSSL". If you modify this file, you may extend this exception
+ to your version of the file, but you are not obligated to do so.
+ If you do not wish to do so, delete this exception statement from
+ your version.
diff --git a/debian/openvswitch-switchui.default b/debian/openvswitch-switchui.default
new file mode 100644
index 000000000..6cdbf7a5e
--- /dev/null
+++ b/debian/openvswitch-switchui.default
@@ -0,0 +1,35 @@
+# This is a POSIX shell fragment -*- sh -*-
+
+# To configure the switch monitor, modify the following. Afterward,
+# the secure channel will come up automatically at boot time. It can
+# be restarted immediately with
+# /etc/init.d/openvswitch-switchui start
+
+# Defaults for initscript
+# sourced by /etc/init.d/openvswitch-switchui
+# installed at /etc/default/openvswitch-switchui by the maintainer scripts
+
+# SWITCH_VCONN: The vconn used to connect to the switch (secchan).
+# The secchan must be configured to listen to this vconn. The default
+# here set is also listened to by default by the openvswitch-switch
+# package, so ordinarily there is no need to modify this.
+SWITCH_VCONN="unix:/var/run/secchan.mgmt"
+
+# EZIO3_DEVICE: To display the switch monitor on an EZIO3 (aka
+# MTB-134) 16x2 LCD displays found on server appliances made by
+# Portwell, set this to the EZIO3 serial device and uncomment it.
+#EZIO3_DEVICE="/dev/ttyS1"
+
+# OPENVT: When EZIO3_DEVICE is unset, this specifies the command under
+# which to run ovs-switchui. The default value of "/usr/bin/openvt"
+# causes ovs-switchui to run on a new, otherwise empty virtual
+# console.
+#
+# The value must be a command name without arguments. Use a wrapper
+# script to provide arguments if you need them.
+#
+# When EZIO3_DEVICE is set, this variable has no effect.
+OPENVT="/usr/bin/openvt"
+
+# DAEMON_OPTS: Additional options to pass to ovs-switchui.
+DAEMON_OPTS=""
diff --git a/debian/openvswitch-switchui.dirs b/debian/openvswitch-switchui.dirs
new file mode 100644
index 000000000..4dced02c5
--- /dev/null
+++ b/debian/openvswitch-switchui.dirs
@@ -0,0 +1,3 @@
+usr/bin
+usr/sbin
+usr/share/terminfo
diff --git a/debian/openvswitch-switchui.init b/debian/openvswitch-switchui.init
new file mode 100755
index 000000000..7a02c5eac
--- /dev/null
+++ b/debian/openvswitch-switchui.init
@@ -0,0 +1,210 @@
+#!/bin/sh
+#
+# Example init.d script with LSB support.
+#
+# Please read this init.d carefully and modify the sections to
+# adjust it to the program you want to run.
+#
+# Copyright (c) 2007, 2009 Javier Fernandez-Sanguino <jfs@debian.org>
+#
+# This is free software; you may redistribute it and/or modify
+# it under the terms of the GNU General Public License as
+# published by the Free Software Foundation; either version 2,
+# or (at your option) any later version.
+#
+# This is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License with
+# the Debian operating system, in /usr/share/common-licenses/GPL; if
+# not, write to the Free Software Foundation, Inc., 59 Temple Place,
+# Suite 330, Boston, MA 02111-1307 USA
+#
+### BEGIN INIT INFO
+# Provides: openvswitch-switchui
+# Required-Start: $network $local_fs
+# Required-Stop:
+# Should-Start: $named $syslog openvswitch-switch
+# Should-Stop:
+# Default-Start: 2 3 4 5
+# Default-Stop: 0 1 6
+# Short-Description: Open vSwitch switch monitor
+### END INIT INFO
+
+PATH=/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin
+
+DAEMON=/usr/bin/ovs-switchui
+NAME=openvswitch-switchui
+DESC="Open vSwitch switch monitor"
+
+PIDFILE=/var/run/$NAME.pid
+
+test -x $DAEMON || exit 0
+
+. /lib/lsb/init-functions
+
+# Default options, these can be overriden by the information
+# at /etc/default/$NAME
+DAEMON_OPTS="" # Additional options given to the server
+
+DODTIME=10 # Time to wait for the server to die, in seconds
+ # If this value is set too low you might not
+ # let some servers to die gracefully and
+ # 'restart' will not work
+
+# Include defaults if available
+if [ -f /etc/default/$NAME ] ; then
+ . /etc/default/$NAME
+fi
+
+set -e
+
+running_pid() {
+# Check if a given process pid's cmdline matches a given name
+ pid=$1
+ name=$2
+ [ -z "$pid" ] && return 1
+ [ ! -d /proc/$pid ] && return 1
+ return 0
+}
+
+running() {
+# Check if the process is running looking at /proc
+# (works for all users)
+
+ # No pidfile, probably no daemon present
+ [ ! -f "$PIDFILE" ] && return 1
+ pid=`cat $PIDFILE`
+ running_pid $pid $DAEMON || return 1
+ return 0
+}
+
+start_server() {
+# Start the process using the wrapper
+ if test -n "$EZIO3_DEVICE"; then
+ # Make ezio-term detach and create the pidfile.
+ WRAPPER="/usr/sbin/ezio-term"
+ WRAPPER_OPTS="--detach --pidfile=$PIDFILE --ezio=$EZIO3_DEVICE --input=vt"
+ else
+ # openvt will detach, so instead make ovs-switchui make the pidfile.
+ WRAPPER=$OPENVT
+ WRAPPER_OPTS=""
+ DAEMON_OPTS="--pidfile=$PIDFILE"
+ fi
+ start-stop-daemon --start --quiet --pidfile $PIDFILE \
+ --exec $WRAPPER -- $WRAPPER_OPTS -- $DAEMON $DAEMON_OPTS \
+ --log-file $SWITCH_VCONN
+
+ # Wait up to 3 seconds for the daemon to start.
+ for i in 1 2 3; do
+ if running; then
+ break
+ fi
+ sleep 1
+ done
+}
+
+stop_server() {
+ ovs-kill $PIDFILE
+}
+
+force_stop() {
+# Force the process to die killing it manually
+ [ ! -e "$PIDFILE" ] && return
+ if running ; then
+ kill -15 $pid
+ # Is it really dead?
+ sleep "$DIETIME"s
+ if running ; then
+ kill -9 $pid
+ sleep "$DIETIME"s
+ if running ; then
+ echo "Cannot kill $NAME (pid=$pid)!"
+ exit 1
+ fi
+ fi
+ fi
+ rm -f $PIDFILE
+}
+
+
+case "$1" in
+ start)
+ log_daemon_msg "Starting $DESC " "$NAME"
+ # Check if it's running first
+ if running ; then
+ log_progress_msg "apparently already running"
+ log_end_msg 0
+ exit 0
+ fi
+ if start_server && running ; then
+ # It's ok, the server started and is running
+ log_end_msg 0
+ else
+ # Either we could not start it or it is not running
+ # after we did
+ # NOTE: Some servers might die some time after they start,
+ # this code does not try to detect this and might give
+ # a false positive (use 'status' for that)
+ log_end_msg 1
+ fi
+ ;;
+ stop)
+ log_daemon_msg "Stopping $DESC" "$NAME"
+ if running ; then
+ # Only stop the server if we see it running
+ stop_server
+ log_end_msg $?
+ else
+ # If it's not running don't do anything
+ log_progress_msg "apparently not running"
+ log_end_msg 0
+ exit 0
+ fi
+ ;;
+ force-stop)
+ # First try to stop gracefully the program
+ $0 stop
+ if running; then
+ # If it's still running try to kill it more forcefully
+ log_daemon_msg "Stopping (force) $DESC" "$NAME"
+ force_stop
+ log_end_msg $?
+ fi
+ ;;
+ restart|force-reload)
+ log_daemon_msg "Restarting $DESC" "$NAME"
+ stop_server
+ # Wait some sensible amount, some server need this
+ [ -n "$DIETIME" ] && sleep $DIETIME
+ start_server
+ running
+ log_end_msg $?
+ ;;
+ status)
+
+ log_daemon_msg "Checking status of $DESC" "$NAME"
+ if running ; then
+ log_progress_msg "running"
+ log_end_msg 0
+ else
+ log_progress_msg "apparently not running"
+ log_end_msg 1
+ exit 1
+ fi
+ ;;
+ # Use this if the daemon cannot reload
+ reload)
+ log_warning_msg "Reloading $NAME daemon: not implemented, as the daemon"
+ log_warning_msg "cannot re-read the config file (use restart)."
+ ;;
+ *)
+ N=/etc/init.d/$NAME
+ echo "Usage: $N {start|stop|force-stop|restart|force-reload|status}" >&2
+ exit 1
+ ;;
+esac
+
+exit 0
diff --git a/debian/openvswitch-switchui.install b/debian/openvswitch-switchui.install
new file mode 100644
index 000000000..f2872c83b
--- /dev/null
+++ b/debian/openvswitch-switchui.install
@@ -0,0 +1,2 @@
+_debian/extras/ezio/ezio-term usr/sbin
+_debian/extras/ezio/ovs-switchui usr/bin
diff --git a/debian/openvswitch-wdt.default b/debian/openvswitch-wdt.default
new file mode 100644
index 000000000..35625d45d
--- /dev/null
+++ b/debian/openvswitch-wdt.default
@@ -0,0 +1,24 @@
+# This is a POSIX shell fragment -*- sh -*-
+
+# To configure the Open vSwitch reliability packages, modify the following.
+# Afterward, the watchdog timer and oops handling will be configured
+# automatically at boot time. It can be started immediately with
+# /etc/init.d/openvswitch-wdt start
+
+# Defaults for initscript
+# sourced by /etc/init.d/openvswitch-wdt
+# installed at /etc/default/openvswitch-wdt by the maintainer scripts
+
+# OOPS_REBOOT_TIME: The number of seconds the system should wait until it
+# reboots when the kernel oops. A value of zero causes the system to
+# wait forever.
+OOPS_REBOOT_TIME=1
+
+# WDT_TIMEOUT: The number of seconds the watchdog timer should wait until
+# it reboots the system when it hasn't received a keep-alive. A value
+# of zero disables the watchdog timer.
+WDT_TIMEOUT=30
+
+# WDT_INTERVAL: The number of seconds to wait between sending keep-alive
+# messages to the watchdog timer.
+WDT_INTERVAL=1
diff --git a/debian/openvswitch-wdt.dirs b/debian/openvswitch-wdt.dirs
new file mode 100644
index 000000000..ca882bbb7
--- /dev/null
+++ b/debian/openvswitch-wdt.dirs
@@ -0,0 +1,2 @@
+usr/bin
+usr/sbin
diff --git a/debian/openvswitch-wdt.init b/debian/openvswitch-wdt.init
new file mode 100755
index 000000000..b1c0ec5e7
--- /dev/null
+++ b/debian/openvswitch-wdt.init
@@ -0,0 +1,176 @@
+#!/bin/sh
+#
+# Example init.d script with LSB support.
+#
+# Please read this init.d carefully and modify the sections to
+# adjust it to the program you want to run.
+#
+# Copyright (c) 2007, 2009 Javier Fernandez-Sanguino <jfs@debian.org>
+#
+# This is free software; you may redistribute it and/or modify
+# it under the terms of the GNU General Public License as
+# published by the Free Software Foundation; either version 2,
+# or (at your option) any later version.
+#
+# This is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License with
+# the Debian operating system, in /usr/share/common-licenses/GPL; if
+# not, write to the Free Software Foundation, Inc., 59 Temple Place,
+# Suite 330, Boston, MA 02111-1307 USA
+#
+### BEGIN INIT INFO
+# Provides: openvswitch-wdt
+# Required-Start: $network $local_fs
+# Required-Stop:
+# Should-Start: $named $syslog openvswitch-switch
+# Should-Stop:
+# Default-Start: 2 3 4 5
+# Default-Stop: 0 1 6
+# Short-Description: Open vSwitch switch watchdog
+### END INIT INFO
+
+PATH=/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin
+
+DAEMON=/usr/sbin/ovs-wdt
+NAME=openvswitch-wdt
+DESC="Open vSwitch switch watchdog"
+
+PIDFILE=/var/run/$NAME.pid
+
+test -x $DAEMON || exit 0
+
+. /lib/lsb/init-functions
+
+# Default options, these can be overriden by the information
+# at /etc/default/$NAME
+DAEMON_OPTS="" # Additional options given to the daemon
+
+DODTIME=10 # Time to wait for the daemon to die, in seconds
+ # If this value is set too low you might not
+ # let some daemons to die gracefully and
+ # 'restart' will not work
+
+# Include defaults if available
+if [ -f /etc/default/$NAME ] ; then
+ . /etc/default/$NAME
+fi
+
+set -e
+
+running_pid() {
+# Check if a given process pid's cmdline matches a given name
+ pid=$1
+ name=$2
+ [ -z "$pid" ] && return 1
+ [ ! -d /proc/$pid ] && return 1
+ return 0
+}
+
+running() {
+# Check if the process is running looking at /proc
+# (works for all users)
+
+ # No pidfile, probably no daemon present
+ [ ! -f "$PIDFILE" ] && return 1
+ pid=`cat $PIDFILE`
+ running_pid $pid $DAEMON || return 1
+ return 0
+}
+
+start_daemon() {
+# Start the process using the wrapper
+ if test $WDT_TIMEOUT != 0; then
+ start-stop-daemon --start --quiet -m --background --pidfile $PIDFILE \
+ --exec $DAEMON -- --timeout=$WDT_TIMEOUT --interval=$WDT_INTERVAL $DAEMON_OPTS
+ fi
+
+ # Wait up to 3 seconds for the daemon to start.
+ for i in 1 2 3; do
+ if running; then
+ break
+ fi
+ sleep 1
+ done
+
+ echo $OOPS_REBOOT_TIME > /proc/sys/kernel/panic
+ echo 1 > /proc/sys/kernel/panic_on_oops
+}
+
+stop_daemon() {
+ start-stop-daemon -o --stop --pidfile $PIDFILE
+ rm $PIDFILE
+}
+
+case "$1" in
+ start)
+ log_daemon_msg "Starting $DESC " "$NAME"
+ # Check if it's running first
+ if running ; then
+ log_progress_msg "apparently already running"
+ log_end_msg 0
+ exit 0
+ fi
+ if start_daemon && running ; then
+ # It's ok, the daemon started and is running
+ log_end_msg 0
+ else
+ # Either we could not start it or it is not running
+ # after we did
+ # NOTE: Some daemons might die some time after they start,
+ # this code does not try to detect this and might give
+ # a false positive (use 'status' for that)
+ log_end_msg 1
+ fi
+ ;;
+ stop)
+ log_daemon_msg "Stopping $DESC" "$NAME"
+ if running ; then
+ # Only stop the daemon if we see it running
+ stop_daemon
+ log_end_msg $?
+ else
+ # If it's not running don't do anything
+ log_progress_msg "apparently not running"
+ log_end_msg 0
+ exit 0
+ fi
+ ;;
+ restart|force-reload)
+ log_daemon_msg "Restarting $DESC" "$NAME"
+ if running ; then
+ stop_daemon
+ # Wait some sensible amount, some daemons need this
+ [ -n "$DIETIME" ] && sleep $DIETIME
+ fi
+ start_daemon
+ running
+ log_end_msg $?
+ ;;
+ status)
+ log_daemon_msg "Checking status of $DESC" "$NAME"
+ if running ; then
+ log_progress_msg "running"
+ log_end_msg 0
+ else
+ log_progress_msg "apparently not running"
+ log_end_msg 1
+ exit 1
+ fi
+ ;;
+ # Use this if the daemon cannot reload
+ reload)
+ log_warning_msg "Reloading $NAME daemon: not implemented, as the daemon"
+ log_warning_msg "cannot re-read the config file (use restart)."
+ ;;
+ *)
+ N=/etc/init.d/$NAME
+ echo "Usage: $N {start|stop|restart|force-reload|status}" >&2
+ exit 1
+ ;;
+esac
+
+exit 0
diff --git a/debian/openvswitch-wdt.install b/debian/openvswitch-wdt.install
new file mode 100644
index 000000000..80a04e131
--- /dev/null
+++ b/debian/openvswitch-wdt.install
@@ -0,0 +1 @@
+_debian/utilities/ovs-wdt usr/sbin
diff --git a/debian/ovs-switch-setup b/debian/ovs-switch-setup
new file mode 100755
index 000000000..7a720727c
--- /dev/null
+++ b/debian/ovs-switch-setup
@@ -0,0 +1,615 @@
+#! /usr/bin/perl
+
+use POSIX;
+use Debconf::Client::ConfModule ':all';
+use HTTP::Request;
+use LWP::UserAgent;
+use Digest::SHA1 'sha1_hex';
+use strict;
+use warnings;
+
+# XXX should support configuring SWITCH_NETMASK and SWITCH_GATEWAY
+# when the mode is in-band.
+
+my $debconf_owner = 'openvswitch-switch';
+
+my $default = '/etc/default/openvswitch-switch';
+my $template = '/usr/share/openvswitch/switch/default.template';
+my $etc = '/etc/openvswitch-switch';
+my $rundir = '/var/run';
+my $privkey_file = "$etc/of0-privkey.pem";
+my $req_file = "$etc/of0-req.pem";
+my $cert_file = "$etc/of0-cert.pem";
+my $cacert_file = "$etc/cacert.pem";
+my $ovs_discover_pidfile = "$rundir/ovs-discover.pid";
+
+my $ua = LWP::UserAgent->new;
+$ua->timeout(10);
+$ua->env_proxy;
+
+system("/etc/init.d/openvswitch-switch stop 1>&2");
+kill_ovs_discover();
+
+version('2.0');
+capb('backup');
+title('Open vSwitch Switch Setup');
+
+my (%netdevs) = find_netdevs();
+db_subst('netdevs', 'choices',
+ join(', ', map($netdevs{$_}, sort(keys(%netdevs)))));
+db_set('netdevs', join(', ', grep(!/IP/, values(%netdevs))));
+
+my %oldconfig;
+if (-e $default) {
+ %oldconfig = load_config($default);
+
+ my (%map) =
+ (NETDEVS => sub {
+ db_set('netdevs', join(', ', map($netdevs{$_},
+ grep(exists $netdevs{$_}, split))))
+ },
+ MODE => sub {
+ db_set('mode',
+ $_ eq 'in-band' || $_ eq 'out-of-band' ? $_ : 'discovery')
+ },
+ SWITCH_IP => sub { db_set('switch-ip', $_) },
+ CONTROLLER => sub { db_set('controller-vconn', $_) },
+ PRIVKEY => sub { $privkey_file = $_ },
+ CERT => sub { $cert_file = $_ },
+ CACERT => sub { $cacert_file = $_ },
+ );
+
+ for my $key (keys(%map)) {
+ local $_ = $oldconfig{$key};
+ &{$map{$key}}() if defined && !/^\s*$/;
+ }
+} elsif (-e $template) {
+ %oldconfig = load_config($template);
+}
+
+my $cacert_preverified = -e $cacert_file;
+my ($req, $req_fingerprint);
+
+my %options;
+
+my (@states) =
+ (sub {
+ # User backed up from first dialog box.
+ exit(10);
+ },
+ sub {
+ # Prompt for ports to include in switch.
+ db_input('netdevs');
+ return;
+ },
+ sub {
+ # Validate the chosen ports.
+ my (@netdevs) = split(', ', db_get('netdevs'));
+ if (!@netdevs) {
+ # No ports chosen. Disable switch.
+ db_input('no-netdevs');
+ return 'prev' if db_go();
+ return 'done';
+ } elsif (my (@conf_netdevs) = grep(/IP/, @netdevs)) {
+ # Point out that some ports have configured IP addresses.
+ db_subst('configured-netdevs', 'configured-netdevs',
+ join(', ', @conf_netdevs));
+ db_input('configured-netdevs');
+ return;
+ } else {
+ # Otherwise proceed.
+ return 'skip';
+ }
+ },
+ sub {
+ # Discovery or in-band or out-of-band controller?
+ db_input('mode');
+ return;
+ },
+ sub {
+ return 'skip' if db_get('mode') ne 'discovery';
+ for (;;) {
+ # Notify user that we are going to do discovery.
+ db_input('discover');
+ return 'prev' if db_go();
+ print STDERR "Please wait up to 30 seconds for discovery...\n";
+
+ # Make sure that there's no running discovery process.
+ kill_ovs_discover();
+
+ # Do discovery.
+ %options = ();
+ open(DISCOVER, '-|', 'ovs-discover --timeout=30 --pidfile '
+ . join(' ', netdev_names()));
+ while (<DISCOVER>) {
+ chomp;
+ if (my ($name, $value) = /^([^=]+)=(.*)$/) {
+ if ($value =~ /^"(.*)"$/) {
+ $value = $1;
+ $value =~ s/\\([0-7][0-7][0-7])/chr($1)/ge;
+ } else {
+ $value =~ s/^(0x[[:xdigit:]]+)$/hex($1)/e;
+ $value = '' if $value eq 'empty';
+ next if $value eq 'null'; # Shouldn't happen.
+ }
+ $options{$name} = $value;
+ }
+ last if /^$/;
+ }
+
+ # Check results.
+ my $vconn = $options{'ovs-controller-vconn'};
+ my $pki_uri = $options{'ovs-pki-uri'};
+ return 'next'
+ if (defined($vconn)
+ && is_valid_vconn($vconn)
+ && (!is_ssl_vconn($vconn) || defined($pki_uri)));
+
+ # Try again?
+ kill_ovs_discover();
+ db_input('discovery-failure');
+ db_go();
+ }
+ },
+ sub {
+ return 'skip' if db_get('mode') ne 'discovery';
+
+ my $vconn = $options{'ovs-controller-vconn'};
+ my $pki_uri = $options{'ovs-pki-uri'};
+ db_subst('discovery-success', 'controller-vconn', $vconn);
+ db_subst('discovery-success',
+ 'pki-uri', is_ssl_vconn($vconn) ? $pki_uri : "no PKI in use");
+ db_input('discovery-success');
+ return 'prev' if db_go();
+ db_set('controller-vconn', $vconn);
+ db_set('pki-uri', $pki_uri);
+ return 'next';
+ },
+ sub {
+ return 'skip' if db_get('mode') ne 'in-band';
+ for (;;) {
+ db_input('switch-ip');
+ return 'prev' if db_go();
+
+ my $ip = db_get('switch-ip');
+ return 'next' if $ip =~ /^dhcp|\d+\.\d+.\d+.\d+$/i;
+
+ db_input('switch-ip-error');
+ db_go();
+ }
+ },
+ sub {
+ return 'skip' if db_get('mode') eq 'discovery';
+ for (;;) {
+ my $old_vconn = db_get('controller-vconn');
+ db_input('controller-vconn');
+ return 'prev' if db_go();
+
+ my $vconn = db_get('controller-vconn');
+ if (is_valid_vconn($vconn)) {
+ if ($old_vconn ne $vconn || db_get('pki-uri') eq '') {
+ db_set('pki-uri', pki_host_to_uri($2));
+ }
+ return 'next';
+ }
+
+ db_input('controller-vconn-error');
+ db_go();
+ }
+ },
+ sub {
+ return 'skip' if !ssl_enabled();
+
+ if (! -e $privkey_file) {
+ my $old_umask = umask(077);
+ run_cmd("ovs-pki req $etc/of0 >&2 2>/dev/null");
+ chmod(0644, $req_file) or die "$req_file: chmod: $!\n";
+ umask($old_umask);
+ }
+
+ if (! -e $cert_file) {
+ open(REQ, '<', $req_file) or die "$req_file: open: $!\n";
+ $req = join('', <REQ>);
+ close(REQ);
+ $req_fingerprint = sha1_hex($req);
+ }
+ return 'skip';
+ },
+ sub {
+ return 'skip' if !ssl_enabled();
+ return 'skip' if -e $cacert_file && -e $cert_file;
+
+ db_input('pki-uri');
+ return 'prev' if db_go();
+ return;
+ },
+ sub {
+ return 'skip' if !ssl_enabled();
+ return 'skip' if -e $cacert_file;
+
+ my $pki_uri = db_get('pki-uri');
+ if ($pki_uri !~ /:/) {
+ $pki_uri = pki_host_to_uri($pki_uri);
+ } else {
+ # Trim trailing slashes.
+ $pki_uri =~ s%/+$%%;
+ }
+ db_set('pki-uri', $pki_uri);
+
+ my $url = "$pki_uri/controllerca/cacert.pem";
+ my $response = $ua->get($url, ':content_file' => $cacert_file);
+ if ($response->is_success) {
+ return 'next';
+ }
+
+ db_subst('fetch-cacert-failed', 'url', $url);
+ db_subst('fetch-cacert-failed', 'error', $response->status_line);
+ db_subst('fetch-cacert-failed', 'pki-uri', $pki_uri);
+ db_input('fetch-cacert-failed');
+ db_go();
+ return 'prev';
+ },
+ sub {
+ return 'skip' if !ssl_enabled();
+ return 'skip' if -e $cert_file;
+
+ for (;;) {
+ db_set('send-cert-req', 'yes');
+ db_input('send-cert-req');
+ return 'prev' if db_go();
+ return 'next' if db_get('send-cert-req') eq 'no';
+
+ my $pki_uri = db_get('pki-uri');
+ my ($pki_base_uri) = $pki_uri =~ m%^([^/]+://[^/]+)/%;
+ my $url = "$pki_base_uri/cgi-bin/ovs-pki-cgi";
+ my $response = $ua->post($url, {'type' => 'switch',
+ 'req' => $req});
+ return 'next' if $response->is_success;
+
+ db_subst('send-cert-req-failed', 'url', $url);
+ db_subst('send-cert-req-failed', 'error',
+ $response->status_line);
+ db_subst('send-cert-req-failed', 'pki-uri', $pki_uri);
+ db_input('send-cert-req-failed');
+ db_go();
+ }
+ },
+ sub {
+ return 'skip' if !ssl_enabled();
+ return 'skip' if $cacert_preverified;
+
+ my ($cacert_fingerprint) = x509_fingerprint($cacert_file);
+ db_subst('verify-controller-ca', 'fingerprint', $cacert_fingerprint);
+ db_input('verify-controller-ca');
+ return 'prev' if db_go();
+ return 'next' if db_get('verify-controller-ca') eq 'yes';
+ unlink($cacert_file);
+ return 'prev';
+ },
+ sub {
+ return 'skip' if !ssl_enabled();
+ return 'skip' if -e $cert_file;
+
+ for (;;) {
+ db_set('fetch-switch-cert', 'yes');
+ db_input('fetch-switch-cert');
+ return 'prev' if db_go();
+ exit(1) if db_get('fetch-switch-cert') eq 'no';
+
+ my $pki_uri = db_get('pki-uri');
+ my $url = "$pki_uri/switchca/certs/$req_fingerprint-cert.pem";
+ my $response = $ua->get($url, ':content_file' => $cert_file);
+ if ($response->is_success) {
+ return 'next';
+ }
+
+ db_subst('fetch-switch-cert-failed', 'url', $url);
+ db_subst('fetch-switch-cert-failed', 'error',
+ $response->status_line);
+ db_subst('fetch-switch-cert-failed', 'pki-uri', $pki_uri);
+ db_input('fetch-switch-cert-failed');
+ db_go();
+ }
+ },
+ sub {
+ db_input('complete');
+ db_go();
+ return;
+ },
+ sub {
+ return 'done';
+ },
+);
+
+my $state = 1;
+my $direction = 1;
+for (;;) {
+ my $ret = &{$states[$state]}();
+ $ret = db_go() ? 'prev' : 'next' if !defined $ret;
+ if ($ret eq 'next') {
+ $direction = 1;
+ } elsif ($ret eq 'prev') {
+ $direction = -1;
+ } elsif ($ret eq 'skip') {
+ # Nothing to do.
+ } elsif ($ret eq 'done') {
+ last;
+ } else {
+ die "unknown ret $ret";
+ }
+ $state += $direction;
+}
+
+my %config = %oldconfig;
+$config{NETDEVS} = join(' ', netdev_names());
+$config{MODE} = db_get('mode');
+if (db_get('mode') eq 'in-band') {
+ $config{SWITCH_IP} = db_get('switch-ip');
+}
+if (db_get('mode') ne 'discovery') {
+ $config{CONTROLLER} = db_get('controller-vconn');
+}
+$config{PRIVKEY} = $privkey_file;
+$config{CERT} = $cert_file;
+$config{CACERT} = $cacert_file;
+save_config($default, %config);
+
+dup2(2, 1); # Get stdout back.
+kill_ovs_discover();
+system("/etc/init.d/openvswitch-switch start");
+
+sub ssl_enabled {
+ return is_ssl_vconn(db_get('controller-vconn'));
+}
+
+sub db_subst {
+ my ($question, $key, $value) = @_;
+ $question = "$debconf_owner/$question";
+ my ($ret, $seen) = subst($question, $key, $value);
+ if ($ret && $ret != 30) {
+ die "Error substituting $value for $key in debconf question "
+ . "$question: $seen";
+ }
+}
+
+sub db_set {
+ my ($question, $value) = @_;
+ $question = "$debconf_owner/$question";
+ my ($ret, $seen) = set($question, $value);
+ if ($ret && $ret != 30) {
+ die "Error setting debconf question $question to $value: $seen";
+ }
+}
+
+sub db_get {
+ my ($question) = @_;
+ $question = "$debconf_owner/$question";
+ my ($ret, $seen) = get($question);
+ if ($ret) {
+ die "Error getting debconf question $question answer: $seen";
+ }
+ return $seen;
+}
+
+sub db_fset {
+ my ($question, $flag, $value) = @_;
+ $question = "$debconf_owner/$question";
+ my ($ret, $seen) = fset($question, $flag, $value);
+ if ($ret && $ret != 30) {
+ die "Error setting debconf question $question flag $flag to $value: "
+ . "$seen";
+ }
+}
+
+sub db_fget {
+ my ($question, $flag) = @_;
+ $question = "$debconf_owner/$question";
+ my ($ret, $seen) = fget($question, $flag);
+ if ($ret) {
+ die "Error getting debconf question $question flag $flag: $seen";
+ }
+ return $seen;
+}
+
+sub db_input {
+ my ($question) = @_;
+ db_fset($question, "seen", "false");
+
+ $question = "$debconf_owner/$question";
+ my ($ret, $seen) = input('high', $question);
+ if ($ret && $ret != 30) {
+ die "Error requesting debconf question $question: $seen";
+ }
+ return $ret;
+}
+
+sub db_go {
+ my ($ret, $seen) = go();
+ if (!defined($ret)) {
+ exit(1); # Cancel button was pushed.
+ }
+ if ($ret && $ret != 30) {
+ die "Error asking debconf questions: $seen";
+ }
+ return $ret;
+}
+
+sub run_cmd {
+ my ($cmd) = @_;
+ return if system($cmd) == 0;
+
+ if ($? == -1) {
+ die "$cmd: failed to execute: $!\n";
+ } elsif ($? & 127) {
+ die sprintf("$cmd: child died with signal %d, %s coredump\n",
+ ($? & 127), ($? & 128) ? 'with' : 'without');
+ } else {
+ die sprintf("$cmd: child exited with value %d\n", $? >> 8);
+ }
+}
+
+sub x509_fingerprint {
+ my ($file) = @_;
+ my $cmd = "openssl x509 -noout -in $file -fingerprint";
+ open(OPENSSL, '-|', $cmd) or die "$cmd: failed to execute: $!\n";
+ my $line = <OPENSSL>;
+ close(OPENSSL);
+ my ($fingerprint) = $line =~ /SHA1 Fingerprint=(.*)/;
+ return $line if !defined $fingerprint;
+ $fingerprint =~ s/://g;
+ return $fingerprint;
+}
+
+sub find_netdevs {
+ my ($netdev, %netdevs);
+ open(IFCONFIG, "/sbin/ifconfig -a|") or die "ifconfig failed: $!";
+ while (<IFCONFIG>) {
+ if (my ($nd) = /^([^\s]+)/) {
+ $netdev = $nd;
+ $netdevs{$netdev} = "$netdev";
+ if (my ($hwaddr) = /HWaddr (\S+)/) {
+ $netdevs{$netdev} .= " (MAC: $hwaddr)";
+ }
+ } elsif (my ($ip4) = /^\s*inet addr:(\S+)/) {
+ $netdevs{$netdev} .= " (IP: $ip4)";
+ } elsif (my ($ip6) = /^\s*inet6 addr:(\S+)/) {
+ $netdevs{$netdev} .= " (IPv6: $ip6)";
+ }
+ }
+ foreach my $nd (keys(%netdevs)) {
+ delete $netdevs{$nd} if $nd eq 'lo' || $nd =~ /^wmaster/;
+ }
+ close(IFCONFIG);
+ return %netdevs;
+}
+
+sub load_config {
+ my ($file) = @_;
+
+ # Get the list of the variables that the shell sets automatically.
+ my (%auto_vars) = read_vars("set -a && env");
+
+ # Get the variables from $default.
+ my (%config) = read_vars("set -a && . '$default' && env");
+
+ # Subtract.
+ delete @config{keys %auto_vars};
+
+ return %config;
+}
+
+sub read_vars {
+ my ($cmd) = @_;
+ local @ENV;
+ if (!open(VARS, '-|', $cmd)) {
+ print STDERR "$cmd: failed to execute: $!\n";
+ return ();
+ }
+ my (%config);
+ while (<VARS>) {
+ my ($var, $value) = /^([^=]+)=(.*)$/ or next;
+ $config{$var} = $value;
+ }
+ close(VARS);
+ return %config;
+}
+
+sub shell_escape {
+ local $_ = $_[0];
+ if ($_ eq '') {
+ return '""';
+ } elsif (m&^[-a-zA-Z0-9:./%^_+,]*$&) {
+ return $_;
+ } else {
+ s/'/'\\''/;
+ return "'$_'";
+ }
+}
+
+sub shell_assign {
+ my ($var, $value) = @_;
+ return $var . '=' . shell_escape($value);
+}
+
+sub save_config {
+ my ($file, %config) = @_;
+ my (@lines);
+ if (open(FILE, '<', $file)) {
+ @lines = <FILE>;
+ chomp @lines;
+ close(FILE);
+ }
+
+ # Replace all existing variable assignments.
+ for (my ($i) = 0; $i <= $#lines; $i++) {
+ local $_ = $lines[$i];
+ my ($var, $value) = /^\s*([^=#]+)=(.*)$/ or next;
+ if (exists($config{$var})) {
+ $lines[$i] = shell_assign($var, $config{$var});
+ delete $config{$var};
+ } else {
+ $lines[$i] = "#$lines[$i]";
+ }
+ }
+
+ # Find a place to put any remaining variable assignments.
+ VAR:
+ for my $var (keys(%config)) {
+ my $assign = shell_assign($var, $config{$var});
+
+ # Replace the last commented-out variable assignment to $var, if any.
+ for (my ($i) = $#lines; $i >= 0; $i--) {
+ local $_ = $lines[$i];
+ if (/^\s*#\s*$var=/) {
+ $lines[$i] = $assign;
+ next VAR;
+ }
+ }
+
+ # Find a place to add the var: after the final commented line
+ # just after a line that contains "$var:".
+ for (my ($i) = 0; $i <= $#lines; $i++) {
+ if ($lines[$i] =~ /^\s*#\s*$var:/) {
+ for (my ($j) = $i + 1; $j <= $#lines; $j++) {
+ if ($lines[$j] !~ /^\s*#/) {
+ splice(@lines, $j, 0, $assign);
+ next VAR;
+ }
+ }
+ }
+ }
+
+ # Just append it.
+ push(@lines, $assign);
+ }
+
+ open(NEWFILE, '>', "$file.tmp") or die "$file.tmp: create: $!\n";
+ print NEWFILE join('', map("$_\n", @lines));
+ close(NEWFILE);
+ rename("$file.tmp", $file) or die "$file.tmp: rename to $file: $!\n";
+}
+
+sub pki_host_to_uri {
+ my ($pki_host) = @_;
+ return "http://$pki_host/openvswitch/pki";
+}
+
+sub kill_ovs_discover {
+ # Delegate this to a subprocess because there is no portable way
+ # to invoke fcntl(F_GETLK) from Perl.
+ system("ovs-kill --force $ovs_discover_pidfile");
+}
+
+sub netdev_names {
+ return map(/^(\S+)/, split(', ', db_get('netdevs')));
+}
+
+sub is_valid_vconn {
+ my ($vconn) = @_;
+ return scalar($vconn =~ /^(tcp|ssl):([^:]+)(:.*)?/);
+}
+
+sub is_ssl_vconn {
+ my ($vconn) = @_;
+ return scalar($vconn =~ /^ssl:/);
+}
diff --git a/debian/ovs-switch-setup.8 b/debian/ovs-switch-setup.8
new file mode 100644
index 000000000..696ad3655
--- /dev/null
+++ b/debian/ovs-switch-setup.8
@@ -0,0 +1,41 @@
+.TH ovs-switch-setup 8 "June 2008" "Open vSwitch" "Open vSwitch Manual"
+
+.SH NAME
+ovs\-switch\-setup \- interactive setup for Open vSwitch switch
+
+.SH SYNOPSIS
+.B ovs\-switch\-setup
+
+.SH DESCRIPTION
+The \fBovs\-switch\-setup\fR program is an interactive program that
+assists the system administrator in configuring an Open vSwitch switch,
+including the underlying public key infrastructure (PKI).
+
+.SH OPTIONS
+ovs\-switch\-setup does not accept any command-line options.
+
+.SH FILES
+.IP /etc/default/openvswitch-switch
+Main configuration file for Open vSwitch switch.
+
+.IP /etc/openvswitch-switch/cacert.pem
+Default location of CA certificate for OpenFlow controllers.
+
+.IP /etc/openvswitch-switch/of0-cert.pem
+Default location of certificate for the Open vSwitch switch's private key.
+
+.IP /etc/openvswitch-switch/of0-privkey.pem
+Default location of the Open vSwitch switch's private key. This file
+should be readable only by \fBroot\fR.
+
+.IP /etc/openvswitch-switch/of0-req.pem
+Default location of certificate request for the Open vSwitch switch's
+certificate. This file is not used after the signed certificate
+(typically \fB/etc/openvswitch-switch/of0-cert.pem\fR, above) has been
+obtained from the OpenFlow PKI server.
+
+.SH "SEE ALSO"
+
+.BR ovs\-dpctl (8),
+.BR ovs-pki (8),
+.BR secchan (8)
diff --git a/debian/po/POTFILES.in b/debian/po/POTFILES.in
new file mode 100644
index 000000000..865bf94c0
--- /dev/null
+++ b/debian/po/POTFILES.in
@@ -0,0 +1 @@
+[type: gettext/rfc822deb] openvswitch-switch-config.templates
diff --git a/debian/po/templates.pot b/debian/po/templates.pot
new file mode 100644
index 000000000..119e55871
--- /dev/null
+++ b/debian/po/templates.pot
@@ -0,0 +1,522 @@
+# SOME DESCRIPTIVE TITLE.
+# Copyright (C) YEAR THE PACKAGE'S COPYRIGHT HOLDER
+# This file is distributed under the same license as the PACKAGE package.
+# FIRST AUTHOR <EMAIL@ADDRESS>, YEAR.
+#
+#, fuzzy
+msgid ""
+msgstr ""
+"Project-Id-Version: PACKAGE VERSION\n"
+"Report-Msgid-Bugs-To: ovs-dev@openvswitch.org\n"
+"POT-Creation-Date: 2009-05-11 13:38-0700\n"
+"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
+"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
+"Language-Team: LANGUAGE <LL@li.org>\n"
+"MIME-Version: 1.0\n"
+"Content-Type: text/plain; charset=CHARSET\n"
+"Content-Transfer-Encoding: 8bit\n"
+
+#. Type: multiselect
+#. Choices
+#: ../openvswitch-switch-config.templates:1001
+msgid "${choices}"
+msgstr ""
+
+#. Type: multiselect
+#. Description
+#: ../openvswitch-switch-config.templates:1002
+msgid "OpenFlow switch network devices:"
+msgstr ""
+
+#. Type: multiselect
+#. Description
+#: ../openvswitch-switch-config.templates:1002
+msgid ""
+"Choose the network devices that should become part of the OpenFlow switch. "
+"At least two devices must be selected for this machine to be a useful "
+"switch. Unselecting all network devices will disable the OpenFlow switch "
+"entirely."
+msgstr ""
+
+#. Type: multiselect
+#. Description
+#: ../openvswitch-switch-config.templates:1002
+msgid ""
+"The network devices that you select should not be configured with IP or IPv6 "
+"addresses, even if the switch contacts the controller over one of the "
+"selected network devices. This is because a running OpenFlow switch takes "
+"over network devices at a low level: they become part of the switch and "
+"cannot be used for other purposes."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../openvswitch-switch-config.templates:2001
+msgid "No network devices were selected."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../openvswitch-switch-config.templates:2001
+msgid ""
+"No network devices were selected for inclusion in the OpenFlow switch. The "
+"switch will be disabled."
+msgstr ""
+
+#. Type: note
+#. Description
+#: ../openvswitch-switch-config.templates:3001
+msgid "Some Network Devices Have IP or IPv6 Addresses"
+msgstr ""
+
+#. Type: note
+#. Description
+#: ../openvswitch-switch-config.templates:3001
+msgid ""
+"The following network devices selected to be part of the OpenFlow switch "
+"have IP or IPv6 addresses configured:"
+msgstr ""
+
+#. Type: note
+#. Description
+#: ../openvswitch-switch-config.templates:3001
+msgid "${configured-netdevs}"
+msgstr ""
+
+#. Type: note
+#. Description
+#: ../openvswitch-switch-config.templates:3001
+msgid ""
+"This is usually a mistake, even if the switch contacts the controller over "
+"one of the selected network devices. This is because a running OpenFlow "
+"switch takes over network devices at a low level: they become part of the "
+"switch and cannot be used for other purposes."
+msgstr ""
+
+#. Type: note
+#. Description
+#: ../openvswitch-switch-config.templates:3001
+msgid ""
+"If this is an unintentional mistake, move back and fix the selection, or de-"
+"configure the IP or IPv6 from these network devices."
+msgstr ""
+
+#. Type: select
+#. Choices
+#: ../openvswitch-switch-config.templates:4001
+msgid "discovery, in-band, out-of-band"
+msgstr ""
+
+#. Type: select
+#. Description
+#: ../openvswitch-switch-config.templates:4002
+msgid "Switch-to-controller access method:"
+msgstr ""
+
+#. Type: select
+#. Description
+#: ../openvswitch-switch-config.templates:4002
+msgid ""
+"The OpenFlow switch must be able to contact the OpenFlow controller over the "
+"network. It can do so in one of three ways:"
+msgstr ""
+
+#. Type: select
+#. Description
+#: ../openvswitch-switch-config.templates:4002
+msgid ""
+"discovery: A single network is used for OpenFlow traffic and other data "
+"traffic; that is, the switch contacts the controller over one of the network "
+"devices selected as OpenFlow switch network devices in the previous "
+"question. The switch automatically determines the location of the "
+"controller using a DHCP request with an OpenFlow-specific vendor option. "
+"This is the most common case."
+msgstr ""
+
+#. Type: select
+#. Description
+#: ../openvswitch-switch-config.templates:4002
+msgid ""
+"in-band: As above, but the location of the controller is manually configured."
+msgstr ""
+
+#. Type: select
+#. Description
+#: ../openvswitch-switch-config.templates:4002
+msgid ""
+"out-of-band: OpenFlow traffic uses a network separate from the data traffic "
+"that it controls. If this is the case, the control network must already be "
+"configured on a network device other than one of those selected as an "
+"OpenFlow switch netdev in the previous question."
+msgstr ""
+
+#. Type: note
+#. Description
+#: ../openvswitch-switch-config.templates:5001
+msgid "Preparing to discover controller."
+msgstr ""
+
+#. Type: note
+#. Description
+#: ../openvswitch-switch-config.templates:5001
+msgid ""
+"The setup program will now attempt to discover the OpenFlow controller. "
+"Controller discovery may take up to 30 seconds. Please be patient."
+msgstr ""
+
+#. Type: note
+#. Description
+#: ../openvswitch-switch-config.templates:5001
+msgid ""
+"See secchan(8) for instructions on how to configure a DHCP server for "
+"controller discovery."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../openvswitch-switch-config.templates:6001
+msgid "Controller discovery failed."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../openvswitch-switch-config.templates:6001
+msgid "The controller's location could not be determined automatically."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../openvswitch-switch-config.templates:6001
+msgid ""
+"Ensure that the OpenFlow DHCP server is properly configured. See secchan(8) "
+"for instructions on how to configure a DHCP server for controller discovery."
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../openvswitch-switch-config.templates:7001
+msgid "Use discovered settings?"
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../openvswitch-switch-config.templates:7001
+msgid "Controller discovery obtained the following settings:"
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../openvswitch-switch-config.templates:7001
+msgid "Controller location: ${controller-vconn}"
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../openvswitch-switch-config.templates:7001
+msgid "PKI URL: ${pki-uri}"
+msgstr ""
+
+#. Type: boolean
+#. Description
+#: ../openvswitch-switch-config.templates:7001
+msgid "Please verify that these settings are correct."
+msgstr ""
+
+#. Type: string
+#. Description
+#: ../openvswitch-switch-config.templates:8001
+msgid "Switch IP address:"
+msgstr ""
+
+#. Type: string
+#. Description
+#: ../openvswitch-switch-config.templates:8001
+msgid ""
+"For in-band communication with the controller, the OpenFlow switch must be "
+"able to determine its own IP address. Its IP address may be configured "
+"statically or dynamically."
+msgstr ""
+
+#. Type: string
+#. Description
+#: ../openvswitch-switch-config.templates:8001
+msgid "For static configuration, specify the switch's IP address as a string."
+msgstr ""
+
+#. Type: string
+#. Description
+#: ../openvswitch-switch-config.templates:8001
+msgid ""
+"For dynamic configuration with DHCP (the most common case), specify \"dhcp"
+"\". Configuration with DHCP will only work reliably if the network topology "
+"allows the switch to contact the DHCP server before it connects to the "
+"OpenFlow controller."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../openvswitch-switch-config.templates:9001
+msgid "The switch IP address is invalid."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../openvswitch-switch-config.templates:9001
+msgid ""
+"The switch IP address must specified as \"dhcp\" or a valid IP address in "
+"dotted-octet form (e.g. \"1.2.3.4\")."
+msgstr ""
+
+#. Type: string
+#. Description
+#: ../openvswitch-switch-config.templates:10001
+msgid "Controller location:"
+msgstr ""
+
+#. Type: string
+#. Description
+#: ../openvswitch-switch-config.templates:10001
+msgid ""
+"Specify how the OpenFlow switch should connect to the OpenFlow controller. "
+"The value should be in form \"ssl:HOST[:PORT]\" to connect to the controller "
+"over SSL (recommended for security) or \"tcp:HOST[:PORT]\" to connect over "
+"cleartext TCP."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../openvswitch-switch-config.templates:11001
+msgid "The controller location is invalid."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../openvswitch-switch-config.templates:11001
+msgid ""
+"The controller location must be specifed as \"ssl:HOST[:PORT]\" to connect "
+"to the controller over SSL (recommended for security) or \"tcp:HOST[:PORT]\" "
+"to connect over cleartext TCP."
+msgstr ""
+
+#. Type: string
+#. Description
+#: ../openvswitch-switch-config.templates:12001
+msgid "OpenFlow PKI server host name or URL:"
+msgstr ""
+
+#. Type: string
+#. Description
+#: ../openvswitch-switch-config.templates:12001
+msgid ""
+"Specify a URL to the OpenFlow public key infrastructure (PKI). If a host "
+"name or IP address is specified in place of a URL, then http://<host>/"
+"openvswitch/pki/ will be used, where <host> is the specified host name or IP "
+"address."
+msgstr ""
+
+#. Type: string
+#. Description
+#: ../openvswitch-switch-config.templates:12001
+msgid ""
+"The OpenFlow PKI is usually on the same machine as the OpenFlow controller."
+msgstr ""
+
+#. Type: string
+#. Description
+#: ../openvswitch-switch-config.templates:12001
+msgid ""
+"The setup process will connect to the OpenFlow PKI server over HTTP, using "
+"the system's configured default HTTP proxy (if any)."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../openvswitch-switch-config.templates:13001
+msgid "The switch CA certificate could not be retrieved."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../openvswitch-switch-config.templates:13001
+msgid "Retrieval of ${url} failed, with the following status: \"${error}\"."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../openvswitch-switch-config.templates:13001
+msgid ""
+"Ensure that the OpenFlow PKI server is correctly configured and available at "
+"${pki-uri}. If the system is configured to use an HTTP proxy, also make "
+"sure that the HTTP proxy is available and that the PKI server can be reached "
+"through it."
+msgstr ""
+
+#. Type: select
+#. Choices
+#. Type: select
+#. Choices
+#. Type: select
+#. Choices
+#: ../openvswitch-switch-config.templates:14001
+#: ../openvswitch-switch-config.templates:15001
+#: ../openvswitch-switch-config.templates:17001
+msgid "yes, no"
+msgstr ""
+
+#. Type: select
+#. Description
+#: ../openvswitch-switch-config.templates:14002
+msgid "Is ${fingerprint} the controller CA's fingerprint?"
+msgstr ""
+
+#. Type: select
+#. Description
+#: ../openvswitch-switch-config.templates:14002
+msgid ""
+"If a man-in-the-middle attack is possible in your network environment, check "
+"that the controller CA's fingerprint is really ${fingerprint}. Answer \"yes"
+"\" if it matches, \"no\" if there is a discrepancy."
+msgstr ""
+
+#. Type: select
+#. Description
+#: ../openvswitch-switch-config.templates:14002
+msgid ""
+"If a man-in-the-middle attack is not a concern, there is no need to verify "
+"the fingerprint. Simply answer \"yes\"."
+msgstr ""
+
+#. Type: select
+#. Description
+#: ../openvswitch-switch-config.templates:15002
+msgid "Send certificate request to switch CA?"
+msgstr ""
+
+#. Type: select
+#. Description
+#: ../openvswitch-switch-config.templates:15002
+msgid ""
+"Before it can connect to the controller over SSL, the OpenFlow switch's key "
+"must be signed by the switch certificate authority (CA) located on the "
+"OpenFlow PKI server, which is usually collocated with the OpenFlow "
+"controller. A signing request can be sent to the PKI server now."
+msgstr ""
+
+#. Type: select
+#. Description
+#: ../openvswitch-switch-config.templates:15002
+msgid ""
+"Answer \"yes\" to send a signing request to the switch CA now. This is "
+"ordinarily the correct choice. There is no harm in sending a given signing "
+"request more than once."
+msgstr ""
+
+#. Type: select
+#. Description
+#: ../openvswitch-switch-config.templates:15002
+msgid ""
+"Answer \"no\" to skip sending a signing request to the switch CA. Unless the "
+"request has already been sent to the switch CA, manual sending of the "
+"request and signing will be necessary."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../openvswitch-switch-config.templates:16001
+msgid "The certificate request could not be sent."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../openvswitch-switch-config.templates:16001
+msgid "Posting to ${url} failed, with the following status: \"${error}\"."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../openvswitch-switch-config.templates:16001
+msgid ""
+"Ensure that the OpenFlow PKI server is correctly configured and available at "
+"${pki-uri}."
+msgstr ""
+
+#. Type: select
+#. Description
+#: ../openvswitch-switch-config.templates:17002
+msgid "Fetch signed switch certificate from PKI server?"
+msgstr ""
+
+#. Type: select
+#. Description
+#: ../openvswitch-switch-config.templates:17002
+msgid ""
+"Before it can connect to the controller over SSL, the OpenFlow switch's key "
+"must be signed by the switch certificate authority (CA) located on the "
+"OpenFlow PKI server, which is usually collocated with the OpenFlow "
+"controller."
+msgstr ""
+
+#. Type: select
+#. Description
+#: ../openvswitch-switch-config.templates:17002
+msgid ""
+"At this point, a signing request has been sent to the switch CA (or sending "
+"a request has been manually skipped), but the signed certificate has not yet "
+"been retrieved. Manual action may need to be taken at the PKI server to "
+"approve the signing request."
+msgstr ""
+
+#. Type: select
+#. Description
+#: ../openvswitch-switch-config.templates:17002
+msgid ""
+"Answer \"yes\" to attempt to retrieve the signed switch certificate from the "
+"switch CA. If the switch certificate request has been signed at the PKI "
+"server, this is the correct choice."
+msgstr ""
+
+#. Type: select
+#. Description
+#: ../openvswitch-switch-config.templates:17002
+msgid ""
+"Answer \"no\" to postpone switch configuration. The configuration process "
+"must be restarted later, when the switch certificate request has been signed."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../openvswitch-switch-config.templates:18001
+msgid "Signed switch certificate could not be retrieved."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../openvswitch-switch-config.templates:18001
+msgid ""
+"The signed switch certificate could not be retrieved from the switch CA: "
+"retrieval of ${url} failed, with the following status: \"${error}\"."
+msgstr ""
+
+#. Type: error
+#. Description
+#: ../openvswitch-switch-config.templates:18001
+msgid ""
+"This probably indicates that the switch's certificate request has not yet "
+"been signed. If this is the problem, it may be fixed by signing the "
+"certificate request at ${pki-uri}, then trying to fetch the signed switch "
+"certificate again."
+msgstr ""
+
+#. Type: note
+#. Description
+#: ../openvswitch-switch-config.templates:19001
+msgid "OpenFlow Switch Setup Finished"
+msgstr ""
+
+#. Type: note
+#. Description
+#: ../openvswitch-switch-config.templates:19001
+msgid ""
+"Setup of this OpenFlow switch is finished. Complete the setup procedure to "
+"enable the switch."
+msgstr ""
diff --git a/debian/rules b/debian/rules
new file mode 100755
index 000000000..707fe8b7d
--- /dev/null
+++ b/debian/rules
@@ -0,0 +1,145 @@
+#!/usr/bin/make -f
+# -*- makefile -*-
+# Sample debian/rules that uses debhelper.
+#
+# This file was originally written by Joey Hess and Craig Small.
+# As a special exception, when this file is copied by dh-make into a
+# dh-make output file, you may use that output file without restriction.
+# This special exception was added by Craig Small in version 0.37 of dh-make.
+#
+# Modified to make a template file for a multi-binary package with separated
+# build-arch and build-indep targets by Bill Allombert 2001
+
+# Uncomment this to turn on verbose mode.
+#export DH_VERBOSE=1
+
+# This has to be exported to make some magic below work.
+export DH_OPTIONS
+
+# prefix of the target package name
+PACKAGE=openvswitch-datapath-module
+# modifieable for experiments or debugging m-a
+MA_DIR ?= /usr/share/modass
+# load generic variable handling
+-include $(MA_DIR)/include/generic.make
+# load default rules
+-include $(MA_DIR)/include/common-rules.make
+
+DATAPATH_CONFIGURE_OPTS =
+
+# Official build number. Leave set to 0 if not an official build.
+BUILD_NUMBER = 0
+
+configure: configure-stamp
+configure-stamp:
+ dh_testdir
+ test -e configure || ./boot.sh
+ test -d _debian || mkdir _debian
+ cd _debian && ( \
+ test -e Makefile || \
+ ../configure --prefix=/usr --localstatedir=/var --enable-ssl \
+ --with-build-number=$(BUILD_NUMBER) \
+ $(DATAPATH_CONFIGURE_OPTS))
+ touch configure-stamp
+
+#Architecture
+build: build-arch build-indep
+
+build-arch: build-arch-stamp
+build-arch-stamp: configure-stamp
+ $(MAKE) -C _debian
+ touch $@
+
+build-indep: build-indep-stamp
+build-indep-stamp: configure-stamp
+ $(MAKE) -C _debian dist distdir=openvswitch
+ touch $@
+
+clean:
+ dh_testdir
+ dh_testroot
+ rm -f build-arch-stamp build-indep-stamp configure-stamp
+ rm -rf _debian
+ [ ! -f Makefile ] || $(MAKE) distclean
+ dh_clean
+ debconf-updatepo
+
+kdist_clean:
+ dh_clean
+ rm -rf openvswitch
+
+kdist_config: prep-deb-files
+
+binary-modules: DSTDIR = $(CURDIR)/debian/$(PKGNAME)/lib/modules/$(KVERS)
+binary-modules: prep-deb-files
+ dh_testdir
+ dh_testroot
+ dh_clean -k
+ tar xzf openvswitch.tar.gz
+ cd openvswitch && ./configure --with-l26=$(KSRC) $(DATAPATH_CONFIGURE_OPTS) --with-build-number=$(BUILD_NUMBER)
+ cd openvswitch && $(MAKE) -C datapath/linux-2.6
+ install -d -m755 $(DSTDIR)
+ install -m644 openvswitch/datapath/linux-2.6/*_mod.ko $(DSTDIR)/
+ dh_installdocs
+ dh_installchangelogs
+ dh_compress
+ dh_fixperms
+ dh_installdeb
+ dh_gencontrol
+ dh_md5sums
+ dh_builddeb --destdir=$(DEB_DESTDIR)
+
+install: install-indep install-arch
+install-indep: build-indep
+ dh_testdir
+ dh_testroot
+ dh_clean -k -i
+ dh_installdirs -i
+ dh_install -i
+ cd debian/openvswitch-datapath-source/usr/src && tar -c modules | bzip2 -9 > openvswitch-datapath.tar.bz2 && rm -rf modules
+ install -m644 debian/openvswitch-pki-server.apache2 debian/openvswitch-pki-server/etc/apache2/sites-available/openvswitch-pki
+ install -m1777 -d debian/corekeeper/var/log/core
+
+install-arch: build-arch
+ dh_testdir
+ dh_testroot
+ dh_clean -k -s
+ dh_installdirs -s
+ $(MAKE) -C _debian DESTDIR=$(CURDIR)/debian/openvswitch install
+ cp debian/openvswitch-switch-config.overrides debian/openvswitch-switch-config/usr/share/lintian/overrides/openvswitch-switch-config
+ cp debian/openvswitch-switch.template debian/openvswitch-switch/usr/share/openvswitch/switch/default.template
+ dh_install -s
+ env TERMINFO=debian/openvswitch-switchui/usr/share/terminfo tic -x extras/ezio/ezio3.ti
+
+# Must not depend on anything. This is to be called by
+# binary-arch/binary-indep
+# in another 'make' thread.
+binary-common:
+ dh_testdir
+ dh_testroot
+ dh_installchangelogs
+ dh_installdocs
+ dh_installexamples
+ dh_installdebconf
+ dh_installlogrotate
+ dh_installinit
+ dh_installcron
+ dh_installman
+ dh_link
+ dh_strip --dbg-package=openvswitch-dbg
+ dh_compress
+ dh_fixperms -X var/log/core
+ dh_perl
+ dh_makeshlibs
+ dh_installdeb
+ dh_shlibdeps
+ dh_gencontrol
+ dh_md5sums
+ dh_builddeb
+binary-indep: install-indep
+ $(MAKE) -f debian/rules DH_OPTIONS=-i binary-common
+binary-arch: install-arch
+ $(MAKE) -f debian/rules DH_OPTIONS=-s binary-common
+
+binary: binary-arch binary-indep
+.PHONY: build clean binary-indep binary-arch binary install install-indep install-arch configure
diff --git a/extras/ezio/automake.mk b/extras/ezio/automake.mk
new file mode 100644
index 000000000..2aeaa6440
--- /dev/null
+++ b/extras/ezio/automake.mk
@@ -0,0 +1,49 @@
+# Copyright (C) 2008, 2009 Nicira Networks, Inc.
+#
+# Copying and distribution of this file, with or without modification,
+# are permitted in any medium without royalty provided the copyright
+# notice and this notice are preserved. This file is offered as-is,
+# without warranty of any kind.
+
+EXTRA_DIST += extras/ezio/ezio3.ti
+install-data-hook:
+ @echo tic -x $(srcdir)/extras/ezio/ezio3.ti
+ @if ! tic -x $(srcdir)/extras/ezio/ezio3.ti; then \
+ echo "-----------------------------------------------------------"; \
+ echo "Failed to install ezio3 terminfo file. The ezio-term"; \
+ echo "program will not work until it has been installed."; \
+ echo "Probably, you need to install the 'tic' program from"; \
+ echo "ncurses, e.g. using a command like:"; \
+ echo " apt-get install ncurses-bin"; \
+ echo "and then re-run \"make install\""; \
+ echo "-----------------------------------------------------------"; \
+ exit 1; \
+ fi
+
+bin_PROGRAMS += extras/ezio/ezio-term
+extras_ezio_ezio_term_SOURCES = \
+ extras/ezio/byteq.c \
+ extras/ezio/byteq.h \
+ extras/ezio/ezio-term.c \
+ extras/ezio/ezio.c \
+ extras/ezio/ezio.h \
+ extras/ezio/terminal.c \
+ extras/ezio/terminal.h \
+ extras/ezio/tty.c \
+ extras/ezio/tty.h \
+ extras/ezio/vt.h
+if HAVE_LINUX_VT_H
+extras_ezio_ezio_term_SOURCES += extras/ezio/vt-linux.c
+else
+extras_ezio_ezio_term_SOURCES += extras/ezio/vt-dummy.c
+endif
+extras_ezio_ezio_term_LDADD = lib/libopenvswitch.a $(NCURSES_LIBS)
+
+bin_PROGRAMS += extras/ezio/ovs-switchui
+extras_ezio_ovs_switchui_SOURCES = extras/ezio/ovs-switchui.c
+extras_ezio_ovs_switchui_LDADD = \
+ lib/libopenvswitch.a \
+ $(NCURSES_LIBS) \
+ $(PCRE_LIBS) \
+ $(SSL_LIBS) \
+ -lm
diff --git a/extras/ezio/byteq.c b/extras/ezio/byteq.c
new file mode 100644
index 000000000..31d48aad2
--- /dev/null
+++ b/extras/ezio/byteq.c
@@ -0,0 +1,216 @@
+/* Copyright (c) 2008, 2009 Nicira Networks, Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * In addition, as a special exception, Nicira Networks gives permission
+ * to link the code of its release of vswitchd with the OpenSSL project's
+ * "OpenSSL" library (or with modified versions of it that use the same
+ * license as the "OpenSSL" library), and distribute the linked
+ * executables. You must obey the GNU General Public License in all
+ * respects for all of the code used other than "OpenSSL". If you modify
+ * this file, you may extend this exception to your version of the file,
+ * but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version.
+ *
+ */
+
+#include <config.h>
+#include "extras/ezio/byteq.h"
+#include <assert.h>
+#include <errno.h>
+#include <string.h>
+#include <unistd.h>
+#include "util.h"
+
+/* The queue size must be a power of 2. */
+BUILD_ASSERT_DECL(!(BYTEQ_SIZE & (BYTEQ_SIZE - 1)));
+
+static uint8_t *head(struct byteq *);
+static int headroom(const struct byteq *);
+static void advance_head(struct byteq *, unsigned int n);
+static int tailroom(const struct byteq *);
+static const uint8_t *tail(const struct byteq *);
+static void advance_tail(struct byteq *, unsigned int n);
+
+/* Initializes 'q' as empty. */
+void
+byteq_init(struct byteq *q)
+{
+ q->head = q->tail = 0;
+}
+
+/* Returns the number of bytes current queued in 'q'. */
+int
+byteq_used(const struct byteq *q)
+{
+ return q->head - q->tail;
+}
+
+/* Returns the number of bytes that can be added to 'q' without overflow. */
+int
+byteq_avail(const struct byteq *q)
+{
+ return BYTEQ_SIZE - byteq_used(q);
+}
+
+/* Returns true if no bytes are queued in 'q',
+ * false if at least one byte is queued. */
+bool
+byteq_is_empty(const struct byteq *q)
+{
+ return !byteq_used(q);
+}
+
+/* Returns true if 'q' has no room to queue additional bytes,
+ * false if 'q' has room for at least one more byte. */
+bool
+byteq_is_full(const struct byteq *q)
+{
+ return !byteq_avail(q);
+}
+
+/* Adds 'c' at the head of 'q', which must not be full. */
+void
+byteq_put(struct byteq *q, uint8_t c)
+{
+ assert(!byteq_is_full(q));
+ *head(q) = c;
+ q->head++;
+}
+
+/* Adds the 'n' bytes in 'p' at the head of 'q', which must have at least 'n'
+ * bytes of free space. */
+void
+byteq_putn(struct byteq *q, const void *p_, size_t n)
+{
+ const uint8_t *p = p_;
+ assert(byteq_avail(q) >= n);
+ while (n > 0) {
+ size_t chunk = MIN(n, headroom(q));
+ memcpy(head(q), p, chunk);
+ advance_head(q, chunk);
+ p += chunk;
+ n -= chunk;
+ }
+}
+
+/* Appends null-terminated string 's' to the head of 'q', which must have
+ * enough space. The null terminator is not added to 'q'. */
+void
+byteq_put_string(struct byteq *q, const char *s)
+{
+ byteq_putn(q, s, strlen(s));
+}
+
+/* Removes a byte from the tail of 'q' and returns it. 'q' must not be
+ * empty. */
+uint8_t
+byteq_get(struct byteq *q)
+{
+ uint8_t c;
+ assert(!byteq_is_empty(q));
+ c = *tail(q);
+ q->tail++;
+ return c;
+}
+
+/* Writes as much of 'q' as possible to 'fd'. Returns 0 if 'q' is fully
+ * drained by the write, otherwise a positive errno value (e.g. EAGAIN if a
+ * socket or tty buffer filled up). */
+int
+byteq_write(struct byteq *q, int fd)
+{
+ while (!byteq_is_empty(q)) {
+ ssize_t n = write(fd, tail(q), tailroom(q));
+ if (n > 0) {
+ advance_tail(q, n);
+ } else {
+ assert(n < 0);
+ return errno;
+ }
+ }
+ return 0;
+}
+
+/* Reads as much possible from 'fd' into 'q'. Returns 0 if 'q' is completely
+ * filled up by the read, EOF if end-of-file was reached before 'q' was filled,
+ * and otherwise a positive errno value (e.g. EAGAIN if a socket or tty buffer
+ * was drained). */
+int
+byteq_read(struct byteq *q, int fd)
+{
+ while (!byteq_is_full(q)) {
+ ssize_t n = read(fd, head(q), headroom(q));
+ if (n > 0) {
+ advance_head(q, n);
+ } else {
+ return !n ? EOF : errno;
+ }
+ }
+ return 0;
+}
+
+/* Returns the number of contiguous bytes of in-use space starting at the tail
+ * of 'q'. */
+static int
+tailroom(const struct byteq *q)
+{
+ int used = byteq_used(q);
+ int tail_to_end = BYTEQ_SIZE - (q->tail & (BYTEQ_SIZE - 1));
+ return MIN(used, tail_to_end);
+}
+
+/* Returns the first in-use byte of 'q', the point at which data is removed
+ * from 'q'. */
+static const uint8_t *
+tail(const struct byteq *q)
+{
+ return &q->buffer[q->tail & (BYTEQ_SIZE - 1)];
+}
+
+/* Removes 'n' bytes from the tail of 'q', which must have at least 'n' bytes
+ * of tailroom. */
+static void
+advance_tail(struct byteq *q, unsigned int n)
+{
+ assert(tailroom(q) >= n);
+ q->tail += n;
+}
+
+/* Returns the byte after the last in-use byte of 'q', the point at which new
+ * data will be added to 'q'. */
+static uint8_t *
+head(struct byteq *q)
+{
+ return &q->buffer[q->head & (BYTEQ_SIZE - 1)];
+}
+
+/* Returns the number of contiguous bytes of free space starting at the head
+ * of 'q'. */
+static int
+headroom(const struct byteq *q)
+{
+ int avail = byteq_avail(q);
+ int head_to_end = BYTEQ_SIZE - (q->head & (BYTEQ_SIZE - 1));
+ return MIN(avail, head_to_end);
+}
+
+/* Adds to 'q' the 'n' bytes after the last currently in-use byte of 'q'. 'q'
+ * must have at least 'n' bytes of headroom. */
+static void
+advance_head(struct byteq *q, unsigned int n)
+{
+ assert(headroom(q) >= n);
+ q->head += n;
+}
diff --git a/extras/ezio/byteq.h b/extras/ezio/byteq.h
new file mode 100644
index 000000000..4397f6aaa
--- /dev/null
+++ b/extras/ezio/byteq.h
@@ -0,0 +1,57 @@
+/* Copyright (c) 2008, 2009 Nicira Networks, Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * In addition, as a special exception, Nicira Networks gives permission
+ * to link the code of its release of vswitchd with the OpenSSL project's
+ * "OpenSSL" library (or with modified versions of it that use the same
+ * license as the "OpenSSL" library), and distribute the linked
+ * executables. You must obey the GNU General Public License in all
+ * respects for all of the code used other than "OpenSSL". If you modify
+ * this file, you may extend this exception to your version of the file,
+ * but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version.
+ *
+ */
+
+#ifndef BYTEQ_H
+#define BYTEQ_H 1
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+/* Maximum number of bytes in a byteq. */
+#define BYTEQ_SIZE 512
+
+/* General-purpose circular queue of bytes. */
+struct byteq {
+ uint8_t buffer[BYTEQ_SIZE]; /* Circular queue. */
+ unsigned int head; /* Head of queue. */
+ unsigned int tail; /* Chases the head. */
+};
+
+void byteq_init(struct byteq *);
+int byteq_used(const struct byteq *);
+int byteq_avail(const struct byteq *);
+bool byteq_is_empty(const struct byteq *);
+bool byteq_is_full(const struct byteq *);
+void byteq_put(struct byteq *, uint8_t c);
+void byteq_putn(struct byteq *, const void *, size_t n);
+void byteq_put_string(struct byteq *, const char *);
+uint8_t byteq_get(struct byteq *);
+int byteq_write(struct byteq *, int fd);
+int byteq_read(struct byteq *, int fd);
+
+#endif /* byteq.h */
diff --git a/extras/ezio/ezio-term.c b/extras/ezio/ezio-term.c
new file mode 100644
index 000000000..c2177addf
--- /dev/null
+++ b/extras/ezio/ezio-term.c
@@ -0,0 +1,1060 @@
+/* Copyright (c) 2008, 2009 Nicira Networks, Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * In addition, as a special exception, Nicira Networks gives permission
+ * to link the code of its release of vswitchd with the OpenSSL project's
+ * "OpenSSL" library (or with modified versions of it that use the same
+ * license as the "OpenSSL" library), and distribute the linked
+ * executables. You must obey the GNU General Public License in all
+ * respects for all of the code used other than "OpenSSL". If you modify
+ * this file, you may extend this exception to your version of the file,
+ * but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version.
+ *
+ */
+
+
+#include <config.h>
+#include <assert.h>
+#include <curses.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <getopt.h>
+#include <inttypes.h>
+#include <signal.h>
+#include <string.h>
+#include <stdlib.h>
+#include <term.h>
+#include <unistd.h>
+#include "command-line.h"
+#include "extras/ezio/byteq.h"
+#include "extras/ezio/tty.h"
+#include "extras/ezio/vt.h"
+#include "daemon.h"
+#include "ezio.h"
+#include "poll-loop.h"
+#include "socket-util.h"
+#include "terminal.h"
+#include "timeval.h"
+#include "util.h"
+
+#define THIS_MODULE VLM_ezio_term
+#include "vlog.h"
+
+/* EZIO button status. */
+enum btn_status {
+ BTN_UP = 1 << 0,
+ BTN_DOWN = 1 << 1,
+ BTN_ENTER = 1 << 2,
+ BTN_ESC = 1 << 3
+};
+
+/* -e, --ezio: EZIO3 serial device file. */
+static char *ezio_dev = "/dev/ttyS1";
+
+/* -i, --input: Terminal from which to accept additional keyboard input. */
+static char *input_dev = NULL;
+
+struct inputdev;
+static int inputdev_open(const char *name, struct inputdev **);
+static void inputdev_close(struct inputdev *);
+static int inputdev_run(struct inputdev *, struct byteq *);
+static void inputdev_update(struct inputdev *, const struct ezio *);
+static void inputdev_wait(struct inputdev *);
+
+static struct scanner *scanner_create(void);
+static void scanner_destroy(struct scanner *);
+static void scanner_run(struct scanner *, struct ezio *);
+static void scanner_wait(struct scanner *);
+static void scanner_left(struct scanner *, struct ezio *);
+static void scanner_right(struct scanner *, struct ezio *);
+
+static struct updater *updater_create(void);
+static void updater_destroy(struct updater *);
+static int updater_run(struct updater *, const struct ezio *shadow,
+ int ezio_fd);
+static void updater_wait(struct updater *, int ezio_fd);
+enum btn_status updater_get_buttons(struct updater *);
+bool updater_has_buttons(const struct updater *);
+
+static void handle_buttons(struct updater *, struct scanner *,
+ struct byteq *, struct ezio *);
+
+static void usage(void) NO_RETURN;
+static void parse_options(int argc, char *argv[]);
+
+int
+main(int argc, char *argv[])
+{
+ struct terminal *terminal;
+ struct updater *updater;
+ struct scanner *scanner;
+ struct inputdev *inputdev;
+ struct byteq inputq;
+ struct ezio ezio;
+ int ezio_fd, pty_fd, dummy_fd;
+ int retval;
+ int i;
+
+ set_program_name(argv[0]);
+ time_init();
+ vlog_init();
+ parse_options(argc, argv);
+ signal(SIGPIPE, SIG_IGN);
+
+ argc -= optind;
+ argv += optind;
+
+ /* Make sure that the ezio3 terminfo entry is available. */
+ dummy_fd = open("/dev/null", O_RDWR);
+ if (dummy_fd >= 0) {
+ if (setupterm("ezio3", dummy_fd, &retval) == ERR) {
+ if (retval == 0) {
+ ovs_fatal(0, "Missing terminfo entry for ezio3. "
+ "Did you run \"make install\"?");
+ } else {
+ ovs_fatal(0, "Missing terminfo database. Is ncurses "
+ "properly installed?");
+ }
+ }
+ del_curterm(cur_term);
+ close(dummy_fd);
+ } else {
+ ovs_error(errno, "failed to open /dev/null");
+ }
+
+ /* Lock serial port. */
+ retval = tty_lock(ezio_dev);
+ if (retval) {
+ ovs_fatal(retval, "%s: lock failed", ezio_dev);
+ }
+
+ /* Open EZIO and configure as 2400 bps, N-8-1, in raw mode. */
+ ezio_fd = open(ezio_dev, O_RDWR | O_NOCTTY);
+ if (ezio_fd < 0) {
+ ovs_fatal(errno, "%s: open", ezio_dev);
+ }
+ retval = tty_set_raw_mode(ezio_fd, B2400);
+ if (retval) {
+ ovs_fatal(retval, "%s: failed to configure tty parameters", ezio_dev);
+ }
+
+ /* Open keyboard device for input. */
+ if (input_dev) {
+ retval = inputdev_open(input_dev, &inputdev);
+ if (retval) {
+ ovs_fatal(retval, "%s: failed to open input device", input_dev);
+ }
+ } else {
+ inputdev = NULL;
+ }
+
+ /* Open pty master. */
+ pty_fd = tty_open_master_pty();
+ if (pty_fd < 0) {
+ ovs_fatal(-pty_fd, "failed to open master pty");
+ }
+ tty_set_window_size(pty_fd, 2, 40);
+
+ /* Start child process. */
+ if (argc < 1) {
+ char *child_argv[2];
+
+ child_argv[0] = getenv("SHELL");
+ if (!child_argv[0]) {
+ child_argv[0] = "/bin/sh";
+ }
+ child_argv[1] = NULL;
+ retval = tty_fork_child(pty_fd, child_argv);
+ } else {
+ retval = tty_fork_child(pty_fd, argv);
+ }
+ if (retval) {
+ ovs_fatal(retval, "failed to fork child process");
+ }
+
+ die_if_already_running();
+ daemonize();
+
+ terminal = terminal_create();
+ updater = updater_create();
+ scanner = scanner_create();
+ ezio_init(&ezio);
+ for (i = 0; i < 8; i++) {
+ ezio_set_default_icon(&ezio, i);
+ }
+ byteq_init(&inputq);
+ for (;;) {
+ /* Get button presses and keyboard input into inputq, then push the
+ * inputq to the pty. */
+ handle_buttons(updater, scanner, &inputq, &ezio);
+ if (inputdev) {
+ retval = inputdev_run(inputdev, &inputq);
+ if (retval) {
+ VLOG_ERR("error reading from input device: %s",
+ strerror(retval));
+ inputdev_close(inputdev);
+ inputdev = NULL;
+ }
+ }
+ retval = byteq_write(&inputq, pty_fd);
+ if (retval && retval != EAGAIN) {
+ VLOG_ERR("error passing through input: %s",
+ retval == EOF ? "end of file" : strerror(retval));
+ }
+
+ /* Process data from pty in terminal emulator. */
+ retval = terminal_run(terminal, &ezio, pty_fd);
+ if (retval) {
+ VLOG_ERR("error reading from terminal: %s",
+ retval == EOF ? "end of file" : strerror(retval));
+ break;
+ }
+
+ /* Scroll left and right through text. */
+ scanner_run(scanner, &ezio);
+
+ /* Update the display to match what should be shown. */
+ retval = updater_run(updater, &ezio, ezio_fd);
+ if (retval) {
+ VLOG_ERR("error writing to ezio: %s",
+ retval == EOF ? "end of file" : strerror(retval));
+ break;
+ }
+ if (inputdev) {
+ inputdev_update(inputdev, &ezio);
+ }
+
+ /* Wait for something to happen. */
+ terminal_wait(terminal, pty_fd);
+ scanner_wait(scanner);
+ if (updater_has_buttons(updater)) {
+ poll_immediate_wake();
+ }
+ updater_wait(updater, ezio_fd);
+ if (!byteq_is_empty(&inputq)) {
+ poll_fd_wait(pty_fd, POLLOUT);
+ }
+ if (inputdev) {
+ inputdev_wait(inputdev);
+ }
+ poll_block();
+ }
+ terminal_destroy(terminal);
+ updater_destroy(updater);
+ scanner_destroy(scanner);
+
+ return 0;
+}
+
+static void
+send_keys(struct byteq *q, const char *s)
+{
+ size_t n = strlen(s);
+ if (byteq_avail(q) >= n) {
+ byteq_putn(q, s, n);
+ }
+}
+
+static void
+handle_buttons(struct updater *up, struct scanner *s,
+ struct byteq *q, struct ezio *ezio)
+{
+ while (updater_has_buttons(up)) {
+ int btns = updater_get_buttons(up);
+ switch (btns) {
+ case BTN_UP:
+ send_keys(q, "\x1b\x5b\x41"); /* Up arrow. */
+ break;
+
+ case BTN_UP | BTN_ESC:
+ send_keys(q, "\x1b[5~"); /* Page up. */
+ break;
+
+ case BTN_DOWN:
+ send_keys(q, "\x1b\x5b\x42"); /* Down arrow. */
+ break;
+
+ case BTN_DOWN | BTN_ESC:
+ send_keys(q, "\x1b[6~"); /* Page down. */
+ break;
+
+ case BTN_ENTER:
+ send_keys(q, "\r");
+ break;
+
+ case BTN_ESC:
+ send_keys(q, "\x7f");
+ break;
+
+ case BTN_UP | BTN_DOWN:
+ scanner_left(s, ezio);
+ break;
+
+ case BTN_ESC | BTN_ENTER:
+ scanner_right(s, ezio);
+ break;
+
+ case BTN_UP | BTN_DOWN | BTN_ENTER | BTN_ESC:
+ send_keys(q, "\x04"); /* End of file. */
+ break;
+
+ case BTN_UP | BTN_ENTER | BTN_ESC:
+ send_keys(q, "y");
+ break;
+
+ case BTN_DOWN | BTN_ENTER | BTN_ESC:
+ send_keys(q, "n");
+ break;
+ }
+ }
+}
+
+/* EZIO screen updater. */
+
+/* EZIO command codes. */
+#define EZIO_CMD 0xfe /* Command prefix byte. */
+#define EZIO_CLEAR 0x01 /* Clear screen. */
+#define EZIO_HOME 0x02 /* Move to (0, 0). */
+#define EZIO_READ 0x06 /* Poll keyboard. */
+
+#define EZIO_ENTRY_MODE 0x04 /* Set entry mode: */
+#define EZIO_LTOR_MODE 0x02 /* ...left-to-right (vs. r-to-l). */
+#define EZIO_SHIFT_MODE 0x01 /* ...scroll with output (vs. don't). */
+
+#define EZIO_DISPLAY_MODE 0x08 /* Set display mode: */
+#define EZIO_ENABLE_DISPLAY 0x04 /* ...turn on display (vs. blank). */
+#define EZIO_SHOW_CURSOR 0x02 /* ...show cursor (vs. hide). */
+#define EZIO_BLOCK_CURSOR 0x01 /* ...block cursor (vs. underline). */
+
+#define EZIO_INIT 0x28 /* Initialize EZIO. */
+
+#define EZIO_MOVE_CURSOR 0x80 /* Set cursor position. */
+#define EZIO_COL_SHIFT 0 /* Shift count for column (0-based). */
+#define EZIO_ROW_SHIFT 6 /* Shift count for row (0-based). */
+
+#define EZIO_DEFINE_ICON 0x40 /* Define icon. */
+#define EZIO_ICON_SHIFT 3 /* Shift count for icon number (0-7). */
+
+#define EZIO_SCROLL_LEFT 0x18 /* Scroll display left 1 position. */
+#define EZIO_SCROLL_RIGHT 0x1c /* Scroll display right 1 position. */
+#define EZIO_CURSOR_LEFT 0x10 /* Move cursor left 1 position. */
+#define EZIO_CURSOR_RIGHT 0x14 /* Move cursor right 1 position. */
+
+/* Rate limiting: the EZIO runs at 2400 bps, which is 240 bytes per second.
+ * Kernel tty buffers, on the other hand, tend to be at least 4 kB. That
+ * means that, if we keep the kernel buffer filled, then the queued data will
+ * be 4,096 kB / 240 bytes/s ~= 17 seconds ahead of what is actually
+ * displayed. This is not a happy situation. So we rate-limit with a token
+ * bucket.
+ *
+ * The parameters below work out as: (6 tokens/ms * 1000 ms) / (25
+ * tokens/byte) = 240 bytes/s. */
+#define UP_TOKENS_PER_MS 6 /* Tokens acquired per millisecond. */
+#define UP_BUCKET_SIZE (6 * 100) /* Capacity of the token bukect. */
+#define UP_TOKENS_PER_BYTE 25 /* Tokens required to output a byte. */
+
+struct updater {
+ /* Current state of EZIO device. */
+ struct ezio visible;
+
+ /* Output state. */
+ struct byteq obuf; /* Output being sent to serial port. */
+ int tokens; /* Token bucket content. */
+ long long int last_fill; /* Last time we increased 'tokens'.*/
+ bool up_to_date; /* Does visible state match shadow state? */
+
+ /* Input state. */
+ struct byteq ibuf; /* Queued button pushes. */
+ long long int last_poll; /* Last time we sent a button poll request. */
+ enum btn_status last_status; /* Last received button status. */
+ long long int last_change; /* Time when status most recently changed. */
+ int repeat_count; /* Autorepeat count. */
+ bool releasing; /* Waiting for button release? */
+};
+
+static void send_command(struct updater *, uint8_t command);
+static void recv_button_state(struct updater *, enum btn_status status);
+static int range(int value, int min, int max);
+static void send_command(struct updater *, uint8_t command);
+static void set_cursor_position(struct updater *, int x, int y);
+static bool icons_differ(const struct ezio *, const struct ezio *, int *idx);
+static void update_char(struct updater *, const struct ezio *, int x, int y);
+static void update_cursor_status(struct updater *, const struct ezio *);
+
+/* Creates and returns a new updater. */
+static struct updater *
+updater_create(void)
+{
+ struct updater *up = xmalloc(sizeof *up);
+ ezio_init(&up->visible);
+ byteq_init(&up->obuf);
+ up->tokens = UP_BUCKET_SIZE;
+ up->last_fill = time_msec();
+ byteq_init(&up->ibuf);
+ up->last_poll = LLONG_MIN;
+ up->last_status = 0;
+ up->last_change = time_msec();
+ up->releasing = false;
+ send_command(up, EZIO_INIT);
+ send_command(up, EZIO_INIT);
+ send_command(up, EZIO_CLEAR);
+ send_command(up, EZIO_HOME);
+ return up;
+}
+
+/* Destroys updater 'up. */
+static void
+updater_destroy(struct updater *up)
+{
+ free(up);
+}
+
+/* Sends EZIO commands over file descriptor 'ezio_fd' to the EZIO represented
+ * by updater 'up', to make the EZIO display the contents of 'shadow'.
+ * Rate-limiting can cause the update to be only partial, but the next call to
+ * updater_run() will resume the update.
+ *
+ * Returns 0 if successful, otherwise a positive errno value. */
+static int
+updater_run(struct updater *up, const struct ezio *shadow, int ezio_fd)
+{
+ uint8_t c;
+ while (read(ezio_fd, &c, 1) > 0) {
+ if ((c & 0xf0) == 0xb0) {
+ recv_button_state(up, ~c & 0x0f);
+ }
+ }
+
+ up->up_to_date = false;
+ for (;;) {
+ struct ezio *visible = &up->visible;
+ int idx, x, y;
+ int retval;
+
+ /* Flush the buffer out to the EZIO device. */
+ retval = byteq_write(&up->obuf, ezio_fd);
+ if (retval == EAGAIN) {
+ return 0;
+ } else if (retval) {
+ VLOG_WARN("error writing ezio: %s", strerror(retval));
+ return retval;
+ }
+
+ /* Make sure we have some tokens before we write anything more. */
+ if (up->tokens <= 0) {
+ long long int now = time_msec();
+ if (now > up->last_fill) {
+ up->tokens += (now - up->last_fill) * UP_TOKENS_PER_MS;
+ up->last_fill = now;
+ if (up->tokens > UP_BUCKET_SIZE) {
+ up->tokens = UP_BUCKET_SIZE;
+ }
+ }
+ if (up->tokens <= 0) {
+ /* Still out of tokens. */
+ return 0;
+ }
+ }
+
+ /* Consider what else we might want to send. */
+ if (time_msec() >= up->last_poll + 100) {
+ /* Send a button-read command. */
+ send_command(up, EZIO_READ);
+ up->last_poll = time_msec();
+ } else if (visible->show_cursor && !shadow->show_cursor) {
+ /* Turn off the cursor. */
+ update_cursor_status(up, shadow);
+ } else if (icons_differ(shadow, visible, &idx)) {
+ /* Update the icons. */
+ send_command(up, EZIO_DEFINE_ICON + (idx << EZIO_ICON_SHIFT));
+ byteq_putn(&up->obuf, &shadow->icons[idx][0], 8);
+ set_cursor_position(up, shadow->x, shadow->y);
+ memcpy(visible->icons[idx], shadow->icons[idx], 8);
+ } else if (visible->x_ofs != shadow->x_ofs) {
+ /* Scroll to the correct horizontal position. */
+ if (visible->x_ofs < shadow->x_ofs) {
+ send_command(up, EZIO_SCROLL_LEFT);
+ visible->x_ofs++;
+ } else {
+ send_command(up, EZIO_SCROLL_RIGHT);
+ visible->x_ofs--;
+ }
+ } else if (ezio_chars_differ(shadow, visible, shadow->x_ofs,
+ shadow->x_ofs + 16, &x, &y)) {
+ /* Update the visible region. */
+ update_char(up, shadow, x, y);
+ } else if (ezio_chars_differ(shadow, visible, 0, 40, &x, &y)) {
+ /* Update the off-screen region. */
+ update_char(up, shadow, x, y);
+ } else if ((visible->x != shadow->x || visible->y != shadow->y)
+ && shadow->show_cursor) {
+ /* Update the cursor position. (This has to follow updating the
+ * display content, because updating display content changes the
+ * cursor position.) */
+ set_cursor_position(up, shadow->x, shadow->y);
+ } else if (visible->show_cursor != shadow->show_cursor
+ || visible->blink_cursor != shadow->blink_cursor) {
+ /* Update the cursor type. */
+ update_cursor_status(up, shadow);
+ } else {
+ /* We're fully up-to-date. */
+ up->up_to_date = true;
+ return 0;
+ }
+ up->tokens -= UP_TOKENS_PER_BYTE * byteq_used(&up->obuf);
+ }
+}
+
+/* Calls poll-loop functions that will cause poll_block() to wake up when
+ * updater_run() has work to do. */
+static void
+updater_wait(struct updater *up, int ezio_fd)
+{
+ if (!byteq_is_empty(&up->obuf)) {
+ poll_fd_wait(ezio_fd, POLLOUT);
+ } else if (up->tokens <= 0) {
+ poll_timer_wait((-up->tokens / UP_TOKENS_PER_MS) + 1);
+ } else if (!up->up_to_date) {
+ poll_immediate_wake();
+ }
+
+ if (!up->last_status && time_msec() - up->last_change > 100) {
+ /* No button presses in a while. Sleep longer. */
+ poll_timer_wait(100);
+ } else {
+ poll_timer_wait(50);
+ }
+}
+
+/* Returns a button or buttons that were pushed. Must not be called if
+ * updater_has_buttons() would return false. One or more BTN_* flags will be
+ * set in the return value. */
+enum btn_status
+updater_get_buttons(struct updater *up)
+{
+ return byteq_get(&up->ibuf);
+}
+
+/* Any buttons pushed? */
+bool
+updater_has_buttons(const struct updater *up)
+{
+ return !byteq_is_empty(&up->ibuf);
+}
+
+/* Adds 'btns' to the queue of pushed buttons */
+static void
+buttons_pushed(struct updater *up, enum btn_status btns)
+{
+ if (!byteq_is_full(&up->ibuf)) {
+ byteq_put(&up->ibuf, btns);
+ }
+}
+
+/* Updates the buttons-pushed queue based on the current button 'status'. */
+static void
+recv_button_state(struct updater *up, enum btn_status status)
+{
+ /* Calculate milliseconds since button status last changed. */
+ long long int stable_msec;
+ if (status != up->last_status) {
+ up->last_change = time_msec();
+ stable_msec = 0;
+ } else {
+ stable_msec = time_msec() - up->last_change;
+ }
+
+ if (up->releasing) {
+ if (!status) {
+ up->releasing = false;
+ }
+ } else if (up->last_status) {
+ if (!(status & up->last_status)) {
+ /* Button(s) were pushed and released. */
+ if (!up->repeat_count) {
+ buttons_pushed(up, up->last_status);
+ }
+ } else if (stable_msec >= 150 && !up->repeat_count) {
+ /* Buttons have been stable for a while, so push them once. */
+ buttons_pushed(up, status);
+ up->repeat_count++;
+ } else if (stable_msec >= 1000) {
+ /* Autorepeat 10/second after 1 second hold time. */
+ int n = (stable_msec - 1000) / 100 + 1;
+ while (up->repeat_count < n) {
+ buttons_pushed(up, status);
+ up->repeat_count++;
+ }
+ } else if ((status & up->last_status) == up->last_status) {
+ /* More buttons pushed than at last poll. */
+ } else {
+ /* Some, but not all, buttons were released. Ignore the buttons
+ * until all are released. */
+ up->releasing = true;
+ }
+ }
+ if (!status) {
+ up->repeat_count = 0;
+ }
+ up->last_status = status;
+}
+
+static int
+range(int value, int min, int max)
+{
+ return value < min ? min : value > max ? max : value;
+}
+
+static void
+send_command(struct updater *up, uint8_t command)
+{
+ byteq_put(&up->obuf, EZIO_CMD);
+ byteq_put(&up->obuf, command);
+}
+
+/* Moves the cursor to 0-based position (x, y). Updates 'up->visible' to
+ * reflect the change. */
+static void
+set_cursor_position(struct updater *up, int x, int y)
+{
+ int command = EZIO_MOVE_CURSOR;
+ command |= range(x, 0, 39) << EZIO_COL_SHIFT;
+ command |= range(y, 0, 1) << EZIO_ROW_SHIFT;
+ send_command(up, command);
+ up->visible.x = x;
+ up->visible.y = y;
+}
+
+/* If any of the icons differ from 'a' to 'b', returns true and sets '*idx' to
+ * the index of the first icon that differs. Otherwise, returns false. */
+static bool
+icons_differ(const struct ezio *a, const struct ezio *b, int *idx)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(a->icons); i++) {
+ if (memcmp(&a->icons[i], &b->icons[i], sizeof a->icons[i])) {
+ *idx = i;
+ return true;
+ }
+ }
+ return false;
+}
+
+/* Queues commands in 'up''s output buffer to update the character at 0-based
+ * position (x,y) to match the character that 'shadow' has there. Updates
+ * 'up->visible' to reflect the change. */
+static void
+update_char(struct updater *up, const struct ezio *shadow, int x, int y)
+{
+ if (x != up->visible.x || y != up->visible.y) {
+ set_cursor_position(up, x, y);
+ }
+ byteq_put(&up->obuf, shadow->chars[y][x]);
+ up->visible.chars[y][x] = shadow->chars[y][x];
+ up->visible.x++;
+}
+
+/* Queues commands in 'up''s output buffer to change the EZIO's cursor shape to
+ * match that in 'shadow'. Updates 'up->visible' to reflect the change. */
+static void
+update_cursor_status(struct updater *up, const struct ezio *shadow)
+{
+ uint8_t command = EZIO_DISPLAY_MODE | EZIO_ENABLE_DISPLAY;
+ if (shadow->show_cursor) {
+ command |= EZIO_SHOW_CURSOR;
+ if (shadow->blink_cursor) {
+ command |= EZIO_BLOCK_CURSOR;
+ }
+ }
+ send_command(up, command);
+ up->visible.show_cursor = shadow->show_cursor;
+ up->visible.blink_cursor = shadow->blink_cursor;
+}
+
+/* An input device, such as a tty. */
+
+struct inputdev {
+ /* Input. */
+ int fd; /* File descriptor. */
+
+ /* State for mirroring the EZIO display to the device. */
+ bool is_tty; /* We only attempt to mirror to ttys. */
+ struct byteq outq; /* Output queue. */
+ struct ezio visible; /* Data that we have displayed. */
+};
+
+/* Opens 'name' as a input device. If successful, returns 0 and stores a
+ * pointer to the input device in '*devp'. On failure, returns a positive
+ * errno value. */
+static int
+inputdev_open(const char *name, struct inputdev **devp)
+{
+ struct inputdev *dev;
+ int retval;
+ int fd;
+
+ *devp = NULL;
+ if (!strcmp(name, "vt")) {
+ fd = vt_open(O_RDWR | O_NOCTTY);
+ if (fd < 0) {
+ return -fd;
+ }
+ } else if (!strcmp(name, "-")) {
+ fd = dup(STDIN_FILENO);
+ if (fd < 0) {
+ return errno;
+ }
+ } else {
+ fd = open(name, O_RDWR | O_NOCTTY);
+ if (fd < 0) {
+ return errno;
+ }
+ }
+
+ retval = tty_set_raw_mode(fd, B0);
+ if (retval) {
+ close(fd);
+ VLOG_WARN("%s: failed to configure tty parameters: %s",
+ name, strerror(retval));
+ return retval;
+ }
+
+ dev = xmalloc(sizeof *dev);
+ dev->fd = fd;
+ dev->is_tty = isatty(fd);
+ byteq_init(&dev->outq);
+ ezio_init(&dev->visible);
+ *devp = dev;
+ return 0;
+}
+
+/* Closes and destroys input device 'dev'. */
+static void
+inputdev_close(struct inputdev *dev)
+{
+ if (dev) {
+ close(dev->fd);
+ free(dev);
+ }
+}
+
+/* Reads input from 'dev' into 'q'. Returns 0 if successful, otherwise a
+ * positive errno value. */
+static int
+inputdev_run(struct inputdev *dev, struct byteq *q)
+{
+ int retval = byteq_read(q, dev->fd);
+ return retval == EAGAIN ? 0 : retval;
+}
+
+/* Dumps data from 'dev''s output queue to the underlying file descriptor,
+ * updating the tty screen display. */
+static void
+flush_inputdev(struct inputdev *dev)
+{
+ int retval = byteq_write(&dev->outq, dev->fd);
+ if (retval && retval != EAGAIN) {
+ VLOG_WARN("error writing input device, "
+ "disabling further output");
+ dev->is_tty = false;
+ }
+}
+
+/* Updates the tty screen display on 'dev' to match 'e'. */
+static void
+inputdev_update(struct inputdev *dev, const struct ezio *e)
+{
+ struct byteq *q = &dev->outq;
+ int x, y;
+
+ if (!dev->is_tty) {
+ return;
+ }
+
+ flush_inputdev(dev);
+ if (!byteq_is_empty(q)) {
+ return;
+ }
+
+ if (!ezio_chars_differ(e, &dev->visible, 0, 40, &x, &y)
+ && e->x == dev->visible.x
+ && e->y == dev->visible.y
+ && e->x_ofs == dev->visible.x_ofs
+ && e->show_cursor == dev->visible.show_cursor) {
+ return;
+ }
+ dev->visible = *e;
+
+ byteq_put_string(q, "\033[H\033[2J"); /* Clear screen. */
+ for (y = 0; y < 4; y++) {
+ byteq_put(q, "+||+"[y]);
+ for (x = 0; x < 40; x++) {
+ int c;
+ if (x == e->x_ofs) {
+ byteq_put(q, '[');
+ }
+ c = y == 0 || y == 3 ? '-' : e->chars[y - 1][x];
+ if (c == 6) {
+ c = '\\';
+ } else if (c == 7) {
+ c = '~';
+ } else if (c < 0x20 || c > 0x7d) {
+ c = '?';
+ }
+ byteq_put(q, c);
+ if (x == e->x_ofs + 15) {
+ byteq_put(q, ']');
+ }
+ }
+ byteq_put(q, "+||+"[y]);
+ byteq_put(q, '\r');
+ byteq_put(q, '\n');
+ }
+ if (e->show_cursor) {
+ int x = range(e->x, 0, 39) + 2 + (e->x >= e->x_ofs) + (e->x > e->x_ofs + 15);
+ int y = range(e->y, 0, 1) + 2;
+ char cup[16];
+ sprintf(cup, "\033[%d;%dH", y, x); /* Position cursor. */
+ byteq_put_string(q, cup);
+ }
+ flush_inputdev(dev);
+}
+
+/* Calls poll-loop functions that will cause poll_block() to wake up when
+ * inputdev_run() has work to do. */
+static void
+inputdev_wait(struct inputdev *dev)
+{
+ int flags = POLLIN;
+ if (dev->is_tty && !byteq_is_empty(&dev->outq)) {
+ flags |= POLLOUT;
+ }
+ poll_fd_wait(dev->fd, flags);
+}
+
+/* Scrolls the display left and right automatically to display all the
+ * content. */
+
+enum scanner_state {
+ SCANNER_LEFT, /* Moving left. */
+ SCANNER_RIGHT /* Moving right. */
+};
+
+struct scanner {
+ enum scanner_state state; /* Current state. */
+ int wait; /* No. of cycles to pause before continuing. */
+ long long int last_move; /* Last time the state machine ran. */
+};
+
+static void find_min_max(struct ezio *, int *min, int *max);
+
+static struct scanner *
+scanner_create(void)
+{
+ struct scanner *s = xmalloc(sizeof *s);
+ s->state = SCANNER_RIGHT;
+ s->wait = 0;
+ s->last_move = LLONG_MIN;
+ return s;
+}
+
+static void
+scanner_destroy(struct scanner *s)
+{
+ free(s);
+}
+
+static void
+scanner_run(struct scanner *s, struct ezio *ezio)
+{
+ long long int now = time_msec();
+ if (now >= s->last_move + 750) {
+ s->last_move = now;
+ if (s->wait) {
+ s->wait--;
+ } else {
+ int min, max;
+
+ find_min_max(ezio, &min, &max);
+ if (max - min + 1 <= 16) {
+ ezio->x_ofs = min;
+ return;
+ }
+
+ switch (s->state) {
+ case SCANNER_RIGHT:
+ if (ezio->x_ofs + 15 < max) {
+ ezio->x_ofs++;
+ } else {
+ s->state = SCANNER_LEFT;
+ s->wait = 1;
+ }
+ break;
+
+ case SCANNER_LEFT:
+ if (ezio->x_ofs > min) {
+ ezio->x_ofs--;
+ } else {
+ s->state = SCANNER_RIGHT;
+ s->wait = 1;
+ }
+ break;
+ }
+ }
+ }
+}
+
+static void
+scanner_wait(struct scanner *s)
+{
+ long long int now = time_msec();
+ long long int expires = s->last_move + 750;
+ if (now >= expires) {
+ poll_immediate_wake();
+ } else {
+ poll_timer_wait(expires - now);
+ }
+
+}
+
+static void
+scanner_left(struct scanner *s, struct ezio *ezio)
+{
+ s->wait = 7;
+ if (ezio->x_ofs > 0) {
+ ezio->x_ofs--;
+ }
+}
+
+static void
+scanner_right(struct scanner *s, struct ezio *ezio)
+{
+ s->wait = 7;
+ if (ezio->x_ofs < 40 - 16) {
+ ezio->x_ofs++;
+ }
+}
+
+static void
+find_min_max(struct ezio *ezio, int *min, int *max)
+{
+ int x;
+
+ *min = 0;
+ for (x = 0; x < 40; x++) {
+ if (ezio->chars[0][x] != ' ' || ezio->chars[1][x] != ' ') {
+ *min = x;
+ break;
+ }
+ }
+
+ *max = 15;
+ for (x = 39; x >= 0; x--) {
+ if (ezio->chars[0][x] != ' ' || ezio->chars[1][x] != ' ') {
+ *max = x;
+ break;
+ }
+ }
+
+ if (ezio->show_cursor) {
+ if (ezio->x < *min) {
+ *min = ezio->x;
+ }
+ if (ezio->x > *max) {
+ *max = ezio->x;
+ }
+ }
+}
+
+static void
+parse_options(int argc, char *argv[])
+{
+ enum {
+ OPT_DUMMY = UCHAR_MAX + 1,
+ VLOG_OPTION_ENUMS
+ };
+ static struct option long_options[] = {
+ {"ezio3", required_argument, 0, 'e'},
+ {"input", required_argument, 0, 'i'},
+ {"verbose", optional_argument, 0, 'v'},
+ {"help", no_argument, 0, 'h'},
+ {"version", no_argument, 0, 'V'},
+ DAEMON_LONG_OPTIONS,
+ VLOG_LONG_OPTIONS,
+ {0, 0, 0, 0},
+ };
+ char *short_options = long_options_to_short_options(long_options);
+
+ for (;;) {
+ int c;
+
+ c = getopt_long(argc, argv, short_options, long_options, NULL);
+ if (c == -1) {
+ break;
+ }
+
+ switch (c) {
+ case 'e':
+ ezio_dev = optarg;
+ break;
+
+ case 'i':
+ input_dev = optarg ? optarg : "-";
+ break;
+
+ case 'h':
+ usage();
+
+ case 'V':
+ OVS_PRINT_VERSION(0, 0);
+ exit(EXIT_SUCCESS);
+
+ DAEMON_OPTION_HANDLERS
+ VLOG_OPTION_HANDLERS
+
+ case '?':
+ exit(EXIT_FAILURE);
+
+ default:
+ abort();
+ }
+ }
+ free(short_options);
+}
+
+static void
+usage(void)
+{
+ printf("%s: EZIO3 terminal front-end\n"
+ "Provides a front-end to a 16x2 EZIO3 LCD display that makes\n"
+ "it look more like a conventional terminal\n"
+ "usage: %s [OPTIONS] [-- COMMAND [ARG...]]\n"
+ "where COMMAND is a command to run with stdin, stdout, and\n"
+ "stderr directed to the EZIO3 display.\n"
+ "\nSettings (defaults in parentheses):\n"
+ " -e, --ezio=TTY set EZIO3 serial device (/dev/ttyS1)\n"
+ " -i, --input=TERMINAL also read input from TERMINAL;\n"
+ " specify - for stdin, or vt to allocate\n"
+ " and switch to a free virtual terminal\n"
+ "\nOther options:\n"
+ " -v, --verbose=MODULE:FACILITY:LEVEL configure logging levels\n"
+ " -v, --verbose set maximum verbosity level\n"
+ " -h, --help display this help message\n"
+ " -V, --version display version information\n",
+ program_name, program_name);
+ exit(EXIT_SUCCESS);
+}
diff --git a/extras/ezio/ezio.c b/extras/ezio/ezio.c
new file mode 100644
index 000000000..6024766e2
--- /dev/null
+++ b/extras/ezio/ezio.c
@@ -0,0 +1,243 @@
+/* Copyright (c) 2008, 2009 Nicira Networks, Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * In addition, as a special exception, Nicira Networks gives permission
+ * to link the code of its release of vswitchd with the OpenSSL project's
+ * "OpenSSL" library (or with modified versions of it that use the same
+ * license as the "OpenSSL" library), and distribute the linked
+ * executables. You must obey the GNU General Public License in all
+ * respects for all of the code used other than "OpenSSL". If you modify
+ * this file, you may extend this exception to your version of the file,
+ * but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version.
+ *
+ */
+
+#include <config.h>
+#include "ezio.h"
+#include <assert.h>
+#include <stddef.h>
+#include <string.h>
+#include "util.h"
+
+static void remove_elements(uint8_t *p, size_t n_elems, size_t elem_size,
+ int pos, int n_del);
+static void insert_elements(uint8_t *p, size_t n_elems, size_t elem_size,
+ int pos, int n_insert);
+static int range(int value, int min, int max);
+
+void
+ezio_init(struct ezio *e)
+{
+ memset(e->icons, 0, sizeof e->icons);
+ ezio_clear(e);
+ e->x_ofs = 0;
+ e->show_cursor = true;
+ e->blink_cursor = false;
+}
+
+void
+ezio_set_icon(struct ezio *e, int idx,
+ int row0, int row1, int row2, int row3,
+ int row4, int row5, int row6, int row7)
+{
+ e->icons[idx][0] = row0;
+ e->icons[idx][1] = row1;
+ e->icons[idx][2] = row2;
+ e->icons[idx][3] = row3;
+ e->icons[idx][4] = row4;
+ e->icons[idx][5] = row5;
+ e->icons[idx][6] = row6;
+ e->icons[idx][7] = row7;
+}
+
+void
+ezio_set_default_icon(struct ezio *e, int idx)
+{
+ uint8_t *icon;
+
+ assert(idx >= 0 && idx < 8);
+ icon = e->icons[idx];
+ if (idx == 6) {
+ ezio_set_icon(e, idx,
+ e_____,
+ eX____,
+ e_X___,
+ e__X__,
+ e___X_,
+ e____X,
+ e_____,
+ e_____);
+ } else if (idx == 7) {
+ ezio_set_icon(e, idx,
+ e_____,
+ e_____,
+ e_X___,
+ eX_X_X,
+ eX_X_X,
+ e___X_,
+ e_____,
+ e_____);
+ } else {
+ ezio_set_icon(e, idx,
+ e_____,
+ e_____,
+ e_____,
+ e_____,
+ e_____,
+ e_____,
+ e_____,
+ e_____);
+ }
+}
+
+void
+ezio_clear(struct ezio *e)
+{
+ memset(e->chars, ' ', sizeof e->chars);
+ e->x = e->y = 0;
+}
+
+void
+ezio_put_char(struct ezio *e, int x, int y, uint8_t c)
+{
+ assert(x >= 0 && x <= 39);
+ assert(y >= 0 && y <= 1);
+ e->chars[y][x] = c != 0xfe ? c : 0xff;
+}
+
+void
+ezio_line_feed(struct ezio *e)
+{
+ if (++e->y >= 2) {
+ e->y = 1;
+ ezio_scroll_up(e, 1);
+ }
+}
+
+void
+ezio_newline(struct ezio *e)
+{
+ e->x = 0;
+ ezio_line_feed(e);
+}
+
+void
+ezio_delete_char(struct ezio *e, int x, int y, int n)
+{
+ remove_elements(&e->chars[y][0], 40, 1, x, n);
+}
+
+void
+ezio_delete_line(struct ezio *e, int y, int n)
+{
+ remove_elements(e->chars[0], 2, 40, y, n);
+}
+
+void
+ezio_insert_char(struct ezio *e, int x, int y, int n)
+{
+ insert_elements(&e->chars[y][0], 40, 1, x, n);
+}
+
+void
+ezio_insert_line(struct ezio *e, int y, int n)
+{
+ insert_elements(&e->chars[0][0], 2, 40, y, n);
+}
+
+void
+ezio_scroll_left(struct ezio *e, int n)
+{
+ int y;
+ for (y = 0; y < 2; y++) {
+ ezio_delete_char(e, 0, y, n);
+ }
+}
+
+void
+ezio_scroll_right(struct ezio *e, int n)
+{
+ int y;
+
+ for (y = 0; y < 2; y++) {
+ ezio_insert_char(e, 0, y, n);
+ }
+}
+
+void
+ezio_scroll_up(struct ezio *e, int n)
+{
+ ezio_delete_line(e, 0, n);
+}
+
+void
+ezio_scroll_down(struct ezio *e, int n)
+{
+ ezio_insert_line(e, 0, n);
+}
+
+bool
+ezio_chars_differ(const struct ezio *a, const struct ezio *b, int x0, int x1,
+ int *xp, int *yp)
+{
+ int x, y;
+
+ x0 = range(x0, 0, 39);
+ x1 = range(x1, 1, 40);
+ for (y = 0; y < 2; y++) {
+ for (x = x0; x < x1; x++) {
+ if (a->chars[y][x] != b->chars[y][x]) {
+ *xp = x;
+ *yp = y;
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
+static void
+remove_elements(uint8_t *p, size_t n_elems, size_t elem_size,
+ int pos, int n_del)
+{
+ if (pos >= 0 && pos < n_elems) {
+ n_del = MIN(n_del, n_elems - pos);
+ memmove(p + elem_size * pos,
+ p + elem_size * (pos + n_del),
+ elem_size * (n_elems - pos - n_del));
+ memset(p + elem_size * (n_elems - n_del), ' ', n_del * elem_size);
+ }
+}
+
+static void
+insert_elements(uint8_t *p, size_t n_elems, size_t elem_size,
+ int pos, int n_insert)
+{
+ if (pos >= 0 && pos < n_elems) {
+ n_insert = MIN(n_insert, n_elems - pos);
+ memmove(p + elem_size * (pos + n_insert),
+ p + elem_size * pos,
+ elem_size * (n_elems - pos - n_insert));
+ memset(p + elem_size * pos, ' ', n_insert * elem_size);
+ }
+}
+
+static int
+range(int value, int min, int max)
+{
+ return value < min ? min : value > max ? max : value;
+}
+
diff --git a/extras/ezio/ezio.h b/extras/ezio/ezio.h
new file mode 100644
index 000000000..1308ec30a
--- /dev/null
+++ b/extras/ezio/ezio.h
@@ -0,0 +1,96 @@
+/* Copyright (c) 2008, 2009 Nicira Networks, Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * In addition, as a special exception, Nicira Networks gives permission
+ * to link the code of its release of vswitchd with the OpenSSL project's
+ * "OpenSSL" library (or with modified versions of it that use the same
+ * license as the "OpenSSL" library), and distribute the linked
+ * executables. You must obey the GNU General Public License in all
+ * respects for all of the code used other than "OpenSSL". If you modify
+ * this file, you may extend this exception to your version of the file,
+ * but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version.
+ *
+ */
+
+#ifndef EZIO_H
+#define EZIO_H 1
+
+#include <stdbool.h>
+#include <stdint.h>
+
+/* Constants for visual representation of a row in an EZIO icon. */
+#define e_____ 0x00
+#define e____X 0x01
+#define e___X_ 0x02
+#define e___XX 0x03
+#define e__X__ 0x04
+#define e__X_X 0x05
+#define e__XX_ 0x06
+#define e__XXX 0x07
+#define e_X___ 0x08
+#define e_X__X 0x09
+#define e_X_X_ 0x0a
+#define e_X_XX 0x0b
+#define e_XX__ 0x0c
+#define e_XX_X 0x0d
+#define e_XXX_ 0x0e
+#define e_XXXX 0x0f
+#define eX____ 0x10
+#define eX___X 0x11
+#define eX__X_ 0x12
+#define eX__XX 0x13
+#define eX_X__ 0x14
+#define eX_X_X 0x15
+#define eX_XX_ 0x16
+#define eX_XXX 0x17
+#define eXX___ 0x18
+#define eXX__X 0x19
+#define eXX_X_ 0x1a
+#define eXX_XX 0x1b
+#define eXXX__ 0x1c
+#define eXXX_X 0x1d
+#define eXXXX_ 0x1e
+#define eXXXXX 0x1f
+
+struct ezio {
+ uint8_t icons[8][8];
+ uint8_t chars[2][40];
+ int x, y, x_ofs;
+ bool show_cursor;
+ bool blink_cursor;
+};
+
+void ezio_init(struct ezio *);
+void ezio_set_icon(struct ezio *, int idx,
+ int row0, int row1, int row2, int row3,
+ int row4, int row5, int row6, int row7);
+void ezio_set_default_icon(struct ezio *, int idx);
+void ezio_clear(struct ezio *);
+void ezio_put_char(struct ezio *, int x, int y, uint8_t c);
+void ezio_line_feed(struct ezio *);
+void ezio_newline(struct ezio *);
+void ezio_delete_char(struct ezio *, int x, int y, int n);
+void ezio_delete_line(struct ezio *, int y, int n);
+void ezio_insert_char(struct ezio *, int x, int y, int n);
+void ezio_insert_line(struct ezio *, int y, int n);
+void ezio_scroll_left(struct ezio *, int n);
+void ezio_scroll_right(struct ezio *, int n);
+void ezio_scroll_up(struct ezio *, int n);
+void ezio_scroll_down(struct ezio *, int n);
+bool ezio_chars_differ(const struct ezio *, const struct ezio *,
+ int x0, int x1, int *xp, int *yp);
+
+#endif /* ezio.h */
diff --git a/extras/ezio/ezio3.ti b/extras/ezio/ezio3.ti
new file mode 100644
index 000000000..0bbcb3985
--- /dev/null
+++ b/extras/ezio/ezio3.ti
@@ -0,0 +1,21 @@
+# Copyright (C) 2008, 2009 Nicira Networks, Inc.
+#
+# Copying and distribution of this file, with or without modification,
+# are permitted in any medium without royalty. This file is offered
+# as-is, without warranty of any kind.
+
+ezio3|16x2 EZIO3 LCD display,
+ cols#40, lines#2, it#8, am, xenl, npc,
+ bel=, clear=\E[H\E[J, cr=^M,
+ cub=\E[%p1%dD, cub1=^H, cud=\E[%p1%dB, cud1=^J,
+ cuf=\E[%p1%dC, cuf1=\E[C$<2>,
+ cup=\E[%i%p1%d;%p2%dH, cuu=\E[%p1%dA,
+ cuu1=\E[A, ed=\E[J, el=\E[K, el1=\E[1K,
+ home=\E[H, ht=^I, ind=^J, kbs=^H,
+ kcub1=\E[D, kcud1=\E[B, kcuf1=\E[C, kcuu1=\E[A,
+ civis=\E[1r, cnorm=\E[2r, cvvis=\E[3r,
+ ri=\EM, rs2=\Ec, rmacs=^O, smacs=^N,
+ dico=\E[%p1%d;%p2%d;%p3%d;%p4%d;%p5%d;%p6%d;%p7%d;%p8%d;%p9%dp,
+ cico=\E[%p1%dq,
+ acsc=}\355\,\177+\176~\245f\337{\367,
+
diff --git a/extras/ezio/ovs-switchui.c b/extras/ezio/ovs-switchui.c
new file mode 100644
index 000000000..6fbf25238
--- /dev/null
+++ b/extras/ezio/ovs-switchui.c
@@ -0,0 +1,3026 @@
+/* Copyright (c) 2008, 2009 Nicira Networks, Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * In addition, as a special exception, Nicira Networks gives permission
+ * to link the code of its release of vswitchd with the OpenSSL project's
+ * "OpenSSL" library (or with modified versions of it that use the same
+ * license as the "OpenSSL" library), and distribute the linked
+ * executables. You must obey the GNU General Public License in all
+ * respects for all of the code used other than "OpenSSL". If you modify
+ * this file, you may extend this exception to your version of the file,
+ * but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version.
+ *
+ */
+
+#include <config.h>
+#include <arpa/inet.h>
+#include <assert.h>
+#include <ctype.h>
+#include <curses.h>
+#include <errno.h>
+#include <getopt.h>
+#include <inttypes.h>
+#include <math.h>
+#include <pcre.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <term.h>
+#include <unistd.h>
+#include "command-line.h"
+#include "daemon.h"
+#include "dynamic-string.h"
+#include "ezio.h"
+#include "fatal-signal.h"
+#include "netdev.h"
+#include "ofpbuf.h"
+#include "openflow/nicira-ext.h"
+#include "openflow/openflow.h"
+#include "packets.h"
+#include "poll-loop.h"
+#include "process.h"
+#include "random.h"
+#include "rconn.h"
+#include "socket-util.h"
+#include "svec.h"
+#include "timeval.h"
+#include "util.h"
+#include "vconn.h"
+#include "xtoxll.h"
+
+#define THIS_MODULE VLM_switchui
+#include "vlog.h"
+
+static void parse_options(int argc, char *argv[]);
+static void usage(void);
+
+static void initialize_terminal(void);
+static void restore_terminal(void *aux);
+
+enum priority {
+ P_STATUS = 5,
+ P_PROGRESS = 10,
+ P_WARNING = 15,
+ P_ERROR = 20,
+ P_FATAL = 25
+};
+
+struct message;
+static void emit(struct message **, enum priority, const char *, ...)
+ PRINTF_FORMAT(3, 4);
+static void emit_function(struct message **, enum priority,
+ void (*function)(void *aux), void *aux);
+static int shown(struct message **);
+static void clear_messages(void);
+static bool empty_message(const struct message *);
+static struct message *best_message(void);
+static struct message *next_message(struct message *);
+static struct message *prev_message(struct message *);
+static void put_message(const struct message *);
+static void message_shown(struct message *);
+static void age_messages(void);
+
+struct pair {
+ char *name;
+ char *value;
+};
+
+struct dict {
+ struct pair *pairs;
+ size_t n, max;
+};
+
+static void dict_init(struct dict *);
+static void dict_add(struct dict *, const char *name, const char *value);
+static void dict_add_nocopy(struct dict *, char *name, char *value);
+static void dict_delete(struct dict *, const char *name);
+static void dict_parse(struct dict *, const char *data, size_t nbytes);
+static void dict_free(struct dict *);
+static bool dict_lookup(const struct dict *,
+ const char *name, const char **value);
+static int dict_get_int(const struct dict *, const char *name, int def);
+static bool dict_get_bool(const struct dict *, const char *name, bool def);
+static const char *dict_get_string(const struct dict *,
+ const char *name, const char *def);
+static uint32_t dict_get_ip(const struct dict *, const char *name);
+
+static void addf(const char *format, ...) PRINTF_FORMAT(1, 2);
+
+static void fetch_status(struct rconn *, struct dict *, long long int timeout);
+static bool parse_reply(void *, struct dict *, uint32_t xid);
+static void compose_messages(const struct dict *, struct rconn *rconn);
+
+static void show_flows(struct rconn *);
+static void show_dpid_ip(struct rconn *, const struct dict *);
+static void show_secchan_state(const struct dict *);
+static void show_fail_open_state(const struct dict *);
+static void show_discovery_state(const struct dict *);
+static void show_remote_state(const struct dict *);
+static void show_data_rates(struct rconn *, const struct dict *);
+
+static void init_reboot_notifier(void);
+static bool show_reboot_state(void);
+
+static void show_string(const char *string);
+static void block_until(long long timeout);
+static void menu(const struct dict *);
+static void drain_keyboard_buffer(void);
+
+static const char *progress(void);
+
+int
+main(int argc, char *argv[])
+{
+ struct rconn *rconn;
+ struct message *msg;
+ int countdown = 5;
+ bool user_selected;
+ bool debug_mode;
+
+ /* Tracking keystroke repeat counts. */
+ int last_key = 0;
+ long long int last_key_time = 0;
+ int repeat_count = 0;
+
+ set_program_name(argv[0]);
+ time_init();
+ vlog_init();
+ parse_options(argc, argv);
+ signal(SIGPIPE, SIG_IGN);
+ vlog_set_levels(VLM_ANY_MODULE, VLF_CONSOLE, VLL_EMER);
+ init_reboot_notifier();
+
+ argc -= optind;
+ argv += optind;
+ if (argc != 1) {
+ ovs_fatal(0, "exactly one non-option argument required; "
+ "use --help for help");
+ }
+
+ rconn = rconn_new(argv[0], 5, 5);
+
+ die_if_already_running();
+ daemonize();
+
+ initialize_terminal();
+ fatal_signal_add_hook(restore_terminal, NULL, true);
+
+ msg = NULL;
+ countdown = 0;
+ user_selected = false;
+ debug_mode = false;
+ for (;;) {
+ struct dict dict;
+ long long timeout = time_msec() + 1000;
+
+ clear_messages();
+
+ dict_init(&dict);
+ fetch_status(rconn, &dict, timeout);
+ dict_add(&dict, "debug", debug_mode ? "true" : "false");
+ compose_messages(&dict, rconn);
+
+ if (countdown) {
+ if (!empty_message(msg)) {
+ countdown--;
+ } else {
+ msg = user_selected ? next_message(msg) : best_message();
+ countdown = 5;
+ }
+ } else {
+ msg = best_message();
+ countdown = 5;
+ user_selected = false;
+ }
+ if (!user_selected) {
+ message_shown(msg);
+ }
+
+ do {
+ for (;;) {
+ int c = getch();
+ if (c == ERR) {
+ break;
+ }
+
+ if (c != last_key || time_msec() > last_key_time + 250) {
+ repeat_count = 0;
+ }
+ last_key = c;
+ last_key_time = time_msec();
+ repeat_count++;
+
+ if (c == KEY_DOWN || c == KEY_UP) {
+ msg = (c == KEY_DOWN ? next_message(msg)
+ : prev_message(msg));
+ countdown = 5;
+ user_selected = true;
+ } else if (c == '\r' || c == '\n') {
+ countdown = 60;
+ user_selected = true;
+ if (repeat_count >= 20) {
+ debug_mode = !debug_mode;
+ show_string(debug_mode
+ ? "Debug Mode\nEnabled"
+ : "Debug Mode\nDisabled");
+ }
+ } else if (c == '\b' || c == '\x7f' ||
+ c == '\x1b' || c == KEY_BACKSPACE || c == KEY_DC) {
+ menu(&dict);
+ drain_keyboard_buffer();
+ break;
+ }
+ }
+
+ erase();
+ curs_set(0);
+ move(0, 0);
+ put_message(msg);
+ refresh();
+
+ poll_fd_wait(STDIN_FILENO, POLLIN);
+ poll_timer_wait(timeout - time_msec());
+ poll_block();
+ } while (time_msec() < timeout);
+ age_messages();
+ dict_free(&dict);
+ }
+
+ return 0;
+}
+
+static void
+compose_messages(const struct dict *dict, struct rconn *rconn)
+{
+ if (!show_reboot_state()) {
+ show_flows(rconn);
+ show_dpid_ip(rconn, dict);
+ show_secchan_state(dict);
+ show_fail_open_state(dict);
+ show_discovery_state(dict);
+ show_remote_state(dict);
+ show_data_rates(rconn, dict);
+ }
+}
+
+struct put_flows_data {
+ struct rconn *rconn;
+ uint32_t xid;
+ uint32_t flow_count;
+ bool got_reply;
+};
+
+static void
+parse_flow_reply(void *data, struct put_flows_data *pfd)
+{
+ struct ofp_header *oh;
+ struct ofp_stats_reply *rpy;
+ struct ofp_aggregate_stats_reply *asr;
+ const size_t min_size = sizeof *rpy + sizeof *asr;
+
+ oh = data;
+ if (ntohs(oh->length) < min_size) {
+ VLOG_WARN("reply is too short (%"PRIu16")", ntohs(oh->length));
+ return;
+ }
+ if (oh->xid != pfd->xid) {
+ VLOG_WARN("xid 0x%08"PRIx32" != expected 0x%08"PRIx32,
+ oh->xid, pfd->xid);
+ return;
+ }
+ if (oh->type != OFPT_STATS_REPLY) {
+ VLOG_WARN("reply is wrong type %"PRIu8, oh->type);
+ return;
+ }
+
+ rpy = data;
+ if (rpy->type != htons(OFPST_AGGREGATE)) {
+ VLOG_WARN("reply has wrong stat type ID %08"PRIx16, rpy->type);
+ return;
+ }
+
+ asr = (struct ofp_aggregate_stats_reply *) rpy->body;
+ pfd->flow_count = ntohl(asr->flow_count);
+ pfd->got_reply = true;
+}
+
+static bool
+have_icons(void)
+{
+ const char *dico = tigetstr("dico");
+ return dico && dico != (const char *) -1;
+}
+
+static void
+set_icon(int num, int r0, int r1, int r2, int r3, int r4, int r5, int r6,
+ int r7)
+{
+ if (have_icons()) {
+ putp(tparm(tigetstr("dico"), num, r0, r1, r2, r3, r4, r5, r6, r7));
+ }
+}
+
+static void
+set_repeated_icon(int num, int row)
+{
+ set_icon(num, row, row, row, row, row, row, row, row);
+}
+
+#if 0
+static void
+set_brick_icon(int num, int n_solid)
+{
+ const static int rows[6] = {_____, X____, XX___, XXX__, XXXX_, XXXXX};
+ set_repeated_icon(num, rows[n_solid < 0 ? 0
+ : n_solid > 5 ? 5
+ : n_solid]);
+}
+#endif
+
+static int
+icon_char(int num, int alternate)
+{
+ return have_icons() ? 0x80 | num | A_ALTCHARSET : alternate;
+}
+
+static void
+put_icon(int num, char alternate)
+{
+ addch(icon_char(num, alternate));
+}
+
+#if 0
+static void
+bar_graph(int n_chars, int n_pixels)
+{
+ int i;
+
+ if (n_pixels < 0) {
+ n_pixels = 0;
+ } else if (n_pixels > n_chars * 5) {
+ n_pixels = n_chars * 5;
+ }
+
+ if (n_pixels > 5) {
+ set_brick_icon(0, 5);
+ for (i = 0; i < n_pixels / 5; i++) {
+ put_icon(0, "#");
+ }
+ }
+ if (n_pixels % 5) {
+ set_brick_icon(1, n_pixels % 5);
+ put_icon(1, "#");
+ }
+}
+#endif
+
+static void
+put_flows(void *pfd_)
+{
+ struct put_flows_data *pfd = pfd_;
+ static struct rconn_packet_counter *counter;
+ char host[64];
+
+ if (!counter) {
+ counter = rconn_packet_counter_create();
+ }
+
+ if (!pfd->xid) {
+ struct ofp_stats_request *rq;
+ struct ofp_aggregate_stats_request *asr;
+ struct ofpbuf *b;
+
+ pfd->xid = random_uint32();
+ rq = make_openflow_xid(sizeof *rq, OFPT_STATS_REQUEST,
+ pfd->xid, &b);
+ rq->type = htons(OFPST_AGGREGATE);
+ rq->flags = htons(0);
+ asr = ofpbuf_put_uninit(b, sizeof *asr);
+ memset(asr, 0, sizeof *asr);
+ asr->match.wildcards = htonl(OFPFW_ALL);
+ asr->table_id = 0xff;
+ asr->out_port = htons(OFPP_NONE);
+ update_openflow_length(b);
+ rconn_send_with_limit(pfd->rconn, b, counter, 10);
+ }
+
+ if (!pfd->got_reply) {
+ int i;
+
+ rconn_run(pfd->rconn);
+ for (i = 0; i < 50; i++) {
+ struct ofpbuf *b;
+
+ b = rconn_recv(pfd->rconn);
+ if (!b) {
+ break;
+ }
+
+ parse_flow_reply(b->data, pfd);
+ ofpbuf_delete(b);
+ if (pfd->got_reply) {
+ break;
+ }
+ }
+ }
+
+ gethostname(host, sizeof host);
+ host[sizeof host - 1] = '\0';
+ if (strlen(host) + 6 <= 16) {
+ addf("Host: %s\n", host);
+ } else {
+ addf("%s\n", host);
+ }
+ if (pfd->got_reply) {
+ addf("Flows: %"PRIu32, pfd->flow_count);
+ }
+
+ if (!pfd->got_reply) {
+ rconn_run_wait(pfd->rconn);
+ rconn_recv_wait(pfd->rconn);
+ }
+}
+
+static void
+show_flows(struct rconn *rconn)
+{
+ static struct message *m;
+ static struct put_flows_data pfd;
+
+ memset(&pfd, 0, sizeof pfd);
+ pfd.rconn = rconn;
+ emit_function(&m, P_STATUS, put_flows, &pfd);
+
+}
+
+struct put_dpid_ip_data {
+ struct rconn *rconn;
+ uint32_t xid;
+ uint64_t dpid;
+ char ip[16];
+ bool got_reply;
+};
+
+static void
+parse_dp_reply(void *data, struct put_dpid_ip_data *pdid)
+{
+ struct ofp_switch_features *osf;
+ struct ofp_header *oh;
+
+ oh = data;
+ if (ntohs(oh->length) < sizeof *osf) {
+ VLOG_WARN("reply is too short (%"PRIu16")", ntohs(oh->length));
+ return;
+ }
+ if (oh->xid != pdid->xid) {
+ VLOG_WARN("xid 0x%08"PRIx32" != expected 0x%08"PRIx32,
+ oh->xid, pdid->xid);
+ return;
+ }
+ if (oh->type != OFPT_FEATURES_REPLY) {
+ VLOG_WARN("reply is wrong type %"PRIu8, oh->type);
+ return;
+ }
+
+ osf = data;
+ pdid->dpid = ntohll(osf->datapath_id);
+ pdid->got_reply = true;
+}
+
+static void
+put_dpid_id(void *pdid_)
+{
+ struct put_dpid_ip_data *pdid = pdid_;
+ static struct rconn_packet_counter *counter;
+
+ if (!counter) {
+ counter = rconn_packet_counter_create();
+ }
+
+ if (!pdid->xid) {
+ struct ofp_header *oh;
+ struct ofpbuf *b;
+
+ pdid->xid = random_uint32();
+ oh = make_openflow_xid(sizeof *oh, OFPT_FEATURES_REQUEST,
+ pdid->xid, &b);
+ rconn_send_with_limit(pdid->rconn, b, counter, 10);
+ }
+
+ if (!pdid->got_reply) {
+ int i;
+
+ rconn_run(pdid->rconn);
+ for (i = 0; i < 50; i++) {
+ struct ofpbuf *b;
+
+ b = rconn_recv(pdid->rconn);
+ if (!b) {
+ break;
+ }
+
+ parse_dp_reply(b->data, pdid);
+ ofpbuf_delete(b);
+ if (pdid->got_reply) {
+ break;
+ }
+ }
+ }
+
+ addf("DP: ");
+ if (pdid->got_reply) {
+ addf("%012"PRIx64, pdid->dpid);
+ }
+ addf("\nIP: %s", pdid->ip);
+
+ if (!pdid->got_reply) {
+ rconn_run_wait(pdid->rconn);
+ rconn_recv_wait(pdid->rconn);
+ }
+}
+
+static void
+show_dpid_ip(struct rconn *rconn, const struct dict *dict)
+{
+ static struct message *m;
+ static struct put_dpid_ip_data pdid;
+ const char *is_connected, *local_ip;
+
+ dict_lookup(dict, "local.is-connected", &is_connected);
+ dict_lookup(dict, "in-band.local-ip", &local_ip);
+ if (!is_connected && !local_ip) {
+ /* If we're not connected to the datapath and don't have a local IP,
+ * then we won't have anything useful to show anyhow. */
+ return;
+ }
+
+ memset(&pdid, 0, sizeof pdid);
+ pdid.rconn = rconn;
+ ovs_strlcpy(pdid.ip, local_ip ? local_ip : "", sizeof pdid.ip);
+ emit_function(&m, P_STATUS, put_dpid_id, &pdid);
+}
+
+static size_t
+dict_find(const struct dict *dict, const char *name)
+{
+ size_t i;
+
+ for (i = 0; i < dict->n; i++) {
+ const struct pair *p = &dict->pairs[i];
+ if (!strcmp(p->name, name)) {
+ return i;
+ }
+ }
+
+ return SIZE_MAX;
+}
+
+static bool
+dict_lookup(const struct dict *dict, const char *name, const char **value)
+{
+ size_t idx = dict_find(dict, name);
+ if (idx != SIZE_MAX) {
+ *value = dict->pairs[idx].value;
+ return true;
+ } else {
+ *value = NULL;
+ return false;
+ }
+}
+
+static const char *
+dict_get(const struct dict *dict, const char *name)
+{
+ const char *value;
+ return dict_lookup(dict, name, &value) ? value : NULL;
+}
+
+static int
+dict_get_int(const struct dict *dict, const char *name, int def)
+{
+ const char *value;
+ return dict_lookup(dict, name, &value) ? atoi(value) : def;
+}
+
+static bool
+dict_get_bool(const struct dict *dict, const char *name, bool def)
+{
+ const char *value;
+ if (dict_lookup(dict, name, &value)) {
+ if (!strcmp(value, "true")) {
+ return true;
+ }
+ if (!strcmp(value, "false")) {
+ return false;
+ }
+ }
+ return def;
+}
+
+static const char *
+dict_get_string(const struct dict *dict, const char *name, const char *def)
+{
+ const char *value;
+ return dict_lookup(dict, name, &value) ? value : def;
+}
+
+static uint32_t
+dict_get_ip(const struct dict *dict, const char *name)
+{
+ struct in_addr in;
+ return (inet_aton(dict_get_string(dict, name, ""), &in) ? in.s_addr
+ : htonl(0));
+}
+
+static void
+addf(const char *format, ...)
+{
+ char buf[128];
+ va_list args;
+
+ va_start(args, format);
+ vsnprintf(buf, sizeof buf, format, args);
+ va_end(args);
+
+ addstr(buf);
+}
+
+static void
+show_secchan_state(const struct dict *dict)
+{
+ static struct message *msg;
+ const char *is_connected;
+
+ if (!dict_lookup(dict, "remote.is-connected", &is_connected)) {
+ /* Secchan not running or not responding. */
+ emit(&msg, P_ERROR, "Switch disabled");
+ }
+}
+
+static const char *
+discovery_state_label(const char *name)
+{
+ static struct dict *states;
+ if (!states) {
+ states = xmalloc(sizeof *states);
+ dict_init(states);
+ dict_add(states, "INIT", "Init");
+ dict_add(states, "INIT_REBOOT", "Init");
+ dict_add(states, "REBOOTING", "Init");
+ dict_add(states, "SELECTING", "Searching");
+ dict_add(states, "REQUESTING", "Requesting");
+ dict_add(states, "BOUND", "Got");
+ dict_add(states, "RENEWING", "Renewing");
+ dict_add(states, "REBINDING", "Rebinding");
+ dict_add(states, "RELEASED", "Released");
+ }
+ return dict_get_string(states, name, "Error");
+}
+
+static void
+show_discovery_state(const struct dict *dict)
+{
+ static struct message *m_bound, *m_other;
+ struct message **m;
+ const char *state, *ip;
+ enum priority priority;
+ int state_elapsed;
+
+ state = dict_get_string(dict, "discovery.state", NULL);
+ if (!state) {
+ return;
+ }
+ ip = dict_get_string(dict, "discovery.ip", NULL);
+ state_elapsed = dict_get_int(dict, "discovery.state-elapsed", 0);
+
+ if (!strcmp(state, "BOUND")) {
+ m = &m_bound;
+ priority = P_STATUS;
+ } else {
+ m = &m_other;
+ priority = P_PROGRESS;
+ }
+ emit(m, priority, "Discovery %s\n%s",
+ progress(), discovery_state_label(state));
+ if (ip) {
+ emit(m, priority, " %s", ip);
+ }
+}
+
+static void
+human_time(int seconds, char *buf, size_t size)
+{
+ const char *sign = "";
+ if (seconds < 0) {
+ sign = "-";
+ seconds = seconds == INT_MIN ? INT_MAX : -seconds;
+ }
+
+ if (seconds <= 60) {
+ snprintf(buf, size, "%s%d s", sign, seconds);
+ } else if (seconds <= 60 * 60) {
+ snprintf(buf, size, "%s%d min", sign, seconds / 60);
+ } else if (seconds <= 60 * 60 * 24 * 2) {
+ snprintf(buf, size, "%s%d h", sign, seconds / 60 / 60);
+ } else {
+ snprintf(buf, size, "%s%d days", sign, seconds / 60 / 60 / 24);
+ }
+}
+
+static void
+show_fail_open_state(const struct dict *dict)
+{
+ static struct message *m;
+ int cur_duration, trigger_duration;
+
+ if (!dict_get_bool(dict, "fail-open.triggered", false)) {
+ return;
+ }
+ trigger_duration = dict_get_int(dict, "fail-open.trigger-duration", 0);
+ cur_duration = dict_get_int(dict, "fail-open.current-duration", 0);
+ if (shown(&m) < 5) {
+ emit(&m, P_WARNING, "Failed open %s\nafter %d secs",
+ progress(), trigger_duration);
+ } else {
+ char buf[16];
+ human_time(cur_duration - trigger_duration, buf, sizeof buf);
+ emit(&m, P_WARNING, "In fail open for\n%s now %s", buf, progress());
+ }
+}
+
+static const char *
+progress(void)
+{
+ return "..." + (3 - (unsigned int) time_now() % 4);
+}
+
+static void
+show_remote_state(const struct dict *dict)
+{
+ bool debug_mode = dict_get_bool(dict, "debug", false);
+ const char *state, *is_connected;
+
+ state = dict_get_string(dict, "remote.state", NULL);
+ if (!state) {
+ return;
+ }
+ is_connected = dict_get_string(dict, "remote.is-connected", "false");
+ if (!strcmp(is_connected, "true")) {
+ if (debug_mode) {
+ static struct message *m_connected;
+ char buf[16];
+ human_time(dict_get_int(dict, "remote.last-connection", 0),
+ buf, sizeof buf);
+ emit(&m_connected, P_STATUS,
+ "Connected for\nlast %s %s", buf, progress());
+ }
+
+ if (!strcmp(state, "IDLE")) {
+ static struct message *m_idle;
+ emit(&m_idle, P_PROGRESS, "Sent idle probe");
+ }
+
+ if (debug_mode) {
+ const char *name = dict_get_string(dict, "remote.name", NULL);
+ if (name) {
+ static struct message *m_name;
+ emit(&m_name, P_STATUS, "Connected to\n%s", name);
+ }
+ }
+ } else {
+ int elapsed, backoff;
+ const char *name, *error;
+
+ elapsed = dict_get_int(dict, "remote.state-elapsed", 0);
+ backoff = dict_get_int(dict, "remote.backoff", 0);
+ name = dict_get_string(dict, "remote.name", "unknown");
+ state = dict_get_string(dict, "remote.state", "VOID");
+ error = dict_get_string(dict, "remote.last-connect-error", NULL);
+ if (!strcmp(state, "VOID")) {
+ static struct message *m;
+ emit(&m, P_PROGRESS, "Controller not\nfound");
+ } else if (!strcmp(state, "BACKOFF")) {
+ static struct message *m[3];
+ char buf[16];
+
+ if (error) {
+ emit(&m[0], P_PROGRESS, "Connect failed:\n%s", error);
+ }
+ emit(&m[2], P_STATUS, "Last connected\n%s ago", buf);
+ emit(&m[1], P_PROGRESS,
+ "Disconnected\nReconnect in %d", backoff - elapsed);
+ human_time(dict_get_int(dict, "remote.last-connection", 0),
+ buf, sizeof buf);
+ } else if (!strcmp(state, "CONNECTING")) {
+ static struct message *m;
+ emit(&m, P_PROGRESS, "Connecting %s\n%s", progress(), name);
+ }
+ }
+}
+
+static void
+fetch_status(struct rconn *rconn, struct dict *dict, long long timeout)
+{
+ static struct rconn_packet_counter *counter;
+ static uint32_t xid;
+ struct nicira_header *rq;
+ struct ofpbuf *b;
+ int retval;
+
+ if (!counter) {
+ counter = rconn_packet_counter_create();
+ }
+ if (!xid) {
+ xid = random_uint32();
+ }
+
+ rq = make_openflow_xid(sizeof *rq, OFPT_VENDOR, ++xid, &b);
+ rq->vendor = htonl(NX_VENDOR_ID);
+ rq->subtype = htonl(NXT_STATUS_REQUEST);
+ retval = rconn_send_with_limit(rconn, b, counter, 10);
+ if (retval) {
+ /* continue into the loop so that we pause for a while */
+ }
+
+ while (time_msec() < timeout) {
+ int i;
+
+ rconn_run(rconn);
+
+ for (i = 0; i < 50; i++) {
+ struct ofpbuf *b;
+ bool got_reply;
+
+ b = rconn_recv(rconn);
+ if (!b) {
+ break;
+ }
+
+ got_reply = parse_reply(b->data, dict, xid);
+ ofpbuf_delete(b);
+ if (got_reply) {
+ return;
+ }
+ }
+
+ rconn_run_wait(rconn);
+ rconn_recv_wait(rconn);
+ poll_timer_wait(timeout - time_msec());
+ poll_block();
+ }
+}
+
+static bool
+parse_reply(void *data, struct dict *dict, uint32_t xid)
+{
+ struct ofp_header *oh;
+ struct nicira_header *rpy;
+
+ oh = data;
+ if (ntohs(oh->length) < sizeof *rpy) {
+ VLOG_WARN("reply is too short (%"PRIu16")", ntohs(oh->length));
+ return false;
+ }
+ if (oh->xid != xid) {
+ VLOG_WARN("xid 0x%08"PRIx32" != expected 0x%08"PRIx32, oh->xid, xid);
+ return false;
+ }
+ if (oh->type != OFPT_VENDOR) {
+ VLOG_WARN("reply is wrong type %"PRIu8, oh->type);
+ return false;
+ }
+
+ rpy = data;
+ if (rpy->vendor != htonl(NX_VENDOR_ID)) {
+ VLOG_WARN("reply has wrong vendor ID %08"PRIx32, rpy->vendor);
+ return false;
+ }
+ if (rpy->subtype != htonl(NXT_STATUS_REPLY)) {
+ VLOG_WARN("reply has wrong subtype %08"PRIx32, rpy->subtype);
+ return false;
+ }
+
+ dict_parse(dict, (const char *) (rpy + 1),
+ ntohs(oh->length) - sizeof *rpy);
+ return true;
+}
+
+static void
+dict_parse(struct dict *dict, const char *data, size_t nbytes)
+{
+ char *save_ptr = NULL;
+ char *copy, *name;
+
+ copy = xmemdup0(data, nbytes);
+ for (name = strtok_r(copy, "=", &save_ptr); name;
+ name = strtok_r(NULL, "=", &save_ptr))
+ {
+ char *value = strtok_r(NULL, "\n", &save_ptr);
+ if (!value) {
+ break;
+ }
+ dict_add(dict, name, value);
+ }
+ free(copy);
+}
+
+static void
+dict_init(struct dict *dict)
+{
+ dict->n = 0;
+ dict->max = 16;
+ dict->pairs = xmalloc(sizeof *dict->pairs * dict->max);
+}
+
+static void
+dict_add(struct dict *dict, const char *name, const char *value)
+{
+ dict_add_nocopy(dict, xstrdup(name), xstrdup(value));
+}
+
+static void
+dict_add_nocopy(struct dict *dict, char *name, char *value)
+{
+ struct pair *p;
+
+ if (dict->n >= dict->max) {
+ dict->max *= 2;
+ dict->pairs = xrealloc(dict->pairs, sizeof *dict->pairs * dict->max);
+ }
+ p = &dict->pairs[dict->n++];
+ p->name = name;
+ p->value = value;
+}
+
+static void
+dict_delete(struct dict *dict, const char *name)
+{
+ size_t idx;
+ while ((idx = dict_find(dict, name)) != SIZE_MAX) {
+ struct pair *pair = &dict->pairs[idx];
+ free(pair->name);
+ free(pair->value);
+ dict->pairs[idx] = dict->pairs[--dict->n];
+ }
+}
+
+static void
+dict_free(struct dict *dict)
+{
+ if (dict) {
+ size_t i;
+
+ for (i = 0; i < dict->n; i++) {
+ free(dict->pairs[i].name);
+ free(dict->pairs[i].value);
+ }
+ free(dict->pairs);
+ }
+}
+
+static void
+initialize_terminal(void)
+{
+ initscr();
+ cbreak();
+ noecho();
+ nonl();
+ intrflush(stdscr, FALSE);
+ keypad(stdscr, TRUE);
+ nodelay(stdscr, TRUE);
+ typeahead(-1);
+ scrollok(stdscr, TRUE);
+}
+
+static void
+restore_terminal(void *aux UNUSED)
+{
+ endwin();
+}
+
+struct byte_count {
+ long long int when;
+ uint64_t tx_bytes;
+};
+
+struct show_rates_data {
+ struct rconn *rconn;
+ uint32_t xid;
+ struct byte_count prev, now;
+ bool got_reply;
+};
+
+static void
+parse_port_reply(void *data, struct show_rates_data *rates)
+{
+ struct ofp_header *oh;
+ struct ofp_stats_reply *rpy;
+ struct ofp_port_stats *ops;
+ size_t n_ports;
+ size_t i;
+
+ oh = data;
+ if (ntohs(oh->length) < sizeof *rpy) {
+ VLOG_WARN("reply is too short (%"PRIu16")", ntohs(oh->length));
+ return;
+ }
+ if (oh->xid != rates->xid) {
+ VLOG_WARN("xid 0x%08"PRIx32" != expected 0x%08"PRIx32,
+ oh->xid, rates->xid);
+ return;
+ }
+ if (oh->type != OFPT_STATS_REPLY) {
+ VLOG_WARN("reply is wrong type %"PRIu8, oh->type);
+ return;
+ }
+
+ rpy = data;
+ if (rpy->type != htons(OFPST_PORT)) {
+ VLOG_WARN("reply has wrong stat type ID %08"PRIx16, rpy->type);
+ return;
+ }
+
+ n_ports = ((ntohs(oh->length) - offsetof(struct ofp_stats_reply, body))
+ / sizeof *ops);
+ ops = (struct ofp_port_stats *) rpy->body;
+ rates->prev = rates->now;
+ rates->now.when = time_msec();
+ rates->now.tx_bytes = UINT64_MAX;
+ for (i = 0; i < n_ports; i++, ops++) {
+ if (ops->tx_bytes != htonll(UINT64_MAX)) {
+ if (rates->now.tx_bytes == UINT64_MAX) {
+ rates->now.tx_bytes = 0;
+ }
+ rates->now.tx_bytes += ntohll(ops->tx_bytes);
+ }
+ }
+ rates->got_reply = true;
+}
+
+static void
+dump_graph(const bool graph[80])
+{
+ signed char icons[32];
+ int n_icons = 3;
+ int i;
+
+ memset(icons, -1, sizeof icons);
+ for (i = 0; i < 16; i++) {
+ uint8_t row;
+ int j;
+
+ row = 0;
+ for (j = 0; j < 5; j++) {
+ row = (row << 1) | graph[i * 5 + j];
+ }
+ if (!row) {
+ addch(' ');
+ continue;
+ }
+
+ if (icons[row] < 0) {
+ if (n_icons >= 8) {
+ addch('X');
+ continue;
+ }
+ set_repeated_icon(n_icons, row);
+ icons[row] = n_icons++;
+ }
+ put_icon(icons[row], row == 0x1f ? '#' : ' ');
+ }
+}
+
+static void
+do_show_data_rates(void *rates_)
+{
+ struct show_rates_data *rates = rates_;
+ static struct rconn_packet_counter *counter;
+ bool graph[80];
+
+ if (!counter) {
+ counter = rconn_packet_counter_create();
+ }
+ if (!rates->xid) {
+ struct ofp_stats_request *rq;
+ struct ofpbuf *b;
+
+ rates->xid = random_uint32();
+ rq = make_openflow_xid(sizeof *rq, OFPT_STATS_REQUEST,
+ rates->xid, &b);
+ rq->type = htons(OFPST_PORT);
+ rq->flags = htons(0);
+ rconn_send_with_limit(rates->rconn, b, counter, 10);
+ }
+
+ if (!rates->got_reply) {
+ int i;
+
+ rconn_run(rates->rconn);
+ for (i = 0; i < 50; i++) {
+ struct ofpbuf *b;
+
+ b = rconn_recv(rates->rconn);
+ if (!b) {
+ break;
+ }
+
+ parse_port_reply(b->data, rates);
+ ofpbuf_delete(b);
+ if (rates->got_reply) {
+ break;
+ }
+ }
+ }
+
+ set_icon(0,
+ e_____,
+ e_____,
+ e_____,
+ e__X__,
+ e__X__,
+ e__X_X,
+ e__XX_,
+ e__X_X);
+ set_icon(1,
+ e_____,
+ e_____,
+ e_____,
+ eX___X,
+ eXX_XX,
+ eX_X_X,
+ eX___X,
+ eX___X);
+ set_icon(2,
+ e_____,
+ e_____,
+ e_____,
+ e_XXX_,
+ eX____,
+ eX_XXX,
+ eX___X,
+ e_XXX_);
+
+ memset(graph, 0, sizeof graph);
+ graph[24] = 1;
+ graph[48] = 1;
+ graph[72] = 1;
+
+ addstr("TX: ");
+ put_icon(0, 'k');
+ addstr(" ");
+ put_icon(1, 'M');
+ addstr(" ");
+ put_icon(2, 'G');
+ addch('\n');
+
+ if (rates->now.tx_bytes != UINT64_MAX
+ && rates->prev.tx_bytes != UINT64_MAX
+ && rates->now.when - rates->prev.when > 500
+ && time_msec() - rates->now.when < 2000)
+ {
+ uint64_t bits = (rates->now.tx_bytes - rates->prev.tx_bytes) * 8;
+ uint64_t msecs = rates->now.when - rates->prev.when;
+ double bps = (double) bits * 1000.0 / msecs;
+ int pixels = bps > 0 ? log(bps) / log(10.0) * 8 + .5 : 0;
+ if (pixels < 0) {
+ pixels = 0;
+ } else if (pixels > 80) {
+ pixels = 80;
+ }
+ memset(graph, 1, pixels);
+ }
+
+ dump_graph(graph);
+
+ if (!rates->got_reply) {
+ rconn_run_wait(rates->rconn);
+ rconn_recv_wait(rates->rconn);
+ }
+}
+
+static void
+show_data_rates(struct rconn *rconn, const struct dict *dict)
+{
+ static struct message *m;
+ static struct show_rates_data rates;
+ const char *is_connected, *local_ip;
+ static bool inited = false;
+
+ dict_lookup(dict, "local.is-connected", &is_connected);
+ dict_lookup(dict, "in-band.local-ip", &local_ip);
+ if (!is_connected && !local_ip) {
+ /* If we're not connected to the datapath and don't have a local IP,
+ * then we won't have anything useful to show anyhow. */
+ return;
+ }
+
+ rates.rconn = rconn;
+ rates.xid = 0;
+ rates.got_reply = false;
+ if (!inited) {
+ rates.now.tx_bytes = UINT64_MAX;
+ rates.prev.tx_bytes = UINT64_MAX;
+ inited = true;
+ }
+ emit_function(&m, P_STATUS, do_show_data_rates, &rates);
+}
+
+struct message {
+ /* Content. */
+ void (*function)(void *aux);
+ void *aux;
+ char string[128];
+
+ size_t index;
+ enum priority priority;
+ int age;
+ int shown;
+};
+
+static struct message **messages;
+static size_t n_messages, allocated_messages;
+
+static struct message *
+allocate_message(struct message **msgp)
+{
+ if (!*msgp) {
+ /* Allocate and initialize message. */
+ *msgp = xcalloc(1, sizeof **msgp);
+ (*msgp)->index = n_messages;
+
+ /* Add to list of messages. */
+ if (n_messages >= allocated_messages) {
+ allocated_messages = 2 * allocated_messages + 1;
+ messages = xrealloc(messages,
+ sizeof *messages * allocated_messages);
+ }
+ messages[n_messages++] = *msgp;
+ }
+ return *msgp;
+}
+
+static void
+emit(struct message **msgp, enum priority priority, const char *format, ...)
+{
+ struct message *msg = allocate_message(msgp);
+ va_list args;
+ size_t length;
+
+ msg->priority = priority;
+
+ va_start(args, format);
+ length = strlen(msg->string);
+ vsnprintf(msg->string + length, sizeof msg->string - length, format, args);
+ va_end(args);
+}
+
+static void
+emit_function(struct message **msgp, enum priority priority,
+ void (*function)(void *aux), void *aux)
+{
+ struct message *msg = allocate_message(msgp);
+ msg->priority = priority;
+ msg->function = function;
+ msg->aux = aux;
+}
+
+static int
+shown(struct message **msgp)
+{
+ struct message *msg = allocate_message(msgp);
+ return msg->shown;
+}
+
+static void
+clear_messages(void)
+{
+ size_t i;
+
+ for (i = 0; i < n_messages; i++) {
+ struct message *msg = messages[i];
+ msg->string[0] = '\0';
+ msg->function = NULL;
+ }
+}
+
+static struct message *
+best_message(void)
+{
+ struct message *best_msg;
+ int best_score;
+ size_t i;
+
+ best_score = INT_MIN;
+ best_msg = NULL;
+ for (i = 0; i < n_messages; i++) {
+ struct message *msg = messages[i];
+ int score;
+
+ if (empty_message(msg)) {
+ continue;
+ }
+
+ score = msg->priority;
+ if (!msg->shown) {
+ score += msg->age;
+ } else {
+ score -= msg->shown;
+ }
+ if (score > best_score) {
+ best_score = score;
+ best_msg = msg;
+ }
+ }
+ return best_msg;
+}
+
+static void
+message_shown(struct message *msg)
+{
+ if (msg && msg->shown++ > 3600) {
+ msg->shown = 0;
+ }
+}
+
+static bool
+empty_message(const struct message *msg)
+{
+ return !msg || (!msg->string[0] && !msg->function);
+}
+
+static struct message *get_message(size_t index)
+{
+ assert(index <= n_messages || index == SIZE_MAX);
+ return (index < n_messages ? messages[index]
+ : index == SIZE_MAX ? messages[n_messages - 1]
+ : messages[0]);
+}
+
+static struct message *
+next_message(struct message *msg)
+{
+ struct message *p;
+
+ for (p = get_message(msg->index + 1); p != msg;
+ p = get_message(p->index + 1)) {
+ if (!empty_message(p)) {
+ break;
+ }
+ }
+ return p;
+}
+
+static struct message *
+prev_message(struct message *msg)
+{
+ struct message *p;
+
+ for (p = get_message(msg->index - 1); p != msg;
+ p = get_message(p->index - 1)) {
+ if (!empty_message(p)) {
+ break;
+ }
+ }
+ return p;
+}
+
+static void
+put_message(const struct message *m)
+{
+ if (m->string[0]) {
+ addstr(m->string);
+ } else if (m->function) {
+ m->function(m->aux);
+ }
+}
+
+static void
+age_messages(void)
+{
+ size_t i;
+ int load;
+
+ load = 0;
+ for (i = 0; i < n_messages; i++) {
+ struct message *msg = messages[i];
+ if (!empty_message(msg)) {
+ load++;
+ }
+ }
+
+ for (i = 0; i < n_messages; i++) {
+ struct message *msg = messages[i];
+ if (empty_message(msg)) {
+ msg->age = msg->shown = 0;
+ } else {
+ if (msg->age && msg->age % 60 == 0) {
+ msg->shown -= MAX(0, 5 - (load + 6) / 12);
+ if (msg->shown < 0) {
+ msg->shown = 0;
+ }
+ }
+ if (msg->age++ > 3600) {
+ msg->age = 0;
+ }
+ }
+ }
+}
+
+/* Set by SIGUSR1 handler. */
+static volatile sig_atomic_t sigusr1_triggered;
+
+/* The time after which we stop indicating that the switch is rebooting.
+ * (This is just in case the reboot fails.) */
+static time_t reboot_deadline = TIME_MIN;
+
+static void sigusr1_handler(int);
+
+static void
+init_reboot_notifier(void)
+{
+ signal(SIGUSR1, sigusr1_handler);
+}
+
+static void
+sigusr1_handler(int signr UNUSED)
+{
+ sigusr1_triggered = true;
+}
+
+static bool
+show_reboot_state(void)
+{
+ if (sigusr1_triggered) {
+ reboot_deadline = time_now() + 30;
+ sigusr1_triggered = false;
+ }
+ if (time_now() < reboot_deadline) {
+ static struct message *msg;
+ emit(&msg, P_FATAL, "Rebooting");
+ return true;
+ }
+ return false;
+}
+
+struct menu_item {
+ char *text;
+ void (*f)(const struct dict *);
+ int id;
+ bool enabled;
+ int toggle;
+};
+
+struct menu {
+ struct menu_item **items;
+ size_t n_items, allocated_items;
+};
+
+static void menu_init(struct menu *);
+static void menu_free(struct menu *);
+static struct menu_item *menu_add_item(struct menu *, const char *text, ...)
+ PRINTF_FORMAT(2, 3);
+static int menu_show(const struct menu *, int start, bool select);
+
+static void cmd_shell(const struct dict *);
+static void cmd_show_version(const struct dict *);
+static void cmd_configure(const struct dict *);
+static void cmd_setup_pki(const struct dict *);
+static void cmd_browse_status(const struct dict *);
+static void cmd_show_motto(const struct dict *);
+
+static void
+menu_init(struct menu *menu)
+{
+ memset(menu, 0, sizeof *menu);
+}
+
+static void
+menu_free(struct menu *menu)
+{
+ size_t i;
+
+ for (i = 0; i < menu->n_items; i++) {
+ struct menu_item *item = menu->items[i];
+ free(item->text);
+ free(item);
+ }
+ free(menu->items);
+}
+
+static struct menu_item *
+menu_add_item(struct menu *menu, const char *text, ...)
+{
+ struct menu_item *item;
+ va_list args;
+
+ if (menu->n_items >= menu->allocated_items) {
+ menu->allocated_items = 2 * menu->allocated_items + 1;
+ menu->items = xrealloc(menu->items,
+ sizeof *menu->items * menu->allocated_items);
+ }
+ item = menu->items[menu->n_items++] = xmalloc(sizeof *item);
+ va_start(args, text);
+ item->text = xvasprintf(text, args);
+ va_end(args);
+ item->f = NULL;
+ item->id = -1;
+ item->enabled = true;
+ item->toggle = -1;
+ return item;
+}
+
+static void
+menu(const struct dict *dict)
+{
+ bool debug_mode = dict_get_bool(dict, "debug", false);
+ struct menu menu;
+ int choice;
+
+ menu_init(&menu);
+ menu_add_item(&menu, "Exit");
+ menu_add_item(&menu, "Show Version")->f = cmd_show_version;
+ menu_add_item(&menu, "Configure")->f = cmd_configure;
+ menu_add_item(&menu, "Setup PKI")->f = cmd_setup_pki;
+ if (debug_mode) {
+ menu_add_item(&menu, "Browse Status")->f = cmd_browse_status;
+ menu_add_item(&menu, "Shell")->f = cmd_shell;
+ menu_add_item(&menu, "Show Motto")->f = cmd_show_motto;
+ }
+
+ choice = menu_show(&menu, 0, true);
+ if (choice >= 0) {
+ void (*f)(const struct dict *) = menu.items[choice]->f;
+ if (f) {
+ (f)(dict);
+ }
+ }
+
+ menu_free(&menu);
+}
+
+static int
+menu_show(const struct menu *menu, int start, bool select)
+{
+ long long int adjust = LLONG_MAX;
+ int min = 0, max = MAX(menu->n_items - 2, 0);
+ int pos, selection;
+ set_icon(0,
+ eXX___,
+ eXXX__,
+ eXXXX_,
+ eXXXXX,
+ eXXXX_,
+ eXXX__,
+ eXX___,
+ e_____);
+ set_icon(1,
+ eXXXXX,
+ eX___X,
+ eX___X,
+ eX___X,
+ eX___X,
+ eX___X,
+ eXXXXX,
+ e_____);
+ set_icon(2,
+ eXXXXX,
+ eX___X,
+ eXX_XX,
+ eX_X_X,
+ eXX_XX,
+ eX___X,
+ eXXXXX,
+ e_____);
+ if (menu->n_items) {
+ pos = MIN(menu->n_items - 1, MAX(0, start));
+ selection = pos;
+ } else {
+ pos = 0;
+ selection = -1;
+ }
+ for (;;) {
+ int key;
+
+ while ((key = getch()) != ERR) {
+ switch (key) {
+ case KEY_UP:
+ if (select && selection > 0) {
+ selection--;
+ if (selection >= pos) {
+ break;
+ }
+ }
+ if (pos >= min) {
+ pos--;
+ }
+ break;
+
+ case KEY_DOWN:
+ if (select && selection < menu->n_items - 1) {
+ selection++;
+ if (selection <= pos + 1) {
+ break;
+ }
+ }
+ if (pos <= max) {
+ pos++;
+ }
+ break;
+
+ case '\r': case '\n':
+ if (select && selection >= 0 && selection < menu->n_items) {
+ struct menu_item *item = menu->items[selection];
+ if (!item->enabled) {
+ show_string("Item disabled");
+ break;
+ } else if (item->toggle >= 0) {
+ item->toggle = !item->toggle;
+ break;
+ }
+ }
+ return selection;
+
+ case '\b': case '\x7f': case '\x1b':
+ case KEY_BACKSPACE: case KEY_DC:
+ return -1;
+ }
+ adjust = time_msec() + 1000;
+ }
+ if (time_msec() >= adjust && menu->n_items > 1) {
+ if (pos < min) {
+ pos = min;
+ } else if (pos > max) {
+ pos = max;
+ }
+ }
+
+ erase();
+ curs_set(0);
+ move(0, 0);
+ if (!menu->n_items) {
+ addstr("[Empty]");
+ } else {
+ int idx;
+ for (idx = pos; idx < pos + 2; idx++) {
+ size_t width = 40;
+
+ if (select) {
+ width--;
+ if (selection == idx) {
+ put_icon(0, '>');
+ } else {
+ addch(' ');
+ }
+ }
+
+ if (idx < 0) {
+ addstr("[Top]");
+ } else if (idx >= menu->n_items) {
+ addstr("[Bottom]");
+ } else {
+ const struct menu_item *item = menu->items[idx];
+ size_t length = strlen(item->text);
+ if (!item->enabled) {
+ width -= 2;
+ addch('(');
+ }
+ if (item->toggle >= 0) {
+ if (have_icons()) {
+ addch(icon_char(item->toggle ? 2 : 1, 0));
+ width--;
+ } else {
+ addstr(item->toggle ? "[X]" : "[ ]");
+ width -= 3;
+ }
+ }
+ addnstr(item->text, MIN(width, length));
+ if (!item->enabled) {
+ addch(')');
+ }
+ }
+ if (idx == pos) {
+ addch('\n');
+ }
+ }
+ }
+ refresh();
+
+ if (pos < min || pos > max) {
+ poll_timer_wait(adjust - time_msec());
+ }
+ poll_fd_wait(STDIN_FILENO, POLLIN);
+ poll_block();
+ }
+}
+
+static int
+menu_show2(const struct menu *menu, int start, bool select)
+{
+ int pos;
+ if (menu->n_items) {
+ pos = MIN(menu->n_items - 1, MAX(0, start));
+ } else {
+ pos = -1;
+ }
+ set_icon(0,
+ e__X__,
+ e_XXX_,
+ eXXXXX,
+ e__X__,
+ e__X__,
+ e__X__,
+ e__X__,
+ e__X__);
+ set_icon(1,
+ e__X__,
+ e__X__,
+ e__X__,
+ e__X__,
+ e__X__,
+ eXXXXX,
+ e_XXX_,
+ e__X__);
+ for (;;) {
+ int key;
+
+ while ((key = getch()) != ERR) {
+ switch (key) {
+ case KEY_UP:
+ if (pos > 0) {
+ pos--;
+ }
+ break;
+
+ case KEY_DOWN:
+ if (menu->n_items > 0 && pos < menu->n_items - 1) {
+ pos++;
+ }
+ break;
+
+ case '\r': case '\n':
+ if (select && !menu->items[pos]->enabled) {
+ show_string("Item disabled");
+ break;
+ }
+ return pos;
+
+ case '\b': case '\x7f': case '\x1b':
+ case KEY_BACKSPACE: case KEY_DC:
+ return -1;
+ }
+ }
+
+ erase();
+ curs_set(0);
+ move(0, 0);
+ if (pos == -1) {
+ addstr("[Empty]");
+ } else {
+ const struct menu_item *item = menu->items[pos];
+ const char *line1 = item->text;
+ size_t len1 = strcspn(line1, "\n");
+ const char *line2 = line1[len1] ? &line1[len1 + 1] : "";
+ size_t len2 = strcspn(line2, "\n");
+ size_t width = 39 - 2 * !item->enabled;
+
+ /* First line. */
+ addch(pos > 0 ? icon_char(0, '^') : ' ');
+ if (!item->enabled && len1) {
+ addch('(');
+ }
+ addnstr(line1, MIN(len1, width));
+ if (!item->enabled && len1) {
+ addch(')');
+ }
+ addch('\n');
+
+ /* Second line. */
+ addch(pos < menu->n_items - 1 ? icon_char(1, 'V') : ' ');
+ if (!item->enabled && len2) {
+ addch('(');
+ }
+ addnstr(line2, MIN(len2, width));
+ if (!item->enabled && len2) {
+ addch(')');
+ }
+ }
+ refresh();
+
+ poll_fd_wait(STDIN_FILENO, POLLIN);
+ poll_block();
+ }
+}
+
+static bool
+yesno(const char *title, bool def)
+{
+ bool answer = def;
+
+ set_icon(0,
+ eXX___,
+ eXXX__,
+ eXXXX_,
+ eXXXXX,
+ eXXXX_,
+ eXXX__,
+ eXX___,
+ e_____);
+
+ for (;;) {
+ int key;
+
+ while ((key = getch()) != ERR) {
+ switch (key) {
+ case KEY_UP:
+ case KEY_DOWN:
+ case KEY_LEFT:
+ case KEY_RIGHT:
+ answer = !answer;
+ break;
+
+ case 'y': case 'Y':
+ answer = true;
+ break;
+
+ case 'n': case 'N':
+ answer = false;
+ break;
+
+ case '\r': case '\n':
+ return answer;
+ }
+ }
+
+ erase();
+ curs_set(0);
+ move(0, 0);
+ addstr(title);
+
+ move(0, 12);
+ addch(answer ? icon_char(0, '>') : ' ');
+ addstr("Yes");
+
+ move(1, 12);
+ addch(!answer ? icon_char(0, '>') : ' ');
+ addstr("No");
+
+ refresh();
+
+ poll_fd_wait(STDIN_FILENO, POLLIN);
+ poll_block();
+ }
+}
+
+static void
+cmd_show_version(const struct dict *dict UNUSED)
+{
+ show_string(VERSION BUILDNR);
+}
+
+static void
+cmd_browse_status(const struct dict *dict)
+{
+ struct menu menu;
+ size_t i;
+
+ menu_init(&menu);
+ for (i = 0; i < dict->n; i++) {
+ const struct pair *p = &dict->pairs[i];
+ menu_add_item(&menu, "%s = %s", p->name, p->value);
+ }
+ menu_show(&menu, 0, false);
+ menu_free(&menu);
+}
+
+static void
+cmd_shell(const struct dict *dict UNUSED)
+{
+ const char *home;
+
+ erase();
+ refresh();
+ endwin();
+
+ printf("Type ^D to exit\n");
+ fflush(stdout);
+
+ putenv("PS1=#");
+ putenv("PS2=>");
+ putenv("PS3=?");
+ putenv("PS4=+");
+ home = getenv("HOME");
+ if (home) {
+ chdir(home);
+ }
+ system("/bin/sh");
+ initialize_terminal();
+}
+
+static void
+cmd_show_motto(const struct dict *dict UNUSED)
+{
+ show_string("\"Just Add Ice\"");
+}
+
+static void
+show_string(const char *string)
+{
+ VLOG_INFO("%s", string);
+ erase();
+ curs_set(0);
+ move(0, 0);
+ addstr(string);
+ refresh();
+ block_until(time_msec() + 5000);
+}
+
+static void
+block_until(long long timeout)
+{
+ while (timeout > time_msec()) {
+ poll_timer_wait(timeout - time_msec());
+ poll_block();
+ }
+ drain_keyboard_buffer();
+}
+
+static void
+drain_keyboard_buffer(void)
+{
+ while (getch() != ERR) {
+ continue;
+ }
+}
+
+static int
+read_vars(const char *cmd, struct dict *dict)
+{
+ struct ds ds;
+ FILE *stream;
+ int status;
+
+ stream = popen(cmd, "r");
+ if (!stream) {
+ VLOG_ERR("popen(\"%s\") failed: %s", cmd, strerror(errno));
+ return errno;
+ }
+
+ dict_init(dict);
+ ds_init(&ds);
+ while (!ds_get_line(&ds, stream)) {
+ const char *s = ds_cstr(&ds);
+ const char *equals = strchr(s, '=');
+ if (equals) {
+ dict_add_nocopy(dict,
+ xmemdup0(s, equals - s), xstrdup(equals + 1));
+ }
+ }
+ status = pclose(stream);
+ if (status) {
+ char *msg = process_status_msg(status);
+ VLOG_ERR("pclose(\"%s\") reported subprocess failure: %s",
+ cmd, msg);
+ free(msg);
+ dict_free(dict);
+ return ECHILD;
+ }
+ return 0;
+}
+
+static bool
+run_and_report_failure(char **argv, const char *title)
+{
+ int null_fds[3] = {0, 1, 2};
+ int status;
+ int retval;
+ char *s;
+
+ s = process_escape_args(argv);
+ VLOG_INFO("starting subprocess: %s", s);
+ free(s);
+
+ retval = process_run(argv, NULL, 0, null_fds, 3, &status);
+ if (retval) {
+ char *s = xasprintf("%s:\n%s", title, strerror(retval));
+ show_string(s);
+ free(s);
+ return false;
+ } else if (status) {
+ char *msg = process_status_msg(status);
+ char *s = xasprintf("%s:\n%s", title, msg);
+ show_string(s);
+ free(msg);
+ free(s);
+ return false;
+ } else {
+ VLOG_INFO("subprocess exited with status 0");
+ return true;
+ }
+}
+
+static int
+do_load_config(const char *file_name, struct dict *dict)
+{
+ struct dict auto_vars;
+ int retval;
+ char *cmd;
+ size_t i;
+
+ /* Get the list of the variables that the shell sets automatically. */
+ retval = read_vars("set -a && env", &auto_vars);
+ if (retval) {
+ return retval;
+ }
+
+ /* Get the variables from 'file_name'. */
+ cmd = xasprintf("set -a && . '%s' && env", file_name);
+ retval = read_vars(cmd, dict);
+ free(cmd);
+ if (retval) {
+ dict_free(&auto_vars);
+ return retval;
+ }
+
+ /* Subtract. */
+ for (i = 0; i < auto_vars.n; i++) {
+ dict_delete(dict, auto_vars.pairs[i].name);
+ }
+ dict_free(&auto_vars);
+ return 0;
+}
+
+static bool
+load_config(struct dict *dict)
+{
+ static const char default_file[] = "/etc/default/openflow-switch";
+ int retval = do_load_config(default_file, dict);
+ if (!retval) {
+ return true;
+ } else {
+ char *s = xasprintf("Cfg load failed:\n%s", strerror(retval));
+ show_string(s);
+ free(s);
+ return false;
+ }
+}
+
+static bool
+save_config(const struct svec *settings)
+{
+ struct svec argv;
+ size_t i;
+ bool ok;
+
+ VLOG_INFO("Saving configuration:");
+ for (i = 0; i < settings->n; i++) {
+ VLOG_INFO("%s", settings->names[i]);
+ }
+
+ svec_init(&argv);
+ svec_add(&argv, "/usr/share/openvswitch/commands/reconfigure");
+ svec_append(&argv, settings);
+ svec_terminate(&argv);
+ ok = run_and_report_failure(argv.names, "Save failed");
+ if (ok) {
+ long long int timeout = time_msec() + 5000;
+
+ erase();
+ curs_set(0);
+ move(0, 0);
+ addstr("Saved.\nRestarting...");
+ refresh();
+
+ svec_clear(&argv);
+ svec_add(&argv, "/bin/sh");
+ svec_add(&argv, "-c");
+ svec_add(&argv,
+ "/etc/init.d/openflow-switch restart >/dev/null 2>&1");
+ svec_terminate(&argv);
+
+ ok = run_and_report_failure(argv.names, "Restart failed");
+ if (ok) {
+ block_until(timeout);
+ }
+ }
+ svec_destroy(&argv);
+
+ if (ok) {
+ VLOG_INFO("Save completed successfully");
+ } else {
+ VLOG_WARN("Save failed");
+ }
+ return ok;
+}
+
+static int
+match(pcre *re, const char *string, int length)
+{
+ int ovec[999];
+ int retval;
+
+ retval = pcre_exec(re, NULL, string, length, 0, PCRE_PARTIAL,
+ ovec, ARRAY_SIZE(ovec));
+ if (retval >= 0) {
+ if (ovec[0] >= 0 && ovec[1] >= length) {
+ /* 're' matched all of 'string'. */
+ return 0;
+ } else {
+ /* 're' matched the initial part of 'string' but not all of it. */
+ return PCRE_ERROR_NOMATCH;
+ }
+ } else {
+ return retval;
+ }
+}
+
+static void
+figure_choices(pcre *re, struct ds *s, int pos, struct ds *choices)
+{
+ struct ds tmp;
+ int retval;
+ char c;
+
+ ds_clear(choices);
+
+ /* See whether the current string is a complete match. */
+ if (!match(re, s->string, pos)) {
+ ds_put_char(choices, '\n');
+ }
+
+ /* Then try all the other possibilities. */
+ ds_init(&tmp);
+ ds_put_buffer(&tmp, s->string, pos);
+ for (c = 0x20; c < 0x7f; c++) {
+ ds_put_char(&tmp, c);
+ retval = match(re, tmp.string, pos + 1);
+ if (retval == PCRE_ERROR_PARTIAL || !retval) {
+ ds_put_char(choices, c);
+ }
+ tmp.length--;
+ }
+ ds_destroy(&tmp);
+
+ if (!choices->length) {
+ ds_put_char(choices, '\n');
+ }
+}
+
+static void
+figure_completion(pcre *re, struct ds *s)
+{
+ for (;;) {
+ int found = -1;
+ int c;
+
+ /* See whether the current string is a complete match. */
+ if (!match(re, s->string, s->length)) {
+ return;
+ }
+ for (c = 0x20; c < 0x7f; c++) {
+ int retval;
+
+ ds_put_char(s, c);
+ retval = match(re, s->string, s->length);
+ s->length--;
+
+ if (retval == PCRE_ERROR_PARTIAL || !retval) {
+ if (found != -1) {
+ return;
+ }
+ found = c;
+ }
+ }
+ if (found == -1) {
+ return;
+ }
+ ds_put_char(s, found);
+ }
+}
+
+#define OCTET_RE "([0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])"
+#define IP_RE "("OCTET_RE"\\."OCTET_RE"\\."OCTET_RE"\\."OCTET_RE")"
+#define PORT_RE \
+ "([0-9]|" \
+ "[1-9][0-9]|" \
+ "[1-9][0-9][0-9]|" \
+ "[1-9][0-9][0-9][0-9]|" \
+ "[1-5][0-9][0-9][0-9][0-9]|" \
+ "6[1-4][0-9][0-9][0-9]|" \
+ "65[1-4][0-9][0-9]|" \
+ "655[1-2][0-9]|" \
+ "6553[1-5])"
+#define XOCTET_RE "[0-9A-F][0-9A-F]"
+#define MAC_RE \
+ XOCTET_RE":"XOCTET_RE":"XOCTET_RE":"\
+ XOCTET_RE":"XOCTET_RE":"XOCTET_RE
+#define NUM100_TO_99999_RE \
+ "([1-9][0-9][0-9]|" \
+ "[1-9][0-9][0-9][0-9]|" \
+ "[1-9][0-9][0-9][0-9][0-9])"
+#define NUM5_TO_99999_RE \
+ "([5-9]|" \
+ "[1-9][0-9]|" \
+ "[1-9][0-9][0-9]|" \
+ "[1-9][0-9][0-9][0-9]|" \
+ "[1-9][0-9][0-9][0-9][0-9])"
+#define NUM1_TO_99999_RE \
+ "([1-9]|" \
+ "[1-9][0-9]|" \
+ "[1-9][0-9][0-9]|" \
+ "[1-9][0-9][0-9][0-9]|" \
+ "[1-9][0-9][0-9][0-9][0-9])"
+
+static char *
+prompt(const char *prompt, const char *initial, const char *pattern)
+{
+ struct ds ds;
+ int pos, chidx;
+ struct ds choices;
+ const char *error;
+ int erroffset;
+ pcre *re;
+ int retval;
+ int okpartial;
+ char *p;
+
+ set_icon(0,
+ e____X,
+ e____X,
+ e__X_X,
+ e_X__X,
+ eXXXXX,
+ e_X___,
+ e__X__,
+ e_____);
+
+ re = pcre_compile(pattern, PCRE_ANCHORED, &error, &erroffset, NULL);
+ if (!re) {
+ VLOG_ERR("PCRE error for pattern \"%s\" at offset %d: %s",
+ pattern, erroffset, error);
+ return xstrdup(initial);
+ }
+
+ retval = pcre_fullinfo(re, NULL, PCRE_INFO_OKPARTIAL, &okpartial);
+ assert(!retval);
+ assert(okpartial);
+
+ pos = 0;
+ ds_init(&ds);
+ ds_put_cstr(&ds, initial);
+ ds_init(&choices);
+ figure_choices(re, &ds, pos, &choices);
+ p = memchr(choices.string, initial[0], choices.length);
+ chidx = p ? p - choices.string : 0;
+ for (;;) {
+ int c, key;
+
+ while ((key = getch()) != ERR) {
+ switch (key) {
+ case KEY_UP:
+ if (choices.length > 1) {
+ if (++chidx >= choices.length) {
+ chidx = 0;
+ }
+ ds.string[pos] = choices.string[chidx];
+ ds_truncate(&ds, pos + 1);
+ figure_completion(re, &ds);
+ }
+ break;
+
+ case KEY_DOWN:
+ if (choices.length > 1) {
+ if (--chidx < 0) {
+ chidx = choices.length - 1;
+ }
+ ds.string[pos] = choices.string[chidx];
+ ds_truncate(&ds, pos + 1);
+ figure_completion(re, &ds);
+ }
+ break;
+
+ case '\r': case '\n':
+ if (choices.string[chidx] == '\n') {
+ ds_truncate(&ds, pos);
+ return ds_cstr(&ds);
+ } else {
+ if (pos >= ds.length) {
+ pos++;
+ ds_put_char(&ds, choices.string[chidx]);
+ figure_choices(re, &ds, pos, &choices);
+ chidx = 0;
+ figure_completion(re, &ds);
+ } else {
+ pos = ds.length;
+ figure_choices(re, &ds, pos, &choices);
+ chidx = 0;
+ figure_completion(re, &ds);
+ }
+ }
+ break;
+
+ case '\f':
+ ds_truncate(&ds, pos + 1);
+ figure_choices(re, &ds, pos, &choices);
+ chidx = 0;
+ break;
+
+ case '\b': case '\x7f': case '\x1b':
+ case KEY_BACKSPACE: case KEY_DC:
+ if (pos) {
+ pos--;
+ } else {
+ return xstrdup(initial);
+ }
+ figure_choices(re, &ds, pos, &choices);
+ chidx = 0;
+ if (pos < ds.length) {
+ p = memchr(choices.string, ds.string[pos],
+ choices.length);
+ if (p) {
+ chidx = p - choices.string;
+ }
+ }
+ break;
+
+ default:
+ if (key >= 0x20 && key < 0x7f) {
+ /* Check whether 'key' is valid and toggle case if
+ * necessary. */
+ if (!memchr(choices.string, key, choices.length)) {
+ if (memchr(choices.string, toupper(key),
+ choices.length)) {
+ key = toupper(key);
+ } else if (memchr(choices.string, tolower(key),
+ choices.length)) {
+ key = tolower(key);
+ } else {
+ break;
+ }
+ }
+
+ /* Insert 'key' and advance the position. */
+ if (pos >= ds.length) {
+ ds_put_char(&ds, key);
+ } else {
+ ds.string[pos] = key;
+ }
+ pos++;
+
+ if (choices.string[chidx] != key) {
+ ds_truncate(&ds, pos);
+ }
+ figure_choices(re, &ds, pos, &choices);
+ chidx = 0;
+ if (pos < ds.length) {
+ p = memchr(choices.string, ds.string[pos],
+ choices.length);
+ if (p) {
+ chidx = p - choices.string;
+ }
+ }
+ figure_completion(re, &ds);
+ }
+ }
+ }
+
+ erase();
+ curs_set(1);
+ move(0, 0);
+ addnstr(prompt, MIN(40, strlen(prompt)));
+
+ c = choices.string[chidx];
+ move(1, 0);
+ addstr(ds_cstr(&ds));
+ move(1, pos);
+ if (c == '\n') {
+ put_icon(0, '$');
+ } else {
+ addch(c);
+ }
+ move(1, pos);
+ refresh();
+
+ poll_fd_wait(STDIN_FILENO, POLLIN);
+ poll_block();
+ }
+}
+
+static void
+prompt_ip(const char *title, uint32_t *ip)
+{
+ char *in = xasprintf(IP_FMT, IP_ARGS(ip));
+ char *out = prompt(title, in, "^"IP_RE"$");
+ *ip = inet_addr(out);
+ free(in);
+ free(out);
+}
+
+static void
+abbreviate_netdevs(const struct svec *netdevs, struct ds *abbrev)
+{
+ size_t i;
+
+ ds_init(abbrev);
+ for (i = 0; i < netdevs->n; ) {
+ size_t i_len = strlen(netdevs->names[i]);
+ size_t j;
+
+ for (j = i + 1; j < netdevs->n; j++) {
+ size_t j_len = strlen(netdevs->names[j]);
+ if (!i_len || !j_len || i_len != j_len
+ || memcmp(netdevs->names[i], netdevs->names[j], i_len - 1)) {
+ break;
+ }
+ }
+
+ if (abbrev->length) {
+ ds_put_char(abbrev, ' ');
+ }
+ if (j - i == 1) {
+ ds_put_cstr(abbrev, netdevs->names[i]);
+ } else {
+ size_t k;
+
+ ds_put_buffer(abbrev, netdevs->names[i], i_len - 1);
+ ds_put_char(abbrev, '[');
+ for (k = i; k < j; k++) {
+ ds_put_char(abbrev, netdevs->names[k][i_len - 1]);
+ }
+ ds_put_char(abbrev, ']');
+ }
+ i = j;
+ }
+}
+
+static void
+choose_netdevs(struct svec *choices)
+{
+ struct svec netdevs;
+ struct menu menu;
+ size_t i;
+
+ netdev_enumerate(&netdevs);
+ svec_sort(&netdevs);
+
+ menu_init(&menu);
+ menu_add_item(&menu, "Exit");
+ for (i = 0; i < netdevs.n; i++) {
+ const char *name = netdevs.names[i];
+ struct menu_item *item;
+ struct netdev *netdev;
+ int retval;
+
+ if (!strncmp(name, "wmaster", strlen("wmaster"))
+ || !strncmp(name, "of", strlen("of"))
+ || !strcmp(name, "lo")) {
+ continue;
+ }
+
+ retval = netdev_open(name, NETDEV_ETH_TYPE_NONE, &netdev);
+ if (!retval) {
+ bool exclude = netdev_get_in4(netdev, NULL);
+ netdev_close(netdev);
+ if (exclude) {
+ continue;
+ }
+ }
+
+ item = menu_add_item(&menu, "%s", name);
+ item->toggle = svec_contains(choices, name);
+ }
+ if (menu.n_items > 1) {
+ menu_show(&menu, 0, true);
+ } else {
+ show_string("No available\nbridge ports");
+ }
+
+ svec_clear(choices);
+ for (i = 0; i < menu.n_items; i++) {
+ struct menu_item *item = menu.items[i];
+ if (item->toggle > 0) {
+ svec_add(choices, item->text);
+ }
+ }
+
+ menu_free(&menu);
+}
+
+static bool
+is_datapath_id_in_dmi(void)
+{
+ FILE *dmidecode;
+ char line[256];
+ bool is_in_dmi;
+
+ dmidecode = popen("dmidecode -s system-uuid", "r");
+ if (!dmidecode) {
+ return false;
+ }
+ is_in_dmi = fgets(line, sizeof line, dmidecode) && strstr(line, "-002320");
+ fclose(dmidecode);
+ return is_in_dmi;
+}
+
+struct switch_config {
+ struct svec netdevs;
+ enum { DISCOVERY, IN_BAND } mode;
+ uint32_t switch_ip;
+ uint32_t switch_mask;
+ uint32_t switch_gw;
+ enum { FAIL_DROP, FAIL_SWITCH } disconnected;
+ bool stp;
+ int rate_limit;
+ int inactivity_probe;
+ int max_backoff;
+ char *controller_vconn;
+ char *datapath_id;
+};
+
+static const char *
+disconnected_string(int value)
+{
+#define FAIL_SWITCH_STRING "Switch packets"
+#define FAIL_DROP_STRING "Drop packets"
+ return value == FAIL_SWITCH ? FAIL_SWITCH_STRING : FAIL_DROP_STRING;
+}
+
+static void
+cmd_configure(const struct dict *dict UNUSED)
+{
+ bool debug_mode = dict_get_bool(dict, "debug", false);
+ struct dict config_dict;
+ struct switch_config config;
+ int start;
+
+ if (!load_config(&config_dict)) {
+ return;
+ }
+ svec_init(&config.netdevs);
+ svec_parse_words(&config.netdevs,
+ dict_get_string(&config_dict, "NETDEVS", ""));
+ config.mode = (!strcmp(dict_get_string(&config_dict, "MODE", "discovery"),
+ "in-band") ? IN_BAND : DISCOVERY);
+ config.switch_ip = dict_get_ip(&config_dict, "SWITCH_IP");
+ config.switch_mask = dict_get_ip(&config_dict, "SWITCH_NETMASK");
+ config.switch_gw = dict_get_ip(&config_dict, "SWITCH_GATEWAY");
+ config.controller_vconn = xstrdup(dict_get_string(&config_dict,
+ "CONTROLLER", ""));
+ config.disconnected = (!strcmp(dict_get_string(&config_dict,
+ "DISCONNECTED_MODE", ""),
+ "switch")
+ ? FAIL_SWITCH : FAIL_DROP);
+ config.stp = !strcmp(dict_get_string(&config_dict, "stp", ""), "yes");
+ config.rate_limit = dict_get_int(&config_dict, "RATE_LIMIT", -1);
+ config.inactivity_probe = dict_get_int(&config_dict, "INACTIVITY_PROBE",
+ -1);
+ config.max_backoff = dict_get_int(&config_dict, "MAX_BACKOFF", -1);
+ if (is_datapath_id_in_dmi()) {
+ config.datapath_id = xstrdup("DMI");
+ } else {
+ const char *dpid = dict_get(&config_dict, "DATAPATH_ID");
+ if (dpid) {
+ struct ds ds = DS_EMPTY_INITIALIZER;
+ const char *cp;
+ for (cp = dpid; *cp != '\0'; cp++) {
+ if (*cp != ':') {
+ ds_put_char(&ds, toupper((unsigned char) *cp));
+ }
+ }
+ config.datapath_id = ds_cstr(&ds);
+ } else {
+ config.datapath_id = xstrdup("Random");
+ }
+ }
+ dict_free(&config_dict);
+
+ start = 0;
+ while (start != -1) {
+ enum {
+ MENU_EXIT,
+ MENU_NETDEVS,
+ MENU_MODE,
+ MENU_IP,
+ MENU_NETMASK,
+ MENU_GATEWAY,
+ MENU_CONTROLLER,
+ MENU_DISCONNECTED_MODE,
+ MENU_DATAPATH_ID,
+ MENU_STP,
+ MENU_RATE_LIMIT,
+ MENU_INACTIVITY_PROBE,
+ MENU_MAX_BACKOFF,
+ };
+
+ struct ds ports;
+ struct menu_item *item;
+ struct menu menu;
+ char *in, *out;
+ uint32_t ip;
+
+ menu_init(&menu);
+
+ /* Exit. */
+ item = menu_add_item(&menu, "Exit");
+ item->id = MENU_EXIT;
+
+ /* Bridge Ports. */
+ abbreviate_netdevs(&config.netdevs, &ports);
+ item = menu_add_item(&menu, "Bridge Ports:\n%s", ds_cstr(&ports));
+ item->id = MENU_NETDEVS;
+ ds_destroy(&ports);
+
+ /* Mode. */
+ item = menu_add_item(&menu, "Mode:\n%s",
+ (config.mode == DISCOVERY
+ ? "Discovery" : "In-Band"));
+ item->id = MENU_MODE;
+
+ /* IP address. */
+ if (config.switch_ip == htonl(0)) {
+ item = menu_add_item(&menu, "Switch IP Addr:\nDHCP");
+ } else {
+ item = menu_add_item(&menu, "Switch IP Addr:\n"IP_FMT,
+ IP_ARGS(&config.switch_ip));
+ }
+ item->id = MENU_IP;
+ item->enabled = config.mode == IN_BAND;
+
+ /* Netmask. */
+ item = menu_add_item(&menu, "Switch Netmask:\n"IP_FMT,
+ IP_ARGS(&config.switch_mask));
+ item->id = MENU_NETMASK;
+ item->enabled = config.mode == IN_BAND && config.switch_ip != htonl(0);
+
+ /* Gateway. */
+ item = menu_add_item(&menu, "Switch Gateway:\n"IP_FMT,
+ IP_ARGS(&config.switch_gw));
+ item->id = MENU_GATEWAY;
+ item->enabled = config.mode == IN_BAND && config.switch_ip != htonl(0);
+
+ /* Controller. */
+ item = menu_add_item(&menu, "Controller:\n%s",
+ config.controller_vconn);
+ item->id = MENU_CONTROLLER;
+ item->enabled = config.mode == IN_BAND;
+
+ /* Disconnected mode. */
+ item = menu_add_item(&menu, "If disconnected:\n%s\n",
+ disconnected_string(config.disconnected));
+ item->id = MENU_DISCONNECTED_MODE;
+
+ /* Datapath ID. */
+ item = menu_add_item(&menu, "Datapath ID:\n%s", config.datapath_id);
+ item->id = MENU_DATAPATH_ID;
+ item->enabled = strcmp(config.datapath_id, "DMI");
+
+ /* Spanning tree protocol. */
+ if (debug_mode) {
+ item = menu_add_item(&menu, "802.1D-1998 STP:\n%s",
+ config.stp ? "Enabled" : "Disabled");
+ item->id = MENU_STP;
+ }
+
+ /* Rate-limiting. */
+ if (debug_mode) {
+ if (config.rate_limit < 0) {
+ item = menu_add_item(&menu, "Ctlr rate limit:\nDisabled");
+ } else {
+ item = menu_add_item(&menu, "Ctlr rate limit:\n%d/s",
+ config.rate_limit);
+ }
+ item->id = MENU_RATE_LIMIT;
+ }
+
+ /* Inactivity probe. */
+ if (debug_mode) {
+ if (config.inactivity_probe < 0) {
+ item = menu_add_item(&menu, "Activity probe:\nDefault");
+ } else {
+ item = menu_add_item(&menu, "Activity probe:\n%d s",
+ config.inactivity_probe);
+ }
+ item->id = MENU_INACTIVITY_PROBE;
+ }
+
+ /* Max backoff. */
+ if (debug_mode) {
+ if (config.max_backoff < 0) {
+ item = menu_add_item(&menu, "Max backoff:\nDefault");
+ } else {
+ item = menu_add_item(&menu, "Max backoff:\n%d s",
+ config.max_backoff);
+ }
+ item->id = MENU_MAX_BACKOFF;
+ }
+
+ start = menu_show2(&menu, start, true);
+ menu_free(&menu);
+
+ in = out = NULL;
+ switch (start) {
+ case MENU_EXIT:
+ start = -1;
+ break;
+
+ case MENU_NETDEVS:
+ choose_netdevs(&config.netdevs);
+ break;
+
+ case MENU_MODE:
+ out = prompt("Mode:",
+ config.mode == DISCOVERY ? "Discovery" : "In-Band",
+ "^(Discovery|In-Band)$");
+ config.mode = !strcmp(out, "Discovery") ? DISCOVERY : IN_BAND;
+ free(out);
+ break;
+
+ case MENU_IP:
+ in = (config.switch_ip == htonl(0) ? xstrdup("DHCP")
+ : xasprintf(IP_FMT, IP_ARGS(&config.switch_ip)));
+ out = prompt("Switch IP:", in, "^(DHCP|"IP_RE")$");
+ ip = strcmp(out, "DHCP") ? inet_addr(out) : htonl(0);
+ free(in);
+ free(out);
+ if (ip != config.switch_ip) {
+ config.switch_ip = ip;
+ if (ip != htonl(0)) {
+ uint32_t mask = guess_netmask(ip);
+ if (mask) {
+ config.switch_mask = mask;
+ config.switch_gw = (ip & mask) | htonl(1);
+ }
+ }
+ }
+ break;
+
+ case MENU_NETMASK:
+ prompt_ip("Switch Netmask:", &config.switch_mask);
+ break;
+
+ case MENU_GATEWAY:
+ prompt_ip("Switch Gateway:", &config.switch_gw);
+ break;
+
+ case MENU_CONTROLLER:
+ out = prompt("Controller:", config.controller_vconn,
+ "^(tcp|ssl):"IP_RE"(:"PORT_RE")?$");
+ free(config.controller_vconn);
+ config.controller_vconn = out;
+ break;
+
+ case MENU_DISCONNECTED_MODE:
+ out = prompt("If disconnected",
+ disconnected_string(config.disconnected),
+ "^("FAIL_DROP_STRING"|"FAIL_SWITCH_STRING")$");
+ config.disconnected = (!strcmp(out, FAIL_DROP_STRING)
+ ? FAIL_DROP : FAIL_SWITCH);
+ free(out);
+ break;
+
+ case MENU_DATAPATH_ID:
+ out = prompt("Datapath ID:", config.datapath_id,
+ "^Random|"MAC_RE"$");
+ free(config.datapath_id);
+ config.datapath_id = out;
+ break;
+
+ case MENU_STP:
+ out = prompt("802.1D-1998 STP:",
+ config.stp ? "Enabled" : "Disabled",
+ "^(Enabled|Disabled)$");
+ config.stp = !strcmp(out, "Enabled");
+ free(out);
+ break;
+
+ case MENU_RATE_LIMIT:
+ in = (config.rate_limit < 0
+ ? xstrdup("Disabled")
+ : xasprintf("%d/s", config.rate_limit));
+ out = prompt("Ctlr rate limit:", in,
+ "^(Disabled|("NUM100_TO_99999_RE")/s)$");
+ free(in);
+ config.rate_limit = isdigit(out[0]) ? atoi(out) : -1;
+ free(out);
+ break;
+
+ case MENU_INACTIVITY_PROBE:
+ in = (config.inactivity_probe < 0
+ ? xstrdup("Default")
+ : xasprintf("%d s", config.inactivity_probe));
+ out = prompt("Activity probe:", in,
+ "^(Default|("NUM5_TO_99999_RE") s)$");
+ free(in);
+ config.inactivity_probe = isdigit(out[0]) ? atoi(out) : -1;
+ free(out);
+ break;
+
+ case MENU_MAX_BACKOFF:
+ in = (config.max_backoff < 0
+ ? xstrdup("Default")
+ : xasprintf("%d s", config.max_backoff));
+ out = prompt("Max backoff:", in,
+ "^(Default|("NUM1_TO_99999_RE") s)$");
+ free(in);
+ config.max_backoff = isdigit(out[0]) ? atoi(out) : -1;
+ free(out);
+ break;
+ }
+ }
+
+ if (yesno("Save\nChanges?", false)) {
+ struct svec set;
+ char *netdevs;
+
+ svec_init(&set);
+ netdevs = svec_join(&config.netdevs, " ", "");
+ svec_add_nocopy(&set, xasprintf("NETDEVS=%s", netdevs));
+ free(netdevs);
+ svec_add(&set,
+ config.mode == IN_BAND ? "MODE=in-band" : "MODE=discovery");
+ if (config.mode == IN_BAND) {
+ if (config.switch_ip == htonl(0)) {
+ svec_add(&set, "SWITCH_IP=dhcp");
+ } else {
+ svec_add_nocopy(&set, xasprintf("SWITCH_IP="IP_FMT,
+ IP_ARGS(&config.switch_ip)));
+ svec_add_nocopy(&set,
+ xasprintf("SWITCH_NETMASK="IP_FMT,
+ IP_ARGS(&config.switch_mask)));
+ svec_add_nocopy(&set, xasprintf("SWITCH_GATEWAY="IP_FMT,
+ IP_ARGS(&config.switch_gw)));
+ svec_add_nocopy(&set, xasprintf("CONTROLLER=%s",
+ config.controller_vconn));
+ }
+ }
+ svec_add(&set, (config.disconnected == FAIL_DROP
+ ? "DISCONNECTED_MODE=drop"
+ : "DISCONNECTED_MODE=switch"));
+ svec_add_nocopy(&set, xasprintf("STP=%s", config.stp ? "yes" : "no"));
+ if (config.rate_limit < 0) {
+ svec_add(&set, "RATE_LIMIT=");
+ } else {
+ svec_add_nocopy(&set,
+ xasprintf("RATE_LIMIT=%d", config.rate_limit));
+ }
+ if (config.inactivity_probe < 0) {
+ svec_add(&set, "INACTIVITY_PROBE=");
+ } else {
+ svec_add_nocopy(&set, xasprintf("INACTIVITY_PROBE=%d",
+ config.inactivity_probe));
+ }
+ if (config.max_backoff < 0) {
+ svec_add(&set, "MAX_BACKOFF=");
+ } else {
+ svec_add_nocopy(&set, xasprintf("MAX_BACKOFF=%d",
+ config.max_backoff));
+ }
+ save_config(&set);
+ svec_destroy(&set);
+ }
+
+ svec_destroy(&config.netdevs);
+ free(config.controller_vconn);
+ free(config.datapath_id);
+}
+
+static void
+cmd_setup_pki(const struct dict *dict UNUSED)
+{
+ static const char def_privkey_file[]
+ = "/etc/openflow-switch/of0-privkey.pem";
+ static const char def_cert_file[] = "/etc/openflow-switch/of0-cert.pem";
+ static const char def_cacert_file[] = "/etc/openflow-switch/cacert.pem";
+ struct dict config_dict;
+ const char *privkey_file, *cert_file, *cacert_file;
+ bool bootstrap;
+ struct stat s;
+ struct svec set;
+ bool has_keys;
+
+ if (!load_config(&config_dict)) {
+ return;
+ }
+ privkey_file = dict_get_string(&config_dict, "PRIVKEY", def_privkey_file);
+ cert_file = dict_get_string(&config_dict, "CERT", def_cert_file);
+ cacert_file = dict_get_string(&config_dict, "CACERT", def_cacert_file);
+ bootstrap = !strcmp(dict_get_string(&config_dict, "CACERT_MODE", "secure"),
+ "bootstrap");
+
+ has_keys = !stat(privkey_file, &s) && !stat(cert_file, &s);
+ if (!has_keys
+ ? yesno("Generate\nkeys?", true)
+ : yesno("Generate\nnew keys?", false)) {
+ struct svec argv;
+ bool ok;
+
+ privkey_file = def_privkey_file;
+ cert_file = def_cert_file;
+
+ svec_init(&argv);
+ svec_parse_words(&argv, "sh -c 'cd /etc/openflow-switch "
+ "&& ovs-pki --force req of0"
+ "&& ovs-pki --force self-sign of0'");
+ svec_terminate(&argv);
+ ok = run_and_report_failure(argv.names, "Key gen failed");
+ svec_destroy(&argv);
+ if (!ok) {
+ return;
+ }
+ has_keys = true;
+ }
+ if (!has_keys) {
+ return;
+ }
+
+ if (stat(cacert_file, &s) && errno == ENOENT) {
+ bootstrap = yesno("Bootstrap\nCA cert?", bootstrap);
+ } else if (yesno("Replace\nCA cert?", false)) {
+ unlink(cacert_file);
+ bootstrap = true;
+ }
+
+ svec_init(&set);
+ svec_add_nocopy(&set, xasprintf("PRIVKEY=%s", privkey_file));
+ svec_add_nocopy(&set, xasprintf("CERT=%s", cert_file));
+ svec_add_nocopy(&set, xasprintf("CACERT=%s", cacert_file));
+ svec_add_nocopy(&set, xasprintf("CACERT_MODE=%s",
+ bootstrap ? "bootstrap" : "secure"));
+ save_config(&set);
+ svec_destroy(&set);
+}
+
+static void
+parse_options(int argc, char *argv[])
+{
+ enum {
+ OPT_DUMMY = UCHAR_MAX + 1,
+ VLOG_OPTION_ENUMS
+ };
+ static struct option long_options[] = {
+ {"verbose", optional_argument, 0, 'v'},
+ {"help", no_argument, 0, 'h'},
+ {"version", no_argument, 0, 'V'},
+ DAEMON_LONG_OPTIONS,
+ VLOG_LONG_OPTIONS,
+ {0, 0, 0, 0},
+ };
+ char *short_options = long_options_to_short_options(long_options);
+
+ for (;;) {
+ int c;
+
+ c = getopt_long(argc, argv, short_options, long_options, NULL);
+ if (c == -1) {
+ break;
+ }
+
+ switch (c) {
+ case 'h':
+ usage();
+
+ case 'V':
+ OVS_PRINT_VERSION(OFP_VERSION, OFP_VERSION);
+ exit(EXIT_SUCCESS);
+
+ VLOG_OPTION_HANDLERS
+ DAEMON_OPTION_HANDLERS
+
+ case '?':
+ exit(EXIT_FAILURE);
+
+ default:
+ abort();
+ }
+ }
+ free(short_options);
+}
+
+static void
+usage(void)
+{
+ printf("%s: OpenFlow switch monitoring user interface\n"
+ "usage: %s [OPTIONS] SWITCH\n"
+ "where SWITCH is an active OpenFlow connection method.\n",
+ program_name, program_name);
+ vconn_usage(true, false, false);
+ printf("\nOptions:\n"
+ " -v, --verbose=MODULE:FACILITY:LEVEL configure logging levels\n"
+ " -v, --verbose set maximum verbosity level\n"
+ " -h, --help display this help message\n"
+ " -V, --version display version information\n");
+ exit(EXIT_SUCCESS);
+}
diff --git a/extras/ezio/terminal.c b/extras/ezio/terminal.c
new file mode 100644
index 000000000..eacf0af06
--- /dev/null
+++ b/extras/ezio/terminal.c
@@ -0,0 +1,833 @@
+/* Copyright (c) 2008, 2009 Nicira Networks, Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * In addition, as a special exception, Nicira Networks gives permission
+ * to link the code of its release of vswitchd with the OpenSSL project's
+ * "OpenSSL" library (or with modified versions of it that use the same
+ * license as the "OpenSSL" library), and distribute the linked
+ * executables. You must obey the GNU General Public License in all
+ * respects for all of the code used other than "OpenSSL". If you modify
+ * this file, you may extend this exception to your version of the file,
+ * but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version.
+ *
+ */
+
+#include <config.h>
+#include "terminal.h"
+#include <assert.h>
+#include <errno.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include "dynamic-string.h"
+#include "ezio.h"
+#include "poll-loop.h"
+#include "util.h"
+
+#define THIS_MODULE VLM_terminal
+#include "vlog.h"
+
+/* UTF-8 decoding. */
+static struct utf8_reader *utf8_reader_create(void);
+static void utf8_reader_destroy(struct utf8_reader *);
+static int utf8_reader_read(struct utf8_reader *, uint8_t c);
+
+/* ANSI escape sequence decoding. */
+struct ansi_sequence {
+ int n_args;
+#define ANSI_MAX_ARGS 16
+ int args[ANSI_MAX_ARGS];
+ int function;
+};
+
+static struct ansi_decoder *ansi_decoder_create(void);
+static void ansi_decoder_destroy(struct ansi_decoder *);
+static int ansi_decoder_put(struct ansi_decoder *, uint8_t c);
+static const struct ansi_sequence *ansi_decoder_get(struct ansi_decoder *);
+
+/* Terminal emulation. */
+struct terminal {
+ struct ansi_decoder *ansi;
+ struct utf8_reader *utf8;
+ enum { EZIO, UTF8 } encoding;
+};
+
+static void recv_byte(struct terminal *term, struct ezio *ezio, uint8_t c);
+
+struct terminal *
+terminal_create(void)
+{
+ struct terminal *term = xmalloc(sizeof *term);
+ term->ansi = ansi_decoder_create();
+ term->utf8 = utf8_reader_create();
+ term->encoding = UTF8;
+ return term;
+}
+
+void
+terminal_destroy(struct terminal *term)
+{
+ if (term) {
+ utf8_reader_destroy(term->utf8);
+ ansi_decoder_destroy(term->ansi);
+ free(term);
+ }
+}
+
+int
+terminal_run(struct terminal *term, struct ezio *ezio, int input_fd)
+{
+ char input[512];
+ int n;
+
+ n = read(input_fd, input, sizeof input);
+ if (n > 0) {
+ int i;
+
+ for (i = 0; i < n; i++) {
+ recv_byte(term, ezio, input[i]);
+ }
+ return 0;
+ } else {
+ return !n ? EOF : errno == EAGAIN ? 0 : errno;
+ }
+}
+
+void
+terminal_wait(struct terminal *term UNUSED, int input_fd)
+{
+ poll_fd_wait(input_fd, POLLIN);
+}
+
+static void recv_ansi_sequence(const struct ansi_sequence *, struct ezio *);
+static void recv_control(uint8_t c, struct ezio *);
+static void recv_character(uint8_t byte, struct ezio *);
+static int unicode_to_ezio(uint16_t unicode);
+static int default_arg(int value, int default_value);
+static int range(int value, int min, int max);
+static void clear_elements(uint8_t *p, size_t size, int pos, int clear_type);
+static void define_icon(struct ezio *e, const int *args);
+static void clear_icon(struct ezio *e, int icon_nr);
+static void set_cursor(struct ezio *e, int visibility);
+
+static void
+recv_byte(struct terminal *term, struct ezio *ezio, uint8_t c)
+{
+ int retval;
+
+ /* Decode and interpret ANSI escape sequences. */
+ retval = ansi_decoder_put(term->ansi, c);
+ if (retval <= 0) {
+ if (retval < 0) {
+ recv_ansi_sequence(ansi_decoder_get(term->ansi), ezio);
+ return;
+ }
+ return;
+ }
+
+ /* Encoding selection. */
+ if (c == 0x0e) {
+ /* Shift Out. */
+ term->encoding = EZIO;
+ return;
+ } else if (c == 0x0f) {
+ /* Shift In. */
+ term->encoding = UTF8;
+ return;
+ }
+
+ if (term->encoding == UTF8) {
+ int unicode, ezchar;
+
+ /* Convert UTF-8 input to Unicode code point. */
+ unicode = utf8_reader_read(term->utf8, c);
+ if (unicode < 0) {
+ return;
+ }
+
+ /* Convert Unicode code point to EZIO encoding. */
+ ezchar = unicode_to_ezio(unicode);
+ if (ezchar >= 0) {
+ if (ezchar & 0xff00) {
+ recv_character(ezchar >> 8, ezio);
+ }
+ recv_character(ezchar, ezio);
+ } else if (unicode < 0x100) {
+ recv_control(unicode, ezio);
+ } else {
+ /* Unsupported Unicode code point. */
+ return;
+ }
+ } else {
+ if (c >= 0x80 && c < 0x87) {
+ c &= 0x07;
+ }
+ if (c != 0xfe) {
+ recv_character(c, ezio);
+ }
+ }
+}
+
+static void
+log_ansi_sequence(const struct ansi_sequence *seq, struct ezio *e)
+{
+ struct sequence {
+ int function;
+ const char *name;
+ };
+ static const struct sequence sequences[] = {
+ {0x5a, "CBT: Cursor Backward Tabulation"},
+ {0x47, "CHA: Cursor Character Absolute"},
+ {0x49, "CHT: Cursor Forward Tabulation"},
+ {0x45, "CNL: Cursor Next Line"},
+ {0x46, "CPL: Cursor Preceding Line"},
+ {0x44, "CUB: Cursor Left"},
+ {0x42, "CUD: Cursor Down"},
+ {0x43, "CUF: Cursor Right"},
+ {0x48, "CUP: Cursor Position"},
+ {0x41, "CUU: Cursor Up"},
+ {0x50, "DCH: Delete Character"},
+ {0x4d, "DL: Delete Line"},
+ {0x58, "ECH: Erase Character"},
+ {0x4a, "ED: Erase in Page"},
+ {0x4b, "EL: Erase in Line"},
+ {0x40, "ICH: Insert Character"},
+ {0x4c, "IL: Insert Line"},
+ {0x4500, "NEL: Next Line"},
+ {0x4d00, "RI: Reverse Line Feed"},
+ {0x6300, "RIS: Reset to Initial State"},
+ {0x54, "SD: Scroll Down"},
+ {0x240, "SL: Scroll Left"},
+ {0x241, "SR: Scroll Right"},
+ {0x53, "SU: Scroll Up"},
+ {0x70, "DICO: Define Icon"},
+ {0x71, "CICO: Clear Icon"},
+ {0x72, "Set cursor visibility"},
+ };
+ const struct sequence *s;
+ struct ds ds;
+ int i;
+
+ ds_init(&ds);
+ for (s = sequences; s < &sequences[ARRAY_SIZE(sequences)]; s++) {
+ if (s->function == seq->function) {
+ ds_put_cstr(&ds, s->name);
+ goto found;
+ }
+ }
+ ds_put_format(&ds, "0x%02x", s->function);
+ if (s->function < 0x100) {
+ ds_put_format(&ds, "(%02d/%02d)", s->function / 16, s->function % 16);
+ }
+
+found:
+ for (i = 0; i < seq->n_args; i++) {
+ ds_put_format(&ds, ", %d", seq->args[i]);
+ }
+ VLOG_DBG("%s (cursor:%d,%d)", ds_cstr(&ds), e->x, e->y);
+ ds_destroy(&ds);
+}
+
+static void
+recv_ansi_sequence(const struct ansi_sequence *seq, struct ezio *e)
+{
+#define ARG1(DEFAULT) default_arg(seq->args[0], DEFAULT)
+#define ARG2(DEFAULT) default_arg(seq->args[1], DEFAULT)
+ if (VLOG_IS_DBG_ENABLED()) {
+ log_ansi_sequence(seq, e);
+ }
+ switch (seq->function) {
+ case 0x5a: /* CBT: Cursor Backward Tabulation. */
+ e->x = 8 * (e->x / 8 - ARG1(1));
+ break;
+ case 0x47: /* CHA: Cursor Character Absolute. */
+ e->x = ARG1(1) - 1;
+ break;
+ case 0x49: /* CHT: Cursor Forward Tabulation. */
+ e->x = 8 * (e->x / 8 + ARG1(1));
+ break;
+ case 0x45: /* CNL: Cursor Next Line. */
+ e->x = 0;
+ e->y += ARG1(1);
+ break;
+ case 0x46: /* CPL: Cursor Preceding Line. */
+ e->x = 0;
+ e->y -= ARG1(1);
+ break;
+ case 0x44: /* CUB: Cursor Left. */
+ e->x -= ARG1(1);
+ break;
+ case 0x42: /* CUD: Cursor Down. */
+ e->y += ARG1(1);
+ break;
+ case 0x43: /* CUF: Cursor Right. */
+ e->x += ARG1(1);
+ break;
+ case 0x48: /* CUP: Cursor Position. */
+ e->y = ARG1(1) - 1;
+ e->x = ARG2(1) - 1;
+ break;
+ case 0x41: /* CUU: Cursor Up. */
+ e->y -= ARG1(1);
+ break;
+ case 0x50: /* DCH: Delete Character. */
+ ezio_delete_char(e, e->x, e->y, ARG1(1));
+ break;
+ case 0x4d: /* DL: Delete Line. */
+ ezio_delete_line(e, e->y, ARG1(1));
+ break;
+ case 0x58: /* ECH: Erase Character. */
+ memset(&e->chars[e->y][e->x], ' ', MIN(ARG1(1), 40 - e->x));
+ break;
+ case 0x4a: /* ED: Erase in Page. */
+ clear_elements(&e->chars[0][0], 2 * 40, e->x + 40 * e->y, ARG1(0));
+ break;
+ case 0x4b: /* EL: Erase in Line. */
+ clear_elements(&e->chars[e->y][0], 40, e->x, ARG1(0));
+ break;
+ case 0x40: /* ICH: Insert Character. */
+ ezio_insert_char(e, e->x, e->y, ARG1(1));
+ break;
+ case 0x4c: /* IL: Insert Line. */
+ ezio_insert_line(e, e->y, ARG1(1));
+ break;
+ case 0x4500: /* NEL: Next Line. */
+ e->x = 0;
+ e->y++;
+ break;
+ case 0x4d00: /* RI: Reverse Line Feed. */
+ e->y--;
+ break;
+ case 0x6300: /* RIS: Reset to Initial State. */
+ ezio_init(e);
+ break;
+ case 0x54: /* SD: Scroll Down. */
+ ezio_scroll_down(e, ARG1(1));
+ break;
+ case 0x240: /* SL: Scroll Left. */
+ ezio_scroll_left(e, ARG1(1));
+ break;
+ case 0x241: /* SR: Scroll Right. */
+ ezio_scroll_right(e, ARG1(1));
+ break;
+ case 0x53: /* SU: Scroll Up. */
+ ezio_scroll_up(e, ARG1(1));
+ break;
+
+ /* Private sequences. */
+ case 0x70: /* DICO: Define Icon. */
+ define_icon(e, seq->args);
+ break;
+ case 0x71: /* CICO: Clear Icon. */
+ clear_icon(e, ARG1(0));
+ break;
+ case 0x72: /* Set cursor visibility. */
+ set_cursor(e, ARG1(1));
+ break;
+ }
+ e->x = range(e->x, 0, 40);
+ e->y = range(e->y, 0, 1);
+ VLOG_DBG("cursor:%d,%d", e->x, e->y);
+}
+
+static void
+recv_control(uint8_t c, struct ezio *e)
+{
+ switch (c) {
+ case '\b':
+ if (e->x > 0) {
+ --e->x;
+ }
+ break;
+
+ case '\t':
+ e->x = ROUND_UP(e->x + 1, 8);
+ if (e->x > 40) {
+ ezio_newline(e);
+ }
+ break;
+
+ case '\n':
+ ezio_line_feed(e);
+ break;
+
+ case '\f':
+ ezio_clear(e);
+ break;
+
+ case '\r':
+ e->x = 0;
+ break;
+
+ default:
+ VLOG_DBG("Unhandled control character 0x%02"PRIx8, c);
+ }
+}
+
+static void
+recv_character(uint8_t byte, struct ezio *e)
+{
+ if (e->x >= 40) {
+ ezio_newline(e);
+ }
+ ezio_put_char(e, e->x++, e->y, byte);
+}
+
+static int
+default_arg(int value, int default_value)
+{
+ return value >= 0 ? value : default_value;
+}
+
+static int
+range(int value, int min, int max)
+{
+ return value < min ? min : value > max ? max : value;
+}
+
+static void
+clear_elements(uint8_t *p, size_t size, int pos, int clear_type)
+{
+ switch (clear_type) {
+ case 0:
+ /* Clear from 'pos' to end. */
+ memset(p + pos, ' ', size - pos);
+ break;
+ case 1:
+ /* Clear from beginning to 'pos'. */
+ memset(p, ' ', pos + 1);
+ break;
+ case 2:
+ /* Clear all. */
+ memset(p, ' ', size);
+ break;
+ }
+}
+
+static void
+define_icon(struct ezio *e, const int *args)
+{
+ int icon_nr;
+ int row;
+
+ icon_nr = args[0];
+ if (icon_nr < 0 || icon_nr > 7) {
+ return;
+ }
+
+ for (row = 0; row < 8; row++) {
+ e->icons[icon_nr][row] = default_arg(args[row + 1], 0) & 0x1f;
+ }
+}
+
+static void
+clear_icon(struct ezio *e, int icon_nr)
+{
+ if (icon_nr >= 0 && icon_nr <= 7) {
+ ezio_set_default_icon(e, icon_nr);
+ }
+}
+
+static void
+set_cursor(struct ezio *e, int visibility)
+{
+ switch (visibility) {
+ case 1:
+ e->show_cursor = e->blink_cursor = false;
+ break;
+ case 2:
+ e->show_cursor = true;
+ e->blink_cursor = false;
+ break;
+ case 3:
+ e->show_cursor = e->blink_cursor = true;
+ break;
+ }
+}
+
+static int
+unicode_to_ezio(uint16_t unicode)
+{
+ switch (unicode) {
+ /* Most ASCII characters map one-to-one. */
+ case 0x0020 ... 0x005b:
+ case 0x005d ... 0x007d:
+ return unicode;
+
+ /* A few ASCII characters have to be simulated with icons. */
+ case 0x005c: return 0x06; /* BACKSLASH */
+ case 0x007e: return 0x07; /* TILDE */
+
+ /* EZIO extended characters equivalents in Unicode - Japanese. */
+ case 0x00a5: return '\\'; /* YEN SIGN */
+ case 0x3002: return 0xa1; /* IDEOGRAPHIC FULL STOP */
+ case 0x300c: return 0xa2; /* LEFT CORNER BRACKET */
+ case 0x300d: return 0xa3; /* RIGHT CORNER BRACKET */
+ case 0x3001: return 0xa4; /* IDEOGRAPHIC COMMA */
+ case 0x30fb: return 0xa5; /* KATAKANA MIDDLE DOT */
+ case 0x30f2: return 0xa6; /* KATAKANA LETTER WO */
+ case 0x30a1: return 0xa7; /* KATAKANA LETTER SMALL A */
+ case 0x30a3: return 0xa8; /* KATAKANA LETTER SMALL I */
+ case 0x30a5: return 0xa9; /* KATAKANA LETTER SMALL U */
+ case 0x30a7: return 0xaa; /* KATAKANA LETTER SMALL E */
+ case 0x30a9: return 0xab; /* KATAKANA LETTER SMALL O */
+ case 0x30e3: return 0xac; /* KATAKANA LETTER SMALL YA */
+ case 0x30e5: return 0xad; /* KATAKANA LETTER SMALL YU */
+ case 0x30e7: return 0xae; /* KATAKANA LETTER SMALL YO */
+ case 0x30c3: return 0xaf; /* KATAKANA LETTER SMALL TU = SMALL TSU */
+ case 0x30fc: return 0xb0; /* KATAKANA-HIRAGANA PROLONGED SOUND MARK */
+ case 0x30a2: return 0xb1; /* KATAKANA LETTER A */
+ case 0x30a4: return 0xb2; /* KATAKANA LETTER I */
+ case 0x30a6: return 0xb3; /* KATAKANA LETTER U */
+ case 0x30a8: return 0xb4; /* KATAKANA LETTER E */
+ case 0x30aa: return 0xb5; /* KATAKANA LETTER O */
+ case 0x30ab: return 0xb6; /* KATAKANA LETTER KA */
+ case 0x30ac: return 0xb6de; /* KATAKANA LETTER GA */
+ case 0x30ad: return 0xb7; /* KATAKANA LETTER KI */
+ case 0x30ae: return 0xb7de; /* KATAKANA LETTER GI */
+ case 0x30af: return 0xb8; /* KATAKANA LETTER KU */
+ case 0x30b0: return 0xb8de; /* KATAKANA LETTER GU */
+ case 0x30b1: return 0xb9; /* KATAKANA LETTER KE */
+ case 0x30b2: return 0xb9de; /* KATAKANA LETTER GE */
+ case 0x30b3: return 0xba; /* KATAKANA LETTER KO */
+ case 0x30b4: return 0xbade; /* KATAKANA LETTER GO */
+ case 0x30b5: return 0xbb; /* KATAKANA LETTER SA */
+ case 0x30b6: return 0xbbde; /* KATAKANA LETTER ZA */
+ case 0x30b7: return 0xbc; /* KATAKANA LETTER SI = SHI */
+ case 0x30b8: return 0xbcde; /* KATAKANA LETTER ZI = JI */
+ case 0x30b9: return 0xbd; /* KATAKANA LETTER SU */
+ case 0x30ba: return 0xbdde; /* KATAKANA LETTER ZU */
+ case 0x30bb: return 0xbe; /* KATAKANA LETTER SE */
+ case 0x30bc: return 0xbede; /* KATAKANA LETTER ZE */
+ case 0x30bd: return 0xbf; /* KATAKANA LETTER SO */
+ case 0x30be: return 0xbfde; /* KATAKANA LETTER ZO */
+ case 0x30bf: return 0xc0; /* KATAKANA LETTER TA */
+ case 0x30c0: return 0xc0de; /* KATAKANA LETTER DA */
+ case 0x30c1: return 0xc1; /* KATAKANA LETTER TI = CHI */
+ case 0x30c2: return 0xc1de; /* KATAKANA LETTER DI = JI */
+ case 0x30c4: return 0xc2; /* KATAKANA LETTER TU = TSU */
+ case 0x30c5: return 0xc2de; /* KATAKANA LETTER DU = ZU */
+ case 0x30c6: return 0xc3; /* KATAKANA LETTER TE */
+ case 0x30c7: return 0xc3de; /* KATAKANA LETTER DE */
+ case 0x30c8: return 0xc4; /* KATAKANA LETTER TO */
+ case 0x30c9: return 0xc4de; /* KATAKANA LETTER DO */
+ case 0x30ca: return 0xc5; /* KATAKANA LETTER NA */
+ case 0x30cb: return 0xc6; /* KATAKANA LETTER NI */
+ case 0x30cc: return 0xc7; /* KATAKANA LETTER NU */
+ case 0x30cd: return 0xc8; /* KATAKANA LETTER NE */
+ case 0x30ce: return 0xc9; /* KATAKANA LETTER NO */
+ case 0x30cf: return 0xca; /* KATAKANA LETTER HA */
+ case 0x30d0: return 0xcade; /* KATAKANA LETTER BA */
+ case 0x30d1: return 0xcadf; /* KATAKANA LETTER PA */
+ case 0x30d2: return 0xcb; /* KATAKANA LETTER HI */
+ case 0x30d3: return 0xcbde; /* KATAKANA LETTER BI */
+ case 0x30d4: return 0xcbdf; /* KATAKANA LETTER PI */
+ case 0x30d5: return 0xcc; /* KATAKANA LETTER HU = FU */
+ case 0x30d6: return 0xccde; /* KATAKANA LETTER BU */
+ case 0x30d7: return 0xccdf; /* KATAKANA LETTER PU */
+ case 0x30d8: return 0xcd; /* KATAKANA LETTER HE */
+ case 0x30d9: return 0xcdde; /* KATAKANA LETTER BE */
+ case 0x30da: return 0xcddf; /* KATAKANA LETTER PE */
+ case 0x30db: return 0xce; /* KATAKANA LETTER HO */
+ case 0x30dc: return 0xcede; /* KATAKANA LETTER BO */
+ case 0x30dd: return 0xcedf; /* KATAKANA LETTER PO */
+ case 0x30de: return 0xcf; /* KATAKANA LETTER MA */
+ case 0x30df: return 0xd0; /* KATAKANA LETTER MI */
+ case 0x30e0: return 0xd1; /* KATAKANA LETTER MU */
+ case 0x30e1: return 0xd2; /* KATAKANA LETTER ME */
+ case 0x30e2: return 0xd3; /* KATAKANA LETTER MO */
+ case 0x30e4: return 0xd4; /* KATAKANA LETTER YA */
+ case 0x30e6: return 0xd5; /* KATAKANA LETTER YU */
+ case 0x30e8: return 0xd6; /* KATAKANA LETTER YO */
+ case 0x30e9: return 0xd7; /* KATAKANA LETTER RA */
+ case 0x30ea: return 0xd8; /* KATAKANA LETTER RI */
+ case 0x30eb: return 0xd9; /* KATAKANA LETTER RU */
+ case 0x30ec: return 0xda; /* KATAKANA LETTER RE */
+ case 0x30ed: return 0xdb; /* KATAKANA LETTER RO */
+ case 0x30ef: return 0xdc; /* KATAKANA LETTER WA */
+ case 0x30f3: return 0xdd; /* KATAKANA LETTER N */
+ case 0x30f4: return 0xb3de; /* KATAKANA LETTER VU */
+ case 0x30f7: return 0xdcde; /* KATAKANA LETTER VA */
+ case 0x3099: return 0xde; /* COMBINING KATAKANA-HIRAGANA VOICED SOUND
+ * MARK */
+ case 0x309a: return 0xdf; /* COMBINING KATAKANA-HIRAGANA SEMI-VOICED
+ * SOUND MARK */
+ case 0x309b: return 0xde; /* KATAKANA-HIRAGANA VOICED SOUND MARK */
+ case 0x309c: return 0xdf; /* KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK */
+
+ /* EZIO extended characters equivalents in Unicode - other. */
+ case 0x2192: return 0x7e; /* RIGHTWARDS ARROW */
+ case 0x2190: return 0x7f; /* LEFTWARDS ARROW */
+ case 0x03b1: return 0xe0; /* GREEK SMALL LETTER ALPHA */
+ case 0x00e4: return 0xe1; /* LATIN SMALL LETTER A WITH DIAERESIS */
+ case 0x03b2: return 0xe2; /* GREEK SMALL LETTER BETA */
+ case 0x03b5: return 0xe3; /* GREEK SMALL LETTER EPSILON */
+ case 0x03bc: return 0xe4; /* GREEK SMALL LETTER MU */
+ case 0x03c6: return 0xe5; /* GREEK SMALL LETTER PHI */
+ case 0x03c1: return 0xe6; /* GREEK SMALL LETTER RHO */
+ /* 0xe7 is 'g'. */
+ case 0x221a: return 0xe8; /* SQUARE ROOT = radical sign */
+ /* 0xe9 is an unrecognizable symbol. */
+ /* 0xea is 'j'. */
+ /* 0xeb is an unrecognizable symbol.*/
+ case 0x00a2: return 0xec; /* CENT SIGN */
+ case 0x00a3: return 0xed; /* POUND SIGN */
+ case 0x00f1: return 0xee; /* LATIN SMALL LETTER N WITH TILDE */
+ case 0x00f6: return 0xef; /* LATIN SMALL LETTER O WITH DIAERESIS */
+ /* 0xf0 is 'p'. */
+ /* 0xf1 is 'q'. */
+ case 0x03b8: return 0xf2; /* GREEK SMALL LETTER THETA */
+ case 0x221e: return 0xf3; /* INFINITY */
+ case 0x03a9: return 0xf4; /* GREEK CAPITAL LETTER OMEGA */
+ case 0x00fc: return 0xf5; /* LATIN SMALL LETTER U WITH DIAERESIS */
+ case 0x03a3: return 0xf6; /* GREEK CAPITAL LETTER SIGMA */
+ case 0x03c0: return 0xf7; /* GREEK SMALL LETTER PI */
+ /* 0xf8 is x-macron (the sample mean). */
+ /* 0xf9 is 'y'. */
+ case 0x5343: return 0xfa; /* thousand */
+ case 0x4e07: return 0xfb; /* ten thousand */
+ case 0x5186: return 0xfc; /* yen */
+ case 0x00f7: return 0xfd; /* DIVISION SIGN */
+ case 0x2588: return 0xff; /* FULL BLOCK = solid */
+
+ /* EZIO icons (from the Unicode Private Use corporate subarea). */
+ case 0xf8f8: return 0x00;
+ case 0xf8f9: return 0x01;
+ case 0xf8fa: return 0x02;
+ case 0xf8fb: return 0x03;
+ case 0xf8fc: return 0x04;
+ case 0xf8fd: return 0x05;
+ case 0xf8fe: return 0x06;
+ case 0xf8ff: return 0x07;
+
+ /* No mappings for anything else. */
+ default: return -1;
+ }
+}
+
+/* UTF-8 decoder. */
+
+#define UTF_STATES \
+ UTF_STATE(UTF8_INIT, 0x00, 0xf4, UTF8_INIT) \
+ UTF_STATE(UTF8_3, 0x80, 0xbf, UTF8_2) \
+ UTF_STATE(UTF8_2, 0x80, 0xbf, UTF8_1) \
+ UTF_STATE(UTF8_1, 0x80, 0xbf, UTF8_INIT) \
+ UTF_STATE(UTF8_E0, 0xa0, 0xbf, UTF8_1) \
+ UTF_STATE(UTF8_ED, 0x80, 0x9f, UTF8_1) \
+ UTF_STATE(UTF8_F0, 0x90, 0xbf, UTF8_INIT) \
+ UTF_STATE(UTF8_F4, 0x80, 0x8f, UTF8_INIT)
+
+enum state {
+#define UTF_STATE(NAME, MIN, MAX, NEXT) NAME,
+ UTF_STATES
+#undef UTF_STATE
+};
+
+struct state_info {
+ uint8_t min, max;
+ enum state next;
+};
+
+static const struct state_info states[] = {
+#define UTF_STATE(NAME, MIN, MAX, NEXT) {MIN, MAX, NEXT},
+ UTF_STATES
+#undef UTF_STATE
+};
+
+struct utf8_reader {
+ int cp;
+ enum state state;
+};
+
+struct utf8_reader *
+utf8_reader_create(void)
+{
+ struct utf8_reader *r = xmalloc(sizeof *r);
+ r->state = UTF8_INIT;
+ return r;
+}
+
+void
+utf8_reader_destroy(struct utf8_reader *r)
+{
+ free(r);
+}
+
+int
+utf8_reader_read(struct utf8_reader *r, uint8_t c)
+{
+ const struct state_info *s = &states[r->state];
+ if (c >= s->min && c <= s->max) {
+ if (r->state == UTF8_INIT) {
+ if (c < 0x80) {
+ return c;
+ } else if (c >= 0xc2 && c <= 0xdf) {
+ r->cp = c & 0x1f;
+ r->state = UTF8_1;
+ return -1;
+ } else if (c >= 0xe0 && c <= 0xef) {
+ r->cp = c & 0x0f;
+ r->state = c == 0xe0 ? UTF8_E0 : c == 0xed ? UTF8_ED : UTF8_2;
+ return -1;
+ } else if (c >= 0xf0 && c <= 0xf4) {
+ r->cp = c & 0x07;
+ r->state = c == 0xf0 ? UTF8_F0 : c == 0xf4 ? UTF8_F4 : UTF8_3;
+ return -1;
+ }
+ } else {
+ r->cp = (r->cp << 6) | (c & 0x3f);
+ r->state = s->next;
+ return r->state == UTF8_INIT ? r->cp : -1;
+ }
+ }
+
+ /* Invalid UTF-8 sequence. Return the Unicode general substitute
+ * REPLACEMENT CHARACTER. */
+ r->state = UTF8_INIT;
+ return 0xfffd;
+}
+
+/* ANSI control sequence decoder. */
+
+/* States are named for what we are looking for in that state. */
+enum ansi_state {
+ ANSI_ESC, /* Looking for ESC. */
+ ANSI_CSI, /* Looking for [ (to complete CSI). */
+ ANSI_PARAMETER, /* Looking for parameter. */
+ ANSI_INTERMEDIATE, /* Looking for intermediate byte. */
+ ANSI_FINAL, /* Looking for final byte. */
+ ANSI_COMPLETE /* Got an entire escape sequence. */
+};
+
+struct ansi_decoder {
+ enum ansi_state state;
+ struct ansi_sequence seq;
+ int c;
+};
+
+struct ansi_decoder *
+ansi_decoder_create(void)
+{
+ struct ansi_decoder *d = xmalloc(sizeof *d);
+ d->state = ANSI_ESC;
+ return d;
+}
+
+void
+ansi_decoder_destroy(struct ansi_decoder *d)
+{
+ free(d);
+}
+
+int
+ansi_decoder_put(struct ansi_decoder *d, uint8_t c)
+{
+ if (c == 27) {
+ /* Escape always starts a new escape sequence, aborting an incomplete
+ * one if necessary. */
+ if (d->state != ANSI_ESC) {
+ VLOG_DBG("Unexpected escape inside escape sequence");
+ }
+ d->state = ANSI_CSI;
+ return 0;
+ }
+
+ switch (d->state) {
+ case ANSI_ESC:
+ return 1;
+
+ case ANSI_CSI:
+ if (c == '[') {
+ d->state = ANSI_PARAMETER;
+ d->seq.n_args = 0;
+ d->seq.function = 0;
+ } else if (c >= 0x40 && c <= 0x5f) {
+ d->state = ANSI_COMPLETE;
+ d->seq.n_args = 0;
+ d->seq.function = 0;
+ d->seq.function = c << 8;
+ return -1;
+ } else {
+ d->state = ANSI_ESC;
+ }
+ break;
+
+ case ANSI_PARAMETER:
+ if (c >= '0' && c <= '9') {
+ int *arg;
+ if (d->seq.n_args == 0) {
+ d->seq.args[d->seq.n_args++] = 0;
+ } else if (d->seq.n_args > ANSI_MAX_ARGS) {
+ break;
+ }
+ arg = &d->seq.args[d->seq.n_args - 1];
+ if (*arg == -1) {
+ *arg = 0;
+ }
+ *arg = *arg * 10 + (c - '0');
+ break;
+ } else if (c == ';') {
+ if (d->seq.n_args < ANSI_MAX_ARGS) {
+ d->seq.args[d->seq.n_args] = -1;
+ }
+ d->seq.n_args++;
+ break;
+ }
+ d->state = ANSI_INTERMEDIATE;
+ /* Fall through. */
+
+ case ANSI_INTERMEDIATE:
+ if (c >= 0x20 && c <= 0x2f) {
+ d->seq.function = d->seq.function * 16 + (c - 0x20);
+ break;
+ }
+ d->state = ANSI_FINAL;
+ /* Fall through. */
+
+ case ANSI_FINAL:
+ if (c >= 0x40 && c <= 0x7e) {
+ d->seq.function = d->seq.function * 256 + c;
+ d->state = ANSI_COMPLETE;
+ return -1;
+ } else {
+ /* Invalid sequence. */
+ d->state = ANSI_ESC;
+ }
+ break;
+
+ case ANSI_COMPLETE:
+ NOT_REACHED();
+ }
+ return 0;
+}
+
+const struct ansi_sequence *
+ansi_decoder_get(struct ansi_decoder *d)
+{
+ assert(d->state == ANSI_COMPLETE);
+ d->state = ANSI_ESC;
+ if (d->seq.n_args < ANSI_MAX_ARGS) {
+ int i;
+ for (i = d->seq.n_args; i < ANSI_MAX_ARGS; i++) {
+ d->seq.args[i] = -1;
+ }
+ } else {
+ d->seq.n_args = ANSI_MAX_ARGS;
+ }
+ return &d->seq;
+}
diff --git a/extras/ezio/terminal.h b/extras/ezio/terminal.h
new file mode 100644
index 000000000..1ae5c479d
--- /dev/null
+++ b/extras/ezio/terminal.h
@@ -0,0 +1,41 @@
+/* Copyright (c) 2008, 2009 Nicira Networks, Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * In addition, as a special exception, Nicira Networks gives permission
+ * to link the code of its release of vswitchd with the OpenSSL project's
+ * "OpenSSL" library (or with modified versions of it that use the same
+ * license as the "OpenSSL" library), and distribute the linked
+ * executables. You must obey the GNU General Public License in all
+ * respects for all of the code used other than "OpenSSL". If you modify
+ * this file, you may extend this exception to your version of the file,
+ * but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version.
+ *
+ */
+
+#ifndef TERMINAL_H
+#define TERMINAL_H 1
+
+#include <stdbool.h>
+#include <stdint.h>
+
+struct ezio;
+
+struct terminal *terminal_create(void);
+void terminal_destroy(struct terminal *);
+int terminal_run(struct terminal *, struct ezio *, int input_fd);
+void terminal_wait(struct terminal *, int input_fd);
+
+#endif /* terminal.h */
diff --git a/extras/ezio/tty.c b/extras/ezio/tty.c
new file mode 100644
index 000000000..ce788f285
--- /dev/null
+++ b/extras/ezio/tty.c
@@ -0,0 +1,404 @@
+/* Copyright (c) 2008, 2009 Nicira Networks, Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * In addition, as a special exception, Nicira Networks gives permission
+ * to link the code of its release of vswitchd with the OpenSSL project's
+ * "OpenSSL" library (or with modified versions of it that use the same
+ * license as the "OpenSSL" library), and distribute the linked
+ * executables. You must obey the GNU General Public License in all
+ * respects for all of the code used other than "OpenSSL". If you modify
+ * this file, you may extend this exception to your version of the file,
+ * but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version.
+ *
+ */
+
+#include <config.h>
+#include "extras/ezio/tty.h"
+#include <errno.h>
+#include <fcntl.h>
+#include <pwd.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stropts.h>
+#include <sys/ioctl.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include "fatal-signal.h"
+#include "socket-util.h"
+#include "util.h"
+
+#define THIS_MODULE VLM_tty
+#include "vlog.h"
+
+/* Get major() and minor() macros. */
+#if MAJOR_IN_MKDEV
+# include <sys/mkdev.h>
+#elif MAJOR_IN_SYSMACROS
+# include <sys/sysmacros.h>
+#else
+# include <sys/types.h>
+# ifndef major
+# define major(dev) (((dev) >> 8) & 0xff)
+# define minor(dev) ((dev) & 0xff)
+# endif
+#endif
+
+static int
+fcntl_lock(int fd)
+{
+ struct flock l;
+ memset(&l, 0, sizeof l);
+ l.l_type = F_WRLCK;
+ l.l_whence = SEEK_SET;
+ l.l_start = 0;
+ l.l_len = 0;
+ return fcntl(fd, F_SETLK, &l) == -1 ? errno : 0;
+}
+
+static int
+remove_lockfile(const char *name)
+{
+ char buffer[BUFSIZ];
+ ssize_t n;
+ pid_t pid;
+ int fd;
+
+ /* Remove existing lockfile. */
+ fd = open(name, O_RDWR);
+ if (fd < 0) {
+ if (errno == ENOENT) {
+ return 0;
+ } else {
+ VLOG_ERR("%s: open: %s", name, strerror(errno));
+ return errno;
+ }
+ }
+
+ /* Read lockfile. */
+ n = read(fd, buffer, sizeof buffer - 1);
+ if (n < 0) {
+ int error = errno;
+ VLOG_ERR("%s: read: %s", name, strerror(error));
+ close(fd);
+ return error;
+ }
+ buffer[n] = '\0';
+ if (n == 4 && memchr(buffer, '\0', n)) {
+ int32_t x;
+ memcpy(&x, buffer, sizeof x);
+ pid = x;
+ } else if (n >= 0) {
+ pid = strtol(buffer, NULL, 10);
+ }
+ if (pid <= 0) {
+ close(fd);
+ VLOG_WARN("%s: format not recognized, treating as locked.", name);
+ return EACCES;
+ }
+
+ /* Is lockfile fresh? */
+ if (strstr(buffer, "fcntl")) {
+ int retval = fcntl_lock(fd);
+ if (retval) {
+ close(fd);
+ VLOG_ERR("%s: device is locked (via fcntl): %s",
+ name, strerror(retval));
+ return retval;
+ } else {
+ VLOG_WARN("%s: removing stale lockfile (checked via fcntl)", name);
+ }
+ } else {
+ if (!(kill(pid, 0) < 0 && errno == ESRCH)) {
+ close(fd);
+ VLOG_ERR("%s: device is locked (without fcntl)", name);
+ return EACCES;
+ } else {
+ VLOG_WARN("%s: removing stale lockfile (without fcntl)", name);
+ }
+ }
+ close(fd);
+
+ /* Remove stale lockfile. */
+ if (unlink(name)) {
+ VLOG_ERR("%s: unlink: %s", name, strerror(errno));
+ return errno;
+ }
+ return 0;
+}
+
+static int
+create_lockfile(const char *name)
+{
+ const char *username;
+ char buffer[BUFSIZ];
+ struct passwd *pwd;
+ mode_t old_umask;
+ uid_t uid;
+ int fd;
+
+ /* Create file. */
+ old_umask = umask(022);
+ fd = open(name, O_WRONLY | O_CREAT | O_EXCL, 0666);
+ if (fd < 0) {
+ int error = errno;
+ VLOG_ERR("%s: create: %s", name, strerror(error));
+ umask(old_umask);
+ return error;
+ }
+ umask(old_umask);
+
+ /* Lock file. */
+ if (fcntl_lock(fd)) {
+ int error = errno;
+ close(fd);
+ VLOG_ERR("%s: cannot lock: %s", name, strerror(error));
+ return error;
+ }
+
+ /* Write to file. */
+ uid = getuid();
+ pwd = getpwuid(uid);
+ username = pwd ? pwd->pw_name : "unknown";
+ snprintf(buffer, sizeof buffer, "%10ld %s %.20s fcntl\n",
+ (long int) getpid(), program_name, username);
+ if (write(fd, buffer, strlen(buffer)) != strlen(buffer)) {
+ int error = errno;
+ VLOG_ERR("%s: write: %s", name, strerror(error));
+ close(fd);
+ unlink(name);
+ return error;
+ }
+
+ /* We intentionally do not close 'fd', to avoid releasing the fcntl lock.
+ * The asssumption here is that we never unlock a tty. */
+ fatal_signal_add_file_to_unlink(name);
+
+ return 0;
+}
+
+static int
+do_lock(char *name)
+{
+ int retval = remove_lockfile(name);
+ if (!retval) {
+ retval = create_lockfile(name);
+ }
+ free(name);
+ return retval;
+}
+
+int
+tty_lock(const char *dev_name)
+{
+ struct stat s;
+ char *name;
+ int retval;
+
+ /* Check that the lockfile directory exists. */
+ if (stat(TTY_LOCK_DIR, &s)) {
+ VLOG_ERR("%s: stat: %s", TTY_LOCK_DIR, strerror(errno));
+ return errno;
+ }
+
+ /* First lock by device number. */
+ if (stat(dev_name, &s)) {
+ VLOG_ERR("%s: stat: %s", dev_name, strerror(errno));
+ return errno;
+ }
+ retval = do_lock(xasprintf("%s/LK.%03d.%03d.%03d", TTY_LOCK_DIR,
+ major(s.st_dev),
+ major(s.st_rdev), minor(s.st_rdev)));
+ if (retval) {
+ return retval;
+ }
+
+ /* Then lock by device name. */
+ if (!strncmp(dev_name, "/dev/", 5)) {
+ char *cp;
+
+ name = xasprintf("%s/%s", TTY_LOCK_DIR, dev_name + 5);
+ for (cp = name + strlen(dev_name) + 1; *cp; cp++) {
+ if (*cp == '/') {
+ *cp = '_';
+ }
+ }
+ } else {
+ char *slash = strrchr(dev_name, '/');
+ name = xasprintf ("%s/%s", TTY_LOCK_DIR, slash ? slash + 1 : dev_name);
+ }
+ return do_lock(name);
+}
+
+struct saved_termios {
+ int fd;
+ struct termios tios;
+};
+
+static void
+restore_termios(void *s_)
+{
+ struct saved_termios *s = s_;
+ tcsetattr(s->fd, TCSAFLUSH, &s->tios);
+}
+
+int
+tty_set_raw_mode(int fd, speed_t speed)
+{
+ if (isatty(fd)) {
+ struct termios tios;
+ struct saved_termios *s;
+
+ if (tcgetattr(fd, &tios) < 0) {
+ return errno;
+ }
+
+ s = xmalloc(sizeof *s);
+ s->fd = dup(fd);
+ if (s->fd < 0) {
+ int error = errno;
+ VLOG_WARN("dup failed: %s", strerror(error));
+ free(s);
+ return errno;
+ }
+ s->tios = tios;
+ fatal_signal_add_hook(restore_termios, s, true);
+
+ tios.c_iflag &= ~(IGNBRK | BRKINT | PARMRK | ISTRIP
+ | INLCR | IGNCR | ICRNL | IXON);
+ tios.c_oflag &= ~OPOST;
+ tios.c_lflag &= ~(ECHO | ECHONL | ICANON | ISIG | IEXTEN);
+ tios.c_cflag &= ~(CSIZE | PARENB);
+ tios.c_cflag |= CS8;
+ if (speed != B0) {
+ cfsetispeed(&tios, speed);
+ cfsetospeed(&tios, speed);
+ }
+ if (tcsetattr(fd, TCSAFLUSH, &tios) < 0) {
+ return errno;
+ }
+ }
+ return set_nonblocking(fd);
+}
+
+int
+tty_open_master_pty(void)
+{
+ int retval;
+ int fd;
+
+ fd = posix_openpt(O_RDWR | O_NOCTTY);
+ if (fd < 0) {
+ int error = errno;
+ VLOG_WARN("posix_openpt failed: %s", strerror(error));
+ close(fd);
+ return -error;
+ }
+
+ if (grantpt(fd) < 0) {
+ int error = errno;
+ VLOG_WARN("grantpt failed: %s", strerror(error));
+ close(fd);
+ return -error;
+ }
+
+ if (unlockpt(fd) < 0) {
+ int error = errno;
+ VLOG_WARN("unlockpt failed: %s", strerror(error));
+ close(fd);
+ return -error;
+ }
+
+ retval = set_nonblocking(fd);
+ if (retval) {
+ VLOG_WARN("set_nonblocking failed: %s", strerror(retval));
+ close(fd);
+ return retval;
+ }
+
+ return fd;
+}
+
+int
+tty_fork_child(int master_fd, char *argv[])
+{
+ int retval = fork();
+ if (!retval) {
+ char *slave_name;
+ int slave_fd;
+ int fd;
+
+ /* Running in child process. */
+ fatal_signal_fork();
+
+ /* Open pty slave as controlling terminal. */
+ setsid();
+ slave_name = ptsname(master_fd);
+ if (slave_name == NULL) {
+ ovs_fatal(errno, "ptsname");
+ }
+ slave_fd = open(slave_name, O_RDWR);
+ if (isastream(slave_fd)
+ && (ioctl(slave_fd, I_PUSH, "ptem") < 0
+ || ioctl(slave_fd, I_PUSH, "ldterm") < 0)) {
+ ovs_fatal(errno, "STREAMS ioctl");
+ }
+
+ /* Make pty slave stdin, stdout. */
+ if (dup2(slave_fd, STDIN_FILENO) < 0
+ || dup2(slave_fd, STDOUT_FILENO) < 0
+ || dup2(slave_fd, STDERR_FILENO) < 0) {
+ ovs_fatal(errno, "dup2");
+ }
+
+ /* Close other file descriptors. */
+ for (fd = 3; fd < 20; fd++) {
+ close(fd);
+ }
+
+ /* Set terminal type. */
+ setenv("TERM", "ezio3", true);
+
+ /* Invoke subprocess. */
+ execvp(argv[0], argv);
+ ovs_fatal(errno, "execvp");
+ } else if (retval > 0) {
+ /* Running in parent process. */
+ return 0;
+ } else {
+ /* Fork failed. */
+ VLOG_WARN("fork failed: %s", strerror(errno));
+ return errno;
+ }
+}
+
+int
+tty_set_window_size(int fd UNUSED, int rows UNUSED, int columns UNUSED)
+{
+#ifdef TIOCGWINSZ
+ struct winsize win;
+ win.ws_row = rows;
+ win.ws_col = columns;
+ win.ws_xpixel = 0;
+ win.ws_ypixel = 0;
+ if (ioctl(fd, TIOCSWINSZ, &win) == -1) {
+ return errno;
+ }
+#else
+#error
+#endif
+ return 0;
+}
diff --git a/extras/ezio/tty.h b/extras/ezio/tty.h
new file mode 100644
index 000000000..7500df558
--- /dev/null
+++ b/extras/ezio/tty.h
@@ -0,0 +1,39 @@
+/* Copyright (c) 2008, 2009 Nicira Networks, Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * In addition, as a special exception, Nicira Networks gives permission
+ * to link the code of its release of vswitchd with the OpenSSL project's
+ * "OpenSSL" library (or with modified versions of it that use the same
+ * license as the "OpenSSL" library), and distribute the linked
+ * executables. You must obey the GNU General Public License in all
+ * respects for all of the code used other than "OpenSSL". If you modify
+ * this file, you may extend this exception to your version of the file,
+ * but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version.
+ *
+ */
+
+#ifndef TTY_H
+#define TTY_H 1
+
+#include <termios.h>
+
+int tty_lock(const char *dev_name);
+int tty_set_raw_mode(int fd, speed_t);
+int tty_open_master_pty(void);
+int tty_fork_child(int master_fd, char *argv[]);
+int tty_set_window_size(int fd, int n_rows, int n_columns);
+
+#endif /* tty.h */
diff --git a/extras/ezio/vt-dummy.c b/extras/ezio/vt-dummy.c
new file mode 100644
index 000000000..f36d31140
--- /dev/null
+++ b/extras/ezio/vt-dummy.c
@@ -0,0 +1,40 @@
+/* Copyright (c) 2008, 2009 Nicira Networks, Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * In addition, as a special exception, Nicira Networks gives permission
+ * to link the code of its release of vswitchd with the OpenSSL project's
+ * "OpenSSL" library (or with modified versions of it that use the same
+ * license as the "OpenSSL" library), and distribute the linked
+ * executables. You must obey the GNU General Public License in all
+ * respects for all of the code used other than "OpenSSL". If you modify
+ * this file, you may extend this exception to your version of the file,
+ * but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version.
+ *
+ */
+
+#include <config.h>
+#include "extras/ezio/vt.h"
+#include <errno.h>
+
+#define THIS_MODULE VLM_vt
+#include "vlog.h"
+
+int
+vt_open(int open_flags)
+{
+ VLOG_ERR("no virtual terminal support on this platform");
+ return -ENOSYS;
+}
diff --git a/extras/ezio/vt-linux.c b/extras/ezio/vt-linux.c
new file mode 100644
index 000000000..f502c9e14
--- /dev/null
+++ b/extras/ezio/vt-linux.c
@@ -0,0 +1,139 @@
+/* Copyright (c) 2008, 2009 Nicira Networks, Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * In addition, as a special exception, Nicira Networks gives permission
+ * to link the code of its release of vswitchd with the OpenSSL project's
+ * "OpenSSL" library (or with modified versions of it that use the same
+ * license as the "OpenSSL" library), and distribute the linked
+ * executables. You must obey the GNU General Public License in all
+ * respects for all of the code used other than "OpenSSL". If you modify
+ * this file, you may extend this exception to your version of the file,
+ * but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version.
+ *
+ */
+
+#include <config.h>
+#include "extras/ezio/vt.h"
+#include <errno.h>
+#include <fcntl.h>
+#include <linux/kd.h>
+#include <linux/vt.h>
+#include <stdint.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <unistd.h>
+#include "util.h"
+
+#define THIS_MODULE VLM_vt
+#include "vlog.h"
+
+static bool get_console_fd(int *fd);
+
+int
+vt_open(int open_flags)
+{
+ int console_fd, vt_fd;
+ char name[16];
+ int vt;
+
+ if (!get_console_fd(&console_fd)) {
+ return -EACCES;
+ }
+
+ /* Deallocate all unused virtual terminals, so that we don't proliferate an
+ * excess of empty ones over multiple runs. */
+ if (ioctl(console_fd, VT_DISALLOCATE, 0) < 0) {
+ VLOG_WARN("failed to deallocate empty virtual terminals: %s",
+ strerror(errno));
+ }
+
+ /* Find a unused virtual terminal. */
+ if (ioctl(console_fd, VT_OPENQRY, &vt) < 0) {
+ int error = errno;
+ VLOG_ERR("failed to find a free virtual terminal: %s",
+ strerror(error));
+ close(console_fd);
+ return -error;
+ }
+
+ /* Open virtual terminal. */
+ sprintf(name, "/dev/tty%d", vt);
+ vt_fd = open(name, open_flags);
+ if (vt_fd < 0) {
+ int error = errno;
+ VLOG_ERR("failed to open %s: %s", name, strerror(error));
+ close(console_fd);
+ return -error;
+ }
+
+ /* Activate virtual terminal. */
+ if (ioctl(console_fd, VT_ACTIVATE, vt) < 0
+ || ioctl(console_fd, VT_WAITACTIVE, vt) < 0) {
+ int error = errno;
+ VLOG_ERR("could not activate virtual terminal %d: %s",
+ vt, strerror(error));
+ close(console_fd);
+ close(vt_fd);
+ return -error;
+ }
+
+ /* Success. */
+ VLOG_DBG("allocated virtual terminal %d (%s)", vt, name);
+ close(console_fd);
+ return vt_fd;
+}
+
+static bool
+is_console(int fd)
+{
+ uint8_t type = 0;
+ return !ioctl(fd, KDGKBTYPE, &type) && (type == KB_101 || type == KB_84);
+}
+
+static bool
+open_console(const char *name, int *fdp)
+{
+ *fdp = open(name, O_RDWR | O_NOCTTY);
+ if (*fdp >= 0) {
+ if (is_console(*fdp)) {
+ return true;
+ }
+ close(*fdp);
+ }
+ return false;
+}
+
+static bool
+get_console_fd(int *fdp)
+{
+ int fd;
+
+ if (open_console("/dev/tty", fdp)
+ || open_console("/dev/tty0", fdp)
+ || open_console("/dev/console", fdp)) {
+ return true;
+ }
+ for (fd = 0; fd < 3; fd++) {
+ if (is_console(fd)) {
+ *fdp = dup(fd);
+ if (*fdp >= 0) {
+ return true;
+ }
+ }
+ }
+ VLOG_ERR("unable to obtain a file descriptor for the console");
+ return false;
+}
diff --git a/extras/ezio/vt.h b/extras/ezio/vt.h
new file mode 100644
index 000000000..2aafe9419
--- /dev/null
+++ b/extras/ezio/vt.h
@@ -0,0 +1,33 @@
+/* Copyright (c) 2008, 2009 Nicira Networks, Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * In addition, as a special exception, Nicira Networks gives permission
+ * to link the code of its release of vswitchd with the OpenSSL project's
+ * "OpenSSL" library (or with modified versions of it that use the same
+ * license as the "OpenSSL" library), and distribute the linked
+ * executables. You must obey the GNU General Public License in all
+ * respects for all of the code used other than "OpenSSL". If you modify
+ * this file, you may extend this exception to your version of the file,
+ * but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version.
+ *
+ */
+
+#ifndef VT_H
+#define VT_H 1
+
+int vt_open(int open_flags);
+
+#endif /* vt.h */
diff --git a/include/.gitignore b/include/.gitignore
new file mode 100644
index 000000000..b336cc7ce
--- /dev/null
+++ b/include/.gitignore
@@ -0,0 +1,2 @@
+/Makefile
+/Makefile.in
diff --git a/include/automake.mk b/include/automake.mk
new file mode 100644
index 000000000..a1316c4ab
--- /dev/null
+++ b/include/automake.mk
@@ -0,0 +1,2 @@
+include include/openflow/automake.mk
+include include/openvswitch/automake.mk
diff --git a/include/openflow/automake.mk b/include/openflow/automake.mk
new file mode 100644
index 000000000..d4731550a
--- /dev/null
+++ b/include/openflow/automake.mk
@@ -0,0 +1,4 @@
+noinst_HEADERS += \
+ include/openflow/openflow-mgmt.h \
+ include/openflow/nicira-ext.h \
+ include/openflow/openflow.h
diff --git a/include/openflow/nicira-ext.h b/include/openflow/nicira-ext.h
new file mode 100644
index 000000000..176e0310e
--- /dev/null
+++ b/include/openflow/nicira-ext.h
@@ -0,0 +1,109 @@
+/*
+ * Distributed under the terms of the GNU GPL version 2.
+ * Copyright (c) 2008, 2009 Nicira Networks
+ */
+
+#ifndef OPENFLOW_NICIRA_EXT_H
+#define OPENFLOW_NICIRA_EXT_H 1
+
+#include "openflow/openflow.h"
+
+#define NICIRA_OUI_STR "002320"
+
+/* The following vendor extensions, proposed by Nicira Networks, are not yet
+ * ready for standardization (and may never be), so they are not included in
+ * openflow.h. */
+
+#define NX_VENDOR_ID 0x00002320
+
+enum nicira_type {
+ /* Switch status request. The request body is an ASCII string that
+ * specifies a prefix of the key names to include in the output; if it is
+ * the null string, then all key-value pairs are included. */
+ NXT_STATUS_REQUEST,
+
+ /* Switch status reply. The reply body is an ASCII string of key-value
+ * pairs in the form "key=value\n". */
+ NXT_STATUS_REPLY,
+
+ /* Configure an action. Most actions do not require configuration
+ * beyond that supplied in the actual action call. */
+ NXT_ACT_SET_CONFIG,
+
+ /* Get configuration of action. */
+ NXT_ACT_GET_CONFIG,
+
+ /* Remote command execution. The request body is a sequence of strings
+ * delimited by null bytes. The first string is a command name.
+ * Subsequent strings are command arguments. */
+ NXT_COMMAND_REQUEST,
+
+ /* Remote command execution reply, sent when the command's execution
+ * completes. The reply body is struct nx_command_reply. */
+ NXT_COMMAND_REPLY,
+
+ /* No longer used. */
+ NXT_FLOW_END_CONFIG__OBSOLETE,
+
+ /* No longer used. */
+ NXT_FLOW_END__OBSOLETE,
+
+ /* Management protocol. See "openflow-mgmt.h". */
+ NXT_MGMT,
+};
+
+struct nicira_header {
+ struct ofp_header header;
+ uint32_t vendor; /* NX_VENDOR_ID. */
+ uint32_t subtype; /* One of NXT_* above. */
+};
+OFP_ASSERT(sizeof(struct nicira_header) == sizeof(struct ofp_vendor_header) + 4);
+
+
+enum nx_action_subtype {
+ NXAST_SNAT__OBSOLETE, /* No longer used. */
+ NXAST_RESUBMIT /* Throw against flow table again. */
+};
+
+/* Action structure for NXAST_RESUBMIT. */
+struct nx_action_resubmit {
+ uint16_t type; /* OFPAT_VENDOR. */
+ uint16_t len; /* Length is 8. */
+ uint32_t vendor; /* NX_VENDOR_ID. */
+ uint16_t subtype; /* NXAST_RESUBMIT. */
+ uint16_t in_port; /* New in_port for checking flow table. */
+ uint8_t pad[4];
+};
+OFP_ASSERT(sizeof(struct nx_action_resubmit) == 16);
+
+/* Header for Nicira-defined actions. */
+struct nx_action_header {
+ uint16_t type; /* OFPAT_VENDOR. */
+ uint16_t len; /* Length is 8. */
+ uint32_t vendor; /* NX_VENDOR_ID. */
+ uint16_t subtype; /* NXAST_*. */
+ uint8_t pad[6];
+};
+OFP_ASSERT(sizeof(struct nx_action_header) == 16);
+
+/* Status bits for NXT_COMMAND_REPLY. */
+enum {
+ NXT_STATUS_EXITED = 1 << 31, /* Exited normally. */
+ NXT_STATUS_SIGNALED = 1 << 30, /* Exited due to signal. */
+ NXT_STATUS_UNKNOWN = 1 << 29, /* Exited for unknown reason. */
+ NXT_STATUS_COREDUMP = 1 << 28, /* Exited with core dump. */
+ NXT_STATUS_ERROR = 1 << 27, /* Command could not be executed. */
+ NXT_STATUS_STARTED = 1 << 26, /* Command was started. */
+ NXT_STATUS_EXITSTATUS = 0xff, /* Exit code mask if NXT_STATUS_EXITED. */
+ NXT_STATUS_TERMSIG = 0xff, /* Signal number if NXT_STATUS_SIGNALED. */
+};
+
+/* NXT_COMMAND_REPLY. */
+struct nx_command_reply {
+ struct nicira_header nxh;
+ uint32_t status; /* Status bits defined above. */
+ /* Followed by any number of bytes of process output. */
+};
+OFP_ASSERT(sizeof(struct nx_command_reply) == 20);
+
+#endif /* openflow/nicira-ext.h */
diff --git a/include/openflow/openflow-mgmt.h b/include/openflow/openflow-mgmt.h
new file mode 100644
index 000000000..1b4f1a0c9
--- /dev/null
+++ b/include/openflow/openflow-mgmt.h
@@ -0,0 +1,194 @@
+/*
+ * Copyright (c) 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef OPENFLOW_OPENFLOW_MGMT_H
+#define OPENFLOW_OPENFLOW_MGMT_H 1
+
+#include "openflow/nicira-ext.h"
+
+enum ofmp_type {
+ OFMPT_CAPABILITY_REQUEST,
+ OFMPT_CAPABILITY_REPLY,
+ OFMPT_RESOURCES_REQUEST,
+ OFMPT_RESOURCES_UPDATE,
+ OFMPT_CONFIG_REQUEST,
+ OFMPT_CONFIG_UPDATE,
+ OFMPT_CONFIG_UPDATE_ACK,
+ OFMPT_ERROR
+};
+
+/* Header on all OpenFlow management packets. */
+struct ofmp_header {
+ struct nicira_header header;
+ uint16_t type; /* One of OFMPT_* above. */
+ uint8_t pad[2];
+};
+OFP_ASSERT(sizeof(struct ofmp_header) == sizeof(struct nicira_header) + 4);
+
+
+/* Generic TLV header. */
+struct ofmp_tlv {
+ uint16_t type; /* Type of value (one of OFMPTLV_*). */
+ uint16_t len; /* Length of TLV (includes this header). */
+ uint8_t data[0]; /* Value of data as defined by type and length. */
+};
+OFP_ASSERT(sizeof(struct ofmp_tlv) == 4);
+
+/* Universal TLV terminator. Used to indicate end of TLV list. */
+struct ofmp_tlv_end {
+ uint16_t type; /* Type is 0. */
+ uint16_t len; /* Length is 4. */
+};
+OFP_ASSERT(sizeof(struct ofmp_tlv_end) == 4);
+
+
+/* Bitmask of capability description styles. */
+enum ofmp_capability_format {
+ OFMPCAF_SIMPLE = 0 << 0, /* "ovs-vswitchd.conf" style. */
+};
+
+/* Body of capbility request.
+ *
+ * OFMPT_CAPABILITY_REQUEST (controller -> switch) */
+struct ofmp_capability_request {
+ struct ofmp_header header;
+ uint32_t format; /* One of OFMPCAF_*. */
+};
+OFP_ASSERT(sizeof(struct ofmp_capability_request) == 24);
+
+/* Body of reply to capability request.
+ *
+ * OFMPT_CAPABILITY_REPLY (switch -> controller). */
+struct ofmp_capability_reply {
+ struct ofmp_header header;
+ uint32_t format; /* One of OFMPCAF_*. */
+ uint64_t mgmt_id; /* Management ID. */
+ uint8_t data[0];
+};
+OFP_ASSERT(sizeof(struct ofmp_capability_reply) == 32);
+
+
+/* Resource TLV for datapath description. */
+struct ofmptsr_dp {
+ uint16_t type; /* OFMPTSR_DP. */
+ uint16_t len; /* 28. */
+ uint8_t pad[4];
+ uint64_t dp_id; /* Datapath ID. */
+ uint8_t name[OFP_MAX_PORT_NAME_LEN]; /* Null-terminated name. */
+};
+OFP_ASSERT(sizeof(struct ofmptsr_dp) == 32);
+
+/* TLV types for switch resource descriptions. */
+enum ofmp_switch_resources {
+ OFMPTSR_END = 0, /* Terminator. */
+ OFMPTSR_DP, /* Datapath. */
+};
+
+/* Body of resources request.
+ *
+ * OFMPT_RESOURCES_REQUEST (controller -> switch) */
+struct ofmp_resources_request {
+ struct ofmp_header header;
+};
+
+/* Body of capbility update. Sent in response to a resources request or
+ * sent asynchronously when resources change on the switch.
+ *
+ * OFMPT_RESOURCES_UPDATE (switch -> controller) */
+struct ofmp_resources_update {
+ struct ofmp_header header;
+ uint8_t data[0];
+};
+OFP_ASSERT(sizeof(struct ofmp_resources_update) == 20);
+
+
+/* Bitmask of capability description styles. */
+enum ofmp_config_format {
+ OFMPCOF_SIMPLE = 0 << 0, /* "ovs-vswitchd.conf" style. */
+};
+
+#define CONFIG_COOKIE_LEN 20
+
+/* Body of configuration request.
+ *
+ * OFMPT_CONFIG_REQUEST (controller -> switch) */
+struct ofmp_config_request {
+ struct ofmp_header header;
+ uint32_t format; /* One of OFMPCOF_*. */
+};
+OFP_ASSERT(sizeof(struct ofmp_config_request) == 24);
+
+/* Body of configuration update. Sent in response to a configuration
+ * request from the controller. May be sent asynchronously by either
+ * the controller or switch to modify configuration or notify of
+ * changes, respectively. If sent by the controller, the switch must
+ * respond with a OFMPT_CONFIG_UPDATE_ACK.
+ *
+ * OFMPT_CONFIG_UPDATE (switch <-> controller) */
+struct ofmp_config_update {
+ struct ofmp_header header;
+ uint32_t format; /* One of OFMPCOF_*. */
+ uint8_t cookie[CONFIG_COOKIE_LEN]; /* Cookie of config attempting to be
+ * replaced by this update. */
+ uint8_t data[0];
+};
+OFP_ASSERT(sizeof(struct ofmp_config_update) == 44);
+
+/* Bitmask of configuration update ack flags. */
+enum ofmp_config_update_ack_flags {
+ OFMPCUAF_SUCCESS = 1 << 0, /* Config succeeded. */
+};
+
+/* Body of configuration update ack. Sent in response to a configuration
+ * udpate request.
+ *
+ * OFMPT_CONFIG_UPDATE_ACK (switch -> controller) */
+struct ofmp_config_update_ack {
+ struct ofmp_header header;
+ uint32_t format; /* One of OFMPCOF_*. */
+ uint32_t flags; /* One of OFMPCUAF_*. */
+ uint8_t cookie[CONFIG_COOKIE_LEN]; /* Cookie of current configuration
+ * being used in the switch. */
+};
+OFP_ASSERT(sizeof(struct ofmp_config_update_ack) == 48);
+
+/* Values for 'type' in ofmp_error_msg. */
+enum ofmp_error_type {
+ OFMPET_BAD_CONFIG /* Problem with configuration. */
+};
+
+/* ofmp_error_msg 'code' values for OFMPET_BAD_CONFIG. 'data' contains
+ * at least the first 64 bytes of the failed request. */
+enum ofmp_bad_config_code {
+ OFMPBCC_BUSY, /* Config updating, try again. */
+ OFMPBCC_OLD_COOKIE, /* Config has changed. */
+};
+
+/* Body of error message. May be sent by either the switch or the
+ * controller to indicate some error condition.
+ *
+ * OFMPT_ERROR (switch <-> controller) */
+struct ofmp_error_msg {
+ struct ofmp_header header;
+
+ uint16_t type; /* One of OFMPET_*. */
+ uint16_t code; /* Code depending on 'type'. */
+ uint8_t data[0]; /* Variable-length data. Interpreted based
+ on the type and code. */
+};
+OFP_ASSERT(sizeof(struct ofmp_error_msg) == 24);
+
+#endif /* openflow/openflow-mgmt.h */
diff --git a/include/openflow/openflow.h b/include/openflow/openflow.h
new file mode 100644
index 000000000..cd96c6ad0
--- /dev/null
+++ b/include/openflow/openflow.h
@@ -0,0 +1,796 @@
+/*
+ * Copyright (c) 2008, 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+/* OpenFlow: protocol between controller and datapath. */
+
+#ifndef OPENFLOW_OPENFLOW_H
+#define OPENFLOW_OPENFLOW_H 1
+
+#ifdef __KERNEL__
+#include <linux/types.h>
+#else
+#include <stdint.h>
+#endif
+
+#ifdef SWIG
+#define OFP_ASSERT(EXPR) /* SWIG can't handle OFP_ASSERT. */
+#elif !defined(__cplusplus)
+/* Build-time assertion for use in a declaration context. */
+#define OFP_ASSERT(EXPR) \
+ extern int (*build_assert(void))[ sizeof(struct { \
+ unsigned int build_assert_failed : (EXPR) ? 1 : -1; })]
+#else /* __cplusplus */
+#include <boost/static_assert.hpp>
+#define OFP_ASSERT BOOST_STATIC_ASSERT
+#endif /* __cplusplus */
+
+#ifndef SWIG
+#define OFP_PACKED __attribute__((packed))
+#else
+#define OFP_PACKED /* SWIG doesn't understand __attribute. */
+#endif
+
+/* The most significant bit being set in the version field indicates an
+ * experimental OpenFlow version.
+ */
+#define OFP_VERSION 0x97
+
+#define OFP_MAX_TABLE_NAME_LEN 32
+#define OFP_MAX_PORT_NAME_LEN 16
+
+#define OFP_TCP_PORT 6633
+#define OFP_SSL_PORT 6633
+
+#define OFP_ETH_ALEN 6 /* Bytes in an Ethernet address. */
+
+/* Port numbering. Physical ports are numbered starting from 0. */
+enum ofp_port {
+ /* Maximum number of physical switch ports. */
+ OFPP_MAX = 0xff00,
+
+ /* Fake output "ports". */
+ OFPP_IN_PORT = 0xfff8, /* Send the packet out the input port. This
+ virtual port must be explicitly used
+ in order to send back out of the input
+ port. */
+ OFPP_TABLE = 0xfff9, /* Perform actions in flow table.
+ NB: This can only be the destination
+ port for packet-out messages. */
+ OFPP_NORMAL = 0xfffa, /* Process with normal L2/L3 switching. */
+ OFPP_FLOOD = 0xfffb, /* All physical ports except input port and
+ those disabled by STP. */
+ OFPP_ALL = 0xfffc, /* All physical ports except input port. */
+ OFPP_CONTROLLER = 0xfffd, /* Send to controller. */
+ OFPP_LOCAL = 0xfffe, /* Local openflow "port". */
+ OFPP_NONE = 0xffff /* Not associated with a physical port. */
+};
+
+enum ofp_type {
+ /* Immutable messages. */
+ OFPT_HELLO, /* Symmetric message */
+ OFPT_ERROR, /* Symmetric message */
+ OFPT_ECHO_REQUEST, /* Symmetric message */
+ OFPT_ECHO_REPLY, /* Symmetric message */
+ OFPT_VENDOR, /* Symmetric message */
+
+ /* Switch configuration messages. */
+ OFPT_FEATURES_REQUEST, /* Controller/switch message */
+ OFPT_FEATURES_REPLY, /* Controller/switch message */
+ OFPT_GET_CONFIG_REQUEST, /* Controller/switch message */
+ OFPT_GET_CONFIG_REPLY, /* Controller/switch message */
+ OFPT_SET_CONFIG, /* Controller/switch message */
+
+ /* Asynchronous messages. */
+ OFPT_PACKET_IN, /* Async message */
+ OFPT_FLOW_EXPIRED, /* Async message */
+ OFPT_PORT_STATUS, /* Async message */
+
+ /* Controller command messages. */
+ OFPT_PACKET_OUT, /* Controller/switch message */
+ OFPT_FLOW_MOD, /* Controller/switch message */
+ OFPT_PORT_MOD, /* Controller/switch message */
+
+ /* Statistics messages. */
+ OFPT_STATS_REQUEST, /* Controller/switch message */
+ OFPT_STATS_REPLY /* Controller/switch message */
+};
+
+/* Header on all OpenFlow packets. */
+struct ofp_header {
+ uint8_t version; /* OFP_VERSION. */
+ uint8_t type; /* One of the OFPT_ constants. */
+ uint16_t length; /* Length including this ofp_header. */
+ uint32_t xid; /* Transaction id associated with this packet.
+ Replies use the same id as was in the request
+ to facilitate pairing. */
+};
+OFP_ASSERT(sizeof(struct ofp_header) == 8);
+
+/* OFPT_HELLO. This message has an empty body, but implementations must
+ * ignore any data included in the body, to allow for future extensions. */
+struct ofp_hello {
+ struct ofp_header header;
+};
+
+#define OFP_DEFAULT_MISS_SEND_LEN 128
+
+enum ofp_config_flags {
+ /* Tells datapath to notify the controller of expired flow entries. */
+ OFPC_SEND_FLOW_EXP = 1 << 0,
+
+ /* Handling of IP fragments. */
+ OFPC_FRAG_NORMAL = 0 << 1, /* No special handling for fragments. */
+ OFPC_FRAG_DROP = 1 << 1, /* Drop fragments. */
+ OFPC_FRAG_REASM = 2 << 1, /* Reassemble (only if OFPC_IP_REASM set). */
+ OFPC_FRAG_MASK = 3 << 1
+};
+
+/* Switch configuration. */
+struct ofp_switch_config {
+ struct ofp_header header;
+ uint16_t flags; /* OFPC_* flags. */
+ uint16_t miss_send_len; /* Max bytes of new flow that datapath should
+ send to the controller. */
+};
+OFP_ASSERT(sizeof(struct ofp_switch_config) == 12);
+
+/* Capabilities supported by the datapath. */
+enum ofp_capabilities {
+ OFPC_FLOW_STATS = 1 << 0, /* Flow statistics. */
+ OFPC_TABLE_STATS = 1 << 1, /* Table statistics. */
+ OFPC_PORT_STATS = 1 << 2, /* Port statistics. */
+ OFPC_STP = 1 << 3, /* 802.1d spanning tree. */
+ OFPC_MULTI_PHY_TX = 1 << 4, /* Supports transmitting through multiple
+ physical interfaces */
+ OFPC_IP_REASM = 1 << 5 /* Can reassemble IP fragments. */
+};
+
+/* Flags to indicate behavior of the physical port. These flags are
+ * used in ofp_phy_port to describe the current configuration. They are
+ * used in the ofp_port_mod message to configure the port's behavior.
+ */
+enum ofp_port_config {
+ OFPPC_PORT_DOWN = 1 << 0, /* Port is administratively down. */
+
+ OFPPC_NO_STP = 1 << 1, /* Disable 802.1D spanning tree on port. */
+ OFPPC_NO_RECV = 1 << 2, /* Drop most packets received on port. */
+ OFPPC_NO_RECV_STP = 1 << 3, /* Drop received 802.1D STP packets. */
+ OFPPC_NO_FLOOD = 1 << 4, /* Do not include this port when flooding. */
+ OFPPC_NO_FWD = 1 << 5, /* Drop packets forwarded to port. */
+ OFPPC_NO_PACKET_IN = 1 << 6 /* Do not send packet-in msgs for port. */
+};
+
+/* Current state of the physical port. These are not configurable from
+ * the controller.
+ */
+enum ofp_port_state {
+ OFPPS_LINK_DOWN = 1 << 0, /* No physical link present. */
+
+ /* The OFPPS_STP_* bits have no effect on switch operation. The
+ * controller must adjust OFPPC_NO_RECV, OFPPC_NO_FWD, and
+ * OFPPC_NO_PACKET_IN appropriately to fully implement an 802.1D spanning
+ * tree. */
+ OFPPS_STP_LISTEN = 0 << 8, /* Not learning or relaying frames. */
+ OFPPS_STP_LEARN = 1 << 8, /* Learning but not relaying frames. */
+ OFPPS_STP_FORWARD = 2 << 8, /* Learning and relaying frames. */
+ OFPPS_STP_BLOCK = 3 << 8, /* Not part of spanning tree. */
+ OFPPS_STP_MASK = 3 << 8 /* Bit mask for OFPPS_STP_* values. */
+};
+
+/* Features of physical ports available in a datapath. */
+enum ofp_port_features {
+ OFPPF_10MB_HD = 1 << 0, /* 10 Mb half-duplex rate support. */
+ OFPPF_10MB_FD = 1 << 1, /* 10 Mb full-duplex rate support. */
+ OFPPF_100MB_HD = 1 << 2, /* 100 Mb half-duplex rate support. */
+ OFPPF_100MB_FD = 1 << 3, /* 100 Mb full-duplex rate support. */
+ OFPPF_1GB_HD = 1 << 4, /* 1 Gb half-duplex rate support. */
+ OFPPF_1GB_FD = 1 << 5, /* 1 Gb full-duplex rate support. */
+ OFPPF_10GB_FD = 1 << 6, /* 10 Gb full-duplex rate support. */
+ OFPPF_COPPER = 1 << 7, /* Copper medium. */
+ OFPPF_FIBER = 1 << 8, /* Fiber medium. */
+ OFPPF_AUTONEG = 1 << 9, /* Auto-negotiation. */
+ OFPPF_PAUSE = 1 << 10, /* Pause. */
+ OFPPF_PAUSE_ASYM = 1 << 11 /* Asymmetric pause. */
+};
+
+/* Description of a physical port */
+struct ofp_phy_port {
+ uint16_t port_no;
+ uint8_t hw_addr[OFP_ETH_ALEN];
+ uint8_t name[OFP_MAX_PORT_NAME_LEN]; /* Null-terminated */
+
+ uint32_t config; /* Bitmap of OFPPC_* flags. */
+ uint32_t state; /* Bitmap of OFPPS_* flags. */
+
+ /* Bitmaps of OFPPF_* that describe features. All bits zeroed if
+ * unsupported or unavailable. */
+ uint32_t curr; /* Current features. */
+ uint32_t advertised; /* Features being advertised by the port. */
+ uint32_t supported; /* Features supported by the port. */
+ uint32_t peer; /* Features advertised by peer. */
+};
+OFP_ASSERT(sizeof(struct ofp_phy_port) == 48);
+
+/* Switch features. */
+struct ofp_switch_features {
+ struct ofp_header header;
+ uint64_t datapath_id; /* Datapath unique ID. Only the lower 48-bits
+ are meaningful. */
+
+ uint32_t n_buffers; /* Max packets buffered at once. */
+
+ uint8_t n_tables; /* Number of tables supported by datapath. */
+ uint8_t pad[3]; /* Align to 64-bits. */
+
+ /* Features. */
+ uint32_t capabilities; /* Bitmap of support "ofp_capabilities". */
+ uint32_t actions; /* Bitmap of supported "ofp_action_type"s. */
+
+ /* Port info.*/
+ struct ofp_phy_port ports[0]; /* Port definitions. The number of ports
+ is inferred from the length field in
+ the header. */
+};
+OFP_ASSERT(sizeof(struct ofp_switch_features) == 32);
+
+/* What changed about the physical port */
+enum ofp_port_reason {
+ OFPPR_ADD, /* The port was added. */
+ OFPPR_DELETE, /* The port was removed. */
+ OFPPR_MODIFY /* Some attribute of the port has changed. */
+};
+
+/* A physical port has changed in the datapath */
+struct ofp_port_status {
+ struct ofp_header header;
+ uint8_t reason; /* One of OFPPR_*. */
+ uint8_t pad[7]; /* Align to 64-bits. */
+ struct ofp_phy_port desc;
+};
+OFP_ASSERT(sizeof(struct ofp_port_status) == 64);
+
+/* Modify behavior of the physical port */
+struct ofp_port_mod {
+ struct ofp_header header;
+ uint16_t port_no;
+ uint8_t hw_addr[OFP_ETH_ALEN]; /* The hardware address is not
+ configurable. This is used to
+ sanity-check the request, so it must
+ be the same as returned in an
+ ofp_phy_port struct. */
+
+ uint32_t config; /* Bitmap of OFPPC_* flags. */
+ uint32_t mask; /* Bitmap of OFPPC_* flags to be changed. */
+
+ uint32_t advertise; /* Bitmap of "ofp_port_features"s. Zero all
+ bits to prevent any action taking place. */
+ uint8_t pad[4]; /* Pad to 64-bits. */
+};
+OFP_ASSERT(sizeof(struct ofp_port_mod) == 32);
+
+/* Why is this packet being sent to the controller? */
+enum ofp_packet_in_reason {
+ OFPR_NO_MATCH, /* No matching flow. */
+ OFPR_ACTION /* Action explicitly output to controller. */
+};
+
+/* Packet received on port (datapath -> controller). */
+struct ofp_packet_in {
+ struct ofp_header header;
+ uint32_t buffer_id; /* ID assigned by datapath. */
+ uint16_t total_len; /* Full length of frame. */
+ uint16_t in_port; /* Port on which frame was received. */
+ uint8_t reason; /* Reason packet is being sent (one of OFPR_*) */
+ uint8_t pad;
+ uint8_t data[0]; /* Ethernet frame, halfway through 32-bit word,
+ so the IP header is 32-bit aligned. The
+ amount of data is inferred from the length
+ field in the header. Because of padding,
+ offsetof(struct ofp_packet_in, data) ==
+ sizeof(struct ofp_packet_in) - 2. */
+};
+OFP_ASSERT(sizeof(struct ofp_packet_in) == 20);
+
+enum ofp_action_type {
+ OFPAT_OUTPUT, /* Output to switch port. */
+ OFPAT_SET_VLAN_VID, /* Set the 802.1q VLAN id. */
+ OFPAT_SET_VLAN_PCP, /* Set the 802.1q priority. */
+ OFPAT_STRIP_VLAN, /* Strip the 802.1q header. */
+ OFPAT_SET_DL_SRC, /* Ethernet source address. */
+ OFPAT_SET_DL_DST, /* Ethernet destination address. */
+ OFPAT_SET_NW_SRC, /* IP source address. */
+ OFPAT_SET_NW_DST, /* IP destination address. */
+ OFPAT_SET_TP_SRC, /* TCP/UDP source port. */
+ OFPAT_SET_TP_DST, /* TCP/UDP destination port. */
+ OFPAT_VENDOR = 0xffff
+};
+
+/* Action structure for OFPAT_OUTPUT, which sends packets out 'port'.
+ * When the 'port' is the OFPP_CONTROLLER, 'max_len' indicates the max
+ * number of bytes to send. A 'max_len' of zero means the entire packet
+ * should be sent. */
+struct ofp_action_output {
+ uint16_t type; /* OFPAT_OUTPUT. */
+ uint16_t len; /* Length is 8. */
+ uint16_t port; /* Output port. */
+ uint16_t max_len; /* Max length to send to controller. */
+};
+OFP_ASSERT(sizeof(struct ofp_action_output) == 8);
+
+/* The VLAN id is 12 bits, so we can use the entire 16 bits to indicate
+ * special conditions. All ones is used to match that no VLAN id was
+ * set. */
+#define OFP_VLAN_NONE 0xffff
+
+/* Action structure for OFPAT_SET_VLAN_VID. */
+struct ofp_action_vlan_vid {
+ uint16_t type; /* OFPAT_SET_VLAN_VID. */
+ uint16_t len; /* Length is 8. */
+ uint16_t vlan_vid; /* VLAN id. */
+ uint8_t pad[2];
+};
+OFP_ASSERT(sizeof(struct ofp_action_vlan_vid) == 8);
+
+/* Action structure for OFPAT_SET_VLAN_PCP. */
+struct ofp_action_vlan_pcp {
+ uint16_t type; /* OFPAT_SET_VLAN_PCP. */
+ uint16_t len; /* Length is 8. */
+ uint8_t vlan_pcp; /* VLAN priority. */
+ uint8_t pad[3];
+};
+OFP_ASSERT(sizeof(struct ofp_action_vlan_vid) == 8);
+
+/* Action structure for OFPAT_SET_DL_SRC/DST. */
+struct ofp_action_dl_addr {
+ uint16_t type; /* OFPAT_SET_DL_SRC/DST. */
+ uint16_t len; /* Length is 16. */
+ uint8_t dl_addr[OFP_ETH_ALEN]; /* Ethernet address. */
+ uint8_t pad[6];
+};
+OFP_ASSERT(sizeof(struct ofp_action_dl_addr) == 16);
+
+/* Action structure for OFPAT_SET_NW_SRC/DST. */
+struct ofp_action_nw_addr {
+ uint16_t type; /* OFPAT_SET_TW_SRC/DST. */
+ uint16_t len; /* Length is 8. */
+ uint32_t nw_addr; /* IP address. */
+};
+OFP_ASSERT(sizeof(struct ofp_action_nw_addr) == 8);
+
+/* Action structure for OFPAT_SET_TP_SRC/DST. */
+struct ofp_action_tp_port {
+ uint16_t type; /* OFPAT_SET_TP_SRC/DST. */
+ uint16_t len; /* Length is 8. */
+ uint16_t tp_port; /* TCP/UDP port. */
+ uint8_t pad[2];
+};
+OFP_ASSERT(sizeof(struct ofp_action_tp_port) == 8);
+
+/* Action header for OFPAT_VENDOR. The rest of the body is vendor-defined. */
+struct ofp_action_vendor_header {
+ uint16_t type; /* OFPAT_VENDOR. */
+ uint16_t len; /* Length is a multiple of 8. */
+ uint32_t vendor; /* Vendor ID, which takes the same form
+ as in "struct ofp_vendor_header". */
+};
+OFP_ASSERT(sizeof(struct ofp_action_vendor_header) == 8);
+
+/* Action header that is common to all actions. The length includes the
+ * header and any padding used to make the action 64-bit aligned.
+ * NB: The length of an action *must* always be a multiple of eight. */
+struct ofp_action_header {
+ uint16_t type; /* One of OFPAT_*. */
+ uint16_t len; /* Length of action, including this
+ header. This is the length of action,
+ including any padding to make it
+ 64-bit aligned. */
+ uint8_t pad[4];
+};
+OFP_ASSERT(sizeof(struct ofp_action_header) == 8);
+
+union ofp_action {
+ uint16_t type;
+ struct ofp_action_header header;
+ struct ofp_action_vendor_header vendor;
+ struct ofp_action_output output;
+ struct ofp_action_vlan_vid vlan_vid;
+ struct ofp_action_vlan_pcp vlan_pcp;
+ struct ofp_action_nw_addr nw_addr;
+ struct ofp_action_tp_port tp_port;
+};
+OFP_ASSERT(sizeof(union ofp_action) == 8);
+
+/* Send packet (controller -> datapath). */
+struct ofp_packet_out {
+ struct ofp_header header;
+ uint32_t buffer_id; /* ID assigned by datapath (-1 if none). */
+ uint16_t in_port; /* Packet's input port (OFPP_NONE if none). */
+ uint16_t actions_len; /* Size of action array in bytes. */
+ struct ofp_action_header actions[0]; /* Actions. */
+ /* uint8_t data[0]; */ /* Packet data. The length is inferred
+ from the length field in the header.
+ (Only meaningful if buffer_id == -1.) */
+};
+OFP_ASSERT(sizeof(struct ofp_packet_out) == 16);
+
+enum ofp_flow_mod_command {
+ OFPFC_ADD, /* New flow. */
+ OFPFC_MODIFY, /* Modify all matching flows. */
+ OFPFC_MODIFY_STRICT, /* Modify entry strictly matching wildcards */
+ OFPFC_DELETE, /* Delete all matching flows. */
+ OFPFC_DELETE_STRICT /* Strictly match wildcards and priority. */
+};
+
+/* Flow wildcards. */
+enum ofp_flow_wildcards {
+ OFPFW_IN_PORT = 1 << 0, /* Switch input port. */
+ OFPFW_DL_VLAN = 1 << 1, /* VLAN. */
+ OFPFW_DL_SRC = 1 << 2, /* Ethernet source address. */
+ OFPFW_DL_DST = 1 << 3, /* Ethernet destination address. */
+ OFPFW_DL_TYPE = 1 << 4, /* Ethernet frame type. */
+ OFPFW_NW_PROTO = 1 << 5, /* IP protocol. */
+ OFPFW_TP_SRC = 1 << 6, /* TCP/UDP source port. */
+ OFPFW_TP_DST = 1 << 7, /* TCP/UDP destination port. */
+
+ /* IP source address wildcard bit count. 0 is exact match, 1 ignores the
+ * LSB, 2 ignores the 2 least-significant bits, ..., 32 and higher wildcard
+ * the entire field. This is the *opposite* of the usual convention where
+ * e.g. /24 indicates that 8 bits (not 24 bits) are wildcarded. */
+ OFPFW_NW_SRC_SHIFT = 8,
+ OFPFW_NW_SRC_BITS = 6,
+ OFPFW_NW_SRC_MASK = ((1 << OFPFW_NW_SRC_BITS) - 1) << OFPFW_NW_SRC_SHIFT,
+ OFPFW_NW_SRC_ALL = 32 << OFPFW_NW_SRC_SHIFT,
+
+ /* IP destination address wildcard bit count. Same format as source. */
+ OFPFW_NW_DST_SHIFT = 14,
+ OFPFW_NW_DST_BITS = 6,
+ OFPFW_NW_DST_MASK = ((1 << OFPFW_NW_DST_BITS) - 1) << OFPFW_NW_DST_SHIFT,
+ OFPFW_NW_DST_ALL = 32 << OFPFW_NW_DST_SHIFT,
+
+ /* Wildcard all fields. */
+ OFPFW_ALL = ((1 << 20) - 1)
+};
+
+/* The wildcards for ICMP type and code fields use the transport source
+ * and destination port fields, respectively. */
+#define OFPFW_ICMP_TYPE OFPFW_TP_SRC
+#define OFPFW_ICMP_CODE OFPFW_TP_DST
+
+/* Values below this cutoff are 802.3 packets and the two bytes
+ * following MAC addresses are used as a frame length. Otherwise, the
+ * two bytes are used as the Ethernet type.
+ */
+#define OFP_DL_TYPE_ETH2_CUTOFF 0x0600
+
+/* Value of dl_type to indicate that the frame does not include an
+ * Ethernet type.
+ */
+#define OFP_DL_TYPE_NOT_ETH_TYPE 0x05ff
+
+/* The VLAN id is 12-bits, so we can use the entire 16 bits to indicate
+ * special conditions. All ones indicates that no VLAN id was set.
+ */
+#define OFP_VLAN_NONE 0xffff
+
+/* Fields to match against flows */
+struct ofp_match {
+ uint32_t wildcards; /* Wildcard fields. */
+ uint16_t in_port; /* Input switch port. */
+ uint8_t dl_src[OFP_ETH_ALEN]; /* Ethernet source address. */
+ uint8_t dl_dst[OFP_ETH_ALEN]; /* Ethernet destination address. */
+ uint16_t dl_vlan; /* Input VLAN. */
+ uint16_t dl_type; /* Ethernet frame type. */
+ uint8_t nw_proto; /* IP protocol. */
+ uint8_t pad; /* Align to 32-bits. */
+ uint32_t nw_src; /* IP source address. */
+ uint32_t nw_dst; /* IP destination address. */
+ uint16_t tp_src; /* TCP/UDP source port. */
+ uint16_t tp_dst; /* TCP/UDP destination port. */
+};
+OFP_ASSERT(sizeof(struct ofp_match) == 36);
+
+/* The match fields for ICMP type and code use the transport source and
+ * destination port fields, respectively. */
+#define icmp_type tp_src
+#define icmp_code tp_dst
+
+/* Value used in "idle_timeout" and "hard_timeout" to indicate that the entry
+ * is permanent. */
+#define OFP_FLOW_PERMANENT 0
+
+/* By default, choose a priority in the middle. */
+#define OFP_DEFAULT_PRIORITY 0x8000
+
+/* Flow setup and teardown (controller -> datapath). */
+struct ofp_flow_mod {
+ struct ofp_header header;
+ struct ofp_match match; /* Fields to match */
+
+ /* Flow actions. */
+ uint16_t command; /* One of OFPFC_*. */
+ uint16_t idle_timeout; /* Idle time before discarding (seconds). */
+ uint16_t hard_timeout; /* Max time before discarding (seconds). */
+ uint16_t priority; /* Priority level of flow entry. */
+ uint32_t buffer_id; /* Buffered packet to apply to (or -1).
+ Not meaningful for OFPFC_DELETE*. */
+ uint16_t out_port; /* For OFPFC_DELETE* commands, require
+ matching entries to include this as an
+ output port. A value of OFPP_NONE
+ indicates no restriction. */
+ uint8_t pad[2]; /* Align to 32-bits. */
+ uint32_t reserved; /* Reserved for future use. */
+ struct ofp_action_header actions[0]; /* The action length is inferred
+ from the length field in the
+ header. */
+};
+OFP_ASSERT(sizeof(struct ofp_flow_mod) == 64);
+
+/* Why did this flow expire? */
+enum ofp_flow_expired_reason {
+ OFPER_IDLE_TIMEOUT, /* Flow idle time exceeded idle_timeout. */
+ OFPER_HARD_TIMEOUT /* Time exceeded hard_timeout. */
+};
+
+/* Flow expiration (datapath -> controller). */
+struct ofp_flow_expired {
+ struct ofp_header header;
+ struct ofp_match match; /* Description of fields. */
+
+ uint16_t priority; /* Priority level of flow entry. */
+ uint8_t reason; /* One of OFPER_*. */
+ uint8_t pad[1]; /* Align to 32-bits. */
+
+ uint32_t duration; /* Time flow was alive in seconds. */
+ uint8_t pad2[4]; /* Align to 64-bits. */
+ uint64_t packet_count;
+ uint64_t byte_count;
+};
+OFP_ASSERT(sizeof(struct ofp_flow_expired) == 72);
+
+/* Values for 'type' in ofp_error_message. These values are immutable: they
+ * will not change in future versions of the protocol (although new values may
+ * be added). */
+enum ofp_error_type {
+ OFPET_HELLO_FAILED, /* Hello protocol failed. */
+ OFPET_BAD_REQUEST, /* Request was not understood. */
+ OFPET_BAD_ACTION, /* Error in action description. */
+ OFPET_FLOW_MOD_FAILED, /* Problem modifying flow entry. */
+ OFPET_PORT_MOD_FAILED /* OFPT_PORT_MOD failed. */
+};
+
+/* ofp_error_msg 'code' values for OFPET_HELLO_FAILED. 'data' contains an
+ * ASCII text string that may give failure details. */
+enum ofp_hello_failed_code {
+ OFPHFC_INCOMPATIBLE /* No compatible version. */
+};
+
+/* ofp_error_msg 'code' values for OFPET_BAD_REQUEST. 'data' contains at least
+ * the first 64 bytes of the failed request. */
+enum ofp_bad_request_code {
+ OFPBRC_BAD_VERSION, /* ofp_header.version not supported. */
+ OFPBRC_BAD_TYPE, /* ofp_header.type not supported. */
+ OFPBRC_BAD_STAT, /* ofp_stats_request.type not supported. */
+ OFPBRC_BAD_VENDOR, /* Vendor not supported (in ofp_vendor_header
+ * or ofp_stats_request or ofp_stats_reply). */
+ OFPBRC_BAD_SUBTYPE, /* Vendor subtype not supported. */
+ OFPBRC_BAD_LENGTH, /* Wrong request length for type. */
+ OFPBRC_BUFFER_EMPTY, /* Specified buffer has already been used. */
+ OFPBRC_BAD_COOKIE /* Specified buffer does not exist. */
+};
+
+/* ofp_error_msg 'code' values for OFPET_BAD_ACTION. 'data' contains at least
+ * the first 64 bytes of the failed request. */
+enum ofp_bad_action_code {
+ OFPBAC_BAD_TYPE, /* Unknown action type. */
+ OFPBAC_BAD_LEN, /* Length problem in actions. */
+ OFPBAC_BAD_VENDOR, /* Unknown vendor id specified. */
+ OFPBAC_BAD_VENDOR_TYPE, /* Unknown action type for vendor id. */
+ OFPBAC_BAD_OUT_PORT, /* Problem validating output action. */
+ OFPBAC_BAD_ARGUMENT, /* Bad action argument. */
+ OFPBAC_TOO_MANY /* Can't handle this many actions. */
+};
+
+/* ofp_error_msg 'code' values for OFPET_FLOW_MOD_FAILED. 'data' contains
+ * at least the first 64 bytes of the failed request. */
+enum ofp_flow_mod_failed_code {
+ OFPFMFC_ALL_TABLES_FULL, /* Flow not added because of full tables. */
+ OFPFMFC_BAD_COMMAND /* Unknown command. */
+};
+
+/* ofp_error_msg 'code' values for OFPET_PORT_MOD_FAILED. 'data' contains
+ * at least the first 64 bytes of the failed request. */
+enum ofp_port_mod_failed_code {
+ OFPPMFC_BAD_PORT, /* Specified port does not exist. */
+ OFPPMFC_BAD_HW_ADDR, /* Specified hardware address is wrong. */
+};
+
+/* OFPT_ERROR: Error message (datapath -> controller). */
+struct ofp_error_msg {
+ struct ofp_header header;
+
+ uint16_t type;
+ uint16_t code;
+ uint8_t data[0]; /* Variable-length data. Interpreted based
+ on the type and code. */
+};
+OFP_ASSERT(sizeof(struct ofp_error_msg) == 12);
+
+enum ofp_stats_types {
+ /* Description of this OpenFlow switch.
+ * The request body is empty.
+ * The reply body is struct ofp_desc_stats. */
+ OFPST_DESC,
+
+ /* Individual flow statistics.
+ * The request body is struct ofp_flow_stats_request.
+ * The reply body is an array of struct ofp_flow_stats. */
+ OFPST_FLOW,
+
+ /* Aggregate flow statistics.
+ * The request body is struct ofp_aggregate_stats_request.
+ * The reply body is struct ofp_aggregate_stats_reply. */
+ OFPST_AGGREGATE,
+
+ /* Flow table statistics.
+ * The request body is empty.
+ * The reply body is an array of struct ofp_table_stats. */
+ OFPST_TABLE,
+
+ /* Physical port statistics.
+ * The request body is empty.
+ * The reply body is an array of struct ofp_port_stats. */
+ OFPST_PORT,
+
+ /* Vendor extension.
+ * The request and reply bodies begin with a 32-bit vendor ID, which takes
+ * the same form as in "struct ofp_vendor_header". The request and reply
+ * bodies are otherwise vendor-defined. */
+ OFPST_VENDOR = 0xffff
+};
+
+struct ofp_stats_request {
+ struct ofp_header header;
+ uint16_t type; /* One of the OFPST_* constants. */
+ uint16_t flags; /* OFPSF_REQ_* flags (none yet defined). */
+ uint8_t body[0]; /* Body of the request. */
+};
+OFP_ASSERT(sizeof(struct ofp_stats_request) == 12);
+
+enum ofp_stats_reply_flags {
+ OFPSF_REPLY_MORE = 1 << 0 /* More replies to follow. */
+};
+
+struct ofp_stats_reply {
+ struct ofp_header header;
+ uint16_t type; /* One of the OFPST_* constants. */
+ uint16_t flags; /* OFPSF_REPLY_* flags. */
+ uint8_t body[0]; /* Body of the reply. */
+};
+OFP_ASSERT(sizeof(struct ofp_stats_reply) == 12);
+
+#define DESC_STR_LEN 256
+#define SERIAL_NUM_LEN 32
+/* Body of reply to OFPST_DESC request. Each entry is a NULL-terminated
+ * ASCII string. */
+struct ofp_desc_stats {
+ char mfr_desc[DESC_STR_LEN]; /* Manufacturer description. */
+ char hw_desc[DESC_STR_LEN]; /* Hardware description. */
+ char sw_desc[DESC_STR_LEN]; /* Software description. */
+ char serial_num[SERIAL_NUM_LEN]; /* Serial number. */
+};
+OFP_ASSERT(sizeof(struct ofp_desc_stats) == 800);
+
+/* Body for ofp_stats_request of type OFPST_FLOW. */
+struct ofp_flow_stats_request {
+ struct ofp_match match; /* Fields to match. */
+ uint8_t table_id; /* ID of table to read (from ofp_table_stats)
+ or 0xff for all tables. */
+ uint8_t pad; /* Align to 32 bits. */
+ uint16_t out_port; /* Require matching entries to include this
+ as an output port. A value of OFPP_NONE
+ indicates no restriction. */
+};
+OFP_ASSERT(sizeof(struct ofp_flow_stats_request) == 40);
+
+/* Body of reply to OFPST_FLOW request. */
+struct ofp_flow_stats {
+ uint16_t length; /* Length of this entry. */
+ uint8_t table_id; /* ID of table flow came from. */
+ uint8_t pad;
+ struct ofp_match match; /* Description of fields. */
+ uint32_t duration; /* Time flow has been alive in seconds. */
+ uint16_t priority; /* Priority of the entry. Only meaningful
+ when this is not an exact-match entry. */
+ uint16_t idle_timeout; /* Number of seconds idle before expiration. */
+ uint16_t hard_timeout; /* Number of seconds before expiration. */
+ uint16_t pad2[3]; /* Pad to 64 bits. */
+ uint64_t packet_count; /* Number of packets in flow. */
+ uint64_t byte_count; /* Number of bytes in flow. */
+ struct ofp_action_header actions[0]; /* Actions. */
+};
+OFP_ASSERT(sizeof(struct ofp_flow_stats) == 72);
+
+/* Body for ofp_stats_request of type OFPST_AGGREGATE. */
+struct ofp_aggregate_stats_request {
+ struct ofp_match match; /* Fields to match. */
+ uint8_t table_id; /* ID of table to read (from ofp_table_stats)
+ or 0xff for all tables. */
+ uint8_t pad; /* Align to 32 bits. */
+ uint16_t out_port; /* Require matching entries to include this
+ as an output port. A value of OFPP_NONE
+ indicates no restriction. */
+};
+OFP_ASSERT(sizeof(struct ofp_aggregate_stats_request) == 40);
+
+/* Body of reply to OFPST_AGGREGATE request. */
+struct ofp_aggregate_stats_reply {
+ uint64_t packet_count; /* Number of packets in flows. */
+ uint64_t byte_count; /* Number of bytes in flows. */
+ uint32_t flow_count; /* Number of flows. */
+ uint8_t pad[4]; /* Align to 64 bits. */
+};
+OFP_ASSERT(sizeof(struct ofp_aggregate_stats_reply) == 24);
+
+/* Body of reply to OFPST_TABLE request. */
+struct ofp_table_stats {
+ uint8_t table_id; /* Identifier of table. Lower numbered tables
+ are consulted first. */
+ uint8_t pad[3]; /* Align to 32-bits. */
+ char name[OFP_MAX_TABLE_NAME_LEN];
+ uint32_t wildcards; /* Bitmap of OFPFW_* wildcards that are
+ supported by the table. */
+ uint32_t max_entries; /* Max number of entries supported. */
+ uint32_t active_count; /* Number of active entries. */
+ uint64_t lookup_count; /* Number of packets looked up in table. */
+ uint64_t matched_count; /* Number of packets that hit table. */
+};
+OFP_ASSERT(sizeof(struct ofp_table_stats) == 64);
+
+/* Body of reply to OFPST_PORT request. If a counter is unsupported, set
+ * the field to all ones. */
+struct ofp_port_stats {
+ uint16_t port_no;
+ uint8_t pad[6]; /* Align to 64-bits. */
+ uint64_t rx_packets; /* Number of received packets. */
+ uint64_t tx_packets; /* Number of transmitted packets. */
+ uint64_t rx_bytes; /* Number of received bytes. */
+ uint64_t tx_bytes; /* Number of transmitted bytes. */
+ uint64_t rx_dropped; /* Number of packets dropped by RX. */
+ uint64_t tx_dropped; /* Number of packets dropped by TX. */
+ uint64_t rx_errors; /* Number of receive errors. This is a super-set
+ of receive errors and should be great than or
+ equal to the sum of all rx_*_err values. */
+ uint64_t tx_errors; /* Number of transmit errors. This is a super-set
+ of transmit errors. */
+ uint64_t rx_frame_err; /* Number of frame alignment errors. */
+ uint64_t rx_over_err; /* Number of packets with RX overrun. */
+ uint64_t rx_crc_err; /* Number of CRC errors. */
+ uint64_t collisions; /* Number of collisions. */
+};
+OFP_ASSERT(sizeof(struct ofp_port_stats) == 104);
+
+/* Vendor extension. */
+struct ofp_vendor_header {
+ struct ofp_header header; /* Type OFPT_VENDOR. */
+ uint32_t vendor; /* Vendor ID:
+ * - MSB 0: low-order bytes are IEEE OUI.
+ * - MSB != 0: defined by OpenFlow
+ * consortium. */
+ /* Vendor-defined arbitrary additional data. */
+};
+OFP_ASSERT(sizeof(struct ofp_vendor_header) == 12);
+
+#endif /* openflow/openflow.h */
diff --git a/include/openvswitch/automake.mk b/include/openvswitch/automake.mk
new file mode 100644
index 000000000..889a21f56
--- /dev/null
+++ b/include/openvswitch/automake.mk
@@ -0,0 +1,4 @@
+noinst_HEADERS += \
+ include/openvswitch/brcompat-netlink.h \
+ include/openvswitch/datapath-protocol.h
+
diff --git a/include/openvswitch/brcompat-netlink.h b/include/openvswitch/brcompat-netlink.h
new file mode 100644
index 000000000..016ddfa55
--- /dev/null
+++ b/include/openvswitch/brcompat-netlink.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2008, 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef OPENVSWITCH_BRCOMPAT_NETLINK_H
+#define OPENVSWITCH_BRCOMPAT_NETLINK_H 1
+
+#define BRC_GENL_FAMILY_NAME "brcompat"
+
+/* Attributes that can be attached to the datapath's netlink messages. */
+enum {
+ BRC_GENL_A_UNSPEC,
+ BRC_GENL_A_DP_NAME, /* Datapath name. */
+ BRC_GENL_A_PORT_NAME, /* Interface name. */
+ BRC_GENL_A_ERR_CODE, /* Positive error code. */
+ BRC_GENL_A_MC_GROUP, /* Generic netlink multicast group. */
+ BRC_GENL_A_PROC_DIR, /* Name of subdirectory in /proc. */
+ BRC_GENL_A_PROC_NAME, /* Name of file in /proc. */
+ BRC_GENL_A_PROC_DATA, /* Contents of file in /proc. */
+
+ __BRC_GENL_A_MAX,
+ BRC_GENL_A_MAX = __BRC_GENL_A_MAX - 1
+};
+
+/* Commands that can be executed on the datapath's netlink interface. */
+enum brc_genl_command {
+ BRC_GENL_C_UNSPEC,
+
+ /*
+ * "K:" messages are sent by the kernel to userspace.
+ * "U:" messages are sent by userspace to the kernel.
+ */
+ BRC_GENL_C_DP_ADD, /* K: Datapath created. */
+ BRC_GENL_C_DP_DEL, /* K: Datapath destroyed. */
+ BRC_GENL_C_DP_RESULT, /* U: Return code from ovs-brcompatd. */
+ BRC_GENL_C_PORT_ADD, /* K: Port added to datapath. */
+ BRC_GENL_C_PORT_DEL, /* K: Port removed from datapath. */
+ BRC_GENL_C_QUERY_MC, /* U: Get multicast group for brcompat. */
+ BRC_GENL_C_SET_PROC, /* U: Set contents of file in /proc. */
+
+ __BRC_GENL_C_MAX,
+ BRC_GENL_C_MAX = __BRC_GENL_C_MAX - 1
+};
+#endif /* openvswitch/brcompat-netlink.h */
diff --git a/include/openvswitch/datapath-protocol.h b/include/openvswitch/datapath-protocol.h
new file mode 100644
index 000000000..537dd5955
--- /dev/null
+++ b/include/openvswitch/datapath-protocol.h
@@ -0,0 +1,287 @@
+/*
+ * Copyright (c) 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+/* Protocol between secchan and datapath. */
+
+#ifndef OPENVSWITCH_DATAPATH_PROTOCOL_H
+#define OPENVSWITCH_DATAPATH_PROTOCOL_H 1
+
+#ifdef __KERNEL__
+#include <linux/types.h>
+#else
+#include <sys/types.h>
+#endif
+#include <linux/if_ether.h>
+
+#define ODP_MAX 256 /* Maximum number of datapaths. */
+
+#define ODP_DP_CREATE _IO('O', 0)
+#define ODP_DP_DESTROY _IO('O', 1)
+#define ODP_DP_STATS _IOW('O', 2, struct odp_stats)
+
+#define ODP_GET_DROP_FRAGS _IOW('O', 3, int)
+#define ODP_SET_DROP_FRAGS _IOR('O', 4, int)
+
+#define ODP_GET_LISTEN_MASK _IOW('O', 5, int)
+#define ODP_SET_LISTEN_MASK _IOR('O', 6, int)
+
+#define ODP_PORT_ADD _IOR('O', 7, struct odp_port)
+#define ODP_PORT_DEL _IOR('O', 8, int)
+#define ODP_PORT_QUERY _IOWR('O', 9, struct odp_port)
+#define ODP_PORT_LIST _IOWR('O', 10, struct odp_portvec)
+
+#define ODP_PORT_GROUP_SET _IOR('O', 11, struct odp_port_group)
+#define ODP_PORT_GROUP_GET _IOWR('O', 12, struct odp_port_group)
+
+#define ODP_FLOW_GET _IOWR('O', 13, struct odp_flow)
+#define ODP_FLOW_GET_MULTIPLE _IOWR('O', 14, struct odp_flowvec)
+#define ODP_FLOW_LIST _IOWR('O', 15, struct odp_flowvec)
+
+#define ODP_FLOW_FLUSH _IO('O', 16)
+#define ODP_FLOW_PUT _IOWR('O', 17, struct odp_flow)
+#define ODP_FLOW_DEL _IOWR('O', 18, struct odp_flow)
+
+#define ODP_EXECUTE _IOR('O', 19, struct odp_execute)
+
+struct odp_stats {
+ /* Flows. */
+ __u32 n_flows; /* Number of flows in flow table. */
+ __u32 cur_capacity; /* Current flow table capacity. */
+ __u32 max_capacity; /* Maximum expansion of flow table capacity. */
+
+ /* Ports. */
+ __u32 n_ports; /* Current number of ports. */
+ __u32 max_ports; /* Maximum supported number of ports. */
+ __u16 max_groups; /* Maximum number of port groups. */
+ __u16 reserved;
+
+ /* Lookups. */
+ __u64 n_frags; /* Number of dropped IP fragments. */
+ __u64 n_hit; /* Number of flow table matches. */
+ __u64 n_missed; /* Number of flow table misses. */
+ __u64 n_lost; /* Number of misses not sent to userspace. */
+
+ /* Queues. */
+ __u16 max_miss_queue; /* Max length of ODPL_MISS queue. */
+ __u16 max_action_queue; /* Max length of ODPL_ACTION queue. */
+};
+
+/* Logical ports. */
+#define ODPP_LOCAL ((__u16)0)
+#define ODPP_NONE ((__u16)-1)
+
+/* Listening channels. */
+#define _ODPL_MISS_NR 0 /* Packet missed in flow table. */
+#define ODPL_MISS (1 << _ODPL_MISS_NR)
+#define _ODPL_ACTION_NR 1 /* Packet output to ODPP_CONTROLLER. */
+#define ODPL_ACTION (1 << _ODPL_ACTION_NR)
+#define ODPL_ALL (ODPL_MISS | ODPL_ACTION)
+
+/* Format of messages read from datapath fd. */
+struct odp_msg {
+ __u32 type; /* _ODPL_MISS_NR or _ODPL_ACTION_NR. */
+ __u32 length; /* Message length, including header. */
+ __u16 port; /* Port on which frame was received. */
+ __u16 reserved;
+ __u32 arg; /* Argument value specified in action. */
+ /* Followed by packet data. */
+};
+
+#define ODP_PORT_INTERNAL (1 << 0) /* This port is simulated. */
+struct odp_port {
+ char devname[16]; /* IFNAMSIZ */
+ __u16 port;
+ __u16 flags;
+ __u32 reserved2;
+};
+
+struct odp_portvec {
+ struct odp_port *ports;
+ int n_ports;
+};
+
+struct odp_port_group {
+ __u16 *ports;
+ __u16 n_ports; /* Number of ports. */
+ __u16 group; /* Group number. */
+};
+
+struct odp_flow_stats {
+ __u64 n_packets; /* Number of matched packets. */
+ __u64 n_bytes; /* Number of matched bytes. */
+ __u64 used_sec; /* Time last used. */
+ __u32 used_nsec;
+ __u8 tcp_flags;
+ __u8 ip_tos;
+ __u16 reserved;
+};
+
+struct odp_flow_key {
+ __be32 nw_src; /* IP source address. */
+ __be32 nw_dst; /* IP destination address. */
+ __u16 in_port; /* Input switch port. */
+ __be16 dl_vlan; /* Input VLAN. */
+ __be16 dl_type; /* Ethernet frame type. */
+ __be16 tp_src; /* TCP/UDP source port. */
+ __be16 tp_dst; /* TCP/UDP destination port. */
+ __u8 dl_src[ETH_ALEN]; /* Ethernet source address. */
+ __u8 dl_dst[ETH_ALEN]; /* Ethernet destination address. */
+ __u8 nw_proto; /* IP protocol. */
+ __u8 reserved; /* Pad to 64 bits. */
+};
+
+struct odp_flow {
+ struct odp_flow_stats stats;
+ struct odp_flow_key key;
+ union odp_action *actions;
+ __u32 n_actions;
+};
+
+/* Flags for ODP_FLOW_PUT. */
+#define ODPPF_CREATE (1 << 0) /* Allow creating a new flow. */
+#define ODPPF_MODIFY (1 << 1) /* Allow modifying an existing flow. */
+#define ODPPF_ZERO_STATS (1 << 2) /* Zero the stats of an existing flow. */
+
+/* ODP_FLOW_PUT argument. */
+struct odp_flow_put {
+ struct odp_flow flow;
+ __u32 flags;
+};
+
+struct odp_flowvec {
+ struct odp_flow *flows;
+ int n_flows;
+};
+
+/* The VLAN id is 12 bits, so we can use the entire 16 bits to indicate
+ * special conditions. All ones is used to match that no VLAN id was
+ * set. */
+#define ODP_VLAN_NONE 0xffff
+
+/* Action types. */
+#define ODPAT_OUTPUT 0 /* Output to switch port. */
+#define ODPAT_OUTPUT_GROUP 1 /* Output to all ports in group. */
+#define ODPAT_CONTROLLER 2 /* Send copy to controller. */
+#define ODPAT_SET_VLAN_VID 3 /* Set the 802.1q VLAN id. */
+#define ODPAT_SET_VLAN_PCP 4 /* Set the 802.1q priority. */
+#define ODPAT_STRIP_VLAN 5 /* Strip the 802.1q header. */
+#define ODPAT_SET_DL_SRC 6 /* Ethernet source address. */
+#define ODPAT_SET_DL_DST 7 /* Ethernet destination address. */
+#define ODPAT_SET_NW_SRC 8 /* IP source address. */
+#define ODPAT_SET_NW_DST 9 /* IP destination address. */
+#define ODPAT_SET_TP_SRC 10 /* TCP/UDP source port. */
+#define ODPAT_SET_TP_DST 11 /* TCP/UDP destination port. */
+#define ODPAT_N_ACTIONS 12
+
+struct odp_action_output {
+ __u16 type; /* ODPAT_OUTPUT. */
+ __u16 port; /* Output port. */
+ __u16 reserved1;
+ __u16 reserved2;
+};
+
+struct odp_action_output_group {
+ __u16 type; /* ODPAT_OUTPUT_GROUP. */
+ __u16 group; /* Group number. */
+ __u16 reserved1;
+ __u16 reserved2;
+};
+
+struct odp_action_controller {
+ __u16 type; /* ODPAT_OUTPUT_CONTROLLER. */
+ __u16 reserved;
+ __u32 arg; /* Copied to struct odp_msg 'arg' member. */
+};
+
+/* Action structure for ODPAT_SET_VLAN_VID. */
+struct odp_action_vlan_vid {
+ __u16 type; /* ODPAT_SET_VLAN_VID. */
+ __be16 vlan_vid; /* VLAN id. */
+ __u16 reserved1;
+ __u16 reserved2;
+};
+
+/* Action structure for ODPAT_SET_VLAN_PCP. */
+struct odp_action_vlan_pcp {
+ __u16 type; /* ODPAT_SET_VLAN_PCP. */
+ __u8 vlan_pcp; /* VLAN priority. */
+ __u8 reserved1;
+ __u16 reserved2;
+ __u16 reserved3;
+};
+
+/* Action structure for ODPAT_SET_DL_SRC/DST. */
+struct odp_action_dl_addr {
+ __u16 type; /* ODPAT_SET_DL_SRC/DST. */
+ __u8 dl_addr[ETH_ALEN]; /* Ethernet address. */
+};
+
+/* Action structure for ODPAT_SET_NW_SRC/DST. */
+struct odp_action_nw_addr {
+ __u16 type; /* ODPAT_SET_TW_SRC/DST. */
+ __u16 reserved;
+ __be32 nw_addr; /* IP address. */
+};
+
+/* Action structure for ODPAT_SET_TP_SRC/DST. */
+struct odp_action_tp_port {
+ __u16 type; /* ODPAT_SET_TP_SRC/DST. */
+ __be16 tp_port; /* TCP/UDP port. */
+ __u16 reserved1;
+ __u16 reserved2;
+};
+
+union odp_action {
+ __u16 type;
+ struct odp_action_output output;
+ struct odp_action_output_group output_group;
+ struct odp_action_controller controller;
+ struct odp_action_vlan_vid vlan_vid;
+ struct odp_action_vlan_pcp vlan_pcp;
+ struct odp_action_dl_addr dl_addr;
+ struct odp_action_nw_addr nw_addr;
+ struct odp_action_tp_port tp_port;
+};
+
+struct odp_execute {
+ __u16 in_port;
+ __u16 reserved1;
+ __u32 reserved2;
+
+ union odp_action *actions;
+ __u32 n_actions;
+
+ const void *data;
+ __u32 length;
+};
+
+/* Values below this cutoff are 802.3 packets and the two bytes
+ * following MAC addresses are used as a frame length. Otherwise, the
+ * two bytes are used as the Ethernet type.
+ */
+#define ODP_DL_TYPE_ETH2_CUTOFF 0x0600
+
+/* Value of dl_type to indicate that the frame does not include an
+ * Ethernet type.
+ */
+#define ODP_DL_TYPE_NOT_ETH_TYPE 0x05ff
+
+/* The VLAN id is 12-bits, so we can use the entire 16 bits to indicate
+ * special conditions. All ones indicates that no VLAN id was set.
+ */
+#define ODP_VLAN_NONE 0xffff
+
+#endif /* openvswitch/datapath-protocol.h */
diff --git a/lib/.gitignore b/lib/.gitignore
new file mode 100644
index 000000000..6a3f65ce1
--- /dev/null
+++ b/lib/.gitignore
@@ -0,0 +1,4 @@
+/Makefile
+/Makefile.in
+/dhparams.c
+/coverage-counters.c
diff --git a/lib/automake.mk b/lib/automake.mk
new file mode 100644
index 000000000..4d2924b63
--- /dev/null
+++ b/lib/automake.mk
@@ -0,0 +1,184 @@
+noinst_LIBRARIES += lib/libopenvswitch.a
+
+lib_libopenvswitch_a_SOURCES = \
+ lib/backtrace.c \
+ lib/backtrace.h \
+ lib/bitmap.c \
+ lib/bitmap.h \
+ lib/cfg.c \
+ lib/cfg.h \
+ lib/classifier.c \
+ lib/classifier.h \
+ lib/command-line.c \
+ lib/command-line.h \
+ lib/compiler.h \
+ lib/coverage.c \
+ lib/coverage.h \
+ lib/coverage-counters.c \
+ lib/coverage-counters.h \
+ lib/csum.c \
+ lib/csum.h \
+ lib/daemon.c \
+ lib/daemon.h \
+ lib/dhcp-client.c \
+ lib/dhcp-client.h \
+ lib/dhcp.c \
+ lib/dhcp.h \
+ lib/dhparams.h \
+ lib/dirs.c \
+ lib/dirs.h \
+ lib/dynamic-string.c \
+ lib/dynamic-string.h \
+ lib/fatal-signal.c \
+ lib/fatal-signal.h \
+ lib/fault.c \
+ lib/fault.h \
+ lib/flow.c \
+ lib/flow.h \
+ lib/hash.c \
+ lib/hash.h \
+ lib/hmap.c \
+ lib/hmap.h \
+ lib/leak-checker.c \
+ lib/leak-checker.h \
+ lib/learning-switch.c \
+ lib/learning-switch.h \
+ lib/list.c \
+ lib/list.h \
+ lib/mac-learning.c \
+ lib/mac-learning.h \
+ lib/netdev.c \
+ lib/netdev.h \
+ lib/odp-util.c \
+ lib/odp-util.h \
+ lib/ofp-print.c \
+ lib/ofp-print.h \
+ lib/ofpbuf.c \
+ lib/ofpbuf.h \
+ lib/packets.h \
+ lib/pcap.c \
+ lib/pcap.h \
+ lib/poll-loop.c \
+ lib/poll-loop.h \
+ lib/port-array.c \
+ lib/port-array.h \
+ lib/process.c \
+ lib/process.h \
+ lib/queue.c \
+ lib/queue.h \
+ lib/random.c \
+ lib/random.h \
+ lib/rconn.c \
+ lib/rconn.h \
+ lib/sat-math.h \
+ lib/sha1.c \
+ lib/sha1.h \
+ lib/shash.c \
+ lib/shash.h \
+ lib/signals.c \
+ lib/signals.h \
+ lib/socket-util.c \
+ lib/socket-util.h \
+ lib/stp.c \
+ lib/stp.h \
+ lib/svec.c \
+ lib/svec.h \
+ lib/tag.c \
+ lib/tag.h \
+ lib/timeval.c \
+ lib/timeval.h \
+ lib/type-props.h \
+ lib/unixctl.c \
+ lib/unixctl.h \
+ lib/util.c \
+ lib/util.h \
+ lib/valgrind.h \
+ lib/vconn-provider.h \
+ lib/vconn-ssl.h \
+ lib/vconn-stream.c \
+ lib/vconn-stream.h \
+ lib/vconn-tcp.c \
+ lib/vconn-unix.c \
+ lib/vconn.c \
+ lib/vconn.h \
+ lib/vlog-modules.def \
+ lib/vlog.c \
+ lib/vlog.h \
+ lib/xtoxll.h
+
+if HAVE_NETLINK
+lib_libopenvswitch_a_SOURCES += \
+ lib/dpif.c \
+ lib/dpif.h \
+ lib/netlink-protocol.h \
+ lib/netlink.c \
+ lib/netlink.h
+endif
+
+if HAVE_OPENSSL
+lib_libopenvswitch_a_SOURCES += \
+ lib/vconn-ssl.c
+nodist_lib_libopenvswitch_a_SOURCES = lib/dhparams.c
+lib/dhparams.c: lib/dh1024.pem lib/dh2048.pem lib/dh4096.pem
+ (echo '#include "lib/dhparams.h"' && \
+ openssl dhparam -C -in $(srcdir)/lib/dh1024.pem -noout && \
+ openssl dhparam -C -in $(srcdir)/lib/dh2048.pem -noout && \
+ openssl dhparam -C -in $(srcdir)/lib/dh4096.pem -noout) \
+ | sed 's/\(get_dh[0-9]*\)()/\1(void)/' > lib/dhparams.c.tmp
+ mv lib/dhparams.c.tmp lib/dhparams.c
+endif
+
+EXTRA_DIST += \
+ lib/dh1024.pem \
+ lib/dh2048.pem \
+ lib/dh4096.pem \
+ lib/dhparams.h
+
+EXTRA_DIST += \
+ lib/common.man \
+ lib/daemon.man \
+ lib/dpif.man \
+ lib/leak-checker.man \
+ lib/vlog.man
+
+
+CLEANFILES += lib/dirs.c
+lib/dirs.c: Makefile
+ ($(ro_c) && \
+ echo 'const char ovs_pkgdatadir[] = "$(pkgdatadir)";' && \
+ echo 'const char ovs_rundir[] = "@RUNDIR@";' && \
+ echo 'const char ovs_logdir[] = "@LOGDIR@";' && \
+ echo 'const char ovs_bindir[] = "$(bindir)";') > lib/dirs.c.tmp
+ mv lib/dirs.c.tmp lib/dirs.c
+
+install-data-local:
+ $(MKDIR_P) $(DESTDIR)$(RUNDIR)
+ $(MKDIR_P) $(DESTDIR)$(PKIDIR)
+ $(MKDIR_P) $(DESTDIR)$(LOGDIR)
+
+# All the source files that have coverage counters.
+COVERAGE_FILES = \
+ lib/cfg.c \
+ lib/dpif.c \
+ lib/flow.c \
+ lib/hmap.c \
+ lib/mac-learning.c \
+ lib/netdev.c \
+ lib/netlink.c \
+ lib/odp-util.c \
+ lib/poll-loop.c \
+ lib/process.c \
+ lib/rconn.c \
+ lib/timeval.c \
+ lib/unixctl.c \
+ lib/util.c \
+ lib/vconn.c \
+ secchan/ofproto.c \
+ secchan/pktbuf.c \
+ vswitchd/bridge.c \
+ vswitchd/mgmt.c \
+ vswitchd/ovs-brcompatd.c
+lib/coverage-counters.c: $(COVERAGE_FILES) lib/coverage-scan.pl
+ (cd $(srcdir) && $(PERL) lib/coverage-scan.pl $(COVERAGE_FILES)) > $@.tmp
+ mv $@.tmp $@
+EXTRA_DIST += lib/coverage-scan.pl
diff --git a/lib/backtrace.c b/lib/backtrace.c
new file mode 100644
index 000000000..070603900
--- /dev/null
+++ b/lib/backtrace.c
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2008, 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <config.h>
+#include "backtrace.h"
+#include <errno.h>
+#include <inttypes.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include "compiler.h"
+
+#define THIS_MODULE VLM_backtrace
+#include "vlog.h"
+
+static uintptr_t UNUSED
+get_max_stack(void)
+{
+ static const char file_name[] = "/proc/self/maps";
+ char line[1024];
+ int line_number;
+ FILE *f;
+
+ f = fopen(file_name, "r");
+ if (f == NULL) {
+ VLOG_WARN("opening %s failed: %s", file_name, strerror(errno));
+ return -1;
+ }
+
+ for (line_number = 1; fgets(line, sizeof line, f); line_number++) {
+ if (strstr(line, "[stack]")) {
+ uintptr_t end;
+ if (sscanf(line, "%*"SCNxPTR"-%"SCNxPTR, &end) != 1) {
+ VLOG_WARN("%s:%d: parse error", file_name, line_number);
+ continue;
+ }
+ fclose(f);
+ return end;
+ }
+ }
+ fclose(f);
+
+ VLOG_WARN("%s: no stack found", file_name);
+ return -1;
+}
+
+static uintptr_t
+stack_high(void)
+{
+ static uintptr_t high;
+ if (!high) {
+ high = get_max_stack();
+ }
+ return high;
+}
+
+static uintptr_t
+stack_low(void)
+{
+#ifdef __i386__
+ uintptr_t low;
+ asm("movl %%esp,%0" : "=g" (low));
+ return low;
+#else
+ /* This causes a warning in GCC that cannot be disabled, so use it only on
+ * non-x86. */
+ int dummy;
+ return (uintptr_t) &dummy;
+#endif
+}
+
+static bool
+in_stack(void *p)
+{
+ uintptr_t address = (uintptr_t) p;
+ return address >= stack_low() && address < stack_high();
+}
+
+void
+backtrace_capture(struct backtrace *backtrace)
+{
+ void **frame;
+ size_t n;
+
+ n = 0;
+ for (frame = __builtin_frame_address(1);
+ frame != NULL && in_stack(frame) && frame[0] != NULL
+ && n < BACKTRACE_MAX_FRAMES;
+ frame = frame[0])
+ {
+ backtrace->frames[n++] = (uintptr_t) frame[1];
+ }
+ backtrace->n_frames = n;
+}
diff --git a/lib/backtrace.h b/lib/backtrace.h
new file mode 100644
index 000000000..49b250c7f
--- /dev/null
+++ b/lib/backtrace.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef BACKTRACE_H
+#define BACKTRACE_H 1
+
+#include <stdint.h>
+
+#define BACKTRACE_MAX_FRAMES 31
+
+struct backtrace {
+ int n_frames;
+ uintptr_t frames[BACKTRACE_MAX_FRAMES];
+};
+
+void backtrace_capture(struct backtrace *);
+
+#endif /* backtrace.h */
diff --git a/lib/bitmap.c b/lib/bitmap.c
new file mode 100644
index 000000000..cab7e6d26
--- /dev/null
+++ b/lib/bitmap.c
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2008 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <config.h>
+#include "bitmap.h"
+#include <string.h>
+
+/* Sets 'count' consecutive bits in 'bitmap', starting at bit offset 'start',
+ * to 'value'. */
+void
+bitmap_set_multiple(unsigned long *bitmap, size_t start, size_t count,
+ bool value)
+{
+ for (; count && start % BITMAP_ULONG_BITS; count--) {
+ bitmap_set(bitmap, start++, value);
+ }
+ for (; count >= BITMAP_ULONG_BITS; count -= BITMAP_ULONG_BITS) {
+ *bitmap_unit__(bitmap, start) = -(unsigned long) value;
+ start += BITMAP_ULONG_BITS;
+ }
+ for (; count; count--) {
+ bitmap_set(bitmap, start++, value);
+ }
+}
+
+/* Compares the 'n' bits in bitmaps 'a' and 'b'. Returns true if all bits are
+ * equal, false otherwise. */
+bool
+bitmap_equal(const unsigned long *a, const unsigned long *b, size_t n)
+{
+ size_t i;
+
+ if (memcmp(a, b, n / BITMAP_ULONG_BITS * sizeof(unsigned long))) {
+ return false;
+ }
+ for (i = ROUND_DOWN(n, BITMAP_ULONG_BITS); i < n; i++) {
+ if (bitmap_is_set(a, i) != bitmap_is_set(b, i)) {
+ return false;
+ }
+ }
+ return true;
+}
diff --git a/lib/bitmap.h b/lib/bitmap.h
new file mode 100644
index 000000000..a2d9a50d9
--- /dev/null
+++ b/lib/bitmap.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2008 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef BITMAP_H
+#define BITMAP_H 1
+
+#include <limits.h>
+#include <stdlib.h>
+#include "util.h"
+
+#define BITMAP_ULONG_BITS (sizeof(unsigned long) * CHAR_BIT)
+
+static inline unsigned long *
+bitmap_unit__(const unsigned long *bitmap, size_t offset)
+{
+ return (unsigned long *) &bitmap[offset / BITMAP_ULONG_BITS];
+}
+
+static inline unsigned long
+bitmap_bit__(size_t offset)
+{
+ return 1UL << (offset % BITMAP_ULONG_BITS);
+}
+
+static inline unsigned long *
+bitmap_allocate(size_t n_bits)
+{
+ return xcalloc(1, ROUND_UP(n_bits, BITMAP_ULONG_BITS));
+}
+
+static inline void
+bitmap_free(unsigned long *bitmap)
+{
+ free(bitmap);
+}
+
+static inline bool
+bitmap_is_set(const unsigned long *bitmap, size_t offset)
+{
+ return (*bitmap_unit__(bitmap, offset) & bitmap_bit__(offset)) != 0;
+}
+
+static inline void
+bitmap_set1(unsigned long *bitmap, size_t offset)
+{
+ *bitmap_unit__(bitmap, offset) |= bitmap_bit__(offset);
+}
+
+static inline void
+bitmap_set0(unsigned long *bitmap, size_t offset)
+{
+ *bitmap_unit__(bitmap, offset) &= ~bitmap_bit__(offset);
+}
+
+static inline void
+bitmap_set(unsigned long *bitmap, size_t offset, bool value)
+{
+ if (value) {
+ bitmap_set1(bitmap, offset);
+ } else {
+ bitmap_set0(bitmap, offset);
+ }
+}
+
+void bitmap_set_multiple(unsigned long *, size_t start, size_t count,
+ bool value);
+bool bitmap_equal(const unsigned long *, const unsigned long *, size_t n);
+
+#endif /* bitmap.h */
diff --git a/lib/cfg.c b/lib/cfg.c
new file mode 100644
index 000000000..b4d1d591a
--- /dev/null
+++ b/lib/cfg.c
@@ -0,0 +1,1182 @@
+/* Copyright (c) 2008, 2009 Nicira Networks
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * In addition, as a special exception, Nicira Networks gives permission
+ * to link the code of its release of vswitchd with the OpenSSL project's
+ * "OpenSSL" library (or with modified versions of it that use the same
+ * license as the "OpenSSL" library), and distribute the linked
+ * executables. You must obey the GNU General Public License in all
+ * respects for all of the code used other than "OpenSSL". If you modify
+ * this file, you may extend this exception to your version of the file,
+ * but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version.
+ */
+
+#include <config.h>
+#include "cfg.h"
+#include <arpa/inet.h>
+#include <assert.h>
+#include <dirent.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <fnmatch.h>
+#include <inttypes.h>
+#include <netinet/in.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include "coverage.h"
+#include "dynamic-string.h"
+#include "ofpbuf.h"
+#include "packets.h"
+#include "svec.h"
+#include "timeval.h"
+#include "util.h"
+
+#define THIS_MODULE VLM_cfg
+#include "vlog.h"
+
+/* XXX This file really needs a unit test! For a while, cfg_get_string(0,
+ * "bridge.a.controller") would return the value of
+ * "bridge.a.controller.in-band", if it existed, and I'm really not certain
+ * that the fix didn't break other things. */
+
+/* Configuration file name. */
+static char *cfg_name;
+
+/* Put the temporary file in the same directory as cfg_name, so that
+ * they are guaranteed to be in the same file system and therefore we can
+ * rename() tmp_name over cfg_name. */
+static char *tmp_name;
+
+/* Lock information. */
+static char *lock_name;
+static int lock_fd = -1;
+
+/* Flag to indicate whether local modifications have been made. */
+static bool dirty;
+
+static uint8_t cfg_cookie[CFG_COOKIE_LEN];
+
+/* Current configuration. Maintained in sorted order. */
+static struct svec cfg = SVEC_EMPTY_INITIALIZER;
+
+static bool has_double_dot(const char *key, size_t len);
+static bool is_valid_key(const char *key, size_t len,
+ const char *file_name, int line_number,
+ const char *id);
+static char *parse_section(const char *file_name, int line_number,
+ const char *);
+static void parse_setting(const char *file_name, int line_number,
+ const char *section, const char *);
+static int compare_key(const char *a, const char *b);
+static char **find_key_le(const char *key);
+static char **find_key_ge(const char *key);
+static char *find_key(const char *);
+static bool parse_mac(const char *, uint8_t mac[6]);
+static bool parse_dpid(const char *, uint64_t *);
+static bool is_key(const char *);
+static bool is_int(const char *);
+static bool is_bool(const char *);
+static const char *extract_value(const char *key);
+static const char *get_nth_value(int idx, const char *key);
+static bool is_type(const char *s, enum cfg_flags);
+
+#define CC_ALPHA "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
+#define CC_DIGIT "0123456789"
+#define CC_ALNUM CC_ALPHA CC_DIGIT
+#define CC_SPACE " \t\r\n\v"
+
+#define CC_FILE_NAME CC_ALNUM "._-"
+#define CC_KEY CC_ALNUM "._-@$:+"
+
+/* Sets 'file_name' as the configuration file read by cfg_read(). Returns 0 on
+ * success, otherwise a positive errno value if 'file_name' cannot be opened.
+ *
+ * This function does not actually read the named file or directory. Use
+ * cfg_read() to (re)read all the configuration files. */
+int
+cfg_set_file(const char *file_name)
+{
+ const char *slash;
+ int fd;
+
+ if (cfg_name) {
+ assert(lock_fd < 0);
+ free(cfg_name);
+ free(lock_name);
+ free(tmp_name);
+ cfg_name = lock_name = tmp_name = NULL;
+ }
+
+ /* Make sure that we can open this file for reading. */
+ fd = open(file_name, O_RDONLY);
+ if (fd < 0) {
+ return errno;
+ }
+ close(fd);
+
+ cfg_name = xstrdup(file_name);
+
+ /* Put the temporary file in the same directory as cfg_name, so that they
+ * are guaranteed to be in the same file system, to guarantee that
+ * rename(tmp_name, cfg_name) will work. */
+ tmp_name = xasprintf("%s.~tmp~", file_name);
+
+ /* Put the lock file in the same directory as cfg_name, but prefixed by
+ * a dot so as not to garner administrator interest. */
+ slash = strrchr(file_name, '/');
+ if (slash) {
+ lock_name = xasprintf("%.*s/.%s.~lock~",
+ slash - file_name, file_name, slash + 1);
+ } else {
+ lock_name = xasprintf(".%s.~lock~", file_name);
+ }
+
+ VLOG_INFO("using \"%s\" as configuration file, \"%s\" as lock file",
+ file_name, lock_name);
+ return 0;
+}
+
+static int
+update_cookie(void)
+{
+ int i;
+ SHA1Context context;
+
+ if (SHA1Reset(&context) != shaSuccess) {
+ return -1;
+ }
+ for (i = 0; i < cfg.n; i++) {
+ if (SHA1Input(&context, (uint8_t *)cfg.names[i],
+ strlen(cfg.names[i])) != shaSuccess) {
+ return -1;
+ }
+ SHA1Input(&context, (uint8_t *)"\n", 1);
+ }
+ if (SHA1Result(&context, cfg_cookie) != shaSuccess) {
+ return -1;
+ }
+
+ return 0;
+}
+
+/* Reads all of the configuration files or directories that have been added
+ * with cfg_add_file(), merges their content. Any previous configuration is
+ * replaced. Returns 0 if successful, otherwise a positive errno value. */
+int
+cfg_read(void)
+{
+ struct svec old_cfg;
+ struct ds ds;
+ FILE *file;
+ char *section;
+ int line_number;
+
+
+ if (!cfg_name) {
+ return ENODEV;
+ }
+
+ /* Save old configuration data and clear the active configuration. */
+ svec_init(&old_cfg);
+ svec_swap(&old_cfg, &cfg);
+
+ /* Read new configuration. */
+ VLOG_DBG("reading configuration from %s", cfg_name);
+
+ file = fopen(cfg_name, "r");
+ if (!file) {
+ VLOG_ERR("failed to open \"%s\": %s", cfg_name, strerror(errno));
+ return errno;
+ }
+
+ ds_init(&ds);
+ section = NULL;
+ line_number = 0;
+ while (!ds_get_line(&ds, file)) {
+ const char *s = ds_cstr(&ds);
+ size_t indent = strspn(s, CC_SPACE);
+
+ line_number++;
+ s += indent;
+ if (*s == '#' || *s == '\0') {
+ /* Ignore comments and lines that contain only white space. */
+ } else if (*s == '[') {
+ if (!indent) {
+ free(section);
+ section = parse_section(cfg_name, line_number, s);
+ } else {
+ VLOG_ERR("%s:%d: ignoring indented section header",
+ cfg_name, line_number);
+ }
+ } else if (indent && !section) {
+ VLOG_ERR("%s:%d: ignoring indented line outside any section",
+ cfg_name, line_number);
+ } else {
+ if (!indent) {
+ free(section);
+ section = NULL;
+ }
+ parse_setting(cfg_name, line_number, section, s);
+ }
+ }
+ ds_destroy(&ds);
+ free(section);
+
+ svec_sort(&cfg);
+ svec_terminate(&cfg);
+ update_cookie();
+
+ fclose(file);
+
+ if (VLOG_IS_DBG_ENABLED()) {
+ struct svec removed, added;
+ size_t i;
+
+ svec_diff(&old_cfg, &cfg, &removed, NULL, &added);
+ if (removed.n || added.n) {
+ VLOG_DBG("configuration changes:");
+ for (i = 0; i < removed.n; i++) {
+ VLOG_DBG("-%s", removed.names[i]);
+ }
+ for (i = 0; i < added.n; i++) {
+ VLOG_DBG("+%s", added.names[i]);
+ }
+ } else {
+ VLOG_DBG("configuration unchanged");
+ }
+ svec_destroy(&added);
+ svec_destroy(&removed);
+ }
+ svec_destroy(&old_cfg);
+
+ dirty = false;
+
+ return 0;
+}
+
+/* Fills 'svec' with the entire configuration file. */
+void
+cfg_get_all(struct svec *svec)
+{
+ svec_clear(svec);
+ svec_append(svec, &cfg);
+}
+
+int
+cfg_get_cookie(uint8_t *cookie)
+{
+ if (dirty) {
+ update_cookie();
+ }
+
+ memcpy(cookie, cfg_cookie, sizeof(cfg_cookie));
+ return 0;
+}
+
+void
+cfg_unlock(void)
+{
+ if (lock_fd != -1) {
+ COVERAGE_INC(cfg_unlock);
+ close(lock_fd);
+ lock_fd = -1;
+ }
+}
+
+static int
+open_lockfile(const char *name)
+{
+ for (;;) {
+ /* Try to open an existing lock file. */
+ int fd = open(name, O_RDWR);
+ if (fd >= 0) {
+ return fd;
+ } else if (errno != ENOENT) {
+ VLOG_WARN("%s: failed to open lock file: %s",
+ name, strerror(errno));
+ return -errno;
+ }
+
+ /* Try to create a new lock file. */
+ VLOG_INFO("%s: lock file does not exist, creating", name);
+ fd = open(name, O_RDWR | O_CREAT | O_EXCL, 0600);
+ if (fd >= 0) {
+ return fd;
+ } else if (errno != EEXIST) {
+ VLOG_WARN("%s: failed to create lock file: %s",
+ name, strerror(errno));
+ return -errno;
+ }
+
+ /* Someone else created the lock file. Try again. */
+ }
+}
+
+static int
+try_lock(int fd, bool block)
+{
+ struct flock l;
+ memset(&l, 0, sizeof l);
+ l.l_type = F_WRLCK;
+ l.l_whence = SEEK_SET;
+ l.l_start = 0;
+ l.l_len = 0;
+ return fcntl(fd, block ? F_SETLKW : F_SETLK, &l) == -1 ? errno : 0;
+}
+
+/* Locks the configuration file against modification by other processes and
+ * re-reads it from disk.
+ *
+ * The 'timeout' specifies the maximum number of milliseconds to wait for the
+ * config file to become free. Use 0 to avoid waiting or INT_MAX to wait
+ * forever.
+ *
+ * Returns 0 on success, otherwise a positive errno value. */
+int
+cfg_lock(uint8_t *cookie, int timeout)
+{
+ long long int start = time_msec();
+ long long int elapsed = 0;
+ int fd;
+ uint8_t curr_cookie[CFG_COOKIE_LEN];
+
+ assert(lock_fd < 0);
+ COVERAGE_INC(cfg_lock);
+ for (;;) {
+ int error;
+
+ /* Open lock file. */
+ fd = open_lockfile(lock_name);
+ if (fd < 0) {
+ return -fd;
+ }
+
+ /* Try to lock it. This will block (if 'timeout' > 0). */
+ error = try_lock(fd, timeout > 0);
+ time_refresh();
+ elapsed = time_msec() - start;
+ if (!error) {
+ /* Success! */
+ break;
+ }
+
+ /* Lock failed. Close the lock file and reopen it on the next
+ * iteration, just in case someone deletes it underneath us (even
+ * though that should not happen). */
+ close(fd);
+ if (error != EINTR) {
+ /* Hard error, give up. */
+ COVERAGE_INC(cfg_lock_error);
+ VLOG_WARN("%s: failed to lock file "
+ "(after %lld ms, with %d-ms timeout): %s",
+ lock_name, elapsed, timeout, strerror(error));
+ return error;
+ }
+
+ /* Probably, the periodic timer set up by time_init() woke up us. Just
+ * check whether it's time to give up. */
+ if (timeout != INT_MAX && elapsed >= timeout) {
+ COVERAGE_INC(cfg_lock_timeout);
+ VLOG_WARN("%s: giving up on lock file after %lld ms",
+ lock_name, elapsed);
+ return ETIMEDOUT;
+ }
+ COVERAGE_INC(cfg_lock_retry);
+ }
+ if (elapsed) {
+ VLOG_WARN("%s: waited %lld ms for lock file", lock_name, elapsed);
+ }
+ lock_fd = fd;
+
+ cfg_read();
+
+ if (cookie) {
+ cfg_get_cookie(curr_cookie);
+
+ if (memcmp(curr_cookie, cookie, sizeof *curr_cookie)) {
+ /* Configuration has changed, so reject. */
+ cfg_unlock();
+ return EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+static int
+do_write_config(const void *data, size_t len)
+{
+ FILE *file;
+ int error;
+
+ file = fopen(tmp_name, "w");
+ if (file == NULL) {
+ VLOG_WARN("could not open %s for writing: %s",
+ tmp_name, strerror(errno));
+ return errno;
+ }
+
+ fwrite(data, 1, len, file);
+
+ /* This is essentially equivalent to:
+ * error = ferror(file) || fflush(file) || fclose(file);
+ * but it doesn't short-circuit, so that it always closes 'file'. */
+ error = ferror(file);
+ error = fflush(file) || error;
+ error = fclose(file) || error;
+ if (error) {
+ VLOG_WARN("problem writing to %s: %s", tmp_name, strerror(errno));
+ return errno;
+ }
+
+ if (rename(tmp_name, cfg_name) < 0) {
+ VLOG_WARN("could not rename %s to %s: %s",
+ tmp_name, cfg_name, strerror(errno));
+ return errno;
+ }
+
+ dirty = false;
+
+ return 0;
+}
+
+/* Write the current configuration into the configuration file. Returns 0 if
+ * successful, otherwise a negative errno value. */
+int
+cfg_write(void)
+{
+ char *content;
+ int retval;
+
+ svec_sort(&cfg);
+ content = (cfg.n
+ ? svec_join(&cfg, "\n", "\n")
+ : xstrdup("# This file intentionally left blank.\n"));
+ retval = do_write_config(content, strlen(content));
+ free(content);
+
+ return retval;
+}
+
+int
+cfg_write_data(uint8_t *data, size_t len)
+{
+ int retval = do_write_config(data, len);
+ if (!retval) {
+ cfg_read();
+ }
+ return retval;
+}
+
+/* Returns true if the configuration has changed since the last time it was
+ * read or written. */
+bool
+cfg_is_dirty(void)
+{
+ return dirty;
+}
+
+void
+cfg_buf_put(struct ofpbuf *buffer)
+{
+ int i;
+
+ for (i = 0; i < cfg.n; i++) {
+ ofpbuf_put(buffer, cfg.names[i], strlen(cfg.names[i]));
+ ofpbuf_put(buffer, "\n", 1);
+ }
+}
+
+/* Formats the printf()-style format string in the parameter 'format', which
+ * must be the function's last parameter, into string variable 'dst'. The
+ * function is responsible for freeing 'dst'. */
+#define FORMAT_KEY(FORMAT, DST) \
+ do { \
+ va_list args__; \
+ va_start(args__, FORMAT); \
+ (DST) = xvasprintf(FORMAT, args__); \
+ va_end(args__); \
+ } while (0)
+
+/* Returns true if the configuration includes a key named 'key'. */
+bool
+cfg_has(const char *key_, ...)
+{
+ char *key;
+ bool retval;
+
+ FORMAT_KEY(key_, key);
+ retval = find_key(key) != NULL;
+ free(key);
+ return retval;
+}
+
+bool
+cfg_is_valid(enum cfg_flags flags, const char *key_, ...)
+{
+ char *key, **first, **last, **p;
+ size_t n;
+ bool retval;
+
+ FORMAT_KEY(key_, key);
+ first = find_key_le(key);
+ last = find_key_ge(key);
+ n = last - first;
+ retval = ((!(flags & CFG_REQUIRED) || n)
+ && (!(flags & CFG_MULTIPLE) || n <= 1));
+ for (p = first; retval && p < last; p++) {
+ retval = is_type(strchr(*p, '=') + 1, flags);
+ }
+ free(key);
+ return retval;
+}
+
+/* Returns true if the configuration includes at least one key whose name
+ * begins with 'section' followed by a dot. */
+bool
+cfg_has_section(const char *section_, ...)
+{
+ struct ds section;
+ bool retval = false;
+ va_list args;
+ char **p;
+
+ ds_init(&section);
+ va_start(args, section_);
+ ds_put_format_valist(&section, section_, args);
+ ds_put_char(&section, '.');
+ va_end(args);
+
+ for (p = cfg.names; *p; p++) { /* XXX this is inefficient */
+ if (!strncmp(section.string, *p, section.length)) {
+ retval = true;
+ break;
+ }
+ }
+
+ ds_destroy(&section);
+ return retval;
+}
+
+/* Returns the number of values for the given 'key'. The return value is 0 if
+ * no values exist for 'key'. */
+int
+cfg_count(const char *key_, ...)
+{
+ char *key;
+ int retval;
+
+ FORMAT_KEY(key_, key);
+ retval = find_key_ge(key) - find_key_le(key);
+ free(key);
+ return retval;
+}
+
+/* Fills 'svec' with all of the immediate subsections of 'section'. For
+ * example, if 'section' is "bridge" and keys bridge.a, bridge.b, bridge.b.c,
+ * and bridge.c.x.y.z exist, then 'svec' would be initialized to a, b, and
+ * c. The caller must first initialize 'svec'. */
+void
+cfg_get_subsections(struct svec *svec, const char *section_, ...)
+{
+ struct ds section;
+ va_list args;
+ char **p;
+
+ ds_init(&section);
+ va_start(args, section_);
+ ds_put_format_valist(&section, section_, args);
+ ds_put_char(&section, '.');
+ va_end(args);
+
+ svec_clear(svec);
+ for (p = cfg.names; *p; p++) { /* XXX this is inefficient */
+ if (!strncmp(section.string, *p, section.length)) {
+ const char *ss = *p + section.length;
+ size_t ss_len = strcspn(ss, ".=");
+ svec_add_nocopy(svec, xmemdup0(ss, ss_len));
+ }
+ }
+ svec_unique(svec);
+ ds_destroy(&section);
+}
+
+void
+cfg_add_entry(const char *entry_, ...)
+{
+ char *entry;
+
+ FORMAT_KEY(entry_, entry);
+ svec_add_nocopy(&cfg, entry);
+ svec_sort(&cfg);
+ svec_terminate(&cfg);
+ dirty = true;
+}
+
+void
+cfg_del_entry(const char *entry_, ...)
+{
+ char *entry;
+
+ FORMAT_KEY(entry_, entry);
+ svec_del(&cfg, entry);
+ svec_terminate(&cfg);
+ free(entry);
+ dirty = true;
+}
+
+void
+cfg_del_section(const char *section_, ...)
+{
+ struct ds section;
+ va_list args;
+ char **p;
+
+ ds_init(&section);
+ va_start(args, section_);
+ ds_put_format_valist(&section, section_, args);
+ ds_put_char(&section, '.');
+ va_end(args);
+
+ for (p = cfg.names; *p; p++) {
+ if (!strncmp(section.string, *p, section.length)) {
+ free(*p);
+ *p = NULL;
+ }
+ }
+ svec_compact(&cfg);
+ svec_terminate(&cfg);
+
+ ds_destroy(&section);
+ dirty = true;
+}
+
+void
+cfg_del_match(const char *pattern_, ...)
+{
+ bool matched = false;
+ char *pattern;
+ char **p;
+
+ FORMAT_KEY(pattern_, pattern);
+
+ for (p = cfg.names; *p; p++) {
+ if (!fnmatch(pattern, *p, 0)) {
+ free(*p);
+ *p = NULL;
+ matched = true;
+ }
+ }
+ if (matched) {
+ svec_compact(&cfg);
+ svec_terminate(&cfg);
+ dirty = true;
+ }
+
+ free(pattern);
+}
+
+/* Fills 'svec' with all of the key-value pairs that have sections that
+ * begin with 'section'. The caller must first initialize 'svec'. */
+void
+cfg_get_section(struct svec *svec, const char *section_, ...)
+{
+ struct ds section;
+ va_list args;
+ char **p;
+
+ ds_init(&section);
+ va_start(args, section_);
+ ds_put_format_valist(&section, section_, args);
+ ds_put_char(&section, '.');
+ va_end(args);
+
+ for (p = cfg.names; *p; p++) { /* XXX this is inefficient */
+ if (!strncmp(section.string, *p, section.length)) {
+ svec_add(svec, *p);
+ }
+ }
+ ds_destroy(&section);
+}
+
+/* Returns the value numbered 'idx' of 'key'. Returns a null pointer if 'idx'
+ * is greater than or equal to cfg_count(key). The caller must not modify or
+ * free the returned string or retain its value beyond the next call to
+ * cfg_read(). */
+const char *
+cfg_get_string(int idx, const char *key_, ...)
+{
+ const char *retval;
+ char *key;
+
+ FORMAT_KEY(key_, key);
+ retval = get_nth_value(idx, key);
+ free(key);
+ return retval;
+}
+
+/* Returns the value numbered 'idx' of 'key'. Returns a null pointer if 'idx'
+ * is greater than or equal to cfg_count(key) or if the value 'idx' of 'key' is
+ * not a valid key. The caller must not modify or free the returned string or
+ * retain its value beyond the next call to cfg_read(). */
+const char *
+cfg_get_key(int idx, const char *key_, ...)
+{
+ const char *value, *retval;
+ char *key;
+
+ FORMAT_KEY(key_, key);
+ value = get_nth_value(idx, key);
+ retval = value && is_key(value) ? value : NULL;
+ free(key);
+ return retval;
+}
+
+/* Returns the value numbered 'idx' of 'key', converted to an integer. Returns
+ * 0 if 'idx' is greater than or equal to cfg_count(key) or if the value 'idx'
+ * of 'key' is not a valid integer. */
+int
+cfg_get_int(int idx, const char *key_, ...)
+{
+ const char *value;
+ int retval;
+ char *key;
+
+ FORMAT_KEY(key_, key);
+ value = get_nth_value(idx, key);
+ retval = value && is_int(value) ? atoi(value) : 0;
+ free(key);
+ return retval;
+}
+
+/* Returns the value numbered 'idx' of 'key', converted to a boolean value.
+ * Returns false if 'idx' is greater than or equal to cfg_count(key) or if the
+ * value 'idx' of 'key' is not a valid boolean. */
+bool
+cfg_get_bool(int idx, const char *key_, ...)
+{
+ const char *value;
+ bool retval;
+ char *key;
+
+ FORMAT_KEY(key_, key);
+ value = get_nth_value(idx, key);
+ retval = value && is_bool(value) ? !strcmp(value, "true") : false;
+ free(key);
+ return retval;
+}
+
+/* Returns the value numbered 'idx' of 'key', converted to an IP address in
+ * network byte order. Returns 0 if 'idx' is greater than or equal to
+ * cfg_count(key) or if the value 'idx' of 'key' is not a valid IP address (as
+ * determined by inet_aton()). */
+uint32_t
+cfg_get_ip(int idx, const char *key_, ...)
+{
+ struct in_addr addr;
+ const char *value;
+ char *key;
+
+ FORMAT_KEY(key_, key);
+ value = get_nth_value(idx, key);
+ if (!value || !inet_aton(value, &addr)) {
+ addr.s_addr = htonl(0);
+ }
+ free(key);
+ return addr.s_addr;
+}
+
+/* Returns the value numbered 'idx' of 'key', converted to an MAC address in
+ * host byte order. Returns 0 if 'idx' is greater than or equal to
+ * cfg_count(key) or if the value 'idx' of 'key' is not a valid MAC address in
+ * the format "##:##:##:##:##:##". */
+uint64_t
+cfg_get_mac(int idx, const char *key_, ...)
+{
+ uint8_t mac[ETH_ADDR_LEN];
+ const char *value;
+ char *key;
+
+ FORMAT_KEY(key_, key);
+ value = get_nth_value(idx, key);
+ if (!value || !parse_mac(value, mac)) {
+ memset(mac, 0, sizeof mac);
+ }
+ free(key);
+ return eth_addr_to_uint64(mac);
+}
+
+/* Returns the value numbered 'idx' of 'key', parsed as an datapath ID.
+ * Returns 0 if 'idx' is greater than or equal to cfg_count(key) or if the
+ * value 'idx' of 'key' is not a valid datapath ID consisting of exactly 12
+ * hexadecimal digits. */
+uint64_t
+cfg_get_dpid(int idx, const char *key_, ...)
+{
+ uint64_t dpid;
+ const char *value;
+ char *key;
+
+ FORMAT_KEY(key_, key);
+ value = get_nth_value(idx, key);
+ if (!value || !parse_dpid(value, &dpid)) {
+ dpid = 0;
+ }
+ free(key);
+ return dpid;
+}
+
+/* Returns the value numbered 'idx' of 'key', converted to an integer. Returns
+ * -1 if 'idx' is greater than or equal to cfg_count(key) or if the value 'idx'
+ * of 'key' is not a valid integer between 0 and 4095. */
+int
+cfg_get_vlan(int idx, const char *key_, ...)
+{
+ const char *value;
+ int retval;
+ char *key;
+
+ FORMAT_KEY(key_, key);
+ value = get_nth_value(idx, key);
+ if (value && is_int(value)) {
+ retval = atoi(value);
+ if (retval < 0 || retval > 4095) {
+ retval = -1;
+ }
+ } else {
+ retval = -1;
+ }
+ free(key);
+ return retval;
+}
+
+/* Fills 'svec' with all of the string values of 'key'. The caller must
+ * first initialize 'svec'. */
+void
+cfg_get_all_strings(struct svec *svec, const char *key_, ...)
+{
+ char **p, **q;
+ char *key;
+
+ FORMAT_KEY(key_, key);
+ svec_clear(svec);
+ for (p = find_key_le(key), q = find_key_ge(key); p < q; p++) {
+ svec_add(svec, extract_value(*p));
+ }
+ free(key);
+}
+
+/* Fills 'svec' with all of the values of 'key' that are valid keys.
+ * Values of 'key' that are not valid keys are omitted. The caller
+ * must first initialize 'svec'. */
+void
+cfg_get_all_keys(struct svec *svec, const char *key_, ...)
+{
+ char **p, **q;
+ char *key;
+
+ FORMAT_KEY(key_, key);
+ svec_clear(svec);
+ for (p = find_key_le(key), q = find_key_ge(key); p < q; p++) {
+ const char *value = extract_value(*p);
+ if (is_key(value)) {
+ svec_add(svec, value);
+ }
+ }
+ free(key);
+}
+
+static bool
+has_double_dot(const char *key, size_t len)
+{
+ if (len >= 2) {
+ size_t i;
+
+ for (i = 0; i < len - 1; i++) {
+ if (key[i] == '.' && key[i + 1] == '.') {
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
+static bool
+is_valid_key(const char *key, size_t len,
+ const char *file_name, int line_number, const char *id)
+{
+ if (!len) {
+ VLOG_ERR("%s:%d: missing %s name", file_name, line_number, id);
+ return false;
+ } else if (key[0] == '.') {
+ VLOG_ERR("%s:%d: %s name \"%.*s\" begins with invalid character '.'",
+ file_name, line_number, id, (int) len, key);
+ return false;
+ } else if (key[len - 1] == '.') {
+ VLOG_ERR("%s:%d: %s name \"%.*s\" ends with invalid character '.'",
+ file_name, line_number, id, (int) len, key);
+ return false;
+ } else if (has_double_dot(key, len)) {
+ VLOG_ERR("%s:%d: %s name \"%.*s\" contains '..', which is not allowed",
+ file_name, line_number, id, (int) len, key);
+ return false;
+ } else {
+ return true;
+ }
+}
+
+static char *
+parse_section(const char *file_name, int line_number, const char *s)
+{
+ struct ds section;
+ size_t len;
+
+ ds_init(&section);
+
+ /* Skip [ and any white space. */
+ s++;
+ s += strspn(s, CC_SPACE);
+
+ /* Obtain the section name. */
+ len = strspn(s, CC_KEY);
+ if (!is_valid_key(s, len, file_name, line_number, "section")) {
+ goto error;
+ }
+ ds_put_buffer(&section, s, len);
+ s += len;
+
+ /* Obtain the subsection name, if any. */
+ s += strspn(s, CC_SPACE);
+ if (*s == '"') {
+ s++;
+ len = strspn(s, CC_KEY);
+ if (!is_valid_key(s, len, file_name, line_number, "subsection")) {
+ goto error;
+ }
+ ds_put_char(&section, '.');
+ ds_put_buffer(&section, s, len);
+ s += len;
+ if (*s != '"') {
+ VLOG_ERR("%s:%d: missing '\"' following subsection name",
+ file_name, line_number);
+ goto error;
+ }
+ s++;
+ s += strspn(s, CC_SPACE);
+ }
+
+ /* Check for ]. */
+ if (*s != ']') {
+ VLOG_ERR("%s:%d: missing ']' following section name",
+ file_name, line_number);
+ goto error;
+ }
+ s++;
+ s += strspn(s, CC_SPACE);
+ if (*s != '\0') {
+ VLOG_ERR("%s:%d: trailing garbage following ']'",
+ file_name, line_number);
+ goto error;
+ }
+
+ return ds_cstr(&section);
+
+error:
+ ds_destroy(&section);
+ return NULL;
+}
+
+static void
+parse_setting(const char *file_name, int line_number, const char *section,
+ const char *s)
+{
+ struct ds key = DS_EMPTY_INITIALIZER;
+ struct ds value = DS_EMPTY_INITIALIZER;
+ size_t len;
+
+ if (section) {
+ ds_put_format(&key, "%s.", section);
+ }
+
+ /* Obtain the key. */
+ len = strspn(s, CC_KEY);
+ if (!len) {
+ VLOG_ERR("%s:%d: missing key name", file_name, line_number);
+ goto done;
+ }
+ if (!is_valid_key(s, len, file_name, line_number, "key")) {
+ goto done;
+ }
+ ds_put_buffer(&key, s, len);
+ s += len;
+
+ /* Skip the '='. */
+ s += strspn(s, CC_SPACE);
+ if (*s != '=') {
+ VLOG_ERR("%s:%d: missing '=' following key", file_name, line_number);
+ goto done;
+ }
+ s++;
+ s += strspn(s, CC_SPACE);
+
+ /* Obtain the value. */
+ ds_put_cstr(&value, s);
+ while (value.length > 0 && strchr(CC_SPACE, ds_last(&value))) {
+ value.length--;
+ }
+
+ /* Add the setting. */
+ svec_add_nocopy(&cfg, xasprintf("%s=%s", ds_cstr(&key), ds_cstr(&value)));
+
+done:
+ ds_destroy(&key);
+ ds_destroy(&value);
+}
+
+static int
+compare_key(const char *a, const char *b)
+{
+ for (;;) {
+ int ac = *a == '\0' || *a == '=' ? INT_MAX : *a;
+ int bc = *b == '\0' || *b == '=' ? INT_MAX : *b;
+ if (ac != bc) {
+ return ac < bc ? -1 : 1;
+ } else if (ac == INT_MAX) {
+ return 0;
+ }
+ a++;
+ b++;
+ }
+}
+
+/* Returns the address of the greatest configuration string with a key less
+ * than or equal to 'key'. Returns the address of the null terminator if all
+ * configuration strings are greater than 'key'. */
+static char **
+find_key_le(const char *key)
+{
+ int low = 0;
+ int len = cfg.n;
+ while (len > 0) {
+ int half = len >> 1;
+ int middle = low + half;
+ if (compare_key(cfg.names[middle], key) < 0) {
+ low = middle + 1;
+ len -= half + 1;
+ } else {
+ len = half;
+ }
+ }
+ return &cfg.names[low];
+}
+
+/* Returns the address of the least configuration string with a key greater
+ * than or equal to 'key'. Returns the address of the null terminator if all
+ * configuration strings are less than 'key'. */
+static char **
+find_key_ge(const char *key)
+{
+ int low = 0;
+ int len = cfg.n;
+ while (len > 0) {
+ int half = len >> 1;
+ int middle = low + half;
+ if (compare_key(cfg.names[middle], key) > 0) {
+ len = half;
+ } else {
+ low = middle + 1;
+ len -= half + 1;
+ }
+ }
+ return &cfg.names[low];
+}
+
+static char *
+find_key(const char *key)
+{
+ char **p = find_key_le(key);
+ return p < &cfg.names[cfg.n] && !compare_key(*p, key) ? *p : NULL;
+}
+
+static bool
+parse_mac(const char *s, uint8_t mac[6])
+{
+ return sscanf(s, "%"SCNx8":%"SCNx8":%"SCNx8":%"SCNx8":%"SCNx8":%"SCNx8,
+ &mac[0], &mac[1], &mac[2], &mac[3], &mac[4], &mac[5]) == 6;
+}
+
+static bool
+parse_dpid(const char *s, uint64_t *dpid)
+{
+ if (strlen(s) == 12 && strspn(s, "0123456789abcdefABCDEF") == 12) {
+ *dpid = strtoll(s, NULL, 16);
+ return true;
+ } else {
+ return false;
+ }
+}
+
+static bool
+is_key(const char *s)
+{
+ /* XXX needs to check the same things as is_valid_key() too. */
+ return *s && s[strspn(s, CC_KEY)] == '\0';
+}
+
+static bool
+is_int(const char *s)
+{
+ return *s && s[strspn(s, CC_DIGIT)] == '\0';
+}
+
+static bool
+is_bool(const char *s)
+{
+ return !strcmp(s, "true") || !strcmp(s, "false");
+}
+
+static const char *
+extract_value(const char *key)
+{
+ const char *p = strchr(key, '=');
+ return p ? p + 1 : NULL;
+}
+
+static const char *
+get_nth_value(int idx, const char *key)
+{
+ char **p = find_key_le(key);
+ char **q = find_key_ge(key);
+ return idx < q - p ? extract_value(p[idx]) : NULL;
+}
+
+static bool
+is_type(const char *s, enum cfg_flags flags)
+{
+ uint8_t mac[ETH_ADDR_LEN];
+ struct in_addr addr;
+ uint64_t dpid;
+
+ return (flags & CFG_STRING
+ || (flags & CFG_KEY && is_key(s))
+ || (flags & CFG_INT && is_int(s))
+ || (flags & CFG_BOOL && is_bool(s))
+ || (flags & CFG_IP && inet_aton(s, &addr))
+ || (flags & CFG_MAC && parse_mac(s, mac))
+ || (flags & CFG_DPID && parse_dpid(s, &dpid)));
+}
diff --git a/lib/cfg.h b/lib/cfg.h
new file mode 100644
index 000000000..85ad66f97
--- /dev/null
+++ b/lib/cfg.h
@@ -0,0 +1,98 @@
+/* Copyright (c) 2008, 2009 Nicira Networks
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * In addition, as a special exception, Nicira Networks gives permission
+ * to link the code of its release of vswitchd with the OpenSSL project's
+ * "OpenSSL" library (or with modified versions of it that use the same
+ * license as the "OpenSSL" library), and distribute the linked
+ * executables. You must obey the GNU General Public License in all
+ * respects for all of the code used other than "OpenSSL". If you modify
+ * this file, you may extend this exception to your version of the file,
+ * but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version.
+ */
+
+
+#ifndef VSWITCHD_CFG_H
+#define VSWITCHD_CFG_H 1
+
+#include <stdbool.h>
+#include <stdint.h>
+#include <unistd.h>
+#include "compiler.h"
+#include "sha1.h"
+
+struct svec;
+struct ofpbuf;
+
+int cfg_set_file(const char *file_name);
+int cfg_read(void);
+int cfg_lock(uint8_t *cookie, int timeout);
+void cfg_unlock(void);
+int cfg_write(void);
+int cfg_write_data(uint8_t *data, size_t len);
+bool cfg_is_dirty(void);
+
+void cfg_get_all(struct svec *);
+
+#define CFG_COOKIE_LEN SHA1HashSize
+int cfg_get_cookie(uint8_t *cookie);
+
+void cfg_buf_put(struct ofpbuf *buffer);
+void cfg_get_subsections(struct svec *, const char *, ...) PRINTF_FORMAT(2, 3);
+
+enum cfg_flags {
+ /* Types allowed. */
+ CFG_STRING = 1 << 0, /* Arbitrary content. */
+ CFG_KEY = 1 << 0, /* Valid key name. */
+ CFG_INT = 1 << 2, /* Integer value. */
+ CFG_BOOL = 1 << 3, /* Boolean. */
+ CFG_IP = 1 << 4, /* IPv4 address. */
+ CFG_MAC = 1 << 5, /* MAC address. */
+ CFG_VLAN = 1 << 6, /* Integer in range 0...4095. */
+ CFG_DPID = 1 << 7, /* 12 hexadecimal digits. */
+
+ /* Number allowed. */
+ CFG_REQUIRED = 1 << 8, /* At least one value allowed. */
+ CFG_MULTIPLE = 1 << 9 /* More than one value allowed. */
+};
+void cfg_register(const char *key_spec, enum cfg_flags);
+
+void cfg_add_entry(const char *key, ...) PRINTF_FORMAT(1, 2);
+void cfg_del_entry(const char *key, ...) PRINTF_FORMAT(1, 2);
+void cfg_del_section(const char *key, ...) PRINTF_FORMAT(1, 2);
+void cfg_del_match(const char *pattern, ...) PRINTF_FORMAT(1, 2);
+void cfg_get_section(struct svec *svec, const char *key, ...)
+ PRINTF_FORMAT(2, 3);
+
+bool cfg_has(const char *key, ...) PRINTF_FORMAT(1, 2);
+bool cfg_is_valid(enum cfg_flags, const char *key, ...) PRINTF_FORMAT(2, 3);
+bool cfg_has_section(const char *key, ...) PRINTF_FORMAT(1, 2);
+int cfg_count(const char *key, ...) PRINTF_FORMAT(1, 2);
+
+const char *cfg_get_string(int idx, const char *key, ...) PRINTF_FORMAT(2, 3);
+const char *cfg_get_key(int idx, const char *key, ...) PRINTF_FORMAT(2, 3);
+int cfg_get_int(int idx, const char *key, ...) PRINTF_FORMAT(2, 3);
+bool cfg_get_bool(int idx, const char *key, ...) PRINTF_FORMAT(2, 3);
+uint32_t cfg_get_ip(int idx, const char *key, ...) PRINTF_FORMAT(2, 3);
+uint64_t cfg_get_mac(int idx, const char *key, ...) PRINTF_FORMAT(2, 3);
+int cfg_get_vlan(int idx, const char *key, ...) PRINTF_FORMAT(2, 3);
+uint64_t cfg_get_dpid(int idx, const char *key, ...) PRINTF_FORMAT(2, 3);
+
+void cfg_get_all_strings(struct svec *, const char *key, ...)
+ PRINTF_FORMAT(2, 3);
+void cfg_get_all_keys(struct svec *, const char *key, ...) PRINTF_FORMAT(2, 3);
+
+#endif /* vswitchd/cfg.h */
diff --git a/lib/classifier.c b/lib/classifier.c
new file mode 100644
index 000000000..11223a815
--- /dev/null
+++ b/lib/classifier.c
@@ -0,0 +1,832 @@
+/*
+ * Copyright (c) 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <config.h>
+#include "classifier.h"
+#include <assert.h>
+#include <errno.h>
+#include <netinet/in.h>
+#include "flow.h"
+#include "hash.h"
+
+const struct cls_field cls_fields[CLS_N_FIELDS + 1] = {
+#define CLS_FIELD(WILDCARDS, MEMBER, NAME) \
+ { offsetof(flow_t, MEMBER), \
+ sizeof ((flow_t *)0)->MEMBER, \
+ WILDCARDS, \
+ #NAME },
+ CLS_FIELDS
+#undef CLS_FIELD
+ { sizeof(flow_t), 0, 0, "exact" },
+};
+
+static uint32_t hash_fields(const flow_t *, int table_idx);
+static bool equal_fields(const flow_t *, const flow_t *, int table_idx);
+
+static int table_idx_from_wildcards(uint32_t wildcards);
+static struct cls_rule *table_insert(struct hmap *, struct cls_rule *);
+static struct cls_rule *insert_exact_rule(struct classifier *,
+ struct cls_rule *);
+static struct cls_bucket *find_bucket(struct hmap *, size_t hash,
+ const struct cls_rule *);
+static struct cls_rule *search_table(const struct hmap *table, int field_idx,
+ const struct cls_rule *);
+static struct cls_rule *search_exact_table(const struct classifier *,
+ size_t hash, const flow_t *);
+static bool rules_match_1wild(const struct cls_rule *fixed,
+ const struct cls_rule *wild, int field_idx);
+
+/* Converts the flow in 'flow' into a cls_rule in 'rule', with the given
+ * 'wildcards' and 'priority'.*/
+void
+cls_rule_from_flow(struct cls_rule *rule, const flow_t *flow,
+ uint32_t wildcards, unsigned int priority)
+{
+ assert(flow->reserved == 0);
+ rule->flow = *flow;
+ flow_wildcards_init(&rule->wc, wildcards);
+ rule->priority = priority;
+ rule->table_idx = table_idx_from_wildcards(rule->wc.wildcards);
+}
+
+/* Converts the ofp_match in 'match' into a cls_rule in 'rule', with the given
+ * 'priority'. */
+void
+cls_rule_from_match(struct cls_rule *rule, const struct ofp_match *match,
+ unsigned int priority)
+{
+ uint32_t wildcards;
+ flow_from_match(&rule->flow, &wildcards, match);
+ flow_wildcards_init(&rule->wc, wildcards);
+ rule->priority = rule->wc.wildcards ? priority : UINT16_MAX;
+ rule->table_idx = table_idx_from_wildcards(rule->wc.wildcards);
+}
+
+/* Prints cls_rule 'rule', for debugging.
+ *
+ * (The output could be improved and expanded, but this was good enough to
+ * debug the classifier.) */
+void
+cls_rule_print(const struct cls_rule *rule)
+{
+ printf("wildcards=%x priority=%u ", rule->wc.wildcards, rule->priority);
+ flow_print(stdout, &rule->flow);
+ putc('\n', stdout);
+}
+
+/* Adjusts pointers around 'old', which must be in classifier 'cls', to
+ * compensate for it having been moved in memory to 'new' (e.g. due to
+ * realloc()).
+ *
+ * This function cannot be realized in all possible flow classifier
+ * implementations, so we will probably have to change the interface if we
+ * change the implementation. Shouldn't be a big deal though. */
+void
+cls_rule_moved(struct classifier *cls, struct cls_rule *old,
+ struct cls_rule *new)
+{
+ if (old != new) {
+ if (new->wc.wildcards) {
+ list_moved(&new->node.list);
+ } else {
+ hmap_moved(&cls->exact_table, &old->node.hmap, &new->node.hmap);
+ }
+ }
+}
+
+/* Replaces 'old', which must be in classifier 'cls', by 'new' (e.g. due to
+ * realloc()); that is, after calling this function 'new' will be in 'cls' in
+ * place of 'old'.
+ *
+ * 'new' and 'old' must be exactly the same: wildcard the same fields, have the
+ * same fixed values for non-wildcarded fields, and have the same priority.
+ *
+ * The caller takes ownership of 'old' and is thus responsible for freeing it,
+ * etc., as necessary.
+ *
+ * This function cannot be realized in all possible flow classifier
+ * implementations, so we will probably have to change the interface if we
+ * change the implementation. Shouldn't be a big deal though. */
+void
+cls_rule_replace(struct classifier *cls, const struct cls_rule *old,
+ struct cls_rule *new)
+{
+ assert(old != new);
+ assert(old->wc.wildcards == new->wc.wildcards);
+ assert(old->priority == new->priority);
+
+ if (new->wc.wildcards) {
+ list_replace(&new->node.list, &old->node.list);
+ } else {
+ hmap_replace(&cls->exact_table, &old->node.hmap, &new->node.hmap);
+ }
+}
+
+/* Initializes 'cls' as a classifier that initially contains no classification
+ * rules. */
+void
+classifier_init(struct classifier *cls)
+{
+ int i;
+
+ cls->n_rules = 0;
+ for (i = 0; i < ARRAY_SIZE(cls->tables); i++) {
+ hmap_init(&cls->tables[i]);
+ }
+ hmap_init(&cls->exact_table);
+}
+
+/* Destroys 'cls'. Rules within 'cls', if any, are not freed; this is the
+ * caller's responsibility. */
+void
+classifier_destroy(struct classifier *cls)
+{
+ if (cls) {
+ struct cls_bucket *bucket, *next_bucket;
+ struct hmap *tbl;
+
+ for (tbl = &cls->tables[0]; tbl < &cls->tables[CLS_N_FIELDS]; tbl++) {
+ HMAP_FOR_EACH_SAFE (bucket, next_bucket,
+ struct cls_bucket, hmap_node, tbl) {
+ free(bucket);
+ }
+ hmap_destroy(tbl);
+ }
+ hmap_destroy(&cls->exact_table);
+ }
+}
+
+/* Returns true if 'cls' does not contain any classification rules, false
+ * otherwise. */
+bool
+classifier_is_empty(const struct classifier *cls)
+{
+ return cls->n_rules == 0;
+}
+
+/* Returns the number of rules in 'classifier'. */
+int
+classifier_count(const struct classifier *cls)
+{
+ return cls->n_rules;
+}
+
+/* Returns the number of rules in 'classifier' that have no wildcards. */
+int
+classifier_count_exact(const struct classifier *cls)
+{
+ return hmap_count(&cls->exact_table);
+}
+
+/* Inserts 'rule' into 'cls'. Transfers ownership of 'rule' to 'cls'.
+ *
+ * If 'cls' already contains an identical rule (including wildcards, values of
+ * fixed fields, and priority), replaces the old rule by 'rule' and returns the
+ * rule that was replaced. The caller takes ownership of the returned rule and
+ * is thus responsible for freeing it, etc., as necessary.
+ *
+ * Returns NULL if 'cls' does not contain a rule with an identical key, after
+ * inserting the new rule. In this case, no rules are displaced by the new
+ * rule, even rules that cannot have any effect because the new rule matches a
+ * superset of their flows and has higher priority. */
+struct cls_rule *
+classifier_insert(struct classifier *cls, struct cls_rule *rule)
+{
+ struct cls_rule *old;
+ assert((rule->wc.wildcards == 0) == (rule->table_idx == CLS_F_IDX_EXACT));
+ old = (rule->wc.wildcards
+ ? table_insert(&cls->tables[rule->table_idx], rule)
+ : insert_exact_rule(cls, rule));
+ if (!old) {
+ cls->n_rules++;
+ }
+ return old;
+}
+
+/* Inserts 'rule' into 'cls'. Transfers ownership of 'rule' to 'cls'.
+ *
+ * 'rule' must be an exact-match rule (rule->wc.wildcards must be 0) and 'cls'
+ * must not contain any rule with an identical key. */
+void
+classifier_insert_exact(struct classifier *cls, struct cls_rule *rule)
+{
+ hmap_insert(&cls->exact_table, &rule->node.hmap,
+ flow_hash(&rule->flow, 0));
+ cls->n_rules++;
+}
+
+/* Removes 'rule' from 'cls'. It is caller's responsibility to free 'rule', if
+ * this is desirable. */
+void
+classifier_remove(struct classifier *cls, struct cls_rule *rule)
+{
+ if (rule->wc.wildcards) {
+ /* Remove 'rule' from bucket. If that empties the bucket, remove the
+ * bucket from its table. */
+ struct hmap *table = &cls->tables[rule->table_idx];
+ struct list *rules = list_remove(&rule->node.list);
+ if (list_is_empty(rules)) {
+ /* This code is a little tricky. list_remove() returns the list
+ * element just after the one removed. Since the list is now
+ * empty, this will be the address of the 'rules' member of the
+ * bucket that was just emptied, so pointer arithmetic (via
+ * CONTAINER_OF) can find that bucket. */
+ struct cls_bucket *bucket;
+ bucket = CONTAINER_OF(rules, struct cls_bucket, rules);
+ hmap_remove(table, &bucket->hmap_node);
+ free(bucket);
+ }
+ } else {
+ /* Remove 'rule' from cls->exact_table. */
+ hmap_remove(&cls->exact_table, &rule->node.hmap);
+ }
+ cls->n_rules--;
+}
+
+/* Finds and returns the highest-priority rule in 'cls' that matches 'flow'.
+ * Returns a null pointer if no rules in 'cls' match 'flow'. If multiple rules
+ * of equal priority match 'flow', returns one arbitrarily.
+ *
+ * (When multiple rules of equal priority happen to fall into the same bucket,
+ * rules added more recently take priority over rules added less recently, but
+ * this is subject to change and should not be depended upon.) */
+struct cls_rule *
+classifier_lookup(const struct classifier *cls, const flow_t *flow)
+{
+ struct cls_rule *rule = classifier_lookup_exact(cls, flow);
+ if (!rule) {
+ rule = classifier_lookup_wild(cls, flow);
+ }
+ return rule;
+}
+
+struct cls_rule *
+classifier_lookup_exact(const struct classifier *cls, const flow_t *flow)
+{
+ return (!hmap_is_empty(&cls->exact_table)
+ ? search_exact_table(cls, flow_hash(flow, 0), flow)
+ : NULL);
+}
+
+struct cls_rule *
+classifier_lookup_wild(const struct classifier *cls, const flow_t *flow)
+{
+ struct cls_rule *best = NULL;
+ if (cls->n_rules > hmap_count(&cls->exact_table)) {
+ struct cls_rule target;
+ int i;
+
+ cls_rule_from_flow(&target, flow, 0, 0);
+ for (i = 0; i < CLS_N_FIELDS; i++) {
+ struct cls_rule *rule = search_table(&cls->tables[i], i, &target);
+ if (rule && (!best || rule->priority > best->priority)) {
+ best = rule;
+ }
+ }
+ }
+ return best;
+}
+
+struct cls_rule *
+classifier_find_rule_exactly(const struct classifier *cls,
+ const flow_t *target, uint32_t wildcards,
+ unsigned int priority)
+{
+ struct cls_bucket *bucket;
+ int table_idx;
+ uint32_t hash;
+
+ if (!wildcards) {
+ /* Ignores 'priority'. */
+ return search_exact_table(cls, flow_hash(target, 0), target);
+ }
+
+ assert(wildcards == (wildcards & OFPFW_ALL));
+ table_idx = table_idx_from_wildcards(wildcards);
+ hash = hash_fields(target, table_idx);
+ HMAP_FOR_EACH_WITH_HASH (bucket, struct cls_bucket, hmap_node, hash,
+ &cls->tables[table_idx]) {
+ if (equal_fields(&bucket->fixed, target, table_idx)) {
+ struct cls_rule *pos;
+ LIST_FOR_EACH (pos, struct cls_rule, node.list, &bucket->rules) {
+ if (pos->priority < priority) {
+ return NULL;
+ } else if (pos->priority == priority &&
+ pos->wc.wildcards == wildcards &&
+ flow_equal(target, &pos->flow)) {
+ return pos;
+ }
+ }
+ }
+ }
+ return NULL;
+}
+
+/* Ignores target->priority.
+ *
+ * 'callback' is allowed to delete the rule that is passed as its argument, but
+ * it must not delete (or move) any other rules in 'cls' that are in the same
+ * table as the argument rule. Two rules are in the same table if their
+ * cls_rule structs have the same table_idx; as a special case, a rule with
+ * wildcards and an exact-match rule will never be in the same table. */
+void
+classifier_for_each_match(const struct classifier *cls,
+ const struct cls_rule *target,
+ int include, cls_cb_func *callback, void *aux)
+{
+ if (include & CLS_INC_WILD) {
+ const struct hmap *table;
+
+ for (table = &cls->tables[0]; table < &cls->tables[CLS_N_FIELDS];
+ table++) {
+ struct cls_bucket *bucket, *next_bucket;
+
+ HMAP_FOR_EACH_SAFE (bucket, next_bucket,
+ struct cls_bucket, hmap_node, table) {
+ /* XXX there is a bit of room for optimization here based on
+ * rejecting entire buckets on their fixed fields, but it will
+ * only be worthwhile for big buckets (which we hope we won't
+ * get anyway, but...) */
+ struct cls_rule *prev_rule, *rule;
+
+ /* We can't just use LIST_FOR_EACH_SAFE here because, if the
+ * callback deletes the last rule in the bucket, then the
+ * bucket itself will be destroyed. The bucket contains the
+ * list head so that's a use-after-free error. */
+ prev_rule = NULL;
+ LIST_FOR_EACH (rule, struct cls_rule, node.list,
+ &bucket->rules) {
+ if (rules_match_1wild(rule, target, 0)) {
+ if (prev_rule) {
+ callback(prev_rule, aux);
+ }
+ prev_rule = rule;
+ }
+ }
+ if (prev_rule) {
+ callback(prev_rule, aux);
+ }
+ }
+ }
+ }
+
+ if (include & CLS_INC_EXACT) {
+ if (target->wc.wildcards) {
+ struct cls_rule *rule, *next_rule;
+
+ HMAP_FOR_EACH_SAFE (rule, next_rule, struct cls_rule, node.hmap,
+ &cls->exact_table) {
+ if (rules_match_1wild(rule, target, 0)) {
+ callback(rule, aux);
+ }
+ }
+ } else {
+ /* Optimization: there can be at most one match in the exact
+ * table. */
+ size_t hash = flow_hash(&target->flow, 0);
+ struct cls_rule *rule = search_exact_table(cls, hash,
+ &target->flow);
+ if (rule) {
+ callback(rule, aux);
+ }
+ }
+ }
+}
+
+/* 'callback' is allowed to delete the rule that is passed as its argument, but
+ * it must not delete (or move) any other rules in 'cls' that are in the same
+ * table as the argument rule. Two rules are in the same table if their
+ * cls_rule structs have the same table_idx; as a special case, a rule with
+ * wildcards and an exact-match rule will never be in the same table. */
+void
+classifier_for_each(const struct classifier *cls, int include,
+ void (*callback)(struct cls_rule *, void *aux),
+ void *aux)
+{
+ if (include & CLS_INC_WILD) {
+ const struct hmap *tbl;
+
+ for (tbl = &cls->tables[0]; tbl < &cls->tables[CLS_N_FIELDS]; tbl++) {
+ struct cls_bucket *bucket, *next_bucket;
+
+ HMAP_FOR_EACH_SAFE (bucket, next_bucket,
+ struct cls_bucket, hmap_node, tbl) {
+ struct cls_rule *prev_rule, *rule;
+
+ /* We can't just use LIST_FOR_EACH_SAFE here because, if the
+ * callback deletes the last rule in the bucket, then the
+ * bucket itself will be destroyed. The bucket contains the
+ * list head so that's a use-after-free error. */
+ prev_rule = NULL;
+ LIST_FOR_EACH (rule, struct cls_rule, node.list,
+ &bucket->rules) {
+ if (prev_rule) {
+ callback(prev_rule, aux);
+ }
+ prev_rule = rule;
+ }
+ if (prev_rule) {
+ callback(prev_rule, aux);
+ }
+ }
+ }
+ }
+
+ if (include & CLS_INC_EXACT) {
+ struct cls_rule *rule, *next_rule;
+
+ HMAP_FOR_EACH_SAFE (rule, next_rule,
+ struct cls_rule, node.hmap, &cls->exact_table) {
+ callback(rule, aux);
+ }
+ }
+}
+
+static struct cls_bucket *create_bucket(struct hmap *, size_t hash,
+ const flow_t *fixed);
+static struct cls_rule *bucket_insert(struct cls_bucket *, struct cls_rule *);
+
+static inline bool equal_bytes(const void *, const void *, size_t n);
+
+/* Returns a hash computed across the fields in 'flow' whose field indexes
+ * (CLS_F_IDX_*) are less than 'table_idx'. (If 'table_idx' is
+ * CLS_F_IDX_EXACT, hashes all the fields in 'flow'). */
+static uint32_t
+hash_fields(const flow_t *flow, int table_idx)
+{
+ /* I just know I'm going to hell for writing code this way.
+ *
+ * GCC generates pretty good code here, with only a single taken
+ * conditional jump per execution. Now the question is, would we be better
+ * off marking this function ALWAYS_INLINE and writing a wrapper that
+ * switches on the value of 'table_idx' to get rid of all the conditional
+ * jumps entirely (except for one in the wrapper)? Honestly I really,
+ * really hope that it doesn't matter in practice.
+ *
+ * We could do better by calculating hashes incrementally, instead of
+ * starting over from the top each time. But that would be even uglier. */
+ uint32_t a, b, c;
+ uint32_t tmp[3];
+ size_t n;
+
+ a = b = c = 0xdeadbeef + table_idx;
+ n = 0;
+
+#define CLS_FIELD(WILDCARDS, MEMBER, NAME) \
+ if (table_idx == CLS_F_IDX_##NAME) { \
+ /* Done. */ \
+ memset((uint8_t *) tmp + n, 0, sizeof tmp - n); \
+ goto finish; \
+ } else { \
+ const size_t size = sizeof flow->MEMBER; \
+ const uint8_t *p1 = (const uint8_t *) &flow->MEMBER; \
+ const size_t p1_size = MIN(sizeof tmp - n, size); \
+ const uint8_t *p2 = p1 + p1_size; \
+ const size_t p2_size = size - p1_size; \
+ \
+ /* Append to 'tmp' as much data as will fit. */ \
+ memcpy((uint8_t *) tmp + n, p1, p1_size); \
+ n += p1_size; \
+ \
+ /* If 'tmp' is full, mix. */ \
+ if (n == sizeof tmp) { \
+ a += tmp[0]; \
+ b += tmp[1]; \
+ c += tmp[2]; \
+ HASH_MIX(a, b, c); \
+ n = 0; \
+ } \
+ \
+ /* Append to 'tmp' any data that didn't fit. */ \
+ memcpy(tmp, p2, p2_size); \
+ n += p2_size; \
+ }
+ CLS_FIELDS
+#undef CLS_FIELD
+
+finish:
+ a += tmp[0];
+ b += tmp[1];
+ c += tmp[2];
+ HASH_FINAL(a, b, c);
+ return c;
+}
+
+/* Compares the fields in 'a' and 'b' whose field indexes (CLS_F_IDX_*) are
+ * less than 'table_idx'. (If 'table_idx' is CLS_F_IDX_EXACT, compares all the
+ * fields in 'a' and 'b').
+ *
+ * Returns true if all the compared fields are equal, false otherwise. */
+static bool
+equal_fields(const flow_t *a, const flow_t *b, int table_idx)
+{
+ /* XXX The generated code could be better here. */
+#define CLS_FIELD(WILDCARDS, MEMBER, NAME) \
+ if (table_idx == CLS_F_IDX_##NAME) { \
+ return true; \
+ } else if (!equal_bytes(&a->MEMBER, &b->MEMBER, sizeof a->MEMBER)) { \
+ return false; \
+ }
+ CLS_FIELDS
+#undef CLS_FIELD
+
+ return true;
+}
+
+static int
+table_idx_from_wildcards(uint32_t wildcards)
+{
+ if (!wildcards) {
+ return CLS_F_IDX_EXACT;
+ }
+#define CLS_FIELD(WILDCARDS, MEMBER, NAME) \
+ if (wildcards & WILDCARDS) { \
+ return CLS_F_IDX_##NAME; \
+ }
+ CLS_FIELDS
+#undef CLS_FIELD
+ NOT_REACHED();
+}
+
+/* Inserts 'rule' into 'table'. Returns the rule, if any, that was displaced
+ * in favor of 'rule'. */
+static struct cls_rule *
+table_insert(struct hmap *table, struct cls_rule *rule)
+{
+ struct cls_bucket *bucket;
+ size_t hash;
+
+ hash = hash_fields(&rule->flow, rule->table_idx);
+ bucket = find_bucket(table, hash, rule);
+ if (!bucket) {
+ bucket = create_bucket(table, hash, &rule->flow);
+ }
+
+ return bucket_insert(bucket, rule);
+}
+
+/* Inserts 'rule' into 'bucket', given that 'field' is the first wildcarded
+ * field in 'rule'.
+ *
+ * Returns the rule, if any, that was displaced in favor of 'rule'. */
+static struct cls_rule *
+bucket_insert(struct cls_bucket *bucket, struct cls_rule *rule)
+{
+ struct cls_rule *pos;
+ LIST_FOR_EACH (pos, struct cls_rule, node.list, &bucket->rules) {
+ if (pos->priority <= rule->priority) {
+ if (pos->priority == rule->priority
+ && pos->wc.wildcards == rule->wc.wildcards
+ && rules_match_1wild(pos, rule, rule->table_idx))
+ {
+ list_replace(&rule->node.list, &pos->node.list);
+ return pos;
+ }
+ break;
+ }
+ }
+ list_insert(&pos->node.list, &rule->node.list);
+ return NULL;
+}
+
+static struct cls_rule *
+insert_exact_rule(struct classifier *cls, struct cls_rule *rule)
+{
+ struct cls_rule *old_rule;
+ size_t hash;
+
+ hash = flow_hash(&rule->flow, 0);
+ old_rule = search_exact_table(cls, hash, &rule->flow);
+ if (old_rule) {
+ hmap_remove(&cls->exact_table, &old_rule->node.hmap);
+ }
+ hmap_insert(&cls->exact_table, &rule->node.hmap, hash);
+ return old_rule;
+}
+
+/* Returns the bucket in 'table' that has the given 'hash' and the same fields
+ * as 'rule->flow' (up to 'rule->table_idx'), or a null pointer if no bucket
+ * matches. */
+static struct cls_bucket *
+find_bucket(struct hmap *table, size_t hash, const struct cls_rule *rule)
+{
+ struct cls_bucket *bucket;
+ HMAP_FOR_EACH_WITH_HASH (bucket, struct cls_bucket, hmap_node, hash,
+ table) {
+ if (equal_fields(&bucket->fixed, &rule->flow, rule->table_idx)) {
+ return bucket;
+ }
+ }
+ return NULL;
+}
+
+/* Creates a bucket and inserts it in 'table' with the given 'hash' and 'fixed'
+ * values. Returns the new bucket. */
+static struct cls_bucket *
+create_bucket(struct hmap *table, size_t hash, const flow_t *fixed)
+{
+ struct cls_bucket *bucket = xmalloc(sizeof *bucket);
+ list_init(&bucket->rules);
+ bucket->fixed = *fixed;
+ hmap_insert(table, &bucket->hmap_node, hash);
+ return bucket;
+}
+
+/* Returns true if the 'n' bytes in 'a' and 'b' are equal, false otherwise. */
+static inline bool ALWAYS_INLINE
+equal_bytes(const void *a, const void *b, size_t n)
+{
+#ifdef __i386__
+ /* For some reason GCC generates stupid code for memcmp() of small
+ * constant integer lengths. Help it out.
+ *
+ * This function is always inlined, and it is always called with 'n' as a
+ * compile-time constant, so the switch statement gets optimized out and
+ * this whole function just expands to an instruction or two. */
+ switch (n) {
+ case 1:
+ return *(uint8_t *) a == *(uint8_t *) b;
+
+ case 2:
+ return *(uint16_t *) a == *(uint16_t *) b;
+
+ case 4:
+ return *(uint32_t *) a == *(uint32_t *) b;
+
+ case 6:
+ return (*(uint32_t *) a == *(uint32_t *) b
+ && ((uint16_t *) a)[2] == ((uint16_t *) b)[2]);
+
+ default:
+ abort();
+ }
+#else
+ /* I hope GCC is smarter on your platform. */
+ return !memcmp(a, b, n);
+#endif
+}
+
+/* Returns the 32-bit unsigned integer at 'p'. */
+static inline uint32_t
+read_uint32(const void *p)
+{
+ /* GCC optimizes this into a single machine instruction on x86. */
+ uint32_t x;
+ memcpy(&x, p, sizeof x);
+ return x;
+}
+
+/* Compares the specified field in 'a' and 'b'. Returns true if the fields are
+ * equal, or if the ofp_match wildcard bits in 'wildcards' are set such that
+ * non-equal values may be ignored. 'nw_src_mask' and 'nw_dst_mask' must be
+ * those that would be set for 'wildcards' by cls_rule_set_masks().
+ *
+ * The compared field is the one with wildcard bit or bits 'field_wc', offset
+ * 'rule_ofs' within cls_rule's "fields" member, and length 'len', in bytes. */
+static inline bool ALWAYS_INLINE
+field_matches(const flow_t *a_, const flow_t *b_,
+ uint32_t wildcards, uint32_t nw_src_mask, uint32_t nw_dst_mask,
+ uint32_t field_wc, int ofs, int len)
+{
+ /* This function is always inlined, and it is always called with 'field_wc'
+ * as a compile-time constant, so the "if" conditionals here generate no
+ * code. */
+ const void *a = (const uint8_t *) a_ + ofs;
+ const void *b = (const uint8_t *) b_ + ofs;
+ if (!(field_wc & (field_wc - 1))) {
+ /* Handle all the single-bit wildcard cases. */
+ return wildcards & field_wc || equal_bytes(a, b, len);
+ } else if (field_wc == OFPFW_NW_SRC_MASK ||
+ field_wc == OFPFW_NW_DST_MASK) {
+ uint32_t a_ip = read_uint32(a);
+ uint32_t b_ip = read_uint32(b);
+ uint32_t mask = (field_wc == OFPFW_NW_SRC_MASK
+ ? nw_src_mask : nw_dst_mask);
+ return ((a_ip ^ b_ip) & mask) == 0;
+ } else {
+ abort();
+ }
+}
+
+/* Returns true if 'a' and 'b' match, ignoring fields for which the wildcards
+ * in 'wildcards' are set. 'nw_src_mask' and 'nw_dst_mask' must be those that
+ * would be set for 'wildcards' by cls_rule_set_masks(). 'field_idx' is the
+ * index of the first field to be compared; fields before 'field_idx' are
+ * assumed to match. (Always returns true if 'field_idx' is CLS_N_FIELDS.) */
+static bool
+rules_match(const struct cls_rule *a, const struct cls_rule *b,
+ uint32_t wildcards, uint32_t nw_src_mask, uint32_t nw_dst_mask,
+ int field_idx)
+{
+ /* This is related to Duff's device (see
+ * http://en.wikipedia.org/wiki/Duff's_device). */
+ switch (field_idx) {
+#define CLS_FIELD(WILDCARDS, MEMBER, NAME) \
+ case CLS_F_IDX_##NAME: \
+ if (!field_matches(&a->flow, &b->flow, \
+ wildcards, nw_src_mask, nw_dst_mask, \
+ WILDCARDS, offsetof(flow_t, MEMBER), \
+ sizeof a->flow.MEMBER)) { \
+ return false; \
+ } \
+ /* Fall though */
+ CLS_FIELDS
+#undef CLS_FIELD
+ }
+ return true;
+}
+
+/* Returns true if 'fixed' and 'wild' match. All fields in 'fixed' must have
+ * fixed values; 'wild' may contain wildcards.
+ *
+ * 'field_idx' is the index of the first field to be compared; fields before
+ * 'field_idx' are assumed to match. Always returns true if 'field_idx' is
+ * CLS_N_FIELDS. */
+static bool
+rules_match_1wild(const struct cls_rule *fixed, const struct cls_rule *wild,
+ int field_idx)
+{
+ return rules_match(fixed, wild, wild->wc.wildcards, wild->wc.nw_src_mask,
+ wild->wc.nw_dst_mask, field_idx);
+}
+
+/* Searches 'bucket' for a rule that matches 'target'. Returns the
+ * highest-priority match, if one is found, or a null pointer if there is no
+ * match.
+ *
+ * 'field_idx' must be the index of the first wildcarded field in 'bucket'. */
+static struct cls_rule *
+search_bucket(struct cls_bucket *bucket, int field_idx,
+ const struct cls_rule *target)
+{
+ struct cls_rule *pos;
+
+ if (!equal_fields(&bucket->fixed, &target->flow, field_idx)) {
+ return NULL;
+ }
+
+ LIST_FOR_EACH (pos, struct cls_rule, node.list, &bucket->rules) {
+ if (rules_match_1wild(target, pos, field_idx)) {
+ return pos;
+ }
+ }
+ return NULL;
+}
+
+/* Searches 'table' for a rule that matches 'target'. Returns the
+ * highest-priority match, if one is found, or a null pointer if there is no
+ * match.
+ *
+ * 'field_idx' must be the index of the first wildcarded field in 'table'. */
+static struct cls_rule *
+search_table(const struct hmap *table, int field_idx,
+ const struct cls_rule *target)
+{
+ struct cls_bucket *bucket;
+
+ switch (hmap_count(table)) {
+ /* In these special cases there's no need to hash. */
+ case 0:
+ return NULL;
+ case 1:
+ bucket = CONTAINER_OF(hmap_first(table), struct cls_bucket, hmap_node);
+ return search_bucket(bucket, field_idx, target);
+ }
+
+ HMAP_FOR_EACH_WITH_HASH (bucket, struct cls_bucket, hmap_node,
+ hash_fields(&target->flow, field_idx), table) {
+ struct cls_rule *rule = search_bucket(bucket, field_idx, target);
+ if (rule) {
+ return rule;
+ }
+ }
+ return NULL;
+}
+
+static struct cls_rule *
+search_exact_table(const struct classifier *cls, size_t hash,
+ const flow_t *target)
+{
+ struct cls_rule *rule;
+
+ HMAP_FOR_EACH_WITH_HASH (rule, struct cls_rule, node.hmap,
+ hash, &cls->exact_table) {
+ if (flow_equal(&rule->flow, target)) {
+ return rule;
+ }
+ }
+ return NULL;
+}
diff --git a/lib/classifier.h b/lib/classifier.h
new file mode 100644
index 000000000..a632fb024
--- /dev/null
+++ b/lib/classifier.h
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef CLASSIFIER_H
+#define CLASSIFIER_H 1
+
+/* Flow classifier.
+ *
+ * This flow classifier assumes that we can arrange the fields in a flow in an
+ * order such that the set of wildcarded fields in a rule tend to fall toward
+ * the end of the ordering. That is, if field F is wildcarded, then all the
+ * fields after F tend to be wildcarded as well. If this assumption is
+ * violated, then the classifier will still classify flows correctly, but its
+ * performance will suffer.
+ */
+
+#include "flow.h"
+#include "hmap.h"
+#include "list.h"
+#include "openflow/openflow.h"
+
+/* Number of bytes of fields in a rule. */
+#define CLS_N_BYTES 31
+
+/* Fields in a rule.
+ *
+ * This definition sets the ordering of fields, which is important for
+ * performance (see above). To adjust the ordering, change the order of the
+ * lines. */
+#define CLS_FIELDS \
+ /* flow_t all-caps */ \
+ /* wildcard bit(s) member name name */ \
+ /* ----------------- ----------- -------- */ \
+ CLS_FIELD(OFPFW_IN_PORT, in_port, IN_PORT) \
+ CLS_FIELD(OFPFW_DL_VLAN, dl_vlan, DL_VLAN) \
+ CLS_FIELD(OFPFW_DL_SRC, dl_src, DL_SRC) \
+ CLS_FIELD(OFPFW_DL_DST, dl_dst, DL_DST) \
+ CLS_FIELD(OFPFW_DL_TYPE, dl_type, DL_TYPE) \
+ CLS_FIELD(OFPFW_NW_SRC_MASK, nw_src, NW_SRC) \
+ CLS_FIELD(OFPFW_NW_DST_MASK, nw_dst, NW_DST) \
+ CLS_FIELD(OFPFW_NW_PROTO, nw_proto, NW_PROTO) \
+ CLS_FIELD(OFPFW_TP_SRC, tp_src, TP_SRC) \
+ CLS_FIELD(OFPFW_TP_DST, tp_dst, TP_DST)
+
+/* Field indexes.
+ *
+ * (These are also indexed into struct classifier's 'tables' array.) */
+enum {
+#define CLS_FIELD(WILDCARDS, MEMBER, NAME) CLS_F_IDX_##NAME,
+ CLS_FIELDS
+#undef CLS_FIELD
+ CLS_F_IDX_EXACT, /* Exact-match table. */
+ CLS_N_FIELDS = CLS_F_IDX_EXACT
+};
+
+/* Field information. */
+struct cls_field {
+ int ofs; /* Offset in flow_t. */
+ int len; /* Length in bytes. */
+ uint32_t wildcards; /* OFPFW_* bit or bits for this field. */
+ const char *name; /* Name (for debugging). */
+};
+extern const struct cls_field cls_fields[CLS_N_FIELDS + 1];
+
+/* A flow classifier. */
+struct classifier {
+ int n_rules; /* Sum of hmap_count() over tables[]. */
+ struct hmap tables[CLS_N_FIELDS]; /* Contain cls_bucket elements. */
+ struct hmap exact_table; /* Contain cls_rule elements. */
+};
+
+/* A group of rules with the same fixed values for initial fields. */
+struct cls_bucket {
+ struct hmap_node hmap_node; /* Within struct classifier 'tables'. */
+ struct list rules; /* In order from highest to lowest priority. */
+ flow_t fixed; /* Values for fixed fields. */
+};
+
+/* A flow classification rule.
+ *
+ * Use cls_rule_from_flow() or cls_rule_from_match() to initialize a cls_rule
+ * or you will almost certainly not initialize 'table_idx' correctly, with
+ * disastrous results! */
+struct cls_rule {
+ union {
+ struct list list; /* Within struct cls_bucket 'rules'. */
+ struct hmap_node hmap; /* Within struct classifier 'exact_table'. */
+ } node;
+ flow_t flow; /* All field values. */
+ struct flow_wildcards wc; /* Wildcards for fields. */
+ unsigned int priority; /* Larger numbers are higher priorities. */
+ unsigned int table_idx; /* Index into struct classifier 'tables'. */
+};
+
+void cls_rule_from_flow(struct cls_rule *, const flow_t *, uint32_t wildcards,
+ unsigned int priority);
+void cls_rule_from_match(struct cls_rule *, const struct ofp_match *,
+ unsigned int priority);
+void cls_rule_print(const struct cls_rule *);
+void cls_rule_moved(struct classifier *,
+ struct cls_rule *old, struct cls_rule *new);
+void cls_rule_replace(struct classifier *, const struct cls_rule *old,
+ struct cls_rule *new);
+
+void classifier_init(struct classifier *);
+void classifier_destroy(struct classifier *);
+bool classifier_is_empty(const struct classifier *);
+int classifier_count(const struct classifier *);
+int classifier_count_exact(const struct classifier *);
+struct cls_rule *classifier_insert(struct classifier *, struct cls_rule *);
+void classifier_insert_exact(struct classifier *, struct cls_rule *);
+void classifier_remove(struct classifier *, struct cls_rule *);
+struct cls_rule *classifier_lookup(const struct classifier *, const flow_t *);
+struct cls_rule *classifier_lookup_wild(const struct classifier *,
+ const flow_t *);
+struct cls_rule *classifier_lookup_exact(const struct classifier *,
+ const flow_t *);
+
+typedef void cls_cb_func(struct cls_rule *, void *aux);
+
+enum {
+ CLS_INC_EXACT = 1 << 0, /* Include exact-match flows? */
+ CLS_INC_WILD = 1 << 1, /* Include flows with wildcards? */
+ CLS_INC_ALL = CLS_INC_EXACT | CLS_INC_WILD
+};
+void classifier_for_each(const struct classifier *, int include,
+ cls_cb_func *, void *aux);
+void classifier_for_each_match(const struct classifier *,
+ const struct cls_rule *,
+ int include, cls_cb_func *, void *aux);
+struct cls_rule *classifier_find_rule_exactly(const struct classifier *,
+ const flow_t *target,
+ uint32_t wildcards,
+ unsigned int priority);
+
+#endif /* classifier.h */
diff --git a/lib/command-line.c b/lib/command-line.c
new file mode 100644
index 000000000..e0bd88110
--- /dev/null
+++ b/lib/command-line.c
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2008 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <config.h>
+#include "command-line.h"
+#include <getopt.h>
+#include <limits.h>
+#include "util.h"
+#include "vlog.h"
+
+/* Given the GNU-style long options in 'options', returns a string that may be
+ * passed to getopt() with the corresponding short options. The caller is
+ * responsible for freeing the string. */
+char *
+long_options_to_short_options(const struct option options[])
+{
+ char short_options[UCHAR_MAX * 3 + 1];
+ char *p = short_options;
+
+ for (; options->name; options++) {
+ const struct option *o = options;
+ if (o->flag == NULL && o->val > 0 && o->val <= UCHAR_MAX) {
+ *p++ = o->val;
+ if (o->has_arg == required_argument) {
+ *p++ = ':';
+ } else if (o->has_arg == optional_argument) {
+ *p++ = ':';
+ *p++ = ':';
+ }
+ }
+ }
+ *p = '\0';
+
+ return xstrdup(short_options);
+}
+
diff --git a/lib/command-line.h b/lib/command-line.h
new file mode 100644
index 000000000..f58a33606
--- /dev/null
+++ b/lib/command-line.h
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2008 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef COMMAND_LINE_H
+#define COMMAND_LINE_H 1
+
+/* Utilities for command-line parsing. */
+
+struct option;
+char *long_options_to_short_options(const struct option *options);
+
+#endif /* command-line.h */
diff --git a/lib/common.man b/lib/common.man
new file mode 100644
index 000000000..84e81c2ce
--- /dev/null
+++ b/lib/common.man
@@ -0,0 +1,7 @@
+.TP
+\fB-h\fR, \fB--help\fR
+Prints a brief help message to the console.
+
+.TP
+\fB-V\fR, \fB--version\fR
+Prints version information to the console.
diff --git a/lib/compiler.h b/lib/compiler.h
new file mode 100644
index 000000000..a08678b26
--- /dev/null
+++ b/lib/compiler.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2008, 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef COMPILER_H
+#define COMPILER_H 1
+
+#define NO_RETURN __attribute__((__noreturn__))
+#define UNUSED __attribute__((__unused__))
+#define PACKED __attribute__((__packed__))
+#define PRINTF_FORMAT(FMT, ARG1) __attribute__((__format__(printf, FMT, ARG1)))
+#define STRFTIME_FORMAT(FMT) __attribute__((__format__(__strftime__, FMT, 0)))
+#define MALLOC_LIKE __attribute__((__malloc__))
+#define ALWAYS_INLINE __attribute__((always_inline))
+#define likely(x) __builtin_expect((x),1)
+#define unlikely(x) __builtin_expect((x),0)
+
+#endif /* compiler.h */
diff --git a/lib/coverage-counters.h b/lib/coverage-counters.h
new file mode 100644
index 000000000..fa0cf73d7
--- /dev/null
+++ b/lib/coverage-counters.h
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef COVERAGE_COUNTERS_H
+#define COVERAGE_COUNTERS_H 1
+
+#include <stddef.h>
+
+extern struct coverage_counter *coverage_counters[];
+extern size_t coverage_n_counters;
+
+#endif /* coverage.h */
diff --git a/lib/coverage-scan.pl b/lib/coverage-scan.pl
new file mode 100755
index 000000000..886223c70
--- /dev/null
+++ b/lib/coverage-scan.pl
@@ -0,0 +1,47 @@
+# Copyright (c) 2009 Nicira Networks.
+#
+# Permission to use, copy, modify, and/or distribute this software for any
+# purpose with or without fee is hereby granted, provided that the above
+# copyright notice and this permission notice appear in all copies.
+#
+# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+use strict;
+use warnings;
+
+my %counters;
+while (<>) {
+ my ($counter) = /^\s*COVERAGE_(?:INC|ADD)\s*\(\s*([a-zA-Z_][a-zA-Z_0-9]*)/
+ or next;
+ push (@{$counters{$counter}}, "$ARGV:$.");
+} continue {
+ # This magic resets $. from one file to the next. See "perldoc -f eof".
+ close ARGV if eof;
+}
+
+print <<EOF;
+#include "coverage-counters.h"
+#include <stddef.h>
+#include "coverage.h"
+#include "util.h"
+
+EOF
+
+for my $counter (sort(keys(%counters))) {
+ my $locations = join(', ', @{$counters{$counter}});
+ print <<EOF;
+/* $locations */
+struct coverage_counter ${counter}_count = { "$counter", 0, 0 };
+
+EOF
+}
+print "struct coverage_counter *coverage_counters[] = {\n";
+print " \&${_}_count,\n" foreach (sort(keys(%counters)));
+print "};\n";
+print "size_t coverage_n_counters = ARRAY_SIZE(coverage_counters);\n";
diff --git a/lib/coverage.c b/lib/coverage.c
new file mode 100644
index 000000000..15a66c2fb
--- /dev/null
+++ b/lib/coverage.c
@@ -0,0 +1,163 @@
+/*
+ * Copyright (c) 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <config.h>
+#include "coverage.h"
+#include <inttypes.h>
+#include <stdlib.h>
+#include "coverage-counters.h"
+#include "dynamic-string.h"
+#include "hash.h"
+#include "util.h"
+
+#define THIS_MODULE VLM_coverage
+#include "vlog.h"
+
+static unsigned int epoch;
+
+/* Sorts coverage counters in descending order by count, within equal counts
+ * alphabetically by name. */
+static int
+compare_coverage_counters(const void *a_, const void *b_)
+{
+ const struct coverage_counter *const *ap = a_;
+ const struct coverage_counter *const *bp = b_;
+ const struct coverage_counter *a = *ap;
+ const struct coverage_counter *b = *bp;
+ if (a->count != b->count) {
+ return a->count < b->count ? 1 : -1;
+ } else {
+ return strcmp(a->name, b->name);
+ }
+}
+
+static uint32_t
+coverage_hash(void)
+{
+ struct coverage_counter **c;
+ uint32_t hash = 0;
+ int n_groups, i;
+
+ /* Sort coverage counters into groups with equal counts. */
+ c = xmalloc(coverage_n_counters * sizeof *c);
+ for (i = 0; i < coverage_n_counters; i++) {
+ c[i] = coverage_counters[i];
+ }
+ qsort(c, coverage_n_counters, sizeof *c, compare_coverage_counters);
+
+ /* Hash the names in each group along with the rank. */
+ n_groups = 0;
+ for (i = 0; i < coverage_n_counters; ) {
+ int j;
+
+ if (!c[i]->count) {
+ break;
+ }
+ n_groups++;
+ hash = hash_int(i, hash);
+ for (j = i; j < coverage_n_counters; j++) {
+ if (c[j]->count != c[i]->count) {
+ break;
+ }
+ hash = hash_string(c[j]->name, hash);
+ }
+ i = j;
+ }
+
+ free(c);
+
+ return hash_int(n_groups, hash);
+}
+
+static bool
+coverage_hit(uint32_t hash)
+{
+ enum { HIT_BITS = 1024, BITS_PER_WORD = 32 };
+ static uint32_t hit[HIT_BITS / BITS_PER_WORD];
+ BUILD_ASSERT_DECL(IS_POW2(HIT_BITS));
+
+ unsigned int bit_index = hash & (HIT_BITS - 1);
+ unsigned int word_index = bit_index / BITS_PER_WORD;
+ unsigned int word_mask = 1u << (bit_index % BITS_PER_WORD);
+
+ if (hit[word_index] & word_mask) {
+ return true;
+ } else {
+ hit[word_index] |= word_mask;
+ return false;
+ }
+}
+
+static void
+coverage_log_counter(enum vlog_level level, const struct coverage_counter *c)
+{
+ VLOG(level, "%-24s %5u / %9llu", c->name, c->count, c->count + c->total);
+}
+
+/* Logs the coverage counters at the given vlog 'level'. */
+void
+coverage_log(enum vlog_level level)
+{
+ size_t n_never_hit;
+ uint32_t hash;
+ size_t i;
+
+ if (!vlog_is_enabled(THIS_MODULE, level)) {
+ return;
+ }
+
+ hash = coverage_hash();
+ if (coverage_hit(hash)) {
+ VLOG(level, "Skipping details of duplicate event coverage for "
+ "hash=%08"PRIx32" in epoch %u", hash, epoch);
+ return;
+ }
+
+ n_never_hit = 0;
+ VLOG(level, "Event coverage (epoch %u/entire run), hash=%08"PRIx32":",
+ epoch, hash);
+ for (i = 0; i < coverage_n_counters; i++) {
+ struct coverage_counter *c = coverage_counters[i];
+ if (c->count) {
+ coverage_log_counter(level, c);
+ }
+ }
+ for (i = 0; i < coverage_n_counters; i++) {
+ struct coverage_counter *c = coverage_counters[i];
+ if (!c->count) {
+ if (c->total) {
+ coverage_log_counter(level, c);
+ } else {
+ n_never_hit++;
+ }
+ }
+ }
+ VLOG(level, "%zu events never hit", n_never_hit);
+}
+
+/* Advances to the next epoch of coverage, resetting all the counters to 0. */
+void
+coverage_clear(void)
+{
+ size_t i;
+
+ epoch++;
+ for (i = 0; i < coverage_n_counters; i++) {
+ struct coverage_counter *c = coverage_counters[i];
+ c->total += c->count;
+ c->count = 0;
+ }
+}
diff --git a/lib/coverage.h b/lib/coverage.h
new file mode 100644
index 000000000..98b6a85b2
--- /dev/null
+++ b/lib/coverage.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef COVERAGE_H
+#define COVERAGE_H 1
+
+/* This file implements a simple form of coverage instrumentation. Points in
+ * source code that are of interest must be explicitly annotated with
+ * COVERAGE_INC. The coverage counters may be logged at any time with
+ * coverage_log().
+ *
+ * This form of coverage instrumentation is intended to be so lightweight that
+ * it can be enabled in production builds. It is obviously not a substitute
+ * for traditional coverage instrumentation with e.g. "gcov", but it is still
+ * a useful debugging tool. */
+
+#include "vlog.h"
+
+/* A coverage counter. */
+struct coverage_counter {
+ const char *name; /* Textual name. */
+ unsigned int count; /* Count within the current epoch. */
+ unsigned long long int total; /* Total count over all epochs. */
+};
+
+/* Increments the counter with the given NAME. Coverage counters need not be
+ * declared explicitly, but when you add the first coverage counter to a given
+ * file, you must also add that file to COVERAGE_FILES in lib/automake.mk. */
+#define COVERAGE_INC(NAME) \
+ do { \
+ extern struct coverage_counter NAME##_count; \
+ NAME##_count.count++; \
+ } while (0)
+
+/* Adds AMOUNT to the coverage counter with the given NAME. */
+#define COVERAGE_ADD(NAME, AMOUNT) \
+ do { \
+ extern struct coverage_counter NAME##_count; \
+ NAME##_count.count += AMOUNT; \
+ } while (0)
+
+void coverage_log(enum vlog_level);
+void coverage_clear(void);
+
+#endif /* coverage.h */
diff --git a/lib/csum.c b/lib/csum.c
new file mode 100644
index 000000000..5266be6de
--- /dev/null
+++ b/lib/csum.c
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2008 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <config.h>
+#include "csum.h"
+
+/* Returns the IP checksum of the 'n' bytes in 'data'. */
+uint16_t
+csum(const void *data, size_t n)
+{
+ return csum_finish(csum_continue(0, data, n));
+}
+
+/* Adds the 16 bits in 'new' to the partial IP checksum 'partial' and returns
+ * the updated checksum. (To start a new checksum, pass 0 for 'partial'. To
+ * obtain the finished checksum, pass the return value to csum_finish().) */
+uint32_t
+csum_add16(uint32_t partial, uint16_t new)
+{
+ return partial + new;
+}
+
+/* Adds the 32 bits in 'new' to the partial IP checksum 'partial' and returns
+ * the updated checksum. (To start a new checksum, pass 0 for 'partial'. To
+ * obtain the finished checksum, pass the return value to csum_finish().) */
+uint32_t
+csum_add32(uint32_t partial, uint32_t new)
+{
+ return partial + (new >> 16) + (new & 0xffff);
+}
+
+
+/* Adds the 'n' bytes in 'data' to the partial IP checksum 'partial' and
+ * returns the updated checksum. (To start a new checksum, pass 0 for
+ * 'partial'. To obtain the finished checksum, pass the return value to
+ * csum_finish().) */
+uint32_t
+csum_continue(uint32_t partial, const void *data_, size_t n)
+{
+ const uint16_t *data = data_;
+
+ for (; n > 1; n -= 2) {
+ partial = csum_add16(partial, *data++);
+ }
+ if (n) {
+ partial += *(uint8_t *) data;
+ }
+ return partial;
+}
+
+/* Returns the IP checksum corresponding to 'partial', which is a value updated
+ * by some combination of csum_add16(), csum_add32(), and csum_continue(). */
+uint16_t
+csum_finish(uint32_t partial)
+{
+ return ~((partial & 0xffff) + (partial >> 16));
+}
+
+/* Returns the new checksum for a packet in which the checksum field previously
+ * contained 'old_csum' and in which a field that contained 'old_u16' was
+ * changed to contain 'new_u16'. */
+uint16_t
+recalc_csum16(uint16_t old_csum, uint16_t old_u16, uint16_t new_u16)
+{
+ /* Ones-complement arithmetic is endian-independent, so this code does not
+ * use htons() or ntohs().
+ *
+ * See RFC 1624 for formula and explanation. */
+ uint16_t hc_complement = ~old_csum;
+ uint16_t m_complement = ~old_u16;
+ uint16_t m_prime = new_u16;
+ uint32_t sum = hc_complement + m_complement + m_prime;
+ uint16_t hc_prime_complement = sum + (sum >> 16);
+ return ~hc_prime_complement;
+}
+
+/* Returns the new checksum for a packet in which the checksum field previously
+ * contained 'old_csum' and in which a field that contained 'old_u32' was
+ * changed to contain 'new_u32'. */
+uint16_t
+recalc_csum32(uint16_t old_csum, uint32_t old_u32, uint32_t new_u32)
+{
+ return recalc_csum16(recalc_csum16(old_csum, old_u32, new_u32),
+ old_u32 >> 16, new_u32 >> 16);
+}
diff --git a/lib/csum.h b/lib/csum.h
new file mode 100644
index 000000000..25de1ac40
--- /dev/null
+++ b/lib/csum.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2008 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef CSUM_H
+#define CSUM_H 1
+
+#include <stddef.h>
+#include <stdint.h>
+
+uint16_t csum(const void *, size_t);
+uint32_t csum_add16(uint32_t partial, uint16_t);
+uint32_t csum_add32(uint32_t partial, uint32_t);
+uint32_t csum_continue(uint32_t partial, const void *, size_t);
+uint16_t csum_finish(uint32_t partial);
+uint16_t recalc_csum16(uint16_t old_csum, uint16_t old_u16, uint16_t new_u16);
+uint16_t recalc_csum32(uint16_t old_csum, uint32_t old_u32, uint32_t new_u32);
+
+#endif /* csum.h */
diff --git a/lib/daemon.c b/lib/daemon.c
new file mode 100644
index 000000000..8b1762388
--- /dev/null
+++ b/lib/daemon.c
@@ -0,0 +1,294 @@
+/*
+ * Copyright (c) 2008, 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <config.h>
+#include "daemon.h"
+#include <errno.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include "fatal-signal.h"
+#include "dirs.h"
+#include "util.h"
+
+#define THIS_MODULE VLM_daemon
+#include "vlog.h"
+
+/* Should we run in the background? */
+static bool detach;
+
+/* Name of pidfile (null if none). */
+static char *pidfile;
+
+/* Create pidfile even if one already exists and is locked? */
+static bool force;
+
+/* Returns the file name that would be used for a pidfile if 'name' were
+ * provided to set_pidfile(). The caller must free the returned string. */
+char *
+make_pidfile_name(const char *name)
+{
+ return (!name ? xasprintf("%s/%s.pid", ovs_rundir, program_name)
+ : *name == '/' ? xstrdup(name)
+ : xasprintf("%s/%s", ovs_rundir, name));
+}
+
+/* Sets up a following call to daemonize() to create a pidfile named 'name'.
+ * If 'name' begins with '/', then it is treated as an absolute path.
+ * Otherwise, it is taken relative to RUNDIR, which is $(prefix)/var/run by
+ * default.
+ *
+ * If 'name' is null, then program_name followed by ".pid" is used. */
+void
+set_pidfile(const char *name)
+{
+ free(pidfile);
+ pidfile = make_pidfile_name(name);
+}
+
+/* Returns an absolute path to the configured pidfile, or a null pointer if no
+ * pidfile is configured. The caller must not modify or free the returned
+ * string. */
+const char *
+get_pidfile(void)
+{
+ return pidfile;
+}
+
+/* Normally, die_if_already_running() will terminate the program with a message
+ * if a locked pidfile already exists. If this function is called,
+ * die_if_already_running() will merely log a warning. */
+void
+ignore_existing_pidfile(void)
+{
+ force = true;
+}
+
+/* Sets up a following call to daemonize() to detach from the foreground
+ * session, running this process in the background. */
+void
+set_detach(void)
+{
+ detach = true;
+}
+
+/* If a pidfile has been configured and that pidfile already exists and is
+ * locked by a running process, returns the pid of the running process.
+ * Otherwise, returns 0. */
+static pid_t
+already_running(void)
+{
+ pid_t pid = 0;
+ if (pidfile) {
+ int fd = open(pidfile, O_RDWR);
+ if (fd >= 0) {
+ struct flock lck;
+ lck.l_type = F_WRLCK;
+ lck.l_whence = SEEK_SET;
+ lck.l_start = 0;
+ lck.l_len = 0;
+ if (fcntl(fd, F_GETLK, &lck) != -1 && lck.l_type != F_UNLCK) {
+ pid = lck.l_pid;
+ }
+ close(fd);
+ }
+ }
+ return pid;
+}
+
+/* If a locked pidfile exists, issue a warning message and, unless
+ * ignore_existing_pidfile() has been called, terminate the program. */
+void
+die_if_already_running(void)
+{
+ pid_t pid = already_running();
+ if (pid) {
+ if (!force) {
+ ovs_fatal(0, "%s: already running as pid %ld",
+ get_pidfile(), (long int) pid);
+ } else {
+ VLOG_WARN("%s: %s already running as pid %ld",
+ get_pidfile(), program_name, (long int) pid);
+ }
+ }
+}
+
+/* If a pidfile has been configured, creates it and stores the running process'
+ * pid init. Ensures that the pidfile will be deleted when the process
+ * exits. */
+static void
+make_pidfile(void)
+{
+ if (pidfile) {
+ /* Create pidfile via temporary file, so that observers never see an
+ * empty pidfile or an unlocked pidfile. */
+ long int pid = getpid();
+ char *tmpfile;
+ int fd;
+
+ tmpfile = xasprintf("%s.tmp%ld", pidfile, pid);
+ fatal_signal_add_file_to_unlink(tmpfile);
+ fd = open(tmpfile, O_CREAT | O_WRONLY | O_TRUNC, 0666);
+ if (fd >= 0) {
+ struct flock lck;
+ lck.l_type = F_WRLCK;
+ lck.l_whence = SEEK_SET;
+ lck.l_start = 0;
+ lck.l_len = 0;
+ if (fcntl(fd, F_SETLK, &lck) != -1) {
+ char *text = xasprintf("%ld\n", pid);
+ if (write(fd, text, strlen(text)) == strlen(text)) {
+ fatal_signal_add_file_to_unlink(pidfile);
+ if (rename(tmpfile, pidfile) < 0) {
+ VLOG_ERR("failed to rename \"%s\" to \"%s\": %s",
+ tmpfile, pidfile, strerror(errno));
+ fatal_signal_remove_file_to_unlink(pidfile);
+ close(fd);
+ } else {
+ /* Keep 'fd' open to retain the lock. */
+ }
+ free(text);
+ } else {
+ VLOG_ERR("%s: write failed: %s", tmpfile, strerror(errno));
+ close(fd);
+ }
+ } else {
+ VLOG_ERR("%s: fcntl failed: %s", tmpfile, strerror(errno));
+ close(fd);
+ }
+ } else {
+ VLOG_ERR("%s: create failed: %s", tmpfile, strerror(errno));
+ }
+ fatal_signal_remove_file_to_unlink(tmpfile);
+ free(tmpfile);
+ }
+ free(pidfile);
+ pidfile = NULL;
+}
+
+/* If configured with set_pidfile() or set_detach(), creates the pid file and
+ * detaches from the foreground session. */
+void
+daemonize(void)
+{
+ if (detach) {
+ char c = 0;
+ int fds[2];
+ if (pipe(fds) < 0) {
+ ovs_fatal(errno, "pipe failed");
+ }
+
+ switch (fork()) {
+ default:
+ /* Parent process: wait for child to create pidfile, then exit. */
+ close(fds[1]);
+ fatal_signal_fork();
+ if (read(fds[0], &c, 1) != 1) {
+ ovs_fatal(errno, "daemon child failed to signal startup");
+ }
+ exit(0);
+
+ case 0:
+ /* Child process. */
+ close(fds[0]);
+ make_pidfile();
+ write(fds[1], &c, 1);
+ close(fds[1]);
+ setsid();
+ chdir("/");
+ break;
+
+ case -1:
+ /* Error. */
+ ovs_fatal(errno, "could not fork");
+ break;
+ }
+ } else {
+ make_pidfile();
+ }
+}
+
+void
+daemon_usage(void)
+{
+ printf(
+ "\nDaemon options:\n"
+ " -D, --detach run in background as daemon\n"
+ " -P, --pidfile[=FILE] create pidfile (default: %s/%s.pid)\n"
+ " -f, --force with -P, start even if already running\n",
+ ovs_rundir, program_name);
+}
+
+/* Opens and reads a PID from 'pidfile'. Returns the nonnegative PID if
+ * successful, otherwise a negative errno value. */
+pid_t
+read_pidfile(const char *pidfile)
+{
+ char line[128];
+ struct flock lck;
+ FILE *file;
+ int error;
+
+ file = fopen(pidfile, "r");
+ if (!file) {
+ error = errno;
+ VLOG_WARN("%s: open: %s", pidfile, strerror(error));
+ goto error;
+ }
+
+ lck.l_type = F_WRLCK;
+ lck.l_whence = SEEK_SET;
+ lck.l_start = 0;
+ lck.l_len = 0;
+ if (fcntl(fileno(file), F_GETLK, &lck)) {
+ error = errno;
+ VLOG_WARN("%s: fcntl: %s", pidfile, strerror(error));
+ goto error;
+ }
+ if (lck.l_type == F_UNLCK) {
+ error = ESRCH;
+ VLOG_WARN("%s: pid file is not locked", pidfile);
+ goto error;
+ }
+
+ if (!fgets(line, sizeof line, file)) {
+ if (ferror(file)) {
+ error = errno;
+ VLOG_WARN("%s: read: %s", pidfile, strerror(error));
+ } else {
+ error = ESRCH;
+ VLOG_WARN("%s: read: unexpected end of file", pidfile);
+ }
+ goto error;
+ }
+
+ if (lck.l_pid != strtoul(line, NULL, 10)) {
+ error = ESRCH;
+ VLOG_WARN("l_pid (%ld) != %s pid (%s)",
+ (long int) lck.l_pid, pidfile, line);
+ goto error;
+ }
+
+ fclose(file);
+ return lck.l_pid;
+
+error:
+ if (file) {
+ fclose(file);
+ }
+ return -error;
+}
diff --git a/lib/daemon.h b/lib/daemon.h
new file mode 100644
index 000000000..7781dd068
--- /dev/null
+++ b/lib/daemon.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2008 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef DAEMON_H
+#define DAEMON_H 1
+
+#include <stdbool.h>
+#include <sys/types.h>
+
+#define DAEMON_LONG_OPTIONS \
+ {"detach", no_argument, 0, 'D'}, \
+ {"force", no_argument, 0, 'f'}, \
+ {"pidfile", optional_argument, 0, 'P'}
+
+#define DAEMON_OPTION_HANDLERS \
+ case 'D': \
+ set_detach(); \
+ break; \
+ \
+ case 'P': \
+ set_pidfile(optarg); \
+ break; \
+ \
+ case 'f': \
+ ignore_existing_pidfile(); \
+ break;
+
+char *make_pidfile_name(const char *name);
+void set_pidfile(const char *name);
+const char *get_pidfile(void);
+void set_detach(void);
+void daemonize(void);
+void die_if_already_running(void);
+void ignore_existing_pidfile(void);
+void daemon_usage(void);
+pid_t read_pidfile(const char *name);
+
+#endif /* daemon.h */
diff --git a/lib/daemon.man b/lib/daemon.man
new file mode 100644
index 000000000..4ab656806
--- /dev/null
+++ b/lib/daemon.man
@@ -0,0 +1,21 @@
+.TP
+\fB-P\fR[\fIpidfile\fR], \fB--pidfile\fR[\fB=\fIpidfile\fR]
+Causes a file (by default, \fB\*(PN.pid\fR) to be created indicating
+the PID of the running process. If \fIpidfile\fR is not specified, or
+if it does not begin with \fB/\fR, then it is created in
+\fB@RUNDIR@\fR.
+
+.TP
+\fB-f\fR, \fB--force\fR
+By default, when \fB-P\fR or \fB--pidfile\fR is specified and the
+specified pidfile already exists and is locked by a running process,
+\fB\*(PN\fR refuses to start. Specify \fB-f\fR or \fB--force\fR
+to cause it to instead overwrite the pidfile.
+
+When \fB-P\fR or \fB--pidfile\fR is not specified, this option has no
+effect.
+
+.TP
+\fB-D\fR, \fB--detach\fR
+Causes \fB\*(PN\fR to detach itself from the foreground session and
+run as a background process.
diff --git a/lib/dh1024.pem b/lib/dh1024.pem
new file mode 100644
index 000000000..6eaeca9b8
--- /dev/null
+++ b/lib/dh1024.pem
@@ -0,0 +1,10 @@
+-----BEGIN DH PARAMETERS-----
+MIGHAoGBAPSI/VhOSdvNILSd5JEHNmszbDgNRR0PfIizHHxbLY7288kjwEPwpVsY
+jY67VYy4XTjTNP18F1dDox0YbN4zISy1Kv884bEpQBgRjXyEpwpy1obEAxnIByl6
+ypUM2Zafq9AKUJsCRtMIPWakXUGfnHy9iUsiGSa6q6Jew1XpL3jHAgEC
+-----END DH PARAMETERS-----
+
+These are the 1024 bit DH parameters from "Assigned Number for SKIP Protocols"
+(http://www.skip-vpn.org/spec/numbers.html).
+See there for how they were generated.
+Note that g is not a generator, but this is not a problem since p is a safe prime.
diff --git a/lib/dh2048.pem b/lib/dh2048.pem
new file mode 100644
index 000000000..dcd0b8d01
--- /dev/null
+++ b/lib/dh2048.pem
@@ -0,0 +1,12 @@
+-----BEGIN DH PARAMETERS-----
+MIIBCAKCAQEA9kJXtwh/CBdyorrWqULzBej5UxE5T7bxbrlLOCDaAadWoxTpj0BV
+89AHxstDqZSt90xkhkn4DIO9ZekX1KHTUPj1WV/cdlJPPT2N286Z4VeSWc39uK50
+T8X8dryDxUcwYc58yWb/Ffm7/ZFexwGq01uejaClcjrUGvC/RgBYK+X0iP1YTknb
+zSC0neSRBzZrM2w4DUUdD3yIsxx8Wy2O9vPJI8BD8KVbGI2Ou1WMuF040zT9fBdX
+Q6MdGGzeMyEstSr/POGxKUAYEY18hKcKctaGxAMZyAcpesqVDNmWn6vQClCbAkbT
+CD1mpF1Bn5x8vYlLIhkmuquiXsNV6TILOwIBAg==
+-----END DH PARAMETERS-----
+
+These are the 2048 bit DH parameters from "Assigned Number for SKIP Protocols"
+(http://www.skip-vpn.org/spec/numbers.html).
+See there for how they were generated.
diff --git a/lib/dh4096.pem b/lib/dh4096.pem
new file mode 100644
index 000000000..1b35ad8e6
--- /dev/null
+++ b/lib/dh4096.pem
@@ -0,0 +1,18 @@
+-----BEGIN DH PARAMETERS-----
+MIICCAKCAgEA+hRyUsFN4VpJ1O8JLcCo/VWr19k3BCgJ4uk+d+KhehjdRqNDNyOQ
+l/MOyQNQfWXPeGKmOmIig6Ev/nm6Nf9Z2B1h3R4hExf+zTiHnvVPeRBhjdQi81rt
+Xeoh6TNrSBIKIHfUJWBh3va0TxxjQIs6IZOLeVNRLMqzeylWqMf49HsIXqbcokUS
+Vt1BkvLdW48j8PPv5DsKRN3tloTxqDJGo9tKvj1Fuk74A+Xda1kNhB7KFlqMyN98
+VETEJ6c7KpfOo30mnK30wqw3S8OtaIR/maYX72tGOno2ehFDkq3pnPtEbD2CScxc
+alJC+EL7RPk5c/tgeTvCngvc1KZn92Y//EI7G9tPZtylj2b56sHtMftIoYJ9+ODM
+sccD5Piz/rejE3Ome8EOOceUSCYAhXn8b3qvxVI1ddd1pED6FHRhFvLrZxFvBEM9
+ERRMp5QqOaHJkM+Dxv8Cj6MqrCbfC4u+ZErxodzuusgDgvZiLF22uxMZbobFWyte
+OvOzKGtwcTqO/1wV5gKkzu1ZVswVUQd5Gg8lJicwqRWyyNRczDDoG9jVDxmogKTH
+AaqLulO7R8Ifa1SwF2DteSGVtgWEN8gDpN3RBmmPTDngyF2DHb5qmpnznwtFKdTL
+KWbuHn491xNO25CQWMtem80uKw+pTnisBRF/454n1Jnhub144YRBoN8CAQI=
+-----END DH PARAMETERS-----
+
+These are the 4096 bit DH parameters from "Assigned Number for SKIP Protocols"
+(http://www.skip-vpn.org/spec/numbers.html).
+See there for how they were generated.
+Note that g is not a generator, but this is not a problem since p is a safe prime.
diff --git a/lib/dhcp-client.c b/lib/dhcp-client.c
new file mode 100644
index 000000000..225be3de7
--- /dev/null
+++ b/lib/dhcp-client.c
@@ -0,0 +1,1073 @@
+/*
+ * Copyright (c) 2008, 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <config.h>
+#include "dhcp-client.h"
+#include <arpa/inet.h>
+#include <assert.h>
+#include <errno.h>
+#include <inttypes.h>
+#include <limits.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <time.h>
+#include <unistd.h>
+#include "csum.h"
+#include "dhcp.h"
+#include "dynamic-string.h"
+#include "flow.h"
+#include "netdev.h"
+#include "ofpbuf.h"
+#include "poll-loop.h"
+#include "sat-math.h"
+#include "timeval.h"
+
+#define THIS_MODULE VLM_dhcp_client
+#include "vlog.h"
+
+#define DHCLIENT_STATES \
+ DHCLIENT_STATE(INIT, 1 << 0) \
+ DHCLIENT_STATE(INIT_REBOOT, 1 << 1) \
+ DHCLIENT_STATE(REBOOTING, 1 << 2) \
+ DHCLIENT_STATE(SELECTING, 1 << 3) \
+ DHCLIENT_STATE(REQUESTING, 1 << 4) \
+ DHCLIENT_STATE(BOUND, 1 << 5) \
+ DHCLIENT_STATE(RENEWING, 1 << 6) \
+ DHCLIENT_STATE(REBINDING, 1 << 7) \
+ DHCLIENT_STATE(RELEASED, 1 << 8)
+enum dhclient_state {
+#define DHCLIENT_STATE(NAME, VALUE) S_##NAME = VALUE,
+ DHCLIENT_STATES
+#undef DHCLIENT_STATE
+};
+
+static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(60, 60);
+
+static const char *
+state_name(enum dhclient_state state)
+{
+ switch (state) {
+#define DHCLIENT_STATE(NAME, VALUE) case S_##NAME: return #NAME;
+ DHCLIENT_STATES
+#undef DHCLIENT_STATE
+ }
+ return "***ERROR***";
+}
+
+struct dhclient {
+ /* Configuration. */
+ struct netdev *netdev;
+
+ void (*modify_request)(struct dhcp_msg *, void *aux);
+ bool (*validate_offer)(const struct dhcp_msg *, void *aux);
+ void *aux;
+
+ /* DHCP state. */
+ enum dhclient_state state;
+ unsigned int state_entered; /* When we transitioned to this state. */
+ uint32_t xid; /* In host byte order. */
+ uint32_t ipaddr, netmask, router;
+ uint32_t server_ip;
+ struct dhcp_msg *binding;
+ bool changed;
+
+ unsigned int retransmit, delay; /* Used by send_reliably(). */
+ unsigned int max_timeout;
+
+ unsigned int init_delay; /* Used by S_INIT. */
+
+ time_t lease_expiration;
+ unsigned int bound_timeout;
+ unsigned int renewing_timeout;
+ unsigned int rebinding_timeout;
+
+ /* Used by dhclient_run() and dhclient_wait() */
+ unsigned int min_timeout;
+ int received;
+
+ /* Set when we send out a DHCPDISCOVER message. */
+ uint32_t secs;
+
+ struct ds s;
+};
+
+/* Minimum acceptable lease time, in seconds. */
+#define MIN_ACCEPTABLE_LEASE 15
+
+static void state_transition(struct dhclient *, enum dhclient_state);
+static unsigned int elapsed_in_this_state(const struct dhclient *cli);
+static bool timeout(struct dhclient *, unsigned int secs);
+
+static void dhclient_msg_init(struct dhclient *, enum dhcp_msg_type,
+ struct dhcp_msg *);
+static void send_reliably(struct dhclient *cli,
+ void (*make_packet)(struct dhclient *,
+ struct dhcp_msg *));
+static bool do_receive_msg(struct dhclient *, struct dhcp_msg *);
+static void do_send_msg(struct dhclient *, const struct dhcp_msg *);
+static bool receive_ack(struct dhclient *);
+
+static unsigned int fuzz(unsigned int x, int max_fuzz);
+static unsigned int calc_t2(unsigned int lease);
+static unsigned int calc_t1(unsigned int lease, unsigned int t2);
+
+static unsigned int clamp(unsigned int x, unsigned int min, unsigned int max);
+
+/* Creates a new DHCP client to configure the network device 'netdev_name'
+ * (e.g. "eth0").
+ *
+ * If 'modify_request' is non-null, then each DHCP message to discover or
+ * request an address will be passed to it (along with auxiliary data 'aux').
+ * It may then add any desired options to the message for transmission.
+ *
+ * If 'validate_offer' is non-null, then each DHCP message that offers an
+ * address will be passed to it (along with auxiliary data 'aux') for
+ * validation: if it returns true, the address will accepted; otherwise, it
+ * will be rejected.
+ *
+ * The DHCP client will not start advertising for an IP address until
+ * dhclient_init() is called.
+ *
+ * If successful, returns 0 and sets '*cli' to the new DHCP client. Otherwise,
+ * returns a positive errno value and sets '*cli' to a null pointer. */
+int
+dhclient_create(const char *netdev_name,
+ void (*modify_request)(struct dhcp_msg *, void *aux),
+ bool (*validate_offer)(const struct dhcp_msg *, void *aux),
+ void *aux, struct dhclient **cli_)
+{
+ struct dhclient *cli;
+ struct netdev *netdev;
+ int error;
+
+ *cli_ = NULL;
+
+ error = netdev_open(netdev_name, ETH_TYPE_IP, &netdev);
+ /* XXX install socket filter to catch only DHCP packets. */
+ if (error) {
+ VLOG_ERR("could not open %s network device: %s",
+ netdev_name, strerror(error));
+ return error;
+ }
+
+ error = netdev_turn_flags_on(netdev, NETDEV_UP, false);
+ if (error) {
+ VLOG_ERR("could not bring %s device up: %s",
+ netdev_name, strerror(error));
+ netdev_close(netdev);
+ return error;
+ }
+
+ cli = xcalloc(1, sizeof *cli);
+ cli->modify_request = modify_request;
+ cli->validate_offer = validate_offer;
+ cli->aux = aux;
+ cli->netdev = netdev;
+ cli->state = S_RELEASED;
+ cli->state_entered = time_now();
+ cli->xid = random_uint32();
+ cli->ipaddr = 0;
+ cli->server_ip = 0;
+ cli->retransmit = cli->delay = 0;
+ cli->max_timeout = 64;
+ cli->min_timeout = 1;
+ ds_init(&cli->s);
+ cli->changed = true;
+ *cli_ = cli;
+ return 0;
+}
+
+/* Sets the maximum amount of timeout that 'cli' will wait for a reply from
+ * the DHCP server before retransmitting, in seconds, to 'max_timeout'. The
+ * default is 64 seconds. */
+void
+dhclient_set_max_timeout(struct dhclient *cli, unsigned int max_timeout)
+{
+ cli->max_timeout = MAX(2, max_timeout);
+}
+
+/* Destroys 'cli' and frees all related resources. */
+void
+dhclient_destroy(struct dhclient *cli)
+{
+ if (cli) {
+ dhcp_msg_uninit(cli->binding);
+ free(cli->binding);
+ netdev_close(cli->netdev);
+ ds_destroy(&cli->s);
+ free(cli);
+ }
+}
+
+/* Returns the network device in use by 'cli'. The caller must not destroy
+ * the returned device. */
+struct netdev *
+dhclient_get_netdev(struct dhclient *cli)
+{
+ return cli->netdev;
+}
+
+/* Forces 'cli' into a (re)initialization state, in which no address is bound
+ * but the client is advertising to obtain one. If 'requested_ip' is nonzero,
+ * then the client will attempt to re-bind to that IP address; otherwise, it
+ * will not ask for any particular address. */
+void
+dhclient_init(struct dhclient *cli, uint32_t requested_ip)
+{
+ state_transition(cli, requested_ip ? S_INIT_REBOOT : S_INIT);
+ cli->ipaddr = requested_ip;
+ cli->min_timeout = 0;
+ cli->init_delay = 0;
+}
+
+/* Forces 'cli' to release its bound IP address (if any). The client will not
+ * advertise for a new address until dhclient_init() is called again. */
+void
+dhclient_release(struct dhclient *cli)
+{
+ if (dhclient_is_bound(cli)) {
+ struct dhcp_msg msg;
+ dhclient_msg_init(cli, DHCPRELEASE, &msg);
+ msg.ciaddr = cli->ipaddr;
+ do_send_msg(cli, &msg);
+ dhcp_msg_uninit(&msg);
+ }
+ state_transition(cli, S_RELEASED);
+ cli->min_timeout = UINT_MAX;
+}
+
+static void
+do_force_renew(struct dhclient *cli, int deadline)
+{
+ time_t now = time_now();
+ unsigned int lease_left = sat_sub(cli->lease_expiration, now);
+ if (lease_left <= deadline) {
+ if (cli->state & (S_RENEWING | S_REBINDING)) {
+ return;
+ }
+ deadline = lease_left;
+ }
+ if (cli->state & (S_BOUND | S_RENEWING)) {
+ state_transition(cli, S_RENEWING);
+ cli->renewing_timeout = deadline * 3 / 4;
+ cli->rebinding_timeout = deadline * 1 / 4;
+ } else {
+ state_transition(cli, S_REBINDING);
+ cli->rebinding_timeout = deadline;
+ }
+ cli->min_timeout = 0;
+}
+
+/* Forces 'cli' to attempt to renew the lease its current IP address (if any)
+ * within 'deadline' seconds. If the deadline is not met, then the client
+ * gives up its IP address binding and re-starts the DHCP process. */
+void
+dhclient_force_renew(struct dhclient *cli, int deadline)
+{
+ /* Drain the receive queue so that we know that any DHCPACK we process is
+ * freshly received. */
+ netdev_drain(cli->netdev);
+
+ switch (cli->state) {
+ case S_INIT:
+ case S_INIT_REBOOT:
+ case S_REBOOTING:
+ case S_SELECTING:
+ case S_REQUESTING:
+ break;
+
+ case S_BOUND:
+ case S_RENEWING:
+ case S_REBINDING:
+ do_force_renew(cli, deadline);
+ break;
+
+ case S_RELEASED:
+ dhclient_init(cli, 0);
+ break;
+ }
+}
+
+/* Returns true if 'cli' is bound to an IP address, false otherwise. */
+bool
+dhclient_is_bound(const struct dhclient *cli)
+{
+ return cli->state & (S_BOUND | S_RENEWING | S_REBINDING);
+}
+
+/* Returns true if 'cli' has changed from bound to unbound, or vice versa, at
+ * least once since the last time this function was called. */
+bool
+dhclient_changed(struct dhclient *cli)
+{
+ bool changed = cli->changed;
+ cli->changed = 0;
+ return changed;
+}
+
+/* Returns 'cli''s current state, as a string. The caller must not modify or
+ * free the string. */
+const char *
+dhclient_get_state(const struct dhclient *cli)
+{
+ return state_name(cli->state);
+}
+
+/* Returns the number of seconds spent so far in 'cli''s current state. */
+unsigned int
+dhclient_get_state_elapsed(const struct dhclient *cli)
+{
+ return elapsed_in_this_state(cli);
+}
+
+/* If 'cli' is bound, returns the number of seconds remaining in its lease;
+ * otherwise, returns 0. */
+unsigned int
+dhclient_get_lease_remaining(const struct dhclient *cli)
+{
+ if (dhclient_is_bound(cli)) {
+ time_t now = time_now();
+ return cli->lease_expiration > now ? cli->lease_expiration - now : 0;
+ } else {
+ return 0;
+ }
+}
+
+/* If 'cli' is bound to an IP address, returns that IP address; otherwise,
+ * returns 0. */
+uint32_t
+dhclient_get_ip(const struct dhclient *cli)
+{
+ return dhclient_is_bound(cli) ? cli->ipaddr : 0;
+}
+
+/* If 'cli' is bound to an IP address, returns the netmask for that IP address;
+ * otherwise, returns 0. */
+uint32_t
+dhclient_get_netmask(const struct dhclient *cli)
+{
+ return dhclient_is_bound(cli) ? cli->netmask : 0;
+}
+
+/* If 'cli' is bound to an IP address and 'cli' has a default gateway, returns
+ * that default gateway; otherwise, returns 0. */
+uint32_t
+dhclient_get_router(const struct dhclient *cli)
+{
+ return dhclient_is_bound(cli) ? cli->router : 0;
+}
+
+/* If 'cli' is bound to an IP address, returns the DHCP message that was
+ * received to obtain that IP address (so that the caller can obtain additional
+ * options from it). Otherwise, returns a null pointer. */
+const struct dhcp_msg *
+dhclient_get_config(const struct dhclient *cli)
+{
+ return dhclient_is_bound(cli) ? cli->binding : NULL;
+}
+
+/* Configures the network device backing 'cli' to the network address and other
+ * parameters obtained via DHCP. If no address is bound on 'cli', removes any
+ * configured address from 'cli'.
+ *
+ * To use a dhclient as a regular DHCP client that binds and unbinds from IP
+ * addresses in the usual fashion, call this function after dhclient_run() if
+ * anything has changed, like so:
+ *
+ * dhclient_run(cli);
+ * if (dhclient_changed(cli)) {
+ * dhclient_configure_netdev(cli);
+ * }
+ *
+ */
+int
+dhclient_configure_netdev(struct dhclient *cli)
+{
+ struct in_addr addr = { dhclient_get_ip(cli) };
+ struct in_addr mask = { dhclient_get_netmask(cli) };
+ struct in_addr router = { dhclient_get_router(cli) };
+ int error;
+
+ error = netdev_set_in4(cli->netdev, addr, mask);
+ if (error) {
+ VLOG_ERR("could not set %s address "IP_FMT"/"IP_FMT": %s",
+ netdev_get_name(cli->netdev),
+ IP_ARGS(&addr.s_addr), IP_ARGS(&mask.s_addr),
+ strerror(error));
+ }
+
+ if (!error && router.s_addr) {
+ error = netdev_add_router(router);
+ if (error) {
+ VLOG_ERR("failed to add default route to "IP_FMT" on %s: %s",
+ IP_ARGS(&router), netdev_get_name(cli->netdev),
+ strerror(error));
+ }
+ }
+
+ return error;
+}
+
+/* If 'cli' is bound and the binding includes DNS domain parameters, updates
+ * /etc/resolv.conf will be updated to match the received parameters. Returns
+ * 0 if successful, otherwise a positive errno value. */
+int
+dhclient_update_resolv_conf(struct dhclient *cli)
+{
+ uint32_t dns_server;
+ char *domain_name;
+ bool has_domain_name;
+ char new_name[128];
+ FILE *old, *new;
+ int i;
+
+ if (!dhclient_is_bound(cli)) {
+ return 0;
+ }
+ if (!dhcp_msg_get_ip(cli->binding, DHCP_CODE_DNS_SERVER, 0, &dns_server)) {
+ VLOG_DBG("binding does not include any DNS servers");
+ return 0;
+ }
+
+ sprintf(new_name, "/etc/resolv.conf.tmp%ld", (long int) getpid());
+ new = fopen(new_name, "w");
+ if (!new) {
+ VLOG_WARN("%s: create: %s", new_name, strerror(errno));
+ return errno;
+ }
+
+ domain_name = dhcp_msg_get_string(cli->binding, DHCP_CODE_DOMAIN_NAME);
+ has_domain_name = domain_name != NULL;
+ if (domain_name) {
+ if (strspn(domain_name, "-_.0123456789abcdefghijklmnopqrstuvwxyz"
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZ") == strlen(domain_name)) {
+ fprintf(new, "domain %s\n", domain_name);
+ } else {
+ VLOG_WARN("ignoring invalid domain name %s", domain_name);
+ has_domain_name = false;
+ }
+ } else {
+ VLOG_DBG("binding does not include domain name");
+ }
+ free(domain_name);
+
+ for (i = 0; dhcp_msg_get_ip(cli->binding, DHCP_CODE_DNS_SERVER,
+ i, &dns_server); i++) {
+ fprintf(new, "nameserver "IP_FMT"\n", IP_ARGS(&dns_server));
+ }
+
+ old = fopen("/etc/resolv.conf", "r");
+ if (old) {
+ char line[128];
+
+ while (fgets(line, sizeof line, old)) {
+ char *kw = xmemdup0(line, strcspn(line, " \t\r\n"));
+ if (strcmp(kw, "nameserver")
+ && (!has_domain_name
+ || (strcmp(kw, "domain") && strcmp(kw, "search")))) {
+ fputs(line, new);
+ }
+ free(kw);
+ }
+ fclose(old);
+ } else {
+ VLOG_DBG("/etc/resolv.conf: open: %s", strerror(errno));
+ }
+
+ if (fclose(new) < 0) {
+ VLOG_WARN("%s: close: %s", new_name, strerror(errno));
+ return errno;
+ }
+
+ if (rename(new_name, "/etc/resolv.conf") < 0) {
+ VLOG_WARN("failed to rename %s to /etc/resolv.conf: %s",
+ new_name, strerror(errno));
+ return errno;
+ }
+
+ return 0;
+}
+
+/* DHCP protocol. */
+
+static void
+make_dhcpdiscover(struct dhclient *cli, struct dhcp_msg *msg)
+{
+ cli->secs = elapsed_in_this_state(cli);
+ dhclient_msg_init(cli, DHCPDISCOVER, msg);
+ if (cli->ipaddr) {
+ dhcp_msg_put_ip(msg, DHCP_CODE_REQUESTED_IP, cli->ipaddr);
+ }
+}
+
+static void
+make_dhcprequest(struct dhclient *cli, struct dhcp_msg *msg)
+{
+ dhclient_msg_init(cli, DHCPREQUEST, msg);
+ msg->ciaddr = dhclient_get_ip(cli);
+ if (cli->state == S_REQUESTING) {
+ dhcp_msg_put_ip(msg, DHCP_CODE_SERVER_IDENTIFIER, cli->server_ip);
+ }
+ dhcp_msg_put_ip(msg, DHCP_CODE_REQUESTED_IP, cli->ipaddr);
+}
+
+static void
+do_init(struct dhclient *cli, enum dhclient_state next_state)
+{
+ if (!cli->init_delay) {
+ cli->init_delay = fuzz(2, 1);
+ }
+ if (timeout(cli, cli->init_delay)) {
+ state_transition(cli, next_state);
+ }
+}
+
+static void
+dhclient_run_INIT(struct dhclient *cli)
+{
+ do_init(cli, S_SELECTING);
+}
+
+static void
+dhclient_run_INIT_REBOOT(struct dhclient *cli)
+{
+ do_init(cli, S_REBOOTING);
+}
+
+static void
+dhclient_run_REBOOTING(struct dhclient *cli)
+{
+ send_reliably(cli, make_dhcprequest);
+ if (!receive_ack(cli) && timeout(cli, 60)) {
+ state_transition(cli, S_INIT);
+ }
+}
+
+static bool
+dhcp_receive(struct dhclient *cli, unsigned int msgs, struct dhcp_msg *msg)
+{
+ while (do_receive_msg(cli, msg)) {
+ if (msg->type > 31 || !((1u << msg->type) & msgs)) {
+ VLOG_DBG_RL(&rl, "received unexpected %s in %s state: %s",
+ dhcp_type_name(msg->type), state_name(cli->state),
+ dhcp_msg_to_string(msg, false, &cli->s));
+ } else if (msg->xid != cli->xid) {
+ VLOG_DBG_RL(&rl,
+ "ignoring %s with xid != %08"PRIx32" in %s state: %s",
+ dhcp_type_name(msg->type), msg->xid,
+ state_name(cli->state),
+ dhcp_msg_to_string(msg, false, &cli->s));
+ } else {
+ return true;
+ }
+ dhcp_msg_uninit(msg);
+ }
+ return false;
+}
+
+static bool
+validate_offered_options(struct dhclient *cli, const struct dhcp_msg *msg)
+{
+ uint32_t lease, netmask;
+ if (!dhcp_msg_get_secs(msg, DHCP_CODE_LEASE_TIME, 0, &lease)) {
+ VLOG_WARN_RL(&rl, "%s lacks lease time: %s", dhcp_type_name(msg->type),
+ dhcp_msg_to_string(msg, false, &cli->s));
+ } else if (!dhcp_msg_get_ip(msg, DHCP_CODE_SUBNET_MASK, 0, &netmask)) {
+ VLOG_WARN_RL(&rl, "%s lacks netmask: %s", dhcp_type_name(msg->type),
+ dhcp_msg_to_string(msg, false, &cli->s));
+ } else if (lease < MIN_ACCEPTABLE_LEASE) {
+ VLOG_WARN_RL(&rl, "Ignoring %s with %"PRIu32"-second lease time: %s",
+ dhcp_type_name(msg->type), lease,
+ dhcp_msg_to_string(msg, false, &cli->s));
+ } else if (cli->validate_offer && !cli->validate_offer(msg, cli->aux)) {
+ VLOG_DBG_RL(&rl, "client validation hook refused offer: %s",
+ dhcp_msg_to_string(msg, false, &cli->s));
+ } else {
+ return true;
+ }
+ return false;
+}
+
+static void
+dhclient_run_SELECTING(struct dhclient *cli)
+{
+ struct dhcp_msg msg;
+
+ send_reliably(cli, make_dhcpdiscover);
+ if (cli->server_ip && timeout(cli, 60)) {
+ cli->server_ip = 0;
+ state_transition(cli, S_INIT);
+ }
+ for (; dhcp_receive(cli, 1u << DHCPOFFER, &msg); dhcp_msg_uninit(&msg)) {
+ if (!validate_offered_options(cli, &msg)) {
+ continue;
+ }
+ if (!dhcp_msg_get_ip(&msg, DHCP_CODE_SERVER_IDENTIFIER,
+ 0, &cli->server_ip)) {
+ VLOG_WARN_RL(&rl, "DHCPOFFER lacks server identifier: %s",
+ dhcp_msg_to_string(&msg, false, &cli->s));
+ continue;
+ }
+
+ VLOG_DBG_RL(&rl, "accepting DHCPOFFER: %s",
+ dhcp_msg_to_string(&msg, false, &cli->s));
+ cli->ipaddr = msg.yiaddr;
+ state_transition(cli, S_REQUESTING);
+ break;
+ }
+}
+
+static bool
+same_binding(const struct dhcp_msg *old, const struct dhcp_msg *new)
+{
+ static const int codes[] = {
+ DHCP_CODE_SUBNET_MASK,
+ DHCP_CODE_ROUTER,
+ DHCP_CODE_DNS_SERVER,
+ DHCP_CODE_HOST_NAME,
+ DHCP_CODE_DOMAIN_NAME,
+ DHCP_CODE_IP_TTL,
+ DHCP_CODE_MTU,
+ DHCP_CODE_BROADCAST_ADDRESS,
+ DHCP_CODE_STATIC_ROUTE,
+ DHCP_CODE_ARP_CACHE_TIMEOUT,
+ DHCP_CODE_ETHERNET_ENCAPSULATION,
+ DHCP_CODE_TCP_TTL,
+ DHCP_CODE_SERVER_IDENTIFIER,
+ DHCP_CODE_OFP_CONTROLLER_VCONN,
+ DHCP_CODE_OFP_PKI_URI,
+ };
+ int i;
+ bool same = true;
+
+ if (old->yiaddr != new->yiaddr) {
+ VLOG_WARN("DHCP binding changed IP address from "IP_FMT" to "IP_FMT,
+ IP_ARGS(&old->yiaddr), IP_ARGS(&new->yiaddr));
+ same = false;
+ }
+ for (i = 0; i < ARRAY_SIZE(codes); i++) {
+ int code = codes[i];
+ const struct dhcp_option *old_opt = &old->options[code];
+ const struct dhcp_option *new_opt = &new->options[code];
+ if (!dhcp_option_equals(old_opt, new_opt)) {
+ struct ds old_string = DS_EMPTY_INITIALIZER;
+ struct ds new_string = DS_EMPTY_INITIALIZER;
+ VLOG_WARN("DHCP binding changed option from %s to %s",
+ dhcp_option_to_string(old_opt, code, &old_string),
+ dhcp_option_to_string(new_opt, code, &new_string));
+ ds_destroy(&old_string);
+ ds_destroy(&new_string);
+ same = false;
+ }
+ }
+ return same;
+}
+
+static bool
+receive_ack(struct dhclient *cli)
+{
+ struct dhcp_msg msg;
+
+ if (!dhcp_receive(cli, (1u << DHCPACK) | (1u << DHCPNAK), &msg)) {
+ return false;
+ } else if (msg.type == DHCPNAK) {
+ dhcp_msg_uninit(&msg);
+ state_transition(cli, S_INIT);
+ return true;
+ } else if (!validate_offered_options(cli, &msg)) {
+ dhcp_msg_uninit(&msg);
+ return false;
+ } else {
+ uint32_t lease = 0, t1 = 0, t2 = 0;
+
+ if (cli->binding) {
+ if (!same_binding(cli->binding, &msg)) {
+ cli->changed = true;
+ }
+ dhcp_msg_uninit(cli->binding);
+ } else {
+ cli->binding = xmalloc(sizeof *cli->binding);
+ }
+ dhcp_msg_copy(cli->binding, &msg);
+
+ dhcp_msg_get_secs(&msg, DHCP_CODE_LEASE_TIME, 0, &lease);
+ dhcp_msg_get_secs(&msg, DHCP_CODE_T1, 0, &t1);
+ dhcp_msg_get_secs(&msg, DHCP_CODE_T2, 0, &t2);
+ assert(lease >= MIN_ACCEPTABLE_LEASE);
+
+ if (!t2 || t2 >= lease) {
+ t2 = calc_t2(lease);
+ }
+ if (!t1 || t1 >= t2) {
+ t1 = calc_t1(lease, t2);
+ }
+
+ cli->lease_expiration = sat_add(time_now(), lease);
+ cli->bound_timeout = t1;
+ cli->renewing_timeout = t2 - t1;
+ cli->rebinding_timeout = lease - t2;
+
+ cli->ipaddr = msg.yiaddr;
+ dhcp_msg_get_ip(&msg, DHCP_CODE_SUBNET_MASK, 0, &cli->netmask);
+ if (!dhcp_msg_get_ip(&msg, DHCP_CODE_ROUTER, 0, &cli->router)) {
+ cli->router = INADDR_ANY;
+ }
+ state_transition(cli, S_BOUND);
+ VLOG_DBG("Bound: %s", dhcp_msg_to_string(&msg, false, &cli->s));
+ return true;
+ }
+}
+
+static void
+dhclient_run_REQUESTING(struct dhclient *cli)
+{
+ send_reliably(cli, make_dhcprequest);
+ if (!receive_ack(cli) && timeout(cli, 60)) {
+ state_transition(cli, S_INIT);
+ }
+}
+
+static void
+dhclient_run_BOUND(struct dhclient *cli)
+{
+ if (timeout(cli, cli->bound_timeout)) {
+ state_transition(cli, S_RENEWING);
+ }
+}
+
+static void
+dhclient_run_RENEWING(struct dhclient *cli)
+{
+ send_reliably(cli, make_dhcprequest);
+ if (!receive_ack(cli) && timeout(cli, cli->renewing_timeout)) {
+ state_transition(cli, S_REBINDING);
+ }
+}
+
+static void
+dhclient_run_REBINDING(struct dhclient *cli)
+{
+ send_reliably(cli, make_dhcprequest);
+ if (!receive_ack(cli) && timeout(cli, cli->rebinding_timeout)) {
+ state_transition(cli, S_INIT);
+ }
+}
+
+static void
+dhclient_run_RELEASED(struct dhclient *cli UNUSED)
+{
+ /* Nothing to do. */
+}
+
+/* Processes the DHCP protocol for 'cli'. */
+void
+dhclient_run(struct dhclient *cli)
+{
+ int old_state;
+ do {
+ old_state = cli->state;
+ cli->min_timeout = UINT_MAX;
+ cli->received = 0;
+ switch (cli->state) {
+#define DHCLIENT_STATE(NAME, VALUE) \
+ case S_##NAME: dhclient_run_##NAME(cli); break;
+ DHCLIENT_STATES
+#undef DHCLIENT_STATE
+ default:
+ NOT_REACHED();
+ }
+ } while (cli->state != old_state);
+}
+
+/* Sets up poll timeouts to wake up the poll loop when 'cli' needs to do some
+ * work. */
+void
+dhclient_wait(struct dhclient *cli)
+{
+ if (cli->min_timeout != UINT_MAX) {
+ time_t now = time_now();
+ unsigned int wake = sat_add(cli->state_entered, cli->min_timeout);
+ if (wake <= now) {
+ poll_immediate_wake();
+ } else {
+ poll_timer_wait(sat_mul(sat_sub(wake, now), 1000));
+ }
+ }
+ /* Reset timeout to 1 second. This will have no effect ordinarily, because
+ * dhclient_run() will typically set it back to a higher value. If,
+ * however, the caller fails to call dhclient_run() before its next call to
+ * dhclient_wait() we won't potentially block forever. */
+ cli->min_timeout = 1;
+
+ if (cli->state & (S_SELECTING | S_REQUESTING | S_RENEWING | S_REBINDING)) {
+ netdev_recv_wait(cli->netdev);
+ }
+}
+
+static void
+state_transition(struct dhclient *cli, enum dhclient_state state)
+{
+ bool was_bound = dhclient_is_bound(cli);
+ bool am_bound;
+ if (cli->state != state) {
+ VLOG_DBG("entering %s", state_name(state));
+ cli->state = state;
+ }
+ cli->state_entered = time_now();
+ cli->retransmit = cli->delay = 0;
+ am_bound = dhclient_is_bound(cli);
+ if (was_bound != am_bound) {
+ cli->changed = true;
+ if (am_bound) {
+ assert(cli->binding != NULL);
+ VLOG_INFO("%s: obtained address "IP_FMT", netmask "IP_FMT,
+ netdev_get_name(cli->netdev),
+ IP_ARGS(&cli->ipaddr), IP_ARGS(&cli->netmask));
+ if (cli->router) {
+ VLOG_INFO("%s: obtained default gateway "IP_FMT,
+ netdev_get_name(cli->netdev), IP_ARGS(&cli->router));
+ }
+ } else {
+ dhcp_msg_uninit(cli->binding);
+ free(cli->binding);
+ cli->binding = NULL;
+
+ VLOG_INFO("%s: network address unbound",
+ netdev_get_name(cli->netdev));
+ }
+ }
+ if (cli->state & (S_SELECTING | S_REQUESTING | S_REBOOTING)) {
+ netdev_drain(cli->netdev);
+ }
+}
+
+static void
+send_reliably(struct dhclient *cli,
+ void (*make_packet)(struct dhclient *, struct dhcp_msg *))
+{
+ if (timeout(cli, cli->retransmit)) {
+ struct dhcp_msg msg;
+ make_packet(cli, &msg);
+ if (cli->modify_request) {
+ cli->modify_request(&msg, cli->aux);
+ }
+ do_send_msg(cli, &msg);
+ cli->delay = MIN(cli->max_timeout, MAX(4, cli->delay * 2));
+ cli->retransmit += fuzz(cli->delay, 1);
+ timeout(cli, cli->retransmit);
+ dhcp_msg_uninit(&msg);
+ }
+}
+
+static void
+dhclient_msg_init(struct dhclient *cli, enum dhcp_msg_type type,
+ struct dhcp_msg *msg)
+{
+ dhcp_msg_init(msg);
+ msg->op = DHCP_BOOTREQUEST;
+ msg->xid = cli->xid;
+ msg->secs = cli->secs;
+ msg->type = type;
+ memcpy(msg->chaddr, netdev_get_etheraddr(cli->netdev), ETH_ADDR_LEN);
+}
+
+/* If time goes backward this returns a large number, which makes it look like
+ * we've been in the current state a very long time. That's probably
+ * fine for that corner case--we'll just expire our lease, etc., and try to
+ * get a new one. */
+static unsigned int
+elapsed_in_this_state(const struct dhclient *cli)
+{
+ return time_now() - cli->state_entered;
+}
+
+static bool
+timeout(struct dhclient *cli, unsigned int secs)
+{
+ cli->min_timeout = MIN(cli->min_timeout, secs);
+ return time_now() >= sat_add(cli->state_entered, secs);
+}
+
+static bool
+do_receive_msg(struct dhclient *cli, struct dhcp_msg *msg)
+{
+ struct ofpbuf b;
+
+ ofpbuf_init(&b, netdev_get_mtu(cli->netdev) + VLAN_ETH_HEADER_LEN);
+ for (; cli->received < 50; cli->received++) {
+ const struct ip_header *ip;
+ const struct dhcp_header *dhcp;
+ flow_t flow;
+ int error;
+
+ ofpbuf_clear(&b);
+ error = netdev_recv(cli->netdev, &b);
+ if (error) {
+ goto drained;
+ }
+
+ flow_extract(&b, 0, &flow);
+ if (flow.dl_type != htons(ETH_TYPE_IP)
+ || flow.nw_proto != IP_TYPE_UDP
+ || flow.tp_dst != htons(68)
+ || !(eth_addr_is_broadcast(flow.dl_dst)
+ || eth_addr_equals(flow.dl_dst,
+ netdev_get_etheraddr(cli->netdev)))) {
+ continue;
+ }
+
+ ip = b.l3;
+ if (IP_IS_FRAGMENT(ip->ip_frag_off)) {
+ /* We don't do reassembly. */
+ VLOG_WARN_RL(&rl, "ignoring fragmented DHCP datagram");
+ continue;
+ }
+
+ dhcp = b.l7;
+ if (!dhcp) {
+ VLOG_WARN_RL(&rl, "ignoring DHCP datagram with missing payload");
+ continue;
+ }
+
+ ofpbuf_pull(&b, (char *)b.l7 - (char*)b.data);
+ error = dhcp_parse(msg, &b);
+ if (!error) {
+ if (VLOG_IS_DBG_ENABLED()) {
+ VLOG_DBG_RL(&rl, "received %s",
+ dhcp_msg_to_string(msg, false, &cli->s));
+ } else {
+ VLOG_INFO_RL(&rl, "received %s", dhcp_type_name(msg->type));
+ }
+ ofpbuf_uninit(&b);
+ return true;
+ }
+ }
+ netdev_drain(cli->netdev);
+drained:
+ ofpbuf_uninit(&b);
+ return false;
+}
+
+static void
+do_send_msg(struct dhclient *cli, const struct dhcp_msg *msg)
+{
+ struct ofpbuf b;
+ struct eth_header eh;
+ struct ip_header nh;
+ struct udp_header th;
+ uint32_t udp_csum;
+ int error;
+
+ ofpbuf_init(&b, ETH_TOTAL_MAX);
+ ofpbuf_reserve(&b, ETH_HEADER_LEN + IP_HEADER_LEN + UDP_HEADER_LEN);
+
+ dhcp_assemble(msg, &b);
+
+ memcpy(eh.eth_src, netdev_get_etheraddr(cli->netdev), ETH_ADDR_LEN);
+ memcpy(eh.eth_dst, eth_addr_broadcast, ETH_ADDR_LEN);
+ eh.eth_type = htons(ETH_TYPE_IP);
+
+ nh.ip_ihl_ver = IP_IHL_VER(5, IP_VERSION);
+ nh.ip_tos = 0;
+ nh.ip_tot_len = htons(IP_HEADER_LEN + UDP_HEADER_LEN + b.size);
+ /* We can't guarantee uniqueness of ip_id versus the host's, screwing up
+ * fragment reassembly, so prevent fragmentation and use an all-zeros
+ * ip_id. RFC 791 doesn't say we can do this, but Linux does the same
+ * thing for DF packets, so it must not screw anything up. */
+ nh.ip_id = 0;
+ nh.ip_frag_off = htons(IP_DONT_FRAGMENT);
+ nh.ip_ttl = 64;
+ nh.ip_proto = IP_TYPE_UDP;
+ nh.ip_csum = 0;
+ nh.ip_src = dhclient_get_ip(cli);
+ /* XXX need to use UDP socket for nonzero server IPs so that we can get
+ * routing table support.
+ *
+ * if (...have server IP and in appropriate state...) {
+ * nh.ip_dst = cli->server_ip;
+ * } else {
+ * nh.ip_dst = INADDR_BROADCAST;
+ * }
+ */
+ nh.ip_dst = INADDR_BROADCAST;
+ nh.ip_csum = csum(&nh, sizeof nh);
+
+ th.udp_src = htons(66);
+ th.udp_dst = htons(67);
+ th.udp_len = htons(UDP_HEADER_LEN + b.size);
+ th.udp_csum = 0;
+ udp_csum = csum_add32(0, nh.ip_src);
+ udp_csum = csum_add32(udp_csum, nh.ip_dst);
+ udp_csum = csum_add16(udp_csum, IP_TYPE_UDP << 8);
+ udp_csum = csum_add16(udp_csum, th.udp_len);
+ udp_csum = csum_continue(udp_csum, &th, sizeof th);
+ th.udp_csum = csum_finish(csum_continue(udp_csum, b.data, b.size));
+
+ ofpbuf_push(&b, &th, sizeof th);
+ ofpbuf_push(&b, &nh, sizeof nh);
+ ofpbuf_push(&b, &eh, sizeof eh);
+
+ /* Don't try to send the frame if it's too long for an Ethernet frame. We
+ * disregard the network device's actual MTU because we don't want the
+ * frame to have to be discarded or fragmented if it travels over a regular
+ * Ethernet at some point. 1500 bytes should be enough for anyone. */
+ if (b.size <= ETH_TOTAL_MAX) {
+ if (VLOG_IS_DBG_ENABLED()) {
+ VLOG_DBG("sending %s", dhcp_msg_to_string(msg, false, &cli->s));
+ } else {
+ VLOG_INFO("sending %s", dhcp_type_name(msg->type));
+ }
+ error = netdev_send(cli->netdev, &b);
+ if (error) {
+ VLOG_ERR("send failed on %s: %s",
+ netdev_get_name(cli->netdev), strerror(error));
+ }
+ } else {
+ VLOG_ERR("cannot send %zu-byte Ethernet frame", b.size);
+ }
+
+ ofpbuf_uninit(&b);
+}
+
+static unsigned int
+fuzz(unsigned int x, int max_fuzz)
+{
+ /* Generate number in range [-max_fuzz, +max_fuzz]. */
+ int fuzz = random_range(max_fuzz * 2 + 1) - max_fuzz;
+ unsigned int y = x + fuzz;
+ return fuzz >= 0 ? (y >= x ? y : UINT_MAX) : (y <= x ? y : 0);
+}
+
+static unsigned int
+clamp(unsigned int x, unsigned int min, unsigned int max)
+{
+ return x < min ? min : x > max ? max : x;
+}
+
+static unsigned int
+calc_t2(unsigned int lease)
+{
+ unsigned int base = lease * 0.875;
+ return lease >= 60 ? clamp(fuzz(base, 10), 0, lease - 1) : base;
+}
+
+static unsigned int
+calc_t1(unsigned int lease, unsigned int t2)
+{
+ unsigned int base = lease / 2;
+ return lease >= 60 ? clamp(fuzz(base, 10), 0, t2 - 1) : base;
+}
diff --git a/lib/dhcp-client.h b/lib/dhcp-client.h
new file mode 100644
index 000000000..b37cac61b
--- /dev/null
+++ b/lib/dhcp-client.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2008 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef DHCP_CLIENT_H
+#define DHCP_CLIENT_H 1
+
+#include <stdbool.h>
+#include <stdint.h>
+
+struct dhclient;
+struct dhcp_msg;
+struct netdev;
+int dhclient_create(const char *netdev,
+ void (*modify_request)(struct dhcp_msg *, void *aux),
+ bool (*validate_offer)(const struct dhcp_msg *, void *aux),
+ void *aux, struct dhclient **);
+void dhclient_set_max_timeout(struct dhclient *, unsigned int max_timeout);
+void dhclient_destroy(struct dhclient *);
+
+struct netdev *dhclient_get_netdev(struct dhclient *);
+
+void dhclient_init(struct dhclient *, uint32_t requested_ip);
+void dhclient_release(struct dhclient *);
+void dhclient_force_renew(struct dhclient *, int deadline);
+bool dhclient_is_bound(const struct dhclient *);
+bool dhclient_changed(struct dhclient *);
+
+const char *dhclient_get_state(const struct dhclient *);
+unsigned int dhclient_get_state_elapsed(const struct dhclient *);
+unsigned int dhclient_get_lease_remaining(const struct dhclient *);
+
+uint32_t dhclient_get_ip(const struct dhclient *);
+uint32_t dhclient_get_netmask(const struct dhclient *);
+uint32_t dhclient_get_router(const struct dhclient *);
+const struct dhcp_msg *dhclient_get_config(const struct dhclient *);
+
+int dhclient_configure_netdev(struct dhclient *);
+int dhclient_update_resolv_conf(struct dhclient *);
+
+void dhclient_run(struct dhclient *);
+void dhclient_wait(struct dhclient *);
+
+#endif /* dhcp-client.h */
diff --git a/lib/dhcp.c b/lib/dhcp.c
new file mode 100644
index 000000000..b7a1f1f0a
--- /dev/null
+++ b/lib/dhcp.c
@@ -0,0 +1,825 @@
+/*
+ * Copyright (c) 2008 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <config.h>
+#include "dhcp.h"
+#include <arpa/inet.h>
+#include <assert.h>
+#include <ctype.h>
+#include <errno.h>
+#include <inttypes.h>
+#include <stdlib.h>
+#include "dynamic-string.h"
+#include "ofpbuf.h"
+
+#define THIS_MODULE VLM_dhcp
+#include "vlog.h"
+
+/* Information about a DHCP argument type. */
+struct arg_type {
+ const char *name; /* Name. */
+ size_t size; /* Number of bytes per argument. */
+};
+
+static struct arg_type types[] = {
+#define DHCP_ARG(NAME, SIZE) [DHCP_ARG_##NAME] = {#NAME, SIZE},
+ DHCP_ARGS
+#undef DHCP_ARG
+};
+
+/* Information about a DHCP option. */
+struct option_class {
+ const char *name; /* Name. */
+ enum dhcp_arg_type type; /* Argument type. */
+ size_t min_args; /* Minimum number of arguments. */
+ size_t max_args; /* Maximum number of arguments. */
+};
+
+static const struct option_class *
+get_option_class(int code)
+{
+ static struct option_class classes[DHCP_N_OPTIONS];
+ static bool init = false;
+ if (!init) {
+ int i;
+
+ init = true;
+#define DHCP_OPT(NAME, CODE, TYPE, MIN, MAX) \
+ classes[CODE].name = #NAME; \
+ classes[CODE].type = DHCP_ARG_##TYPE; \
+ classes[CODE].min_args = MIN; \
+ classes[CODE].max_args = MAX;
+ DHCP_OPTS
+#undef DHCP_OPT
+
+ for (i = 0; i < DHCP_N_OPTIONS; i++) {
+ if (!classes[i].name) {
+ classes[i].name = xasprintf("option-%d", i);
+ classes[i].type = DHCP_ARG_UINT8;
+ classes[i].min_args = 0;
+ classes[i].max_args = SIZE_MAX;
+ }
+ }
+ }
+ assert(code >= 0 && code < DHCP_N_OPTIONS);
+ return &classes[code];
+}
+
+/* A single (bad) DHCP message can in theory dump out many, many log messages,
+ * especially at high logging levels, so the burst size is set quite high
+ * here to avoid missing useful information. */
+struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(60, 600);
+
+static void copy_data(struct dhcp_msg *);
+
+const char *
+dhcp_type_name(enum dhcp_msg_type type)
+{
+ switch (type) {
+#define DHCP_MSG(NAME, VALUE) case NAME: return #NAME;
+ DHCP_MSGS
+#undef DHCP_MSG
+ }
+ return "<<unknown DHCP message type>>";
+}
+
+/* Initializes 'msg' as a DHCP message. The message should be freed with
+ * dhcp_msg_uninit() when it is no longer needed. */
+void
+dhcp_msg_init(struct dhcp_msg *msg)
+{
+ memset(msg, 0, sizeof *msg);
+}
+
+/* Frees the contents of 'msg'. The caller is responsible for freeing 'msg',
+ * if necessary. */
+void
+dhcp_msg_uninit(struct dhcp_msg *msg)
+{
+ if (msg) {
+ free(msg->data);
+ }
+}
+
+/* Initializes 'dst' as a copy of 'src'. 'dst' (and 'src') should be freed
+ * with dhcp_msg_uninit() when it is no longer needed. */
+void
+dhcp_msg_copy(struct dhcp_msg *dst, const struct dhcp_msg *src)
+{
+ *dst = *src;
+ dst->data_allocated = src->data_used;
+ dst->data_used = 0;
+ dst->data = xmalloc(dst->data_allocated);
+ copy_data(dst);
+}
+
+static void
+prealloc_data(struct dhcp_msg *msg, size_t n)
+{
+ size_t needed = msg->data_used + n;
+ if (needed > msg->data_allocated) {
+ uint8_t *old_data = msg->data;
+ msg->data_allocated = MAX(needed * 2, 64);
+ msg->data = xmalloc(msg->data_allocated);
+ if (old_data) {
+ copy_data(msg);
+ free(old_data);
+ }
+ }
+}
+
+static void *
+append_data(struct dhcp_msg *msg, const void *data, size_t n)
+{
+ uint8_t *p = &msg->data[msg->data_used];
+ memcpy(p, data, n);
+ msg->data_used += n;
+ return p;
+}
+
+static void
+copy_data(struct dhcp_msg *msg)
+{
+ int code;
+
+ msg->data_used = 0;
+ for (code = 0; code < DHCP_N_OPTIONS; code++) {
+ struct dhcp_option *opt = &msg->options[code];
+ if (opt->data) {
+ assert(msg->data_used + opt->n <= msg->data_allocated);
+ opt->data = append_data(msg, opt->data, opt->n);
+ }
+ }
+}
+
+/* Appends the 'n' bytes in 'data' to the DHCP option in 'msg' represented by
+ * 'code' (which must be in the range 0...DHCP_N_OPTIONS). */
+void
+dhcp_msg_put(struct dhcp_msg *msg, int code,
+ const void *data, size_t n)
+{
+ struct dhcp_option *opt;
+ if (code == DHCP_CODE_PAD || code == DHCP_CODE_END) {
+ return;
+ }
+
+ opt = &msg->options[code];
+ prealloc_data(msg, n + opt->n);
+ if (opt->n) {
+ if (&msg->data[msg->data_used - opt->n] != opt->data) {
+ opt->data = append_data(msg, opt->data, opt->n);
+ }
+ append_data(msg, data, n);
+ } else {
+ opt->data = append_data(msg, data, n);
+ }
+ opt->n += n;
+}
+
+/* Appends the boolean value 'b', as a octet with value 0 (false) or 1 (true),
+ * to the DHCP option in 'msg' represented by 'code' (which must be in the
+ * range 0...DHCP_N_OPTIONS). */
+void
+dhcp_msg_put_bool(struct dhcp_msg *msg, int code, bool b_)
+{
+ char b = !!b_;
+ dhcp_msg_put(msg, code, &b, 1);
+}
+
+/* Appends the number of seconds 'secs', as a 32-bit number in network byte
+ * order, to the DHCP option in 'msg' represented by 'code' (which must be in
+ * the range 0...DHCP_N_OPTIONS). */
+void
+dhcp_msg_put_secs(struct dhcp_msg *msg, int code, uint32_t secs_)
+{
+ uint32_t secs = htonl(secs_);
+ dhcp_msg_put(msg, code, &secs, sizeof secs);
+}
+
+/* Appends the IP address 'ip', as a 32-bit number in network byte order, to
+ * the DHCP option in 'msg' represented by 'code' (which must be in the range
+ * 0...DHCP_N_OPTIONS). */
+void
+dhcp_msg_put_ip(struct dhcp_msg *msg, int code, uint32_t ip)
+{
+ dhcp_msg_put(msg, code, &ip, sizeof ip);
+}
+
+/* Appends the ASCII string 'string', to the DHCP option in 'msg' represented
+ * by 'code' (which must be in the range 0...DHCP_N_OPTIONS). */
+void
+dhcp_msg_put_string(struct dhcp_msg *msg, int code, const char *string)
+{
+ dhcp_msg_put(msg, code, string, strlen(string));
+}
+
+/* Appends octet 'x' to DHCP option in 'msg' represented by 'code' (which must
+ * be in the range 0...DHCP_N_OPTIONS). */
+void
+dhcp_msg_put_uint8(struct dhcp_msg *msg, int code, uint8_t x)
+{
+ dhcp_msg_put(msg, code, &x, sizeof x);
+}
+
+/* Appends the 'n' octets in 'data' to DHCP option in 'msg' represented by
+ * 'code' (which must be in the range 0...DHCP_N_OPTIONS). */
+void dhcp_msg_put_uint8_array(struct dhcp_msg *msg, int code,
+ const uint8_t data[], size_t n)
+{
+ dhcp_msg_put(msg, code, data, n);
+}
+
+/* Appends the 16-bit value in 'x', in network byte order, to DHCP option in
+ * 'msg' represented by 'code' (which must be in the range
+ * 0...DHCP_N_OPTIONS). */
+void
+dhcp_msg_put_uint16(struct dhcp_msg *msg, int code, uint16_t x_)
+{
+ uint16_t x = htons(x_);
+ dhcp_msg_put(msg, code, &x, sizeof x);
+}
+
+
+/* Appends the 'n' 16-bit values in 'data', in network byte order, to DHCP
+ * option in 'msg' represented by 'code' (which must be in the range
+ * 0...DHCP_N_OPTIONS). */
+void
+dhcp_msg_put_uint16_array(struct dhcp_msg *msg, int code,
+ const uint16_t data[], size_t n)
+{
+ size_t i;
+
+ for (i = 0; i < n; i++) {
+ dhcp_msg_put_uint16(msg, code, data[i]);
+ }
+}
+
+/* Returns a pointer to the 'size' bytes starting at byte offset 'offset' in
+ * the DHCP option in 'msg' represented by 'code' (which must be in the range
+ * 0...DHCP_N_OPTIONS). If the option has fewer than 'offset + size' bytes,
+ * returns a null pointer. */
+const void *
+dhcp_msg_get(const struct dhcp_msg *msg, int code,
+ size_t offset, size_t size)
+{
+ const struct dhcp_option *opt = &msg->options[code];
+ return offset + size <= opt->n ? (const char *) opt->data + offset : NULL;
+}
+
+/* Stores in '*out' the boolean value at byte offset 'offset' in the DHCP
+ * option in 'msg' represented by 'code' (which must be in the range
+ * 0...DHCP_N_OPTIONS). Returns true if successful, false if the option has
+ * fewer than 'offset + 1' bytes. */
+bool
+dhcp_msg_get_bool(const struct dhcp_msg *msg, int code, size_t offset,
+ bool *out)
+{
+ const uint8_t *uint8 = dhcp_msg_get(msg, code, offset, sizeof *uint8);
+ if (uint8) {
+ *out = *uint8 != 0;
+ return true;
+ } else {
+ return false;
+ }
+}
+
+/* Stores in '*out' the 32-bit count of seconds at offset 'offset' (in
+ * 4-byte increments) in the DHCP option in 'msg' represented by 'code'
+ * (which must be in the range 0...DHCP_N_OPTIONS). The value is converted to
+ * native byte order. Returns true if successful, false if the option has
+ * fewer than '4 * (offset + 1)' bytes. */
+bool
+dhcp_msg_get_secs(const struct dhcp_msg *msg, int code, size_t offset,
+ uint32_t *out)
+{
+ const uint32_t *uint32 = dhcp_msg_get(msg, code, offset * sizeof *uint32,
+ sizeof *uint32);
+ if (uint32) {
+ *out = ntohl(*uint32);
+ return true;
+ } else {
+ return false;
+ }
+}
+
+/* Stores in '*out' the IP address at offset 'offset' (in 4-byte increments) in
+ * the DHCP option in 'msg' represented by 'code' (which must be in the range
+ * 0...DHCP_N_OPTIONS). The IP address is stored in network byte order.
+ * Returns true if successful, false if the option has fewer than '4 * (offset
+ * + 1)' bytes. */
+bool
+dhcp_msg_get_ip(const struct dhcp_msg *msg, int code,
+ size_t offset, uint32_t *out)
+{
+ const uint32_t *uint32 = dhcp_msg_get(msg, code, offset * sizeof *uint32,
+ sizeof *uint32);
+ if (uint32) {
+ *out = *uint32;
+ return true;
+ } else {
+ return false;
+ }
+}
+
+/* Returns the string in the DHCP option in 'msg' represented by 'code' (which
+ * must be in the range 0...DHCP_N_OPTIONS). The caller is responsible for
+ * freeing the string with free().
+ *
+ * If 'msg' has no option represented by 'code', returns a null pointer. (If
+ * the option was specified but had no content, then an empty string is
+ * returned, not a null pointer.) */
+char *
+dhcp_msg_get_string(const struct dhcp_msg *msg, int code)
+{
+ const struct dhcp_option *opt = &msg->options[code];
+ return opt->data ? xmemdup0(opt->data, opt->n) : NULL;
+}
+
+/* Stores in '*out' the octet at byte offset 'offset' in the DHCP option in
+ * 'msg' represented by 'code' (which must be in the range 0...DHCP_N_OPTIONS).
+ * Returns true if successful, false if the option has fewer than 'offset + 1'
+ * bytes. */
+bool
+dhcp_msg_get_uint8(const struct dhcp_msg *msg, int code,
+ size_t offset, uint8_t *out)
+{
+ const uint8_t *uint8 = dhcp_msg_get(msg, code, offset, sizeof *uint8);
+ if (uint8) {
+ *out = *uint8;
+ return true;
+ } else {
+ return false;
+ }
+}
+
+/* Stores in '*out' the 16-bit value at offset 'offset' (in 2-byte units) in
+ * the DHCP option in 'msg' represented by 'code' (which must be in the range
+ * 0...DHCP_N_OPTIONS). The value is converted to native byte order. Returns
+ * true if successful, false if the option has fewer than '2 * (offset + 1)'
+ * bytes. */
+bool
+dhcp_msg_get_uint16(const struct dhcp_msg *msg, int code,
+ size_t offset, uint16_t *out)
+{
+ const uint16_t *uint16 = dhcp_msg_get(msg, code, offset * sizeof *uint16,
+ sizeof *uint16);
+ if (uint16) {
+ *out = ntohs(*uint16);
+ return true;
+ } else {
+ return false;
+ }
+}
+
+/* Appends a string representing 'duration' seconds to 'ds'. */
+static void
+put_duration(struct ds *ds, unsigned int duration)
+{
+ if (duration) {
+ if (duration >= 86400) {
+ ds_put_format(ds, "%ud", duration / 86400);
+ duration %= 86400;
+ }
+ if (duration >= 3600) {
+ ds_put_format(ds, "%uh", duration / 3600);
+ duration %= 3600;
+ }
+ if (duration >= 60) {
+ ds_put_format(ds, "%umin", duration / 60);
+ duration %= 60;
+ }
+ if (duration > 0) {
+ ds_put_format(ds, "%us", duration);
+ }
+ } else {
+ ds_put_cstr(ds, "0s");
+ }
+}
+
+/* Appends a string representation of 'opt', which has the given 'code', to
+ * 'ds'. */
+const char *
+dhcp_option_to_string(const struct dhcp_option *opt, int code, struct ds *ds)
+{
+ const struct option_class *class = get_option_class(code);
+ const struct arg_type *type = &types[class->type];
+ size_t offset;
+ const char *cp;
+
+ for (cp = class->name; *cp; cp++) {
+ unsigned char c = *cp;
+ ds_put_char(ds, c == '_' ? '-' : tolower(c));
+ }
+ ds_put_char(ds, '=');
+
+ if (!opt->data || !opt->n) {
+ ds_put_cstr(ds, opt->data ? "empty" : "null");
+ return ds_cstr(ds);
+ }
+
+ if (class->type == DHCP_ARG_STRING) {
+ ds_put_char(ds, '"');
+ ds_put_printable(ds, opt->data, opt->n);
+ ds_put_char(ds, '"');
+ return ds_cstr(ds);
+ }
+ for (offset = 0; offset + type->size <= opt->n; offset += type->size) {
+ const void *p = (const char *) opt->data + offset;
+ const uint8_t *uint8 = p;
+ const uint32_t *uint32 = p;
+ const uint16_t *uint16 = p;
+
+ if (offset && class->type != DHCP_ARG_STRING) {
+ ds_put_cstr(ds, class->type == DHCP_ARG_UINT8 ? ":" : ", ");
+ }
+ switch (class->type) {
+ case DHCP_ARG_FIXED:
+ NOT_REACHED();
+ case DHCP_ARG_IP:
+ ds_put_format(ds, IP_FMT, IP_ARGS(uint32));
+ break;
+ case DHCP_ARG_UINT8:
+ ds_put_format(ds, "%02"PRIx8, *uint8);
+ break;
+ case DHCP_ARG_UINT16:
+ ds_put_format(ds, "%"PRIu16, ntohs(*uint16));
+ break;
+ case DHCP_ARG_UINT32:
+ ds_put_format(ds, "%"PRIu32, ntohl(*uint32));
+ break;
+ case DHCP_ARG_SECS:
+ put_duration(ds, ntohl(*uint32));
+ break;
+ case DHCP_ARG_STRING:
+ NOT_REACHED();
+ case DHCP_ARG_BOOLEAN:
+ if (*uint8 == 0) {
+ ds_put_cstr(ds, "false");
+ } else if (*uint8 == 1) {
+ ds_put_cstr(ds, "true");
+ } else {
+ ds_put_format(ds, "**%"PRIu8"**", *uint8);
+ }
+ break;
+ }
+ }
+ if (offset != opt->n) {
+ if (offset) {
+ ds_put_cstr(ds, ", ");
+ }
+ ds_put_cstr(ds, "**leftovers:");
+ for (; offset < opt->n; offset++) {
+ const void *p = (const char *) opt->data + offset;
+ const uint8_t *uint8 = p;
+ ds_put_format(ds, " %"PRIu8, *uint8);
+ }
+ ds_put_cstr(ds, "**");
+ }
+ return ds_cstr(ds);
+}
+
+/* Returns true if 'a' and 'b' have the same content, false otherwise. */
+bool
+dhcp_option_equals(const struct dhcp_option *a, const struct dhcp_option *b)
+{
+ return ((a->data != NULL) == (b->data != NULL)
+ && a->n == b->n
+ && !memcmp(a->data, b->data, a->n));
+}
+
+/* Replaces 'ds' by a string representation of 'msg'. If 'multiline' is
+ * false, 'ds' receives a single-line representation of 'msg', otherwise a
+ * multiline representation. */
+const char *
+dhcp_msg_to_string(const struct dhcp_msg *msg, bool multiline, struct ds *ds)
+{
+ char separator = multiline ? '\n' : ' ';
+ int code;
+
+ ds_clear(ds);
+ ds_put_format(ds, "op=%s",
+ (msg->op == DHCP_BOOTREQUEST ? "request"
+ : msg->op == DHCP_BOOTREPLY ? "reply"
+ : "error"));
+ ds_put_format(ds, "%ctype=%s", separator, dhcp_type_name(msg->type));
+ ds_put_format(ds, "%cxid=0x%08"PRIx32, separator, msg->xid);
+ ds_put_format(ds, "%csecs=", separator);
+ put_duration(ds, msg->secs);
+ if (msg->flags) {
+ ds_put_format(ds, "%cflags=", separator);
+ if (msg->flags & DHCP_FLAGS_BROADCAST) {
+ ds_put_cstr(ds, "[BROADCAST]");
+ }
+ if (msg->flags & DHCP_FLAGS_MBZ) {
+ ds_put_format(ds, "[0x%04"PRIx16"]", msg->flags & DHCP_FLAGS_MBZ);
+ }
+ }
+ if (msg->ciaddr) {
+ ds_put_format(ds, "%cciaddr="IP_FMT, separator, IP_ARGS(&msg->ciaddr));
+ }
+ if (msg->yiaddr) {
+ ds_put_format(ds, "%cyiaddr="IP_FMT, separator, IP_ARGS(&msg->yiaddr));
+ }
+ if (msg->siaddr) {
+ ds_put_format(ds, "%csiaddr="IP_FMT, separator, IP_ARGS(&msg->siaddr));
+ }
+ if (msg->giaddr) {
+ ds_put_format(ds, "%cgiaddr="IP_FMT, separator, IP_ARGS(&msg->giaddr));
+ }
+ ds_put_format(ds, "%cchaddr="ETH_ADDR_FMT,
+ separator, ETH_ADDR_ARGS(msg->chaddr));
+
+ for (code = 0; code < DHCP_N_OPTIONS; code++) {
+ const struct dhcp_option *opt = &msg->options[code];
+ if (opt->data) {
+ ds_put_char(ds, separator);
+ dhcp_option_to_string(opt, code, ds);
+ }
+ }
+ if (multiline) {
+ ds_put_char(ds, separator);
+ }
+ return ds_cstr(ds);
+}
+
+static void
+parse_options(struct dhcp_msg *msg, const char *name, void *data, size_t size,
+ int option_offset)
+{
+ struct ofpbuf b;
+
+ b.data = data;
+ b.size = size;
+ for (;;) {
+ uint8_t *code, *len;
+ void *payload;
+
+ code = ofpbuf_try_pull(&b, 1);
+ if (!code || *code == DHCP_CODE_END) {
+ break;
+ } else if (*code == DHCP_CODE_PAD) {
+ continue;
+ }
+
+ len = ofpbuf_try_pull(&b, 1);
+ if (!len) {
+ VLOG_DBG_RL(&rl, "reached end of %s expecting length byte", name);
+ break;
+ }
+
+ payload = ofpbuf_try_pull(&b, *len);
+ if (!payload) {
+ VLOG_DBG_RL(&rl, "expected %"PRIu8" bytes of option-%"PRIu8" "
+ "payload with only %zu bytes of %s left",
+ *len, *code, b.size, name);
+ break;
+ }
+ dhcp_msg_put(msg, *code + option_offset, payload, *len);
+ }
+}
+
+static void
+validate_options(struct dhcp_msg *msg)
+{
+ int code;
+
+ for (code = 0; code < DHCP_N_OPTIONS; code++) {
+ struct dhcp_option *opt = &msg->options[code];
+ const struct option_class *class = get_option_class(code);
+ struct arg_type *type = &types[class->type];
+ if (opt->data) {
+ size_t n_elems = opt->n / type->size;
+ size_t remainder = opt->n % type->size;
+ bool ok = true;
+ if (remainder) {
+ VLOG_DBG_RL(&rl, "%s option has %zu %zu-byte %s arguments "
+ "with %zu bytes left over",
+ class->name, n_elems, type->size,
+ type->name, remainder);
+ ok = false;
+ }
+ if (n_elems < class->min_args || n_elems > class->max_args) {
+ VLOG_DBG_RL(&rl, "%s option has %zu %zu-byte %s arguments but "
+ "between %zu and %zu are required",
+ class->name, n_elems, type->size, type->name,
+ class->min_args, class->max_args);
+ ok = false;
+ }
+ if (!ok) {
+ struct ds ds = DS_EMPTY_INITIALIZER;
+ VLOG_DBG_RL(&rl, "%s option contains: %s", class->name,
+ dhcp_option_to_string(opt, code, &ds));
+ ds_destroy(&ds);
+
+ opt->n = 0;
+ opt->data = NULL;
+ }
+ }
+ }
+}
+
+/* Attempts to parse 'b' as a DHCP message. If successful, initializes '*msg'
+ * to the parsed message and returns 0. Otherwise, returns a positive errno
+ * value and '*msg' is indeterminate. */
+int
+dhcp_parse(struct dhcp_msg *msg, const struct ofpbuf *b_)
+{
+ struct ofpbuf b = *b_;
+ struct dhcp_header *dhcp;
+ uint32_t *cookie;
+ uint8_t type;
+ char *vendor_class;
+
+ dhcp = ofpbuf_try_pull(&b, sizeof *dhcp);
+ if (!dhcp) {
+ VLOG_DBG_RL(&rl, "buffer too small for DHCP header (%zu bytes)",
+ b.size);
+ goto error;
+ }
+
+ if (dhcp->op != DHCP_BOOTREPLY && dhcp->op != DHCP_BOOTREQUEST) {
+ VLOG_DBG_RL(&rl, "invalid DHCP op (%"PRIu8")", dhcp->op);
+ goto error;
+ }
+ if (dhcp->htype != ARP_HRD_ETHERNET) {
+ VLOG_DBG_RL(&rl, "invalid DHCP htype (%"PRIu8")", dhcp->htype);
+ goto error;
+ }
+ if (dhcp->hlen != ETH_ADDR_LEN) {
+ VLOG_DBG_RL(&rl, "invalid DHCP hlen (%"PRIu8")", dhcp->hlen);
+ goto error;
+ }
+
+ dhcp_msg_init(msg);
+ msg->op = dhcp->op;
+ msg->xid = ntohl(dhcp->xid);
+ msg->secs = ntohs(dhcp->secs);
+ msg->flags = ntohs(dhcp->flags);
+ msg->ciaddr = dhcp->ciaddr;
+ msg->yiaddr = dhcp->yiaddr;
+ msg->siaddr = dhcp->siaddr;
+ msg->giaddr = dhcp->giaddr;
+ memcpy(msg->chaddr, dhcp->chaddr, ETH_ADDR_LEN);
+
+ cookie = ofpbuf_try_pull(&b, sizeof cookie);
+ if (cookie) {
+ if (ntohl(*cookie) == DHCP_OPTS_COOKIE) {
+ uint8_t overload;
+
+ parse_options(msg, "options", b.data, b.size, 0);
+ if (dhcp_msg_get_uint8(msg, DHCP_CODE_OPTION_OVERLOAD,
+ 0, &overload)) {
+ if (overload & 1) {
+ parse_options(msg, "file", dhcp->file, sizeof dhcp->file,
+ 0);
+ }
+ if (overload & 2) {
+ parse_options(msg, "sname",
+ dhcp->sname, sizeof dhcp->sname, 0);
+ }
+ }
+ } else {
+ VLOG_DBG_RL(&rl, "bad DHCP options cookie: %08"PRIx32,
+ ntohl(*cookie));
+ }
+ } else {
+ VLOG_DBG_RL(&rl, "DHCP packet has no options");
+ }
+
+ vendor_class = dhcp_msg_get_string(msg, DHCP_CODE_VENDOR_CLASS);
+ if (vendor_class && !strcmp(vendor_class, "OpenFlow")) {
+ parse_options(msg, "vendor-specific",
+ msg->options[DHCP_CODE_VENDOR_SPECIFIC].data,
+ msg->options[DHCP_CODE_VENDOR_SPECIFIC].n,
+ DHCP_VENDOR_OFS);
+ }
+ free(vendor_class);
+
+ validate_options(msg);
+ if (!dhcp_msg_get_uint8(msg, DHCP_CODE_DHCP_MSG_TYPE, 0, &type)) {
+ VLOG_DBG_RL(&rl, "missing DHCP message type");
+ dhcp_msg_uninit(msg);
+ goto error;
+ }
+ msg->type = type;
+ return 0;
+
+error:
+ if (VLOG_IS_DBG_ENABLED()) {
+ struct ds ds;
+
+ ds_init(&ds);
+ ds_put_hex_dump(&ds, b_->data, b_->size, 0, true);
+ VLOG_DBG_RL(&rl, "invalid DHCP message dump:\n%s", ds_cstr(&ds));
+
+ ds_clear(&ds);
+ dhcp_msg_to_string(msg, false, &ds);
+ VLOG_DBG_RL(&rl, "partially dissected DHCP message: %s", ds_cstr(&ds));
+
+ ds_destroy(&ds);
+ }
+ return EPROTO;
+}
+
+static void
+put_option_chunk(struct ofpbuf *b, uint8_t code, void *data, size_t n)
+{
+ uint8_t header[2];
+
+ assert(n < 256);
+ header[0] = code;
+ header[1] = n;
+ ofpbuf_put(b, header, sizeof header);
+ ofpbuf_put(b, data, n);
+}
+
+static void
+put_option(struct ofpbuf *b, uint8_t code, void *data, size_t n)
+{
+ if (data) {
+ if (n) {
+ /* Divide the data into chunks of 255 bytes or less. Make
+ * intermediate chunks multiples of 8 bytes in case the
+ * recipient validates a chunk at a time instead of the
+ * concatenated value. */
+ uint8_t *p = data;
+ while (n) {
+ size_t chunk = n > 255 ? 248 : n;
+ put_option_chunk(b, code, p, chunk);
+ p += chunk;
+ n -= chunk;
+ }
+ } else {
+ /* Option should be present but carry no data. */
+ put_option_chunk(b, code, NULL, 0);
+ }
+ }
+}
+
+/* Appends to 'b' the DHCP message represented by 'msg'. */
+void
+dhcp_assemble(const struct dhcp_msg *msg, struct ofpbuf *b)
+{
+ const uint8_t end = DHCP_CODE_END;
+ uint32_t cookie = htonl(DHCP_OPTS_COOKIE);
+ struct ofpbuf vnd_data;
+ struct dhcp_header dhcp;
+ int i;
+
+ memset(&dhcp, 0, sizeof dhcp);
+ dhcp.op = msg->op;
+ dhcp.htype = ARP_HRD_ETHERNET;
+ dhcp.hlen = ETH_ADDR_LEN;
+ dhcp.hops = 0;
+ dhcp.xid = htonl(msg->xid);
+ dhcp.secs = htons(msg->secs);
+ dhcp.flags = htons(msg->flags);
+ dhcp.ciaddr = msg->ciaddr;
+ dhcp.yiaddr = msg->yiaddr;
+ dhcp.siaddr = msg->siaddr;
+ dhcp.giaddr = msg->giaddr;
+ memcpy(dhcp.chaddr, msg->chaddr, ETH_ADDR_LEN);
+ ofpbuf_put(b, &dhcp, sizeof dhcp);
+ ofpbuf_put(b, &cookie, sizeof cookie);
+
+ /* Put DHCP message type first. (The ordering is not required but it
+ * seems polite.) */
+ if (msg->type) {
+ uint8_t type = msg->type;
+ put_option(b, DHCP_CODE_DHCP_MSG_TYPE, &type, 1);
+ }
+
+ /* Put the standard options. */
+ for (i = 0; i < DHCP_VENDOR_OFS; i++) {
+ const struct dhcp_option *option = &msg->options[i];
+ put_option(b, i, option->data, option->n);
+ }
+
+ /* Assemble vendor specific option and put it. */
+ ofpbuf_init(&vnd_data, 0);
+ for (i = DHCP_VENDOR_OFS; i < DHCP_N_OPTIONS; i++) {
+ const struct dhcp_option *option = &msg->options[i];
+ put_option(&vnd_data, i - DHCP_VENDOR_OFS, option->data, option->n);
+ }
+ if (vnd_data.size) {
+ put_option(b, DHCP_CODE_VENDOR_SPECIFIC, vnd_data.data, vnd_data.size);
+ }
+ ofpbuf_uninit(&vnd_data);
+
+ /* Put end-of-options option. */
+ ofpbuf_put(b, &end, sizeof end);
+}
+
diff --git a/lib/dhcp.h b/lib/dhcp.h
new file mode 100644
index 000000000..eaeb6120f
--- /dev/null
+++ b/lib/dhcp.h
@@ -0,0 +1,262 @@
+/*
+ * Copyright (c) 2008 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef DHCP_H
+#define DHCP_H 1
+
+#include <stdint.h>
+#include "packets.h"
+#include "util.h"
+
+struct ds;
+struct ofpbuf;
+
+/* Values for 'op' field. */
+#define DHCP_BOOTREQUEST 1 /* Message sent by DHCP client. */
+#define DHCP_BOOTREPLY 2 /* Message sent by DHCP server. */
+
+/* Bits in 'flags' field. */
+#define DHCP_FLAGS_BROADCAST 0x8000 /* Server must broadcast all replies. */
+#define DHCP_FLAGS_MBZ 0x7fff /* Must be zero. */
+
+/* First four bytes of 'options' field. */
+#define DHCP_OPTS_COOKIE 0x63825363
+
+#define DHCP_HEADER_LEN 236
+struct dhcp_header {
+ uint8_t op; /* DHCP_BOOTREQUEST or DHCP_BOOTREPLY. */
+ uint8_t htype; /* ARP_HRD_ETHERNET (typically). */
+ uint8_t hlen; /* ETH_ADDR_LEN (typically). */
+ uint8_t hops; /* Hop count; set to 0 by client. */
+ uint32_t xid; /* Transaction ID. */
+ uint16_t secs; /* Since client started address acquisition. */
+ uint16_t flags; /* DHCP_FLAGS_*. */
+ uint32_t ciaddr; /* Client IP, if it has a lease for one. */
+ uint32_t yiaddr; /* Client ("your") IP address. */
+ uint32_t siaddr; /* Next server IP address. */
+ uint32_t giaddr; /* Relay agent IP address. */
+ uint8_t chaddr[16]; /* Client hardware address. */
+ char sname[64]; /* Optional server host name. */
+ char file[128]; /* Boot file name. */
+ /* Followed by variable-length options field. */
+};
+BUILD_ASSERT_DECL(DHCP_HEADER_LEN == sizeof(struct dhcp_header));
+
+#define DHCP_ARGS \
+ DHCP_ARG(FIXED, 0) /* Fixed-length option (PAD and END only). */ \
+ DHCP_ARG(IP, 4) /* IP addresses. */ \
+ DHCP_ARG(SECS, 4) /* 32-bit duration in seconds. */ \
+ DHCP_ARG(STRING, 1) /* NVT string, optionally null-terminated. */ \
+ DHCP_ARG(UINT8, 1) /* 8-bit unsigned integer. */ \
+ DHCP_ARG(UINT16, 2) /* 16-bit unsigned integer. */ \
+ DHCP_ARG(UINT32, 4) /* 32-bit unsigned integer. */ \
+ DHCP_ARG(BOOLEAN, 1) /* Boolean octet (0 or 1). */
+
+/* DHCP option argument types. */
+enum dhcp_arg_type {
+#define DHCP_ARG(NAME, SIZE) DHCP_ARG_##NAME,
+ DHCP_ARGS
+#undef DHCP_ARG
+};
+
+#define DHCP_MSGS \
+ DHCP_MSG(DHCPDISCOVER, 1) /* Client->server: What IPs are available? */ \
+ DHCP_MSG(DHCPOFFER, 2) /* Server->client: This IP is available. */ \
+ DHCP_MSG(DHCPREQUEST, 3) /* Client->server: I want that IP. */ \
+ DHCP_MSG(DHCPDECLINE, 4) /* Client->server: That IP is in use!. */ \
+ DHCP_MSG(DHCPACK, 5) /* Server->client: You can have that IP. */ \
+ DHCP_MSG(DHCPNAK, 6) /* Server->client: You can't have that IP. */ \
+ DHCP_MSG(DHCPRELEASE, 7) /* Client->server: I'm done with this IP. */ \
+ DHCP_MSG(DCHPINFORM, 8) /* Client->server: I'm using this IP. */
+
+/* DHCP message type (this is the argument for the DHCP_MSG_TYPE option). */
+enum dhcp_msg_type {
+#define DHCP_MSG(NAME, VALUE) NAME = VALUE,
+ DHCP_MSGS
+#undef DHCP_MSG
+};
+const char *dhcp_type_name(enum dhcp_msg_type);
+
+/* DHCP allows for 256 standardized options and 256 vendor-specific options.
+ * We put them in a single array, with the standard options at the
+ * beginning. */
+#define DHCP_N_OPTIONS 512
+#define DHCP_VENDOR_OFS 256
+
+/* DHCP options. */
+#define DHCP_OPTS \
+ /* arg min max */ \
+ /* name code type args args */ \
+ DHCP_OPT(PAD, 0, FIXED, 0, 0) \
+ DHCP_OPT(END, 255, FIXED, 0, 0) \
+ DHCP_OPT(SUBNET_MASK, 1, IP, 1, 1) \
+ DHCP_OPT(TIME_OFFSET, 2, SECS, 1, 1) \
+ DHCP_OPT(ROUTER, 3, IP, 1, SIZE_MAX) \
+ /* Time Server Option is obsolete. */ \
+ /* Name Server Option is obsolete. */ \
+ DHCP_OPT(DNS_SERVER, 6, IP, 1, SIZE_MAX) \
+ /* Log Server Option is obsolete. */ \
+ /* Cookie Server Option is obsolete. */ \
+ DHCP_OPT(LPR_SERVER, 9, IP, 1, SIZE_MAX) \
+ /* Impress Server Option is obsolete. */ \
+ /* Resource Location Server Option is obsolete. */ \
+ DHCP_OPT(HOST_NAME, 12, STRING, 1, SIZE_MAX) \
+ DHCP_OPT(BOOT_FILE_SIZE, 13, UINT16, 1, 1) \
+ /* Merit Dump File option is obsolete. */ \
+ DHCP_OPT(DOMAIN_NAME, 15, STRING, 1, SIZE_MAX) \
+ /* Swap Server option is obsolete. */ \
+ DHCP_OPT(ROOT_PATH, 17, STRING, 1, SIZE_MAX) \
+ DHCP_OPT(EXTENSIONS_PATH, 18, STRING, 1, SIZE_MAX) \
+ DHCP_OPT(IP_FORWARDING, 19, BOOLEAN, 1, 1) \
+ DHCP_OPT(SOURCE_ROUTING, 20, BOOLEAN, 1, 1) \
+ DHCP_OPT(POLICY_FILTER, 21, IP, 2, SIZE_MAX) \
+ DHCP_OPT(MAX_DGRAM_REASSEMBLY, 22, UINT16, 1, 1) \
+ DHCP_OPT(IP_TTL, 23, UINT8, 1, 1) \
+ DHCP_OPT(PATH_MTU_TIMEOUT, 24, SECS, 1, 1) \
+ DHCP_OPT(PATH_MTU_PLATEAU, 25, UINT16, 2, SIZE_MAX) \
+ DHCP_OPT(MTU, 26, UINT16, 1, 1) \
+ DHCP_OPT(ALL_SUBNETS_ARE_LOCAL, 27, BOOLEAN, 1, 1) \
+ DHCP_OPT(BROADCAST_ADDRESS, 28, IP, 1, 1) \
+ DHCP_OPT(PERFORM_MASK_DISCOVERY, 29, BOOLEAN, 1, 1) \
+ DHCP_OPT(MASK_SUPPLIER, 30, BOOLEAN, 1, 1) \
+ DHCP_OPT(PERFORM_ROUTER_DISCOVERY, 31, BOOLEAN, 1, 1) \
+ DHCP_OPT(ROUTER_SOLICITATION, 32, IP, 1, 1) \
+ DHCP_OPT(STATIC_ROUTE, 33, IP, 2, SIZE_MAX) \
+ /* Trailer Encapsulation Option is obsolete. */ \
+ DHCP_OPT(ARP_CACHE_TIMEOUT, 35, SECS, 1, 1) \
+ DHCP_OPT(ETHERNET_ENCAPSULATION, 36, BOOLEAN, 1, 1) \
+ DHCP_OPT(TCP_TTL, 37, UINT8, 1, 1) \
+ DHCP_OPT(TCP_KEEPALIVE_INTERVAL, 38, SECS, 1, 1) \
+ DHCP_OPT(TCP_KEEPALIVE_GARBAGE, 39, BOOLEAN, 1, 1) \
+ DHCP_OPT(NIS_DOMAIN, 40, STRING, 1, SIZE_MAX) \
+ DHCP_OPT(NIS_SERVERS, 41, IP, 1, SIZE_MAX) \
+ DHCP_OPT(NTP_SERVERS, 42, IP, 1, SIZE_MAX) \
+ DHCP_OPT(VENDOR_SPECIFIC, 43, UINT8, 1, SIZE_MAX) \
+ DHCP_OPT(NETBIOS_NS, 44, IP, 1, SIZE_MAX) \
+ DHCP_OPT(NETBIOS_DDS, 45, IP, 1, SIZE_MAX) \
+ DHCP_OPT(NETBIOS_NODE_TYPE, 46, UINT8, 1, 1) \
+ DHCP_OPT(NETBIOS_SCOPE, 47, STRING, 1, SIZE_MAX) \
+ DHCP_OPT(X_FONT_SERVER, 48, IP, 1, SIZE_MAX) \
+ DHCP_OPT(XDM, 49, IP, 1, SIZE_MAX) \
+ DHCP_OPT(NISPLUS_DOMAIN, 64, STRING, 1, SIZE_MAX) \
+ DHCP_OPT(NISPLUS_SERVERS, 65, IP, 1, SIZE_MAX) \
+ DHCP_OPT(MOBILE_IP_HOME_AGENT, 68, IP, 0, SIZE_MAX) \
+ DHCP_OPT(SMTP_SERVER, 69, IP, 1, SIZE_MAX) \
+ DHCP_OPT(POP3_SERVER, 70, IP, 1, SIZE_MAX) \
+ DHCP_OPT(NNTP_SERVER, 71, IP, 1, SIZE_MAX) \
+ DHCP_OPT(WWW_SERVER, 72, IP, 1, SIZE_MAX) \
+ DHCP_OPT(FINGER_SERVER, 73, IP, 1, SIZE_MAX) \
+ DHCP_OPT(IRC_SERVER, 74, IP, 1, SIZE_MAX) \
+ /* StreetTalk Server Option is obsolete. */ \
+ /* StreetTalk Directory Assistance Server Option is obsolete. */ \
+ DHCP_OPT(REQUESTED_IP, 50, IP, 1, 1) \
+ DHCP_OPT(LEASE_TIME, 51, SECS, 1, 1) \
+ DHCP_OPT(OPTION_OVERLOAD, 52, UINT8, 1, 1) \
+ DHCP_OPT(TFTP_SERVER, 66, STRING, 1, SIZE_MAX) \
+ DHCP_OPT(BOOTFILE_NAME, 67, STRING, 1, SIZE_MAX) \
+ DHCP_OPT(DHCP_MSG_TYPE, 53, UINT8, 1, 1) \
+ DHCP_OPT(SERVER_IDENTIFIER, 54, IP, 1, 1) \
+ DHCP_OPT(PARAMETER_REQUEST_LIST, 55, UINT8, 1, SIZE_MAX) \
+ DHCP_OPT(MESSAGE, 56, STRING, 1, SIZE_MAX) \
+ DHCP_OPT(MAX_DHCP_MSG_SIZE, 57, UINT16, 1, 1) \
+ DHCP_OPT(T1, 58, SECS, 1, 1) \
+ DHCP_OPT(T2, 59, SECS, 1, 1) \
+ DHCP_OPT(VENDOR_CLASS, 60, STRING, 1, SIZE_MAX) \
+ DHCP_OPT(CLIENT_ID, 61, UINT8, 2, SIZE_MAX) \
+ DHCP_VNDOPT(OFP_CONTROLLER_VCONN, 1, STRING, 1, SIZE_MAX) \
+ DHCP_VNDOPT(OFP_PKI_URI, 2, STRING, 1, SIZE_MAX)
+
+/* Shorthand for defining vendor options (used above). */
+#define DHCP_VNDOPT(NAME, CODE, ARG, MIN, MAX) \
+ DHCP_OPT(NAME, (CODE) + DHCP_VENDOR_OFS, ARG, MIN, MAX)
+
+/* DHCP option codes. */
+enum {
+#define DHCP_OPT(NAME, VALUE, ARGTYPE, MIN_ARGS, MAX_ARGS) \
+ DHCP_CODE_##NAME = VALUE,
+DHCP_OPTS
+#undef DHCP_OPT
+};
+
+/* The contents of a DHCP option.
+ *
+ * DHCP options can (rarely) be present but lack content. To represent such an
+ * option, 'n' is 0 and 'data' is non-null (but does not point to anything
+ * useful). */
+struct dhcp_option {
+ size_t n; /* Number of bytes of data. */
+ void *data; /* Data. */
+};
+
+const char *dhcp_option_to_string(const struct dhcp_option *, int code,
+ struct ds *);
+bool dhcp_option_equals(const struct dhcp_option *,
+ const struct dhcp_option *);
+
+/* Abstracted DHCP protocol message, to make them easier to manipulate than
+ * through raw protocol buffers. */
+struct dhcp_msg {
+ /* For use by calling code. */
+ uint8_t op; /* DHCP_BOOTREQUEST or DHCP_BOOTREPLY. */
+ uint32_t xid; /* Transaction ID. */
+ uint16_t secs; /* Since client started address acquisition. */
+ uint16_t flags; /* DHCP_FLAGS_*. */
+ uint32_t ciaddr; /* Client IP, if it has a lease for one. */
+ uint32_t yiaddr; /* Client ("your") IP address. */
+ uint32_t siaddr; /* Next server IP address. */
+ uint32_t giaddr; /* Relay agent IP address. */
+ uint8_t chaddr[ETH_ADDR_LEN]; /* Client hardware address. */
+ enum dhcp_msg_type type; /* DHCP_CODE_DHCP_MSG_TYPE option argument. */
+ struct dhcp_option options[DHCP_N_OPTIONS]; /* Indexed by option code. */
+
+ /* For direct use only by dhcp_msg_*() functions. */
+ uint8_t *data;
+ size_t data_used, data_allocated;
+};
+
+void dhcp_msg_init(struct dhcp_msg *);
+void dhcp_msg_uninit(struct dhcp_msg *);
+void dhcp_msg_copy(struct dhcp_msg *, const struct dhcp_msg *);
+void dhcp_msg_put(struct dhcp_msg *, int code, const void *, size_t);
+void dhcp_msg_put_bool(struct dhcp_msg *, int code, bool);
+void dhcp_msg_put_secs(struct dhcp_msg *, int code, uint32_t);
+void dhcp_msg_put_ip(struct dhcp_msg *, int code, uint32_t);
+void dhcp_msg_put_string(struct dhcp_msg *, int code, const char *);
+void dhcp_msg_put_uint8(struct dhcp_msg *, int code, uint8_t);
+void dhcp_msg_put_uint8_array(struct dhcp_msg *, int code,
+ const uint8_t[], size_t n);
+void dhcp_msg_put_uint16(struct dhcp_msg *, int code, uint16_t);
+void dhcp_msg_put_uint16_array(struct dhcp_msg *, int code,
+ const uint16_t[], size_t n);
+const void *dhcp_msg_get(const struct dhcp_msg *, int code, size_t offset,
+ size_t size);
+bool dhcp_msg_get_bool(const struct dhcp_msg *, int code,
+ size_t offset, bool *);
+bool dhcp_msg_get_secs(const struct dhcp_msg *, int code,
+ size_t offset, uint32_t *);
+bool dhcp_msg_get_ip(const struct dhcp_msg *, int code,
+ size_t offset, uint32_t *);
+char *dhcp_msg_get_string(const struct dhcp_msg *, int code);
+bool dhcp_msg_get_uint8(const struct dhcp_msg *, int code,
+ size_t offset, uint8_t *);
+bool dhcp_msg_get_uint16(const struct dhcp_msg *, int code,
+ size_t offset, uint16_t *);
+const char *dhcp_msg_to_string(const struct dhcp_msg *, bool multiline,
+ struct ds *);
+int dhcp_parse(struct dhcp_msg *, const struct ofpbuf *);
+void dhcp_assemble(const struct dhcp_msg *, struct ofpbuf *);
+
+#endif /* dhcp.h */
diff --git a/lib/dhparams.h b/lib/dhparams.h
new file mode 100644
index 000000000..0377bb940
--- /dev/null
+++ b/lib/dhparams.h
@@ -0,0 +1,10 @@
+#ifndef DHPARAMS_H
+#define DHPARAMS_H 1
+
+#include <openssl/dh.h>
+
+DH *get_dh1024(void);
+DH *get_dh2048(void);
+DH *get_dh4096(void);
+
+#endif /* dhparams.h */
diff --git a/lib/dirs.h b/lib/dirs.h
new file mode 100644
index 000000000..c90c013fa
--- /dev/null
+++ b/lib/dirs.h
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2008, 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef DIRS_H
+#define DIRS_H 1
+
+extern const char ovs_pkgdatadir[]; /* /usr/local/share/openvswitch */
+extern const char ovs_rundir[]; /* /usr/local/var/run */
+extern const char ovs_logdir[]; /* /usr/local/var/log */
+extern const char ovs_bindir[]; /* /usr/local/bin */
+
+#endif /* dirs.h */
diff --git a/lib/dpif.c b/lib/dpif.c
new file mode 100644
index 000000000..73ad5df00
--- /dev/null
+++ b/lib/dpif.c
@@ -0,0 +1,1060 @@
+/*
+ * Copyright (c) 2008, 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <config.h>
+#include "dpif.h"
+
+#include <assert.h>
+#include <ctype.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <inttypes.h>
+#include <net/if.h>
+#include <linux/rtnetlink.h>
+#include <linux/ethtool.h>
+#include <linux/sockios.h>
+#include <netinet/in.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/stat.h>
+#include <sys/sysmacros.h>
+#include <unistd.h>
+
+#include "coverage.h"
+#include "dynamic-string.h"
+#include "flow.h"
+#include "netlink.h"
+#include "odp-util.h"
+#include "ofp-print.h"
+#include "ofpbuf.h"
+#include "packets.h"
+#include "poll-loop.h"
+#include "util.h"
+#include "valgrind.h"
+
+#include "vlog.h"
+#define THIS_MODULE VLM_dpif
+
+/* Rate limit for individual messages going to or from the datapath, output at
+ * DBG level. This is very high because, if these are enabled, it is because
+ * we really need to see them. */
+static struct vlog_rate_limit dpmsg_rl = VLOG_RATE_LIMIT_INIT(600, 600);
+
+/* Not really much point in logging many dpif errors. */
+static struct vlog_rate_limit error_rl = VLOG_RATE_LIMIT_INIT(9999, 5);
+
+static int get_minor_from_name(const char *name, unsigned int *minor);
+static int name_to_minor(const char *name, unsigned int *minor);
+static int lookup_minor(const char *name, unsigned int *minor);
+static int open_by_minor(unsigned int minor, struct dpif *);
+static int make_openvswitch_device(unsigned int minor, char **fnp);
+static void check_rw_odp_flow(struct odp_flow *);
+
+int
+dpif_open(const char *name, struct dpif *dpif)
+{
+ int listen_mask;
+ int error;
+
+ dpif->fd = -1;
+
+ error = name_to_minor(name, &dpif->minor);
+ if (error) {
+ return error;
+ }
+
+ error = open_by_minor(dpif->minor, dpif);
+ if (error) {
+ return error;
+ }
+
+ /* We can open the device, but that doesn't mean that it's been created.
+ * If it hasn't been, then any command other than ODP_DP_CREATE will
+ * return ENODEV. Try something innocuous. */
+ listen_mask = 0; /* Make Valgrind happy. */
+ if (ioctl(dpif->fd, ODP_GET_LISTEN_MASK, &listen_mask)) {
+ error = errno;
+ if (error != ENODEV) {
+ VLOG_WARN("dp%u: probe returned unexpected error: %s",
+ dpif->minor, strerror(error));
+ }
+ dpif_close(dpif);
+ return error;
+ }
+ return 0;
+}
+
+void
+dpif_close(struct dpif *dpif)
+{
+ if (dpif) {
+ close(dpif->fd);
+ dpif->fd = -1;
+ }
+}
+
+static int
+do_ioctl(const struct dpif *dpif, int cmd, const char *cmd_name,
+ const void *arg)
+{
+ int error = ioctl(dpif->fd, cmd, arg) ? errno : 0;
+ if (cmd_name) {
+ if (error) {
+ VLOG_WARN_RL(&error_rl, "dp%u: ioctl(%s) failed (%s)",
+ dpif->minor, cmd_name, strerror(error));
+ } else {
+ VLOG_DBG_RL(&dpmsg_rl, "dp%u: ioctl(%s): success",
+ dpif->minor, cmd_name);
+ }
+ }
+ return error;
+}
+
+int
+dpif_create(const char *name, struct dpif *dpif)
+{
+ unsigned int minor;
+ int error;
+
+ if (!get_minor_from_name(name, &minor)) {
+ /* Minor was specified in 'name', go ahead and create it. */
+ error = open_by_minor(minor, dpif);
+ if (error) {
+ return error;
+ }
+
+ if (!strncmp(name, "nl:", 3)) {
+ char devname[128];
+ sprintf(devname, "of%u", minor);
+ error = ioctl(dpif->fd, ODP_DP_CREATE, devname) < 0 ? errno : 0;
+ } else {
+ error = ioctl(dpif->fd, ODP_DP_CREATE, name) < 0 ? errno : 0;
+ }
+ if (error) {
+ dpif_close(dpif);
+ }
+ return error;
+ } else {
+ for (minor = 0; minor < ODP_MAX; minor++) {
+ error = open_by_minor(minor, dpif);
+ if (error) {
+ return error;
+ }
+
+ error = ioctl(dpif->fd, ODP_DP_CREATE, name) < 0 ? errno : 0;
+ if (!error) {
+ return 0;
+ }
+ dpif_close(dpif);
+ if (error != EBUSY) {
+ return error;
+ }
+ }
+ return ENOBUFS;
+ }
+}
+
+int
+dpif_get_name(struct dpif *dpif, char *name, size_t name_size)
+{
+ struct odp_port port;
+ int error;
+
+ assert(name_size > 0);
+ *name = '\0';
+
+ error = dpif_port_query_by_number(dpif, ODPP_LOCAL, &port);
+ if (!error) {
+ ovs_strlcpy(name, port.devname, name_size);
+ }
+ return error;
+}
+
+int
+dpif_delete(struct dpif *dpif)
+{
+ COVERAGE_INC(dpif_destroy);
+ return do_ioctl(dpif, ODP_DP_DESTROY, "ODP_DP_DESTROY", NULL);
+}
+
+int
+dpif_get_dp_stats(const struct dpif *dpif, struct odp_stats *stats)
+{
+ memset(stats, 0, sizeof *stats);
+ return do_ioctl(dpif, ODP_DP_STATS, "ODP_DP_STATS", stats);
+}
+
+int
+dpif_get_drop_frags(const struct dpif *dpif, bool *drop_frags)
+{
+ int tmp;
+ int error = do_ioctl(dpif, ODP_GET_DROP_FRAGS, "ODP_GET_DROP_FRAGS", &tmp);
+ *drop_frags = error ? tmp & 1 : false;
+ return error;
+}
+
+int
+dpif_set_drop_frags(struct dpif *dpif, bool drop_frags)
+{
+ int tmp = drop_frags;
+ return do_ioctl(dpif, ODP_SET_DROP_FRAGS, "ODP_SET_DROP_FRAGS", &tmp);
+}
+
+int
+dpif_get_listen_mask(const struct dpif *dpif, int *listen_mask)
+{
+ int error = do_ioctl(dpif, ODP_GET_LISTEN_MASK, "ODP_GET_LISTEN_MASK",
+ listen_mask);
+ if (error) {
+ *listen_mask = 0;
+ }
+ return error;
+}
+
+int
+dpif_set_listen_mask(struct dpif *dpif, int listen_mask)
+{
+ return do_ioctl(dpif, ODP_SET_LISTEN_MASK, "ODP_SET_LISTEN_MASK",
+ &listen_mask);
+}
+
+int
+dpif_purge(struct dpif *dpif)
+{
+ struct odp_stats stats;
+ unsigned int i;
+ int error;
+
+ COVERAGE_INC(dpif_purge);
+
+ error = dpif_get_dp_stats(dpif, &stats);
+ if (error) {
+ return error;
+ }
+
+ for (i = 0; i < stats.max_miss_queue + stats.max_action_queue; i++) {
+ struct ofpbuf *buf;
+ error = dpif_recv(dpif, &buf);
+ if (error) {
+ return error == EAGAIN ? 0 : error;
+ }
+ ofpbuf_delete(buf);
+ }
+ return 0;
+}
+
+int
+dpif_port_add(struct dpif *dpif, const char *devname, uint16_t port_no,
+ uint16_t flags)
+{
+ struct odp_port port;
+
+ COVERAGE_INC(dpif_port_add);
+ memset(&port, 0, sizeof port);
+ strncpy(port.devname, devname, sizeof port.devname);
+ port.port = port_no;
+ port.flags = flags;
+ if (!ioctl(dpif->fd, ODP_PORT_ADD, &port)) {
+ VLOG_DBG_RL(&dpmsg_rl, "dp%u: added %s as port %"PRIu16,
+ dpif->minor, devname, port_no);
+ return 0;
+ } else {
+ VLOG_WARN_RL(&error_rl, "dp%u: failed to add %s as port "
+ "%"PRIu16": %s", dpif->minor, devname, port_no,
+ strerror(errno));
+ return errno;
+ }
+}
+
+int
+dpif_port_del(struct dpif *dpif, uint16_t port_no)
+{
+ int tmp = port_no;
+ COVERAGE_INC(dpif_port_del);
+ return do_ioctl(dpif, ODP_PORT_DEL, "ODP_PORT_DEL", &tmp);
+}
+
+int
+dpif_port_query_by_number(const struct dpif *dpif, uint16_t port_no,
+ struct odp_port *port)
+{
+ memset(port, 0, sizeof *port);
+ port->port = port_no;
+ if (!ioctl(dpif->fd, ODP_PORT_QUERY, port)) {
+ VLOG_DBG_RL(&dpmsg_rl, "dp%u: port %"PRIu16" is device %s",
+ dpif->minor, port_no, port->devname);
+ return 0;
+ } else {
+ VLOG_WARN_RL(&error_rl, "dp%u: failed to query port %"PRIu16": %s",
+ dpif->minor, port_no, strerror(errno));
+ return errno;
+ }
+}
+
+int
+dpif_port_query_by_name(const struct dpif *dpif, const char *devname,
+ struct odp_port *port)
+{
+ memset(port, 0, sizeof *port);
+ strncpy(port->devname, devname, sizeof port->devname);
+ if (!ioctl(dpif->fd, ODP_PORT_QUERY, port)) {
+ VLOG_DBG_RL(&dpmsg_rl, "dp%u: device %s is on port %"PRIu16,
+ dpif->minor, devname, port->port);
+ return 0;
+ } else {
+ VLOG_WARN_RL(&error_rl, "dp%u: failed to query port %s: %s",
+ dpif->minor, devname, strerror(errno));
+ return errno;
+ }
+}
+
+int
+dpif_port_list(const struct dpif *dpif,
+ struct odp_port **ports, size_t *n_ports)
+{
+ struct odp_portvec pv;
+ struct odp_stats stats;
+ int error;
+
+ do {
+ error = dpif_get_dp_stats(dpif, &stats);
+ if (error) {
+ goto error;
+ }
+
+ *ports = xcalloc(1, stats.n_ports * sizeof **ports);
+ pv.ports = *ports;
+ pv.n_ports = stats.n_ports;
+ error = do_ioctl(dpif, ODP_PORT_LIST, "ODP_PORT_LIST", &pv);
+ if (error) {
+ free(*ports);
+ goto error;
+ }
+ } while (pv.n_ports != stats.n_ports);
+ *n_ports = pv.n_ports;
+ return 0;
+
+error:
+ *ports = NULL;
+ *n_ports = 0;
+ return error;
+}
+
+int
+dpif_port_group_set(struct dpif *dpif, uint16_t group,
+ const uint16_t ports[], size_t n_ports)
+{
+ struct odp_port_group pg;
+
+ COVERAGE_INC(dpif_port_group_set);
+ assert(n_ports <= UINT16_MAX);
+ pg.group = group;
+ pg.ports = (uint16_t *) ports;
+ pg.n_ports = n_ports;
+ return do_ioctl(dpif, ODP_PORT_GROUP_SET, "ODP_PORT_GROUP_SET", &pg);
+}
+
+/* Careful: '*n_out' can be greater than 'n_ports' on return, if 'n_ports' is
+ * less than the number of ports in 'group'. */
+int
+dpif_port_group_get(const struct dpif *dpif, uint16_t group,
+ uint16_t ports[], size_t n_ports, size_t *n_out)
+{
+ struct odp_port_group pg;
+ int error;
+
+ assert(n_ports <= UINT16_MAX);
+ pg.group = group;
+ pg.ports = ports;
+ pg.n_ports = n_ports;
+ error = do_ioctl(dpif, ODP_PORT_GROUP_GET, "ODP_PORT_GROUP_GET", &pg);
+ *n_out = error ? 0 : pg.n_ports;
+ return error;
+}
+
+int
+dpif_flow_flush(struct dpif *dpif)
+{
+ COVERAGE_INC(dpif_flow_flush);
+ return do_ioctl(dpif, ODP_FLOW_FLUSH, "ODP_FLOW_FLUSH", NULL);
+}
+
+static enum vlog_level
+flow_message_log_level(int error)
+{
+ return error ? VLL_WARN : VLL_DBG;
+}
+
+static bool
+should_log_flow_message(int error)
+{
+ return !vlog_should_drop(THIS_MODULE, flow_message_log_level(error),
+ error ? &error_rl : &dpmsg_rl);
+}
+
+static void
+log_flow_message(const struct dpif *dpif, int error,
+ const char *operation,
+ const flow_t *flow, const struct odp_flow_stats *stats,
+ const union odp_action *actions, size_t n_actions)
+{
+ struct ds ds = DS_EMPTY_INITIALIZER;
+ ds_put_format(&ds, "dp%u: ", dpif->minor);
+ if (error) {
+ ds_put_cstr(&ds, "failed to ");
+ }
+ ds_put_format(&ds, "%s ", operation);
+ if (error) {
+ ds_put_format(&ds, "(%s) ", strerror(error));
+ }
+ flow_format(&ds, flow);
+ if (stats) {
+ ds_put_cstr(&ds, ", ");
+ format_odp_flow_stats(&ds, stats);
+ }
+ if (actions || n_actions) {
+ ds_put_cstr(&ds, ", actions:");
+ format_odp_actions(&ds, actions, n_actions);
+ }
+ vlog(THIS_MODULE, flow_message_log_level(error), "%s", ds_cstr(&ds));
+ ds_destroy(&ds);
+}
+
+static int
+do_flow_ioctl(const struct dpif *dpif, int cmd, struct odp_flow *flow,
+ const char *operation, bool show_stats)
+{
+ int error = do_ioctl(dpif, cmd, NULL, flow);
+ if (error && show_stats) {
+ flow->n_actions = 0;
+ }
+ if (should_log_flow_message(error)) {
+ log_flow_message(dpif, error, operation, &flow->key,
+ show_stats && !error ? &flow->stats : NULL,
+ flow->actions, flow->n_actions);
+ }
+ return error;
+}
+
+int
+dpif_flow_put(struct dpif *dpif, struct odp_flow_put *put)
+{
+ int error = do_ioctl(dpif, ODP_FLOW_PUT, NULL, put);
+ COVERAGE_INC(dpif_flow_put);
+ if (should_log_flow_message(error)) {
+ struct ds operation = DS_EMPTY_INITIALIZER;
+ ds_put_cstr(&operation, "put");
+ if (put->flags & ODPPF_CREATE) {
+ ds_put_cstr(&operation, "[create]");
+ }
+ if (put->flags & ODPPF_MODIFY) {
+ ds_put_cstr(&operation, "[modify]");
+ }
+ if (put->flags & ODPPF_ZERO_STATS) {
+ ds_put_cstr(&operation, "[zero]");
+ }
+#define ODPPF_ALL (ODPPF_CREATE | ODPPF_MODIFY | ODPPF_ZERO_STATS)
+ if (put->flags & ~ODPPF_ALL) {
+ ds_put_format(&operation, "[%x]", put->flags & ~ODPPF_ALL);
+ }
+ log_flow_message(dpif, error, ds_cstr(&operation), &put->flow.key,
+ !error ? &put->flow.stats : NULL,
+ put->flow.actions, put->flow.n_actions);
+ ds_destroy(&operation);
+ }
+ return error;
+}
+
+int
+dpif_flow_del(struct dpif *dpif, struct odp_flow *flow)
+{
+ COVERAGE_INC(dpif_flow_del);
+ check_rw_odp_flow(flow);
+ memset(&flow->stats, 0, sizeof flow->stats);
+ return do_flow_ioctl(dpif, ODP_FLOW_DEL, flow, "delete flow", true);
+}
+
+int
+dpif_flow_get(const struct dpif *dpif, struct odp_flow *flow)
+{
+ COVERAGE_INC(dpif_flow_query);
+ check_rw_odp_flow(flow);
+ memset(&flow->stats, 0, sizeof flow->stats);
+ return do_flow_ioctl(dpif, ODP_FLOW_GET, flow, "get flow", true);
+}
+
+int
+dpif_flow_get_multiple(const struct dpif *dpif,
+ struct odp_flow flows[], size_t n)
+{
+ struct odp_flowvec fv;
+ size_t i;
+
+ COVERAGE_ADD(dpif_flow_query_multiple, n);
+ fv.flows = flows;
+ fv.n_flows = n;
+ for (i = 0; i < n; i++) {
+ check_rw_odp_flow(&flows[i]);
+ }
+ return do_ioctl(dpif, ODP_FLOW_GET_MULTIPLE, "ODP_FLOW_GET_MULTIPLE",
+ &fv);
+}
+
+int
+dpif_flow_list(const struct dpif *dpif, struct odp_flow flows[], size_t n,
+ size_t *n_out)
+{
+ struct odp_flowvec fv;
+ uint32_t i;
+ int error;
+
+ COVERAGE_INC(dpif_flow_query_list);
+ fv.flows = flows;
+ fv.n_flows = n;
+ if (RUNNING_ON_VALGRIND) {
+ memset(flows, 0, n * sizeof *flows);
+ } else {
+ for (i = 0; i < n; i++) {
+ flows[i].actions = NULL;
+ flows[i].n_actions = 0;
+ }
+ }
+ error = do_ioctl(dpif, ODP_FLOW_LIST, NULL, &fv);
+ if (error) {
+ *n_out = 0;
+ VLOG_WARN_RL(&error_rl, "dp%u: flow list failed (%s)",
+ dpif->minor, strerror(error));
+ } else {
+ COVERAGE_ADD(dpif_flow_query_list_n, fv.n_flows);
+ *n_out = fv.n_flows;
+ VLOG_DBG_RL(&dpmsg_rl, "dp%u: listed %zu flows", dpif->minor, *n_out);
+ }
+ return error;
+}
+
+int
+dpif_flow_list_all(const struct dpif *dpif,
+ struct odp_flow **flowsp, size_t *np)
+{
+ struct odp_stats stats;
+ struct odp_flow *flows;
+ size_t n_flows;
+ int error;
+
+ *flowsp = NULL;
+ *np = 0;
+
+ error = dpif_get_dp_stats(dpif, &stats);
+ if (error) {
+ return error;
+ }
+
+ flows = xmalloc(sizeof *flows * stats.n_flows);
+ error = dpif_flow_list(dpif, flows, stats.n_flows, &n_flows);
+ if (error) {
+ free(flows);
+ return error;
+ }
+
+ if (stats.n_flows != n_flows) {
+ VLOG_WARN_RL(&error_rl, "dp%u: datapath stats reported %"PRIu32" "
+ "flows but flow listing reported %zu",
+ dpif->minor, stats.n_flows, n_flows);
+ }
+ *flowsp = flows;
+ *np = n_flows;
+ return 0;
+}
+
+int
+dpif_execute(struct dpif *dpif, uint16_t in_port,
+ const union odp_action actions[], size_t n_actions,
+ const struct ofpbuf *buf)
+{
+ int error;
+
+ COVERAGE_INC(dpif_execute);
+ if (n_actions > 0) {
+ struct odp_execute execute;
+ memset(&execute, 0, sizeof execute);
+ execute.in_port = in_port;
+ execute.actions = (union odp_action *) actions;
+ execute.n_actions = n_actions;
+ execute.data = buf->data;
+ execute.length = buf->size;
+ error = do_ioctl(dpif, ODP_EXECUTE, NULL, &execute);
+ } else {
+ error = 0;
+ }
+
+ if (!(error ? VLOG_DROP_WARN(&error_rl) : VLOG_DROP_DBG(&dpmsg_rl))) {
+ struct ds ds = DS_EMPTY_INITIALIZER;
+ char *packet = ofp_packet_to_string(buf->data, buf->size, buf->size);
+ ds_put_format(&ds, "dp%u: execute ", dpif->minor);
+ format_odp_actions(&ds, actions, n_actions);
+ if (error) {
+ ds_put_format(&ds, " failed (%s)", strerror(error));
+ }
+ ds_put_format(&ds, " on packet %s", packet);
+ vlog(THIS_MODULE, error ? VLL_WARN : VLL_DBG, "%s", ds_cstr(&ds));
+ ds_destroy(&ds);
+ free(packet);
+ }
+ return error;
+}
+
+int
+dpif_recv(struct dpif *dpif, struct ofpbuf **bufp)
+{
+ struct ofpbuf *buf;
+ int retval;
+ int error;
+
+ buf = ofpbuf_new(65536);
+ retval = read(dpif->fd, ofpbuf_tail(buf), ofpbuf_tailroom(buf));
+ if (retval < 0) {
+ error = errno;
+ if (error != EAGAIN) {
+ VLOG_WARN_RL(&error_rl, "dp%u: read failed: %s",
+ dpif->minor, strerror(error));
+ }
+ } else if (retval >= sizeof(struct odp_msg)) {
+ struct odp_msg *msg = buf->data;
+ if (msg->length <= retval) {
+ buf->size += retval;
+ if (VLOG_IS_DBG_ENABLED()) {
+ void *payload = msg + 1;
+ size_t length = buf->size - sizeof *msg;
+ char *s = ofp_packet_to_string(payload, length, length);
+ VLOG_DBG_RL(&dpmsg_rl, "dp%u: received %s message of length "
+ "%zu on port %"PRIu16": %s", dpif->minor,
+ (msg->type == _ODPL_MISS_NR ? "miss"
+ : msg->type == _ODPL_ACTION_NR ? "action"
+ : "<unknown>"),
+ msg->length - sizeof(struct odp_msg),
+ msg->port, s);
+ free(s);
+ }
+ *bufp = buf;
+ COVERAGE_INC(dpif_recv);
+ return 0;
+ } else {
+ VLOG_WARN_RL(&error_rl, "dp%u: discarding message truncated "
+ "from %zu bytes to %d",
+ dpif->minor, msg->length, retval);
+ error = ERANGE;
+ }
+ } else if (!retval) {
+ VLOG_WARN_RL(&error_rl, "dp%u: unexpected end of file", dpif->minor);
+ error = EPROTO;
+ } else {
+ VLOG_WARN_RL(&error_rl,
+ "dp%u: discarding too-short message (%d bytes)",
+ dpif->minor, retval);
+ error = ERANGE;
+ }
+
+ *bufp = NULL;
+ ofpbuf_delete(buf);
+ return error;
+}
+
+void
+dpif_recv_wait(struct dpif *dpif)
+{
+ poll_fd_wait(dpif->fd, POLLIN);
+}
+
+struct dpifmon {
+ struct dpif dpif;
+ struct nl_sock *sock;
+ int local_ifindex;
+};
+
+int
+dpifmon_create(const char *datapath_name, struct dpifmon **monp)
+{
+ struct dpifmon *mon;
+ char local_name[IFNAMSIZ];
+ int error;
+
+ mon = *monp = xmalloc(sizeof *mon);
+
+ error = dpif_open(datapath_name, &mon->dpif);
+ if (error) {
+ goto error;
+ }
+ error = dpif_get_name(&mon->dpif, local_name, sizeof local_name);
+ if (error) {
+ goto error_close_dpif;
+ }
+
+ mon->local_ifindex = if_nametoindex(local_name);
+ if (!mon->local_ifindex) {
+ error = errno;
+ VLOG_WARN("could not get ifindex of %s device: %s",
+ local_name, strerror(errno));
+ goto error_close_dpif;
+ }
+
+ error = nl_sock_create(NETLINK_ROUTE, RTNLGRP_LINK, 0, 0, &mon->sock);
+ if (error) {
+ VLOG_WARN("could not create rtnetlink socket: %s", strerror(error));
+ goto error_close_dpif;
+ }
+
+ return 0;
+
+error_close_dpif:
+ dpif_close(&mon->dpif);
+error:
+ free(mon);
+ *monp = NULL;
+ return error;
+}
+
+void
+dpifmon_destroy(struct dpifmon *mon)
+{
+ if (mon) {
+ dpif_close(&mon->dpif);
+ nl_sock_destroy(mon->sock);
+ }
+}
+
+int
+dpifmon_poll(struct dpifmon *mon, char **devnamep)
+{
+ static struct vlog_rate_limit slow_rl = VLOG_RATE_LIMIT_INIT(1, 5);
+ static const struct nl_policy rtnlgrp_link_policy[] = {
+ [IFLA_IFNAME] = { .type = NL_A_STRING },
+ [IFLA_MASTER] = { .type = NL_A_U32, .optional = true },
+ };
+ struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
+ struct ofpbuf *buf;
+ int error;
+
+ *devnamep = NULL;
+again:
+ error = nl_sock_recv(mon->sock, &buf, false);
+ switch (error) {
+ case 0:
+ if (!nl_policy_parse(buf, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
+ rtnlgrp_link_policy,
+ attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
+ VLOG_WARN_RL(&slow_rl, "received bad rtnl message");
+ error = ENOBUFS;
+ } else {
+ const char *devname = nl_attr_get_string(attrs[IFLA_IFNAME]);
+ bool for_us;
+
+ if (attrs[IFLA_MASTER]) {
+ uint32_t master_ifindex = nl_attr_get_u32(attrs[IFLA_MASTER]);
+ for_us = master_ifindex == mon->local_ifindex;
+ } else {
+ /* It's for us if that device is one of our ports. This is
+ * open-coded instead of using dpif_port_query_by_name() to
+ * avoid logging a warning on failure. */
+ struct odp_port port;
+ memset(&port, 0, sizeof port);
+ strncpy(port.devname, devname, sizeof port.devname);
+ for_us = !ioctl(mon->dpif.fd, ODP_PORT_QUERY, &port);
+ }
+
+ if (!for_us) {
+ /* Not for us, try again. */
+ ofpbuf_delete(buf);
+ COVERAGE_INC(dpifmon_poll_false_wakeup);
+ goto again;
+ }
+ COVERAGE_INC(dpifmon_poll_changed);
+ *devnamep = xstrdup(devname);
+ }
+ ofpbuf_delete(buf);
+ break;
+
+ case EAGAIN:
+ /* Nothing to do. */
+ break;
+
+ case ENOBUFS:
+ VLOG_WARN_RL(&slow_rl, "dpifmon socket overflowed");
+ break;
+
+ default:
+ VLOG_WARN_RL(&slow_rl, "error on dpifmon socket: %s", strerror(error));
+ break;
+ }
+ return error;
+}
+
+void
+dpifmon_run(struct dpifmon *mon UNUSED)
+{
+ /* Nothing to do in this implementation. */
+}
+
+void
+dpifmon_wait(struct dpifmon *mon)
+{
+ nl_sock_wait(mon->sock, POLLIN);
+}
+
+static int get_openvswitch_major(void);
+static int get_major(const char *target, int default_major);
+
+static int
+lookup_minor(const char *name, unsigned int *minor)
+{
+ struct ethtool_drvinfo drvinfo;
+ struct ifreq ifr;
+ int error;
+ int sock;
+
+ *minor = -1;
+ sock = socket(AF_INET, SOCK_DGRAM, 0);
+ if (sock < 0) {
+ VLOG_WARN("socket(AF_INET) failed: %s", strerror(errno));
+ error = errno;
+ goto error;
+ }
+
+ memset(&ifr, 0, sizeof ifr);
+ strncpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
+ ifr.ifr_data = (caddr_t) &drvinfo;
+
+ memset(&drvinfo, 0, sizeof drvinfo);
+ drvinfo.cmd = ETHTOOL_GDRVINFO;
+ if (ioctl(sock, SIOCETHTOOL, &ifr)) {
+ VLOG_WARN("ioctl(SIOCETHTOOL) failed: %s", strerror(errno));
+ error = errno;
+ goto error_close_sock;
+ }
+
+ if (strcmp(drvinfo.driver, "openvswitch")) {
+ VLOG_WARN("%s is not an openvswitch device", name);
+ error = EOPNOTSUPP;
+ goto error_close_sock;
+ }
+
+ if (!isdigit(drvinfo.bus_info[0])) {
+ VLOG_WARN("%s ethtool info does not contain an openvswitch minor",
+ name);
+ error = EPROTOTYPE;
+ goto error_close_sock;
+ }
+
+ *minor = atoi(drvinfo.bus_info);
+ close(sock);
+ return 0;
+
+error_close_sock:
+ close(sock);
+error:
+ return error;
+}
+
+static int
+make_openvswitch_device(unsigned int minor, char **fnp)
+{
+ dev_t dev = makedev(get_openvswitch_major(), minor);
+ const char dirname[] = "/dev/net";
+ struct stat s;
+ char fn[128];
+
+ *fnp = NULL;
+ sprintf(fn, "%s/dp%d", dirname, minor);
+ if (!stat(fn, &s)) {
+ if (!S_ISCHR(s.st_mode)) {
+ VLOG_WARN_RL(&error_rl, "%s is not a character device, fixing",
+ fn);
+ } else if (s.st_rdev != dev) {
+ VLOG_WARN_RL(&error_rl,
+ "%s is device %u:%u instead of %u:%u, fixing",
+ fn, major(s.st_rdev), minor(s.st_rdev),
+ major(dev), minor(dev));
+ } else {
+ goto success;
+ }
+ if (unlink(fn)) {
+ VLOG_WARN_RL(&error_rl, "%s: unlink failed (%s)",
+ fn, strerror(errno));
+ return errno;
+ }
+ } else if (errno == ENOENT) {
+ if (stat(dirname, &s)) {
+ if (errno == ENOENT) {
+ if (mkdir(dirname, 0755)) {
+ VLOG_WARN_RL(&error_rl, "%s: mkdir failed (%s)",
+ dirname, strerror(errno));
+ return errno;
+ }
+ } else {
+ VLOG_WARN_RL(&error_rl, "%s: stat failed (%s)",
+ dirname, strerror(errno));
+ return errno;
+ }
+ }
+ } else {
+ VLOG_WARN_RL(&error_rl, "%s: stat failed (%s)", fn, strerror(errno));
+ return errno;
+ }
+
+ /* The device needs to be created. */
+ if (mknod(fn, S_IFCHR | 0700, dev)) {
+ VLOG_WARN_RL(&error_rl,
+ "%s: creating character device %u:%u failed (%s)",
+ fn, major(dev), minor(dev), strerror(errno));
+ return errno;
+ }
+
+success:
+ *fnp = xstrdup(fn);
+ return 0;
+}
+
+
+static int
+get_openvswitch_major(void)
+{
+ static unsigned int openvswitch_major;
+ if (!openvswitch_major) {
+ enum { DEFAULT_MAJOR = 248 };
+ openvswitch_major = get_major("openvswitch", DEFAULT_MAJOR);
+ }
+ return openvswitch_major;
+}
+
+static int
+get_major(const char *target, int default_major)
+{
+ const char fn[] = "/proc/devices";
+ char line[128];
+ FILE *file;
+ int ln;
+
+ file = fopen(fn, "r");
+ if (!file) {
+ VLOG_ERR("opening %s failed (%s)", fn, strerror(errno));
+ goto error;
+ }
+
+ for (ln = 1; fgets(line, sizeof line, file); ln++) {
+ char name[64];
+ int major;
+
+ if (!strncmp(line, "Character", 9) || line[0] == '\0') {
+ /* Nothing to do. */
+ } else if (!strncmp(line, "Block", 5)) {
+ /* We only want character devices, so skip the rest of the file. */
+ break;
+ } else if (sscanf(line, "%d %63s", &major, name)) {
+ if (!strcmp(name, target)) {
+ fclose(file);
+ return major;
+ }
+ } else {
+ static bool warned;
+ if (!warned) {
+ VLOG_WARN("%s:%d: syntax error", fn, ln);
+ }
+ warned = true;
+ }
+ }
+
+ VLOG_ERR("%s: %s major not found (is the module loaded?), using "
+ "default major %d", fn, target, default_major);
+error:
+ VLOG_INFO("using default major %d for %s", default_major, target);
+ return default_major;
+}
+
+static int
+name_to_minor(const char *name, unsigned int *minor)
+{
+ if (!get_minor_from_name(name, minor)) {
+ return 0;
+ }
+ return lookup_minor(name, minor);
+}
+
+static int
+get_minor_from_name(const char *name, unsigned int *minor)
+{
+ if (!strncmp(name, "dp", 2) && isdigit(name[2])) {
+ *minor = atoi(name + 2);
+ return 0;
+ } else if (!strncmp(name, "nl:", 3) && isdigit(name[3])) {
+ /* This is for compatibility only and will be dropped. */
+ *minor = atoi(name + 3);
+ return 0;
+ } else {
+ return EINVAL;
+ }
+}
+
+static int
+open_by_minor(unsigned int minor, struct dpif *dpif)
+{
+ int error;
+ char *fn;
+ int fd;
+
+ dpif->minor = -1;
+ dpif->fd = -1;
+ error = make_openvswitch_device(minor, &fn);
+ if (error) {
+ return error;
+ }
+
+ fd = open(fn, O_RDONLY | O_NONBLOCK);
+ if (fd < 0) {
+ error = errno;
+ VLOG_WARN("%s: open failed (%s)", fn, strerror(error));
+ free(fn);
+ return error;
+ }
+
+ free(fn);
+ dpif->minor = minor;
+ dpif->fd = fd;
+ return 0;
+}
+
+/* There is a tendency to construct odp_flow objects on the stack and to
+ * forget to properly initialize their "actions" and "n_actions" members.
+ * When this happens, we get memory corruption because the kernel
+ * writes through the random pointer that is in the "actions" member.
+ *
+ * This function attempts to combat the problem by:
+ *
+ * - Forcing a segfault if "actions" points to an invalid region (instead
+ * of just getting back EFAULT, which can be easily missed in the log).
+ *
+ * - Storing a distinctive value that is likely to cause an
+ * easy-to-identify error later if it is dereferenced, etc.
+ *
+ * - Triggering a warning on uninitialized memory from Valgrind if
+ * "actions" or "n_actions" was not initialized.
+ */
+static void
+check_rw_odp_flow(struct odp_flow *flow)
+{
+ if (flow->n_actions) {
+ memset(&flow->actions[0], 0xcc, sizeof flow->actions[0]);
+ }
+}
diff --git a/lib/dpif.h b/lib/dpif.h
new file mode 100644
index 000000000..22f835554
--- /dev/null
+++ b/lib/dpif.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2008, 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+
+#ifndef DPIF_H
+#define DPIF_H 1
+
+/* Operations for the datapath running in the local kernel. The interface can
+ * generalize to multiple types of local datapaths, but the implementation only
+ * supports the openflow kernel module. */
+
+#include "openvswitch/datapath-protocol.h"
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+struct ofpbuf;
+
+/* A datapath interface. Opaque. */
+struct dpif {
+ unsigned int minor; /* For use in error messages. */
+ int fd;
+};
+
+int dpif_open(const char *name, struct dpif *);
+int dpif_create(const char *name, struct dpif *);
+void dpif_close(struct dpif *);
+
+static inline unsigned int dpif_id(const struct dpif *dpif);
+int dpif_get_name(struct dpif *, char *name, size_t name_size);
+
+int dpif_delete(struct dpif *);
+
+int dpif_get_dp_stats(const struct dpif *, struct odp_stats *);
+int dpif_get_drop_frags(const struct dpif *, bool *drop_frags);
+int dpif_set_drop_frags(struct dpif *, bool drop_frags);
+
+int dpif_get_listen_mask(const struct dpif *, int *listen_mask);
+int dpif_set_listen_mask(struct dpif *, int listen_mask);
+int dpif_purge(struct dpif *);
+
+int dpif_port_add(struct dpif *, const char *devname, uint16_t port_no,
+ uint16_t flags);
+int dpif_port_del(struct dpif *, uint16_t port_no);
+int dpif_port_query_by_number(const struct dpif *, uint16_t port_no,
+ struct odp_port *);
+int dpif_port_query_by_name(const struct dpif *, const char *devname,
+ struct odp_port *);
+int dpif_port_list(const struct dpif *, struct odp_port **, size_t *n_ports);
+
+int dpif_port_group_set(struct dpif *, uint16_t group,
+ const uint16_t ports[], size_t n_ports);
+int dpif_port_group_get(const struct dpif *, uint16_t group,
+ uint16_t ports[], size_t n_ports, size_t *n_out);
+
+int dpif_flow_flush(struct dpif *);
+int dpif_flow_put(struct dpif *, struct odp_flow_put *);
+int dpif_flow_del(struct dpif *, struct odp_flow *);
+int dpif_flow_get(const struct dpif *, struct odp_flow *);
+int dpif_flow_get_multiple(const struct dpif *, struct odp_flow[], size_t n);
+int dpif_flow_list(const struct dpif *, struct odp_flow[], size_t n,
+ size_t *n_out);
+int dpif_flow_list_all(const struct dpif *,
+ struct odp_flow **flowsp, size_t *np);
+
+int dpif_execute(struct dpif *, uint16_t in_port,
+ const union odp_action[], size_t n_actions,
+ const struct ofpbuf *);
+
+int dpif_recv(struct dpif *, struct ofpbuf **);
+void dpif_recv_wait(struct dpif *);
+
+static inline unsigned int
+dpif_id(const struct dpif *dpif)
+{
+ return dpif->minor;
+}
+
+struct dpifmon;
+
+int dpifmon_create(const char *datapath_name, struct dpifmon **);
+void dpifmon_destroy(struct dpifmon *);
+
+int dpifmon_poll(struct dpifmon *, char **devnamep);
+
+void dpifmon_run(struct dpifmon *);
+void dpifmon_wait(struct dpifmon *);
+
+#endif /* dpif.h */
diff --git a/lib/dpif.man b/lib/dpif.man
new file mode 100644
index 000000000..72175b0a3
--- /dev/null
+++ b/lib/dpif.man
@@ -0,0 +1,16 @@
+.RS
+.TP
+\fBdp\fIN\fR
+Datapath number \fIN\fR, where \fIN\fR is a number between 0 and 255,
+inclusive.
+
+.TP
+\fIname\fR
+The name of the network device associated with the datapath's local
+port. (\fB\*(PN\fR internally converts this into a datapath number,
+as above.)
+
+.TP
+\fBnl:\fIN\fR
+This is an obsolete synonym for \fBdp\fIN\fR.
+.RE
diff --git a/lib/dynamic-string.c b/lib/dynamic-string.c
new file mode 100644
index 000000000..6214c8a10
--- /dev/null
+++ b/lib/dynamic-string.c
@@ -0,0 +1,261 @@
+/*
+ * Copyright (c) 2008 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <config.h>
+#include "dynamic-string.h"
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include "timeval.h"
+#include "util.h"
+
+void
+ds_init(struct ds *ds)
+{
+ ds->string = NULL;
+ ds->length = 0;
+ ds->allocated = 0;
+}
+
+void
+ds_clear(struct ds *ds)
+{
+ ds->length = 0;
+}
+
+void
+ds_truncate(struct ds *ds, size_t new_length)
+{
+ if (ds->length > new_length) {
+ ds->length = new_length;
+ ds->string[new_length] = '\0';
+ }
+}
+
+void
+ds_reserve(struct ds *ds, size_t min_length)
+{
+ if (min_length > ds->allocated || !ds->string) {
+ ds->allocated += MAX(min_length, ds->allocated);
+ ds->allocated = MAX(8, ds->allocated);
+ ds->string = xrealloc(ds->string, ds->allocated + 1);
+ }
+}
+
+char *
+ds_put_uninit(struct ds *ds, size_t n)
+{
+ ds_reserve(ds, ds->length + n);
+ ds->length += n;
+ ds->string[ds->length] = '\0';
+ return &ds->string[ds->length - n];
+}
+
+void
+ds_put_char(struct ds *ds, char c)
+{
+ *ds_put_uninit(ds, 1) = c;
+}
+
+void
+ds_put_char_multiple(struct ds *ds, char c, size_t n)
+{
+ memset(ds_put_uninit(ds, n), c, n);
+}
+
+void
+ds_put_buffer(struct ds *ds, const char *s, size_t n)
+{
+ memcpy(ds_put_uninit(ds, n), s, n);
+}
+
+void
+ds_put_cstr(struct ds *ds, const char *s)
+{
+ size_t s_len = strlen(s);
+ memcpy(ds_put_uninit(ds, s_len), s, s_len);
+}
+
+void
+ds_put_format(struct ds *ds, const char *format, ...)
+{
+ va_list args;
+
+ va_start(args, format);
+ ds_put_format_valist(ds, format, args);
+ va_end(args);
+}
+
+void
+ds_put_format_valist(struct ds *ds, const char *format, va_list args_)
+{
+ va_list args;
+ size_t available;
+ int needed;
+
+ va_copy(args, args_);
+ available = ds->string ? ds->allocated - ds->length + 1 : 0;
+ needed = vsnprintf(&ds->string[ds->length], available, format, args);
+ va_end(args);
+
+ if (needed < available) {
+ ds->length += needed;
+ } else {
+ size_t available;
+
+ ds_reserve(ds, ds->length + needed);
+
+ va_copy(args, args_);
+ available = ds->allocated - ds->length + 1;
+ needed = vsnprintf(&ds->string[ds->length], available, format, args);
+ va_end(args);
+
+ assert(needed < available);
+ ds->length += needed;
+ }
+}
+
+void
+ds_put_printable(struct ds *ds, const char *s, size_t n)
+{
+ ds_reserve(ds, ds->length + n);
+ while (n-- > 0) {
+ unsigned char c = *s++;
+ if (c < 0x20 || c > 0x7e || c == '\\' || c == '"') {
+ ds_put_format(ds, "\\%03o", (int) c);
+ } else {
+ ds_put_char(ds, c);
+ }
+ }
+}
+
+void
+ds_put_strftime(struct ds *ds, const char *template, const struct tm *tm)
+{
+ if (!tm) {
+ time_t now = time_now();
+ tm = localtime(&now);
+ }
+ for (;;) {
+ size_t avail = ds->string ? ds->allocated - ds->length + 1 : 0;
+ size_t used = strftime(&ds->string[ds->length], avail, template, tm);
+ if (used) {
+ ds->length += used;
+ return;
+ }
+ ds_reserve(ds, ds->length + (avail < 32 ? 64 : 2 * avail));
+ }
+}
+
+int
+ds_get_line(struct ds *ds, FILE *file)
+{
+ ds_clear(ds);
+ for (;;) {
+ int c = getc(file);
+ if (c == EOF) {
+ return ds->length ? 0 : EOF;
+ } else if (c == '\n') {
+ return 0;
+ } else {
+ ds_put_char(ds, c);
+ }
+ }
+}
+
+char *
+ds_cstr(struct ds *ds)
+{
+ if (!ds->string) {
+ ds_reserve(ds, 0);
+ }
+ ds->string[ds->length] = '\0';
+ return ds->string;
+}
+
+void
+ds_destroy(struct ds *ds)
+{
+ free(ds->string);
+}
+
+/* Writes the 'size' bytes in 'buf' to 'string' as hex bytes arranged 16 per
+ * line. Numeric offsets are also included, starting at 'ofs' for the first
+ * byte in 'buf'. If 'ascii' is true then the corresponding ASCII characters
+ * are also rendered alongside. */
+void
+ds_put_hex_dump(struct ds *ds, const void *buf_, size_t size,
+ uintptr_t ofs, bool ascii)
+{
+ const uint8_t *buf = buf_;
+ const size_t per_line = 16; /* Maximum bytes per line. */
+
+ while (size > 0)
+ {
+ size_t start, end, n;
+ size_t i;
+
+ /* Number of bytes on this line. */
+ start = ofs % per_line;
+ end = per_line;
+ if (end - start > size)
+ end = start + size;
+ n = end - start;
+
+ /* Print line. */
+ ds_put_format(ds, "%08jx ", (uintmax_t) ROUND_DOWN(ofs, per_line));
+ for (i = 0; i < start; i++)
+ ds_put_format(ds, " ");
+ for (; i < end; i++)
+ ds_put_format(ds, "%02hhx%c",
+ buf[i - start], i == per_line / 2 - 1? '-' : ' ');
+ if (ascii)
+ {
+ for (; i < per_line; i++)
+ ds_put_format(ds, " ");
+ ds_put_format(ds, "|");
+ for (i = 0; i < start; i++)
+ ds_put_format(ds, " ");
+ for (; i < end; i++) {
+ int c = buf[i - start];
+ ds_put_char(ds, c >= 32 && c < 127 ? c : '.');
+ }
+ for (; i < per_line; i++)
+ ds_put_format(ds, " ");
+ ds_put_format(ds, "|");
+ }
+ ds_put_format(ds, "\n");
+
+ ofs += n;
+ buf += n;
+ size -= n;
+ }
+}
+
+int
+ds_last(const struct ds *ds)
+{
+ return ds->length > 0 ? (unsigned char) ds->string[ds->length - 1] : EOF;
+}
+
+void
+ds_chomp(struct ds *ds, int c)
+{
+ if (ds->length > 0 && ds->string[ds->length - 1] == (char) c) {
+ ds->string[--ds->length] = '\0';
+ }
+}
diff --git a/lib/dynamic-string.h b/lib/dynamic-string.h
new file mode 100644
index 000000000..30da5fad9
--- /dev/null
+++ b/lib/dynamic-string.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2008 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef DYNAMIC_STRING_H
+#define DYNAMIC_STRING_H 1
+
+#include <stdarg.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include "compiler.h"
+
+struct tm;
+
+struct ds {
+ char *string; /* Null-terminated string. */
+ size_t length; /* Bytes used, not including null terminator. */
+ size_t allocated; /* Bytes allocated, not including null terminator. */
+};
+
+#define DS_EMPTY_INITIALIZER { NULL, 0, 0 }
+
+void ds_init(struct ds *);
+void ds_clear(struct ds *);
+void ds_truncate(struct ds *, size_t new_length);
+void ds_reserve(struct ds *, size_t min_length);
+char *ds_put_uninit(struct ds *, size_t n);
+void ds_put_char(struct ds *, char);
+void ds_put_char_multiple(struct ds *, char, size_t n);
+void ds_put_buffer(struct ds *, const char *, size_t n);
+void ds_put_cstr(struct ds *, const char *);
+void ds_put_format(struct ds *, const char *, ...) PRINTF_FORMAT(2, 3);
+void ds_put_format_valist(struct ds *, const char *, va_list)
+ PRINTF_FORMAT(2, 0);
+void ds_put_printable(struct ds *, const char *, size_t);
+void ds_put_strftime(struct ds *, const char *, const struct tm *)
+ STRFTIME_FORMAT(2);
+void ds_put_hex_dump(struct ds *ds, const void *buf_, size_t size,
+ uintptr_t ofs, bool ascii);
+int ds_get_line(struct ds *, FILE *);
+
+char *ds_cstr(struct ds *);
+void ds_destroy(struct ds *);
+
+int ds_last(const struct ds *);
+void ds_chomp(struct ds *, int c);
+
+#endif /* dynamic-string.h */
diff --git a/lib/fatal-signal.c b/lib/fatal-signal.c
new file mode 100644
index 000000000..ccb7fe451
--- /dev/null
+++ b/lib/fatal-signal.c
@@ -0,0 +1,253 @@
+/*
+ * Copyright (c) 2008, 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+#include <config.h>
+#include "fatal-signal.h"
+#include <assert.h>
+#include <errno.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include "util.h"
+
+/* Signals to catch. */
+static const int fatal_signals[] = { SIGTERM, SIGINT, SIGHUP, SIGALRM };
+
+/* Signals to catch as a sigset_t. */
+static sigset_t fatal_signal_set;
+
+/* Hooks to call upon catching a signal */
+struct hook {
+ void (*func)(void *aux);
+ void *aux;
+ bool run_at_exit;
+};
+#define MAX_HOOKS 32
+static struct hook hooks[MAX_HOOKS];
+static size_t n_hooks;
+
+/* Number of nesting signal blockers. */
+static int block_level = 0;
+
+/* Signal mask saved by outermost signal blocker. */
+static sigset_t saved_signal_mask;
+
+/* Disabled by fatal_signal_fork()? */
+static bool disabled;
+
+static void call_sigprocmask(int how, sigset_t* new_set, sigset_t* old_set);
+static void atexit_handler(void);
+static void call_hooks(int sig_nr);
+
+/* Registers 'hook' to be called when a process termination signal is raised.
+ * If 'run_at_exit' is true, 'hook' is also called during normal process
+ * termination, e.g. when exit() is called or when main() returns. */
+void
+fatal_signal_add_hook(void (*func)(void *aux), void *aux, bool run_at_exit)
+{
+ fatal_signal_block();
+ assert(n_hooks < MAX_HOOKS);
+ hooks[n_hooks].func = func;
+ hooks[n_hooks].aux = aux;
+ hooks[n_hooks].run_at_exit = run_at_exit;
+ n_hooks++;
+ fatal_signal_unblock();
+}
+
+/* Blocks program termination signals until fatal_signal_unblock() is called.
+ * May be called multiple times with nesting; if so, fatal_signal_unblock()
+ * must be called the same number of times to unblock signals.
+ *
+ * This is needed while adjusting a data structure that will be accessed by a
+ * fatal signal hook, so that the hook is not invoked while the data structure
+ * is in an inconsistent state. */
+void
+fatal_signal_block(void)
+{
+ static bool inited = false;
+ if (!inited) {
+ size_t i;
+
+ inited = true;
+ sigemptyset(&fatal_signal_set);
+ for (i = 0; i < ARRAY_SIZE(fatal_signals); i++) {
+ int sig_nr = fatal_signals[i];
+ struct sigaction old_sa;
+
+ sigaddset(&fatal_signal_set, sig_nr);
+ if (sigaction(sig_nr, NULL, &old_sa)) {
+ ovs_fatal(errno, "sigaction");
+ }
+ if (old_sa.sa_handler == SIG_DFL
+ && signal(sig_nr, fatal_signal_handler) == SIG_ERR) {
+ ovs_fatal(errno, "signal");
+ }
+ }
+ atexit(atexit_handler);
+ }
+
+ if (++block_level == 1) {
+ call_sigprocmask(SIG_BLOCK, &fatal_signal_set, &saved_signal_mask);
+ }
+}
+
+/* Unblocks program termination signals blocked by fatal_signal_block() is
+ * called. If multiple calls to fatal_signal_block() are nested,
+ * fatal_signal_unblock() must be called the same number of times to unblock
+ * signals. */
+void
+fatal_signal_unblock(void)
+{
+ assert(block_level > 0);
+ if (--block_level == 0) {
+ call_sigprocmask(SIG_SETMASK, &saved_signal_mask, NULL);
+ }
+}
+
+/* Handles fatal signal number 'sig_nr'.
+ *
+ * Ordinarily this is the actual signal handler. When other code needs to
+ * handle one of our signals, however, it can register for that signal and, if
+ * and when necessary, call this function to do fatal signal processing for it
+ * and terminate the process. Currently only timeval.c does this, for SIGALRM.
+ * (It is not important whether the other code sets up its signal handler
+ * before or after this file, because this file will only set up a signal
+ * handler in the case where the signal has its default handling.) */
+void
+fatal_signal_handler(int sig_nr)
+{
+ call_hooks(sig_nr);
+
+ /* Re-raise the signal with the default handling so that the program
+ * termination status reflects that we were killed by this signal */
+ signal(sig_nr, SIG_DFL);
+ raise(sig_nr);
+}
+
+static void
+atexit_handler(void)
+{
+ if (!disabled) {
+ call_hooks(0);
+ }
+}
+
+static void
+call_hooks(int sig_nr)
+{
+ static volatile sig_atomic_t recurse = 0;
+ if (!recurse) {
+ size_t i;
+
+ recurse = 1;
+
+ for (i = 0; i < n_hooks; i++) {
+ struct hook *h = &hooks[i];
+ if (sig_nr || h->run_at_exit) {
+ h->func(h->aux);
+ }
+ }
+ }
+}
+
+static char **files;
+static size_t n_files, max_files;
+
+static void unlink_files(void *aux);
+static void do_unlink_files(void);
+
+/* Registers 'file' to be unlinked when the program terminates via exit() or a
+ * fatal signal. */
+void
+fatal_signal_add_file_to_unlink(const char *file)
+{
+ static bool added_hook = false;
+ if (!added_hook) {
+ added_hook = true;
+ fatal_signal_add_hook(unlink_files, NULL, true);
+ }
+
+ fatal_signal_block();
+ if (n_files >= max_files) {
+ files = x2nrealloc(files, &max_files, sizeof *files);
+ }
+ files[n_files++] = xstrdup(file);
+ fatal_signal_unblock();
+}
+
+/* Unregisters 'file' from being unlinked when the program terminates via
+ * exit() or a fatal signal. */
+void
+fatal_signal_remove_file_to_unlink(const char *file)
+{
+ size_t i;
+
+ fatal_signal_block();
+ for (i = 0; i < n_files; i++) {
+ if (!strcmp(files[i], file)) {
+ free(files[i]);
+ files[i] = files[--n_files];
+ break;
+ }
+ }
+ fatal_signal_unblock();
+}
+
+static void
+unlink_files(void *aux UNUSED)
+{
+ do_unlink_files();
+}
+
+static void
+do_unlink_files(void)
+{
+ size_t i;
+
+ for (i = 0; i < n_files; i++) {
+ unlink(files[i]);
+ }
+}
+
+/* Disables the fatal signal hook mechanism. Following a fork, one of the
+ * resulting processes can call this function to allow it to terminate without
+ * triggering fatal signal processing or removing files. Fatal signal
+ * processing is still enabled in the other process. */
+void
+fatal_signal_fork(void)
+{
+ size_t i;
+
+ disabled = true;
+
+ for (i = 0; i < ARRAY_SIZE(fatal_signals); i++) {
+ int sig_nr = fatal_signals[i];
+ if (signal(sig_nr, SIG_DFL) == SIG_IGN) {
+ signal(sig_nr, SIG_IGN);
+ }
+ }
+}
+
+static void
+call_sigprocmask(int how, sigset_t* new_set, sigset_t* old_set)
+{
+ int error = sigprocmask(how, new_set, old_set);
+ if (error) {
+ fprintf(stderr, "sigprocmask: %s\n", strerror(errno));
+ }
+}
diff --git a/lib/fatal-signal.h b/lib/fatal-signal.h
new file mode 100644
index 000000000..a92b10ddb
--- /dev/null
+++ b/lib/fatal-signal.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2008 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef FATAL_SIGNAL_H
+#define FATAL_SIGNAL_H 1
+
+#include <stdbool.h>
+
+/* Basic interface. */
+void fatal_signal_add_hook(void (*)(void *aux), void *aux, bool run_at_exit);
+void fatal_signal_block(void);
+void fatal_signal_unblock(void);
+void fatal_signal_fork(void);
+
+/* Convenience functions for unlinking files upon termination.
+ *
+ * These functions also unlink the files upon normal process termination via
+ * exit(). */
+void fatal_signal_add_file_to_unlink(const char *);
+void fatal_signal_remove_file_to_unlink(const char *);
+
+/* Interface for other code that catches one of our signals and needs to pass
+ * it through. */
+void fatal_signal_handler(int sig_nr);
+
+#endif /* fatal-signal.h */
diff --git a/lib/fault.c b/lib/fault.c
new file mode 100644
index 000000000..6a0ff143c
--- /dev/null
+++ b/lib/fault.c
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2008, 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <config.h>
+#include "fault.h"
+#include <dlfcn.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <signal.h>
+#include "util.h"
+
+#include "vlog.h"
+#define THIS_MODULE VLM_fault
+
+static void
+fault_handler(int sig_nr)
+{
+ VLOG_EMER("Caught signal %d.", sig_nr);
+ log_backtrace();
+ fflush(stdout);
+ fflush(stderr);
+
+ signal(sig_nr, SIG_DFL);
+ raise(sig_nr);
+}
+
+void
+log_backtrace(void)
+{
+ /* During the loop:
+
+ frame[0] points to the next frame.
+ frame[1] points to the return address. */
+ void **frame;
+ for (frame = __builtin_frame_address(0);
+ frame != NULL && frame[0] != NULL;
+ frame = frame[0]) {
+ Dl_info addrinfo;
+ if (!dladdr(frame[1], &addrinfo) || !addrinfo.dli_sname) {
+ fprintf(stderr, " 0x%08"PRIxPTR"\n", (uintptr_t) frame[1]);
+ } else {
+ fprintf(stderr, " 0x%08"PRIxPTR" (%s+0x%x)\n",
+ (uintptr_t) frame[1], addrinfo.dli_sname,
+ (char *) frame[1] - (char *) addrinfo.dli_saddr);
+ }
+ }
+ fflush(stderr);
+}
+
+void
+register_fault_handlers(void)
+{
+ signal(SIGABRT, fault_handler);
+ signal(SIGBUS, fault_handler);
+ signal(SIGFPE, fault_handler);
+ signal(SIGILL, fault_handler);
+ signal(SIGSEGV, fault_handler);
+}
diff --git a/lib/fault.h b/lib/fault.h
new file mode 100644
index 000000000..fa984d40a
--- /dev/null
+++ b/lib/fault.h
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) 2008 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef FAULT_H
+#define FAULT_H 1
+
+void register_fault_handlers(void);
+void log_backtrace(void);
+
+#endif /* fault.h */
diff --git a/lib/flow.c b/lib/flow.c
new file mode 100644
index 000000000..e55f4fe3e
--- /dev/null
+++ b/lib/flow.c
@@ -0,0 +1,280 @@
+/*
+ * Copyright (c) 2008, 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+#include <config.h>
+#include <sys/types.h>
+#include "flow.h"
+#include <inttypes.h>
+#include <netinet/in.h>
+#include <stdlib.h>
+#include <string.h>
+#include "coverage.h"
+#include "dynamic-string.h"
+#include "hash.h"
+#include "ofpbuf.h"
+#include "openflow/openflow.h"
+#include "openvswitch/datapath-protocol.h"
+#include "packets.h"
+
+#include "vlog.h"
+#define THIS_MODULE VLM_flow
+
+static struct ip_header *
+pull_ip(struct ofpbuf *packet)
+{
+ if (packet->size >= IP_HEADER_LEN) {
+ struct ip_header *ip = packet->data;
+ int ip_len = IP_IHL(ip->ip_ihl_ver) * 4;
+ if (ip_len >= IP_HEADER_LEN && packet->size >= ip_len) {
+ return ofpbuf_pull(packet, ip_len);
+ }
+ }
+ return NULL;
+}
+
+static struct tcp_header *
+pull_tcp(struct ofpbuf *packet)
+{
+ if (packet->size >= TCP_HEADER_LEN) {
+ struct tcp_header *tcp = packet->data;
+ int tcp_len = TCP_OFFSET(tcp->tcp_ctl) * 4;
+ if (tcp_len >= TCP_HEADER_LEN && packet->size >= tcp_len) {
+ return ofpbuf_pull(packet, tcp_len);
+ }
+ }
+ return NULL;
+}
+
+static struct udp_header *
+pull_udp(struct ofpbuf *packet)
+{
+ return ofpbuf_try_pull(packet, UDP_HEADER_LEN);
+}
+
+static struct icmp_header *
+pull_icmp(struct ofpbuf *packet)
+{
+ return ofpbuf_try_pull(packet, ICMP_HEADER_LEN);
+}
+
+static struct eth_header *
+pull_eth(struct ofpbuf *packet)
+{
+ return ofpbuf_try_pull(packet, ETH_HEADER_LEN);
+}
+
+static struct vlan_header *
+pull_vlan(struct ofpbuf *packet)
+{
+ return ofpbuf_try_pull(packet, VLAN_HEADER_LEN);
+}
+
+/* Returns 1 if 'packet' is an IP fragment, 0 otherwise. */
+int
+flow_extract(struct ofpbuf *packet, uint16_t in_port, flow_t *flow)
+{
+ struct ofpbuf b = *packet;
+ struct eth_header *eth;
+ int retval = 0;
+
+ COVERAGE_INC(flow_extract);
+
+ memset(flow, 0, sizeof *flow);
+ flow->dl_vlan = htons(OFP_VLAN_NONE);
+ flow->in_port = in_port;
+
+ packet->l2 = b.data;
+ packet->l3 = NULL;
+ packet->l4 = NULL;
+ packet->l7 = NULL;
+
+ eth = pull_eth(&b);
+ if (eth) {
+ if (ntohs(eth->eth_type) >= OFP_DL_TYPE_ETH2_CUTOFF) {
+ /* This is an Ethernet II frame */
+ flow->dl_type = eth->eth_type;
+ } else {
+ /* This is an 802.2 frame */
+ struct llc_header *llc = ofpbuf_at(&b, 0, sizeof *llc);
+ struct snap_header *snap = ofpbuf_at(&b, sizeof *llc,
+ sizeof *snap);
+ if (llc == NULL) {
+ return 0;
+ }
+ if (snap
+ && llc->llc_dsap == LLC_DSAP_SNAP
+ && llc->llc_ssap == LLC_SSAP_SNAP
+ && llc->llc_cntl == LLC_CNTL_SNAP
+ && !memcmp(snap->snap_org, SNAP_ORG_ETHERNET,
+ sizeof snap->snap_org)) {
+ flow->dl_type = snap->snap_type;
+ ofpbuf_pull(&b, LLC_SNAP_HEADER_LEN);
+ } else {
+ flow->dl_type = htons(OFP_DL_TYPE_NOT_ETH_TYPE);
+ ofpbuf_pull(&b, sizeof(struct llc_header));
+ }
+ }
+
+ /* Check for a VLAN tag */
+ if (flow->dl_type == htons(ETH_TYPE_VLAN)) {
+ struct vlan_header *vh = pull_vlan(&b);
+ if (vh) {
+ flow->dl_type = vh->vlan_next_type;
+ flow->dl_vlan = vh->vlan_tci & htons(VLAN_VID_MASK);
+ }
+ }
+ memcpy(flow->dl_src, eth->eth_src, ETH_ADDR_LEN);
+ memcpy(flow->dl_dst, eth->eth_dst, ETH_ADDR_LEN);
+
+ packet->l3 = b.data;
+ if (flow->dl_type == htons(ETH_TYPE_IP)) {
+ const struct ip_header *nh = pull_ip(&b);
+ if (nh) {
+ flow->nw_src = nh->ip_src;
+ flow->nw_dst = nh->ip_dst;
+ flow->nw_proto = nh->ip_proto;
+ packet->l4 = b.data;
+ if (!IP_IS_FRAGMENT(nh->ip_frag_off)) {
+ if (flow->nw_proto == IP_TYPE_TCP) {
+ const struct tcp_header *tcp = pull_tcp(&b);
+ if (tcp) {
+ flow->tp_src = tcp->tcp_src;
+ flow->tp_dst = tcp->tcp_dst;
+ packet->l7 = b.data;
+ } else {
+ /* Avoid tricking other code into thinking that
+ * this packet has an L4 header. */
+ flow->nw_proto = 0;
+ }
+ } else if (flow->nw_proto == IP_TYPE_UDP) {
+ const struct udp_header *udp = pull_udp(&b);
+ if (udp) {
+ flow->tp_src = udp->udp_src;
+ flow->tp_dst = udp->udp_dst;
+ packet->l7 = b.data;
+ } else {
+ /* Avoid tricking other code into thinking that
+ * this packet has an L4 header. */
+ flow->nw_proto = 0;
+ }
+ } else if (flow->nw_proto == IP_TYPE_ICMP) {
+ const struct icmp_header *icmp = pull_icmp(&b);
+ if (icmp) {
+ flow->icmp_type = htons(icmp->icmp_type);
+ flow->icmp_code = htons(icmp->icmp_code);
+ packet->l7 = b.data;
+ } else {
+ /* Avoid tricking other code into thinking that
+ * this packet has an L4 header. */
+ flow->nw_proto = 0;
+ }
+ }
+ } else {
+ retval = 1;
+ }
+ }
+ }
+ }
+ return retval;
+}
+
+/* Extracts the flow stats for a packet. The 'flow' and 'packet'
+ * arguments must have been initialized through a call to flow_extract().
+ */
+void
+flow_extract_stats(const flow_t *flow, struct ofpbuf *packet,
+ struct odp_flow_stats *stats)
+{
+ memset(stats, '\0', sizeof(*stats));
+
+ if ((flow->dl_type == htons(ETH_TYPE_IP)) && packet->l4) {
+ struct ip_header *ip = packet->l3;
+ stats->ip_tos = ip->ip_tos;
+ if ((flow->nw_proto == IP_TYPE_TCP) && packet->l7) {
+ struct tcp_header *tcp = packet->l4;
+ stats->tcp_flags = TCP_FLAGS(tcp->tcp_ctl);
+ }
+ }
+
+ stats->n_bytes = packet->size;
+ stats->n_packets = 1;
+}
+
+void
+flow_to_match(const flow_t *flow, uint32_t wildcards, struct ofp_match *match)
+{
+ match->wildcards = htonl(wildcards);
+ match->in_port = htons(flow->in_port == ODPP_LOCAL ? OFPP_LOCAL
+ : flow->in_port);
+ match->dl_vlan = flow->dl_vlan;
+ memcpy(match->dl_src, flow->dl_src, ETH_ADDR_LEN);
+ memcpy(match->dl_dst, flow->dl_dst, ETH_ADDR_LEN);
+ match->dl_type = flow->dl_type;
+ match->nw_src = flow->nw_src;
+ match->nw_dst = flow->nw_dst;
+ match->nw_proto = flow->nw_proto;
+ match->tp_src = flow->tp_src;
+ match->tp_dst = flow->tp_dst;
+ match->pad = 0;
+}
+
+void
+flow_from_match(flow_t *flow, uint32_t *wildcards,
+ const struct ofp_match *match)
+{
+ if (wildcards) {
+ *wildcards = ntohl(match->wildcards);
+ }
+ flow->nw_src = match->nw_src;
+ flow->nw_dst = match->nw_dst;
+ flow->in_port = (match->in_port == htons(OFPP_LOCAL) ? ODPP_LOCAL
+ : ntohs(match->in_port));
+ flow->dl_vlan = match->dl_vlan;
+ flow->dl_type = match->dl_type;
+ flow->tp_src = match->tp_src;
+ flow->tp_dst = match->tp_dst;
+ memcpy(flow->dl_src, match->dl_src, ETH_ADDR_LEN);
+ memcpy(flow->dl_dst, match->dl_dst, ETH_ADDR_LEN);
+ flow->nw_proto = match->nw_proto;
+ flow->reserved = 0;
+}
+
+char *
+flow_to_string(const flow_t *flow)
+{
+ struct ds ds = DS_EMPTY_INITIALIZER;
+ flow_format(&ds, flow);
+ return ds_cstr(&ds);
+}
+
+void
+flow_format(struct ds *ds, const flow_t *flow)
+{
+ ds_put_format(ds, "port%04x:vlan%d mac"ETH_ADDR_FMT"->"ETH_ADDR_FMT" "
+ "type%04x proto%"PRId8" ip"IP_FMT"->"IP_FMT" port%d->%d",
+ flow->in_port, ntohs(flow->dl_vlan),
+ ETH_ADDR_ARGS(flow->dl_src), ETH_ADDR_ARGS(flow->dl_dst),
+ ntohs(flow->dl_type), flow->nw_proto,
+ IP_ARGS(&flow->nw_src), IP_ARGS(&flow->nw_dst),
+ ntohs(flow->tp_src), ntohs(flow->tp_dst));
+}
+
+void
+flow_print(FILE *stream, const flow_t *flow)
+{
+ char *s = flow_to_string(flow);
+ fputs(s, stream);
+ free(s);
+}
diff --git a/lib/flow.h b/lib/flow.h
new file mode 100644
index 000000000..c7ec77014
--- /dev/null
+++ b/lib/flow.h
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2008, 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+#ifndef FLOW_H
+#define FLOW_H 1
+
+#include <netinet/in.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <string.h>
+#include "openflow/openflow.h"
+#include "hash.h"
+#include "openflow/openflow.h"
+#include "openvswitch/datapath-protocol.h"
+#include "util.h"
+
+struct ds;
+struct ofp_match;
+struct ofpbuf;
+
+typedef struct odp_flow_key flow_t;
+
+int flow_extract(struct ofpbuf *, uint16_t in_port, flow_t *);
+void flow_extract_stats(const flow_t *flow, struct ofpbuf *packet,
+ struct odp_flow_stats *stats);
+void flow_to_match(const flow_t *, uint32_t wildcards, struct ofp_match *);
+void flow_from_match(flow_t *, uint32_t *wildcards, const struct ofp_match *);
+char *flow_to_string(const flow_t *);
+void flow_format(struct ds *, const flow_t *);
+void flow_print(FILE *, const flow_t *);
+static inline int flow_compare(const flow_t *, const flow_t *);
+static inline bool flow_equal(const flow_t *, const flow_t *);
+static inline size_t flow_hash(const flow_t *, uint32_t basis);
+
+static inline int
+flow_compare(const flow_t *a, const flow_t *b)
+{
+ return memcmp(a, b, sizeof *a);
+}
+
+static inline bool
+flow_equal(const flow_t *a, const flow_t *b)
+{
+ return !flow_compare(a, b);
+}
+
+static inline size_t
+flow_hash(const flow_t *flow, uint32_t basis)
+{
+ BUILD_ASSERT_DECL(!(sizeof *flow % sizeof(uint32_t)));
+ return hash_words((const uint32_t *) flow,
+ sizeof *flow / sizeof(uint32_t), basis);
+}
+
+/* Information on wildcards for a flow, as a supplement to flow_t. */
+struct flow_wildcards {
+ uint32_t wildcards; /* enum ofp_flow_wildcards (in host order). */
+ uint32_t nw_src_mask; /* 1-bit in each significant nw_src bit. */
+ uint32_t nw_dst_mask; /* 1-bit in each significant nw_dst bit. */
+};
+
+/* Given the wildcard bit count in bits 'shift' through 'shift + 5' (inclusive)
+ * of 'wildcards', returns a 32-bit bit mask with a 1 in each bit that must
+ * match and a 0 in each bit that is wildcarded.
+ *
+ * The bits in 'wildcards' are in the format used in enum ofp_flow_wildcards: 0
+ * is exact match, 1 ignores the LSB, 2 ignores the 2 least-significant bits,
+ * ..., 32 and higher wildcard the entire field. This is the *opposite* of the
+ * usual convention where e.g. /24 indicates that 8 bits (not 24 bits) are
+ * wildcarded.
+ *
+ * 'wildcards' is in host byte order. The return value is in network byte
+ * order. */
+static inline uint32_t
+flow_nw_bits_to_mask(uint32_t wildcards, int shift)
+{
+ wildcards = (wildcards >> shift) & 0x3f;
+ return wildcards < 32 ? htonl(~((1u << wildcards) - 1)) : 0;
+}
+
+static inline void
+flow_wildcards_init(struct flow_wildcards *wc, uint32_t wildcards)
+{
+ wc->wildcards = wildcards & OFPFW_ALL;
+ wc->nw_src_mask = flow_nw_bits_to_mask(wc->wildcards, OFPFW_NW_SRC_SHIFT);
+ wc->nw_dst_mask = flow_nw_bits_to_mask(wc->wildcards, OFPFW_NW_DST_SHIFT);
+}
+
+#endif /* flow.h */
diff --git a/lib/hash.c b/lib/hash.c
new file mode 100644
index 000000000..af8f21bb1
--- /dev/null
+++ b/lib/hash.c
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2008, 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+#include <config.h>
+#include "hash.h"
+#include <string.h>
+
+/* Returns the hash of the 'n' 32-bit words at 'p', starting from 'basis'.
+ * 'p' must be properly aligned. */
+uint32_t
+hash_words(const uint32_t *p, size_t n, uint32_t basis)
+{
+ uint32_t a, b, c;
+
+ a = b = c = 0xdeadbeef + (((uint32_t) n) << 2) + basis;
+
+ while (n > 3) {
+ a += p[0];
+ b += p[1];
+ c += p[2];
+ HASH_MIX(a, b, c);
+ n -= 3;
+ p += 3;
+ }
+
+ switch (n) {
+ case 3:
+ c += p[2];
+ /* fall through */
+ case 2:
+ b += p[1];
+ /* fall through */
+ case 1:
+ a += p[0];
+ HASH_FINAL(a, b, c);
+ /* fall through */
+ case 0:
+ break;
+ }
+ return c;
+}
+
+/* Returns the hash of the 'n' bytes at 'p', starting from 'basis'. */
+uint32_t
+hash_bytes(const void *p_, size_t n, uint32_t basis)
+{
+ const uint8_t *p = p_;
+ uint32_t a, b, c;
+ uint32_t tmp[3];
+
+ a = b = c = 0xdeadbeef + n + basis;
+
+ while (n >= sizeof tmp) {
+ memcpy(tmp, p, sizeof tmp);
+ a += tmp[0];
+ b += tmp[1];
+ c += tmp[2];
+ HASH_MIX(a, b, c);
+ n -= sizeof tmp;
+ p += sizeof tmp;
+ }
+
+ if (n) {
+ tmp[0] = tmp[1] = tmp[2] = 0;
+ memcpy(tmp, p, n);
+ a += tmp[0];
+ b += tmp[1];
+ c += tmp[2];
+ HASH_FINAL(a, b, c);
+ }
+
+ return c;
+}
diff --git a/lib/hash.h b/lib/hash.h
new file mode 100644
index 000000000..39894d9af
--- /dev/null
+++ b/lib/hash.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2008, 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+#ifndef HASH_H
+#define HASH_H 1
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+/* This is the public domain lookup3 hash by Bob Jenkins from
+ * http://burtleburtle.net/bob/c/lookup3.c, modified for style. */
+
+#define HASH_ROT(x, k) (((x) << (k)) | ((x) >> (32 - (k))))
+
+#define HASH_MIX(a, b, c) \
+ do { \
+ a -= c; a ^= HASH_ROT(c, 4); c += b; \
+ b -= a; b ^= HASH_ROT(a, 6); a += c; \
+ c -= b; c ^= HASH_ROT(b, 8); b += a; \
+ a -= c; a ^= HASH_ROT(c, 16); c += b; \
+ b -= a; b ^= HASH_ROT(a, 19); a += c; \
+ c -= b; c ^= HASH_ROT(b, 4); b += a; \
+ } while (0)
+
+#define HASH_FINAL(a, b, c) \
+ do { \
+ c ^= b; c -= HASH_ROT(b, 14); \
+ a ^= c; a -= HASH_ROT(c, 11); \
+ b ^= a; b -= HASH_ROT(a, 25); \
+ c ^= b; c -= HASH_ROT(b, 16); \
+ a ^= c; a -= HASH_ROT(c, 4); \
+ b ^= a; b -= HASH_ROT(a, 14); \
+ c ^= b; c -= HASH_ROT(b, 24); \
+ } while (0)
+
+uint32_t hash_words(const uint32_t *, size_t n_word, uint32_t basis);
+uint32_t hash_bytes(const void *, size_t n_bytes, uint32_t basis);
+
+static inline uint32_t hash_string(const char *s, uint32_t basis)
+{
+ return hash_bytes(s, strlen(s), basis);
+}
+
+/* This is Bob Jenkins' integer hash from
+ * http://burtleburtle.net/bob/hash/integer.html, modified for style. */
+static inline uint32_t hash_int(uint32_t x, uint32_t basis)
+{
+ x -= x << 6;
+ x ^= x >> 17;
+ x -= x << 9;
+ x ^= x << 4;
+ x -= x << 3;
+ x ^= x << 10;
+ x ^= x >> 15;
+ return x + basis;
+}
+
+#endif /* hash.h */
diff --git a/lib/hmap.c b/lib/hmap.c
new file mode 100644
index 000000000..ea08ab80e
--- /dev/null
+++ b/lib/hmap.c
@@ -0,0 +1,145 @@
+/*
+ * Copyright (c) 2008, 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <config.h>
+#include "hmap.h"
+#include <assert.h>
+#include <stdint.h>
+#include "coverage.h"
+#include "util.h"
+
+/* Initializes 'hmap' as an empty hash table. */
+void
+hmap_init(struct hmap *hmap)
+{
+ hmap->buckets = &hmap->one;
+ hmap->one = NULL;
+ hmap->mask = 0;
+ hmap->n = 0;
+}
+
+/* Frees memory reserved by 'hmap'. It is the client's responsibility to free
+ * the nodes themselves, if necessary. */
+void
+hmap_destroy(struct hmap *hmap)
+{
+ if (hmap && hmap->buckets != &hmap->one) {
+ free(hmap->buckets);
+ }
+}
+
+/* Exchanges hash maps 'a' and 'b'. */
+void
+hmap_swap(struct hmap *a, struct hmap *b)
+{
+ struct hmap tmp = *a;
+ *a = *b;
+ *b = tmp;
+ if (a->buckets == &b->one) {
+ a->buckets = &a->one;
+ }
+ if (b->buckets == &a->one) {
+ b->buckets = &b->one;
+ }
+}
+
+static void
+resize(struct hmap *hmap, size_t new_mask)
+{
+ struct hmap tmp;
+ size_t i;
+
+ assert(!(new_mask & (new_mask + 1)));
+ assert(new_mask != SIZE_MAX);
+
+ hmap_init(&tmp);
+ if (new_mask) {
+ tmp.buckets = xmalloc(sizeof *tmp.buckets * (new_mask + 1));
+ tmp.mask = new_mask;
+ for (i = 0; i <= tmp.mask; i++) {
+ tmp.buckets[i] = NULL;
+ }
+ }
+ for (i = 0; i <= hmap->mask; i++) {
+ struct hmap_node *node, *next;
+ int count = 0;
+ for (node = hmap->buckets[i]; node; node = next) {
+ next = node->next;
+ hmap_insert_fast(&tmp, node, node->hash);
+ count++;
+ }
+ if (count > 5) {
+ COVERAGE_INC(hmap_pathological);
+ }
+ }
+ hmap_swap(hmap, &tmp);
+ hmap_destroy(&tmp);
+}
+
+static size_t
+calc_mask(size_t capacity)
+{
+ size_t mask = capacity / 2;
+ mask |= mask >> 1;
+ mask |= mask >> 2;
+ mask |= mask >> 4;
+ mask |= mask >> 8;
+ mask |= mask >> 16;
+#if SIZE_MAX > UINT32_MAX
+ mask |= mask >> 32;
+#endif
+
+ /* If we need to dynamically allocate buckets we might as well allocate at
+ * least 4 of them. */
+ mask |= (mask & 1) << 1;
+
+ return mask;
+}
+
+/* Expands 'hmap', if necessary, to optimize the performance of searches. */
+void
+hmap_expand(struct hmap *hmap)
+{
+ size_t new_mask = calc_mask(hmap->n);
+ if (new_mask > hmap->mask) {
+ COVERAGE_INC(hmap_expand);
+ resize(hmap, new_mask);
+ }
+}
+
+/* Shrinks 'hmap', if necessary, to optimize the performance of iteration. */
+void
+hmap_shrink(struct hmap *hmap)
+{
+ size_t new_mask = calc_mask(hmap->n);
+ if (new_mask < hmap->mask) {
+ COVERAGE_INC(hmap_shrink);
+ resize(hmap, new_mask);
+ }
+}
+
+/* Expands 'hmap', if necessary, to optimize the performance of searches when
+ * it has up to 'n' elements. (But iteration will be slow in a hash map whose
+ * allocated capacity is much higher than its current number of nodes.) */
+void
+hmap_reserve(struct hmap *hmap, size_t n)
+{
+ size_t new_mask = calc_mask(n);
+ if (new_mask > hmap->mask) {
+ COVERAGE_INC(hmap_reserve);
+ resize(hmap, new_mask);
+ }
+}
diff --git a/lib/hmap.h b/lib/hmap.h
new file mode 100644
index 000000000..7f4b00fc3
--- /dev/null
+++ b/lib/hmap.h
@@ -0,0 +1,254 @@
+/*
+ * Copyright (c) 2008, 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef HMAP_H
+#define HMAP_H 1
+
+#include <stdbool.h>
+#include <stdlib.h>
+#include "util.h"
+
+/* A hash map node, to be embedded inside the data structure being mapped. */
+struct hmap_node {
+ size_t hash; /* Hash value. */
+ struct hmap_node *next; /* Next in linked list. */
+};
+
+/* Returns the hash value embedded in 'node'. */
+static inline size_t hmap_node_hash(const struct hmap_node *node)
+{
+ return node->hash;
+}
+
+/* A hash map. */
+struct hmap {
+ struct hmap_node **buckets;
+ struct hmap_node *one;
+ size_t mask;
+ size_t n;
+};
+
+/* Initializer for an empty hash map. */
+#define HMAP_INITIALIZER(HMAP) { &(HMAP)->one, NULL, 0, 0 }
+
+/* Initialization. */
+void hmap_init(struct hmap *);
+void hmap_destroy(struct hmap *);
+void hmap_swap(struct hmap *a, struct hmap *b);
+static inline size_t hmap_count(const struct hmap *);
+static inline bool hmap_is_empty(const struct hmap *);
+
+/* Adjusting capacity. */
+void hmap_expand(struct hmap *);
+void hmap_shrink(struct hmap *);
+void hmap_reserve(struct hmap *, size_t capacity);
+
+/* Insertion and deletion. */
+static inline void hmap_insert_fast(struct hmap *,
+ struct hmap_node *, size_t hash);
+static inline void hmap_insert(struct hmap *, struct hmap_node *, size_t hash);
+static inline void hmap_remove(struct hmap *, struct hmap_node *);
+static inline void hmap_moved(struct hmap *,
+ struct hmap_node *, struct hmap_node *);
+static inline void hmap_replace(struct hmap *, const struct hmap_node *old,
+ struct hmap_node *new);
+
+/* Search. */
+#define HMAP_FOR_EACH_WITH_HASH(NODE, STRUCT, MEMBER, HASH, HMAP) \
+ for ((NODE) = CONTAINER_OF(hmap_first_with_hash(HMAP, HASH), \
+ STRUCT, MEMBER); \
+ &(NODE)->MEMBER != NULL; \
+ (NODE) = CONTAINER_OF(hmap_next_with_hash(&(NODE)->MEMBER), \
+ STRUCT, MEMBER))
+
+static inline struct hmap_node *hmap_first_with_hash(const struct hmap *,
+ size_t hash);
+static inline struct hmap_node *hmap_next_with_hash(const struct hmap_node *);
+
+/* Iteration.
+ *
+ * The _SAFE version is needed when NODE may be freed. It is not needed when
+ * NODE may be removed from the hash map but its members remain accessible and
+ * intact. */
+#define HMAP_FOR_EACH(NODE, STRUCT, MEMBER, HMAP) \
+ for ((NODE) = CONTAINER_OF(hmap_first(HMAP), STRUCT, MEMBER); \
+ &(NODE)->MEMBER != NULL; \
+ (NODE) = CONTAINER_OF(hmap_next(HMAP, &(NODE)->MEMBER), \
+ STRUCT, MEMBER))
+
+#define HMAP_FOR_EACH_SAFE(NODE, NEXT, STRUCT, MEMBER, HMAP) \
+ for ((NODE) = CONTAINER_OF(hmap_first(HMAP), STRUCT, MEMBER); \
+ (&(NODE)->MEMBER != NULL \
+ ? (NEXT) = CONTAINER_OF(hmap_next(HMAP, &(NODE)->MEMBER), \
+ STRUCT, MEMBER), 1 \
+ : 0); \
+ (NODE) = (NEXT))
+
+static inline struct hmap_node *hmap_first(const struct hmap *);
+static inline struct hmap_node *hmap_next(const struct hmap *,
+ const struct hmap_node *);
+
+/* Returns the number of nodes currently in 'hmap'. */
+static inline size_t
+hmap_count(const struct hmap *hmap)
+{
+ return hmap->n;
+}
+
+/* Returns true if 'hmap' currently contains no nodes,
+ * false otherwise. */
+static inline bool
+hmap_is_empty(const struct hmap *hmap)
+{
+ return hmap->n == 0;
+}
+
+/* Inserts 'node', with the given 'hash', into 'hmap'. 'hmap' is never
+ * expanded automatically. */
+static inline void
+hmap_insert_fast(struct hmap *hmap, struct hmap_node *node, size_t hash)
+{
+ struct hmap_node **bucket = &hmap->buckets[hash & hmap->mask];
+ node->hash = hash;
+ node->next = *bucket;
+ *bucket = node;
+ hmap->n++;
+}
+
+/* Inserts 'node', with the given 'hash', into 'hmap', and expands 'hmap' if
+ * necessary to optimize search performance. */
+static inline void
+hmap_insert(struct hmap *hmap, struct hmap_node *node, size_t hash)
+{
+ hmap_insert_fast(hmap, node, hash);
+ if (hmap->n / 2 > hmap->mask) {
+ hmap_expand(hmap);
+ }
+}
+
+/* Removes 'node' from 'hmap'. Does not shrink the hash table; call
+ * hmap_shrink() directly if desired. */
+static inline void
+hmap_remove(struct hmap *hmap, struct hmap_node *node)
+{
+ struct hmap_node **bucket = &hmap->buckets[node->hash & hmap->mask];
+ while (*bucket != node) {
+ bucket = &(*bucket)->next;
+ }
+ *bucket = node->next;
+ hmap->n--;
+}
+
+/* Adjusts 'hmap' to compensate for 'old_node' having moved position in memory
+ * to 'node' (e.g. due to realloc()). */
+static inline void
+hmap_moved(struct hmap *hmap,
+ struct hmap_node *old_node, struct hmap_node *node)
+{
+ struct hmap_node **bucket = &hmap->buckets[node->hash & hmap->mask];
+ while (*bucket != old_node) {
+ bucket = &(*bucket)->next;
+ }
+ *bucket = node;
+}
+
+/* Puts 'new' in the position in 'hmap' currently occupied by 'old'. The 'new'
+ * node must hash to the same value as 'old'. The client is responsible for
+ * ensuring that the replacement does not violate any client-imposed
+ * invariants (e.g. uniqueness of keys within a map).
+ *
+ * Afterward, 'old' is not part of 'hmap', and the client is responsible for
+ * freeing it (if this is desirable). */
+static inline void
+hmap_replace(struct hmap *hmap,
+ const struct hmap_node *old, struct hmap_node *new)
+{
+ struct hmap_node **bucket = &hmap->buckets[old->hash & hmap->mask];
+ while (*bucket != old) {
+ bucket = &(*bucket)->next;
+ }
+ *bucket = new;
+ new->hash = old->hash;
+}
+
+static inline struct hmap_node *
+hmap_next_with_hash__(const struct hmap_node *node, size_t hash)
+{
+ while (node != NULL && node->hash != hash) {
+ node = node->next;
+ }
+ return (struct hmap_node *) node;
+}
+
+/* Returns the first node in 'hmap' with the given 'hash', or a null pointer if
+ * no nodes have that hash value. */
+static inline struct hmap_node *
+hmap_first_with_hash(const struct hmap *hmap, size_t hash)
+{
+ return hmap_next_with_hash__(hmap->buckets[hash & hmap->mask], hash);
+}
+
+/* Returns the next node in the same hash map as 'node' with the same hash
+ * value, or a null pointer if no more nodes have that hash value.
+ *
+ * If the hash map has been reallocated since 'node' was visited, some nodes
+ * may be skipped; if new nodes with the same hash value have been added, they
+ * will be skipped. (Removing 'node' from the hash map does not prevent
+ * calling this function, since node->next is preserved, although freeing
+ * 'node' of course does.) */
+static inline struct hmap_node *
+hmap_next_with_hash(const struct hmap_node *node)
+{
+ return hmap_next_with_hash__(node->next, node->hash);
+}
+
+static inline struct hmap_node *
+hmap_next__(const struct hmap *hmap, size_t start)
+{
+ size_t i;
+ for (i = start; i <= hmap->mask; i++) {
+ struct hmap_node *node = hmap->buckets[i];
+ if (node) {
+ return node;
+ }
+ }
+ return NULL;
+}
+
+/* Returns the first node in 'hmap', in arbitrary order, or a null pointer if
+ * 'hmap' is empty. */
+static inline struct hmap_node *
+hmap_first(const struct hmap *hmap)
+{
+ return hmap_next__(hmap, 0);
+}
+
+/* Returns the next node in 'hmap' following 'node', in arbitrary order, or a
+ * null pointer if 'node' is the last node in 'hmap'.
+ *
+ * If the hash map has been reallocated since 'node' was visited, some nodes
+ * may be skipped or visited twice. (Removing 'node' from the hash map does
+ * not prevent calling this function, since node->next is preserved, although
+ * freeing 'node' of course does.) */
+static inline struct hmap_node *
+hmap_next(const struct hmap *hmap, const struct hmap_node *node)
+{
+ return (node->next
+ ? node->next
+ : hmap_next__(hmap, (node->hash & hmap->mask) + 1));
+}
+
+#endif /* hmap.h */
diff --git a/lib/leak-checker.c b/lib/leak-checker.c
new file mode 100644
index 000000000..a25fa71dc
--- /dev/null
+++ b/lib/leak-checker.c
@@ -0,0 +1,244 @@
+/*
+ * Copyright (c) 2008, 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <config.h>
+#include "leak-checker.h"
+#include <inttypes.h>
+#include "backtrace.h"
+
+#define THIS_MODULE VLM_leak_checker
+#include "vlog.h"
+
+#ifndef HAVE_MALLOC_HOOKS
+void
+leak_checker_start(const char *file_name UNUSED)
+{
+ VLOG_WARN("not enabling leak checker because the libc in use does not "
+ "have the required hooks");
+}
+
+void
+leak_checker_set_limit(off_t max_size UNUSED)
+{
+}
+
+void
+leak_checker_claim(const void *p UNUSED)
+{
+}
+
+void
+leak_checker_usage(void)
+{
+ printf(" --check-leaks=FILE (accepted but ignored in this build)\n");
+}
+#else /* HAVE_MALLOC_HOOKS */
+#include <errno.h>
+#include <fcntl.h>
+#include <malloc.h>
+#include <sys/stat.h>
+
+typedef void *malloc_hook_type(size_t, const void *);
+typedef void *realloc_hook_type(void *, size_t, const void *);
+typedef void free_hook_type(void *, const void *);
+
+struct hooks {
+ malloc_hook_type *malloc_hook_func;
+ realloc_hook_type *realloc_hook_func;
+ free_hook_type *free_hook_func;
+};
+
+static malloc_hook_type hook_malloc;
+static realloc_hook_type hook_realloc;
+static free_hook_type hook_free;
+
+static struct hooks libc_hooks;
+static const struct hooks our_hooks = { hook_malloc, hook_realloc, hook_free };
+
+static FILE *file;
+static off_t limit = 10 * 1000 * 1000;
+
+static void
+get_hooks(struct hooks *hooks)
+{
+ hooks->malloc_hook_func = __malloc_hook;
+ hooks->realloc_hook_func = __realloc_hook;
+ hooks->free_hook_func = __free_hook;
+}
+
+static void
+set_hooks(const struct hooks *hooks)
+{
+ __malloc_hook = hooks->malloc_hook_func;
+ __realloc_hook = hooks->realloc_hook_func;
+ __free_hook = hooks->free_hook_func;
+}
+
+void
+leak_checker_start(const char *file_name)
+{
+ if (!file) {
+ file = fopen(file_name, "w");
+ if (!file) {
+ VLOG_WARN("failed to create \"%s\": %s",
+ file_name, strerror(errno));
+ return;
+ }
+ setvbuf(file, NULL, _IOLBF, 0);
+ VLOG_WARN("enabled memory leak logging to \"%s\"", file_name);
+ get_hooks(&libc_hooks);
+ set_hooks(&our_hooks);
+ }
+}
+
+void
+leak_checker_stop(void)
+{
+ if (file) {
+ fclose(file);
+ file = NULL;
+ set_hooks(&libc_hooks);
+ VLOG_WARN("disabled memory leak logging");
+ }
+}
+
+void
+leak_checker_set_limit(off_t limit_)
+{
+ limit = limit_;
+}
+
+void
+leak_checker_usage(void)
+{
+ printf(" --check-leaks=FILE log malloc and free calls to FILE\n");
+}
+
+static void PRINTF_FORMAT(1, 2)
+log_callers(const char *format, ...)
+{
+ struct backtrace backtrace;
+ va_list args;
+ int i;
+
+ va_start(args, format);
+ vfprintf(file, format, args);
+ va_end(args);
+
+ putc(':', file);
+ backtrace_capture(&backtrace);
+ for (i = 0; i < backtrace.n_frames; i++) {
+ fprintf(file, " 0x%x", backtrace.frames[i]);
+ }
+ putc('\n', file);
+}
+
+static void
+reset_hooks(void)
+{
+ static int count;
+
+ if (file) {
+ if (ferror(file)) {
+ VLOG_WARN("error writing leak checker log file");
+ leak_checker_stop();
+ return;
+ }
+
+ if (count++ >= 100 && limit) {
+ struct stat s;
+ count = 0;
+ if (fstat(fileno(file), &s) < 0) {
+ VLOG_WARN("cannot fstat leak checker log file: %s",
+ strerror(errno));
+ leak_checker_stop();
+ return;
+ }
+ if (s.st_size > limit) {
+ VLOG_WARN("leak checker log file size exceeded limit");
+ leak_checker_stop();
+ return;
+ }
+ }
+ }
+ if (file) {
+ set_hooks(&our_hooks);
+ }
+}
+
+static void *
+hook_malloc(size_t size, const void *caller UNUSED)
+{
+ void *p;
+
+ set_hooks(&libc_hooks);
+ p = malloc(size);
+ get_hooks(&libc_hooks);
+
+ log_callers("malloc(%zu) -> %p", size, p);
+
+ reset_hooks();
+ return p;
+}
+
+void
+leak_checker_claim(const void *p)
+{
+ if (!file) {
+ return;
+ }
+
+ if (p) {
+ set_hooks(&libc_hooks);
+ log_callers("claim(%p)", p);
+ reset_hooks();
+ }
+}
+
+static void
+hook_free(void *p, const void *caller UNUSED)
+{
+ if (!p) {
+ return;
+ }
+
+ set_hooks(&libc_hooks);
+ free(p);
+ get_hooks(&libc_hooks);
+
+ log_callers("free(%p)", p);
+
+ reset_hooks();
+}
+
+static void *
+hook_realloc(void *p, size_t size, const void *caller UNUSED)
+{
+ void *q;
+
+ set_hooks(&libc_hooks);
+ q = realloc(p, size);
+ get_hooks(&libc_hooks);
+
+ if (p != q) {
+ log_callers("realloc(%p, %zu) -> %p", p, size, q);
+ }
+
+ reset_hooks();
+
+ return q;
+}
+#endif /* HAVE_MALLOC_HOOKS */
diff --git a/lib/leak-checker.h b/lib/leak-checker.h
new file mode 100644
index 000000000..c2259dabe
--- /dev/null
+++ b/lib/leak-checker.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2008, 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef LEAK_CHECKER_H
+#define LEAK_CHECKER_H 1
+
+#include <sys/types.h>
+
+#define LEAK_CHECKER_OPTION_ENUMS \
+ OPT_CHECK_LEAKS, \
+ OPT_LEAK_LIMIT
+#define LEAK_CHECKER_LONG_OPTIONS \
+ {"check-leaks", required_argument, 0, OPT_CHECK_LEAKS}, \
+ {"leak-limit", required_argument, 0, OPT_LEAK_LIMIT}
+#define LEAK_CHECKER_OPTION_HANDLERS \
+ case OPT_CHECK_LEAKS: \
+ leak_checker_start(optarg); \
+ break; \
+ case OPT_LEAK_LIMIT: \
+ leak_checker_set_limit(atol(optarg)); \
+ break;
+void leak_checker_start(const char *file_name);
+void leak_checker_set_limit(off_t limit);
+void leak_checker_stop(void);
+void leak_checker_claim(const void *);
+void leak_checker_usage(void);
+
+#endif /* leak-checker.h */
diff --git a/lib/leak-checker.man b/lib/leak-checker.man
new file mode 100644
index 000000000..7b376e1a9
--- /dev/null
+++ b/lib/leak-checker.man
@@ -0,0 +1,7 @@
+.TP
+\fB--check-leaks=\fIfile\fR
+.
+Logs information about memory allocation and deallocation to
+\fIfile\fR, to allow for debugging memory leaks in \fB\*(PN\fR. This
+option slows down \fB\*(PN\fR considerably, so it should only be used
+when a memory leak is suspected.
diff --git a/lib/learning-switch.c b/lib/learning-switch.c
new file mode 100644
index 000000000..efc583867
--- /dev/null
+++ b/lib/learning-switch.c
@@ -0,0 +1,644 @@
+/*
+ * Copyright (c) 2008, 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <config.h>
+#include "learning-switch.h"
+
+#include <errno.h>
+#include <inttypes.h>
+#include <netinet/in.h>
+#include <stdlib.h>
+#include <time.h>
+
+#include "flow.h"
+#include "mac-learning.h"
+#include "ofpbuf.h"
+#include "ofp-print.h"
+#include "openflow/openflow.h"
+#include "poll-loop.h"
+#include "queue.h"
+#include "rconn.h"
+#include "stp.h"
+#include "timeval.h"
+#include "vconn.h"
+#include "xtoxll.h"
+
+#define THIS_MODULE VLM_learning_switch
+#include "vlog.h"
+
+enum port_state {
+ P_DISABLED = 1 << 0,
+ P_LISTENING = 1 << 1,
+ P_LEARNING = 1 << 2,
+ P_FORWARDING = 1 << 3,
+ P_BLOCKING = 1 << 4
+};
+
+struct lswitch {
+ /* If nonnegative, the switch sets up flows that expire after the given
+ * number of seconds (or never expire, if the value is OFP_FLOW_PERMANENT).
+ * Otherwise, the switch processes every packet. */
+ int max_idle;
+
+ unsigned long long int datapath_id;
+ uint32_t capabilities;
+ time_t last_features_request;
+ struct mac_learning *ml; /* NULL to act as hub instead of switch. */
+
+ /* Number of outgoing queued packets on the rconn. */
+ struct rconn_packet_counter *queued;
+
+ /* Spanning tree protocol implementation.
+ *
+ * We implement STP states by, whenever a port's STP state changes,
+ * querying all the flows on the switch and then deleting any of them that
+ * are inappropriate for a port's STP state. */
+ long long int next_query; /* Next time at which to query all flows. */
+ long long int last_query; /* Last time we sent a query. */
+ long long int last_reply; /* Last time we received a query reply. */
+ unsigned int port_states[STP_MAX_PORTS];
+ uint32_t query_xid; /* XID used for query. */
+ int n_flows, n_no_recv, n_no_send;
+};
+
+/* The log messages here could actually be useful in debugging, so keep the
+ * rate limit relatively high. */
+static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(30, 300);
+
+static void queue_tx(struct lswitch *, struct rconn *, struct ofpbuf *);
+static void send_features_request(struct lswitch *, struct rconn *);
+static void schedule_query(struct lswitch *, long long int delay);
+static bool may_learn(const struct lswitch *, uint16_t port_no);
+static bool may_recv(const struct lswitch *, uint16_t port_no,
+ bool any_actions);
+static bool may_send(const struct lswitch *, uint16_t port_no);
+
+typedef void packet_handler_func(struct lswitch *, struct rconn *, void *);
+static packet_handler_func process_switch_features;
+static packet_handler_func process_packet_in;
+static packet_handler_func process_echo_request;
+static packet_handler_func process_port_status;
+static packet_handler_func process_phy_port;
+static packet_handler_func process_stats_reply;
+
+/* Creates and returns a new learning switch.
+ *
+ * If 'learn_macs' is true, the new switch will learn the ports on which MAC
+ * addresses appear. Otherwise, the new switch will flood all packets.
+ *
+ * If 'max_idle' is nonnegative, the new switch will set up flows that expire
+ * after the given number of seconds (or never expire, if 'max_idle' is
+ * OFP_FLOW_PERMANENT). Otherwise, the new switch will process every packet.
+ *
+ * 'rconn' is used to send out an OpenFlow features request. */
+struct lswitch *
+lswitch_create(struct rconn *rconn, bool learn_macs, int max_idle)
+{
+ struct lswitch *sw;
+ size_t i;
+
+ sw = xcalloc(1, sizeof *sw);
+ sw->max_idle = max_idle;
+ sw->datapath_id = 0;
+ sw->last_features_request = time_now() - 1;
+ sw->ml = learn_macs ? mac_learning_create() : NULL;
+ sw->queued = rconn_packet_counter_create();
+ sw->next_query = LLONG_MIN;
+ sw->last_query = LLONG_MIN;
+ sw->last_reply = LLONG_MIN;
+ for (i = 0; i < STP_MAX_PORTS; i++) {
+ sw->port_states[i] = P_DISABLED;
+ }
+ send_features_request(sw, rconn);
+ return sw;
+}
+
+/* Destroys 'sw'. */
+void
+lswitch_destroy(struct lswitch *sw)
+{
+ if (sw) {
+ mac_learning_destroy(sw->ml);
+ rconn_packet_counter_destroy(sw->queued);
+ free(sw);
+ }
+}
+
+/* Takes care of necessary 'sw' activity, except for receiving packets (which
+ * the caller must do). */
+void
+lswitch_run(struct lswitch *sw, struct rconn *rconn)
+{
+ long long int now = time_msec();
+
+ if (sw->ml) {
+ mac_learning_run(sw->ml, NULL);
+ }
+
+ /* If we're waiting for more replies, keeping waiting for up to 10 s. */
+ if (sw->last_reply != LLONG_MIN) {
+ if (now - sw->last_reply > 10000) {
+ VLOG_ERR_RL(&rl, "%012llx: No more flow stat replies last 10 s",
+ sw->datapath_id);
+ sw->last_reply = LLONG_MIN;
+ sw->last_query = LLONG_MIN;
+ schedule_query(sw, 0);
+ } else {
+ return;
+ }
+ }
+
+ /* If we're waiting for any reply at all, keep waiting for up to 10 s. */
+ if (sw->last_query != LLONG_MIN) {
+ if (now - sw->last_query > 10000) {
+ VLOG_ERR_RL(&rl, "%012llx: No flow stat replies in last 10 s",
+ sw->datapath_id);
+ sw->last_query = LLONG_MIN;
+ schedule_query(sw, 0);
+ } else {
+ return;
+ }
+ }
+
+ /* If it's time to send another query, do so. */
+ if (sw->next_query != LLONG_MIN && now >= sw->next_query) {
+ sw->next_query = LLONG_MIN;
+ if (!rconn_is_connected(rconn)) {
+ schedule_query(sw, 1000);
+ } else {
+ struct ofp_stats_request *osr;
+ struct ofp_flow_stats_request *ofsr;
+ struct ofpbuf *b;
+ int error;
+
+ VLOG_DBG("%012llx: Sending flow stats request to implement STP",
+ sw->datapath_id);
+
+ sw->last_query = now;
+ sw->query_xid = random_uint32();
+ sw->n_flows = 0;
+ sw->n_no_recv = 0;
+ sw->n_no_send = 0;
+ osr = make_openflow_xid(sizeof *osr + sizeof *ofsr,
+ OFPT_STATS_REQUEST, sw->query_xid, &b);
+ osr->type = htons(OFPST_FLOW);
+ osr->flags = htons(0);
+ ofsr = (struct ofp_flow_stats_request *) osr->body;
+ ofsr->match.wildcards = htonl(OFPFW_ALL);
+ ofsr->table_id = 0xff;
+ ofsr->out_port = htons(OFPP_NONE);
+
+ error = rconn_send(rconn, b, NULL);
+ if (error) {
+ VLOG_WARN_RL(&rl, "%012llx: sending flow stats request "
+ "failed: %s", sw->datapath_id, strerror(error));
+ ofpbuf_delete(b);
+ schedule_query(sw, 1000);
+ }
+ }
+ }
+}
+
+static void
+wait_timeout(long long int started)
+{
+ long long int now = time_msec();
+ long long int timeout = 10000 - (now - started);
+ if (timeout <= 0) {
+ poll_immediate_wake();
+ } else {
+ poll_timer_wait(timeout);
+ }
+}
+
+void
+lswitch_wait(struct lswitch *sw)
+{
+ if (sw->ml) {
+ mac_learning_wait(sw->ml);
+ }
+
+ if (sw->last_reply != LLONG_MIN) {
+ wait_timeout(sw->last_reply);
+ } else if (sw->last_query != LLONG_MIN) {
+ wait_timeout(sw->last_query);
+ }
+}
+
+/* Processes 'msg', which should be an OpenFlow received on 'rconn', according
+ * to the learning switch state in 'sw'. The most likely result of processing
+ * is that flow-setup and packet-out OpenFlow messages will be sent out on
+ * 'rconn'. */
+void
+lswitch_process_packet(struct lswitch *sw, struct rconn *rconn,
+ const struct ofpbuf *msg)
+{
+ struct processor {
+ uint8_t type;
+ size_t min_size;
+ packet_handler_func *handler;
+ };
+ static const struct processor processors[] = {
+ {
+ OFPT_ECHO_REQUEST,
+ sizeof(struct ofp_header),
+ process_echo_request
+ },
+ {
+ OFPT_FEATURES_REPLY,
+ sizeof(struct ofp_switch_features),
+ process_switch_features
+ },
+ {
+ OFPT_PACKET_IN,
+ offsetof(struct ofp_packet_in, data),
+ process_packet_in
+ },
+ {
+ OFPT_PORT_STATUS,
+ sizeof(struct ofp_port_status),
+ process_port_status
+ },
+ {
+ OFPT_STATS_REPLY,
+ offsetof(struct ofp_stats_reply, body),
+ process_stats_reply
+ },
+ {
+ OFPT_FLOW_EXPIRED,
+ sizeof(struct ofp_flow_expired),
+ NULL
+ },
+ };
+ const size_t n_processors = ARRAY_SIZE(processors);
+ const struct processor *p;
+ struct ofp_header *oh;
+
+ oh = msg->data;
+ if (sw->datapath_id == 0
+ && oh->type != OFPT_ECHO_REQUEST
+ && oh->type != OFPT_FEATURES_REPLY) {
+ send_features_request(sw, rconn);
+ return;
+ }
+
+ for (p = processors; p < &processors[n_processors]; p++) {
+ if (oh->type == p->type) {
+ if (msg->size < p->min_size) {
+ VLOG_WARN_RL(&rl, "%012llx: %s: too short (%zu bytes) for "
+ "type %"PRIu8" (min %zu)", sw->datapath_id,
+ rconn_get_name(rconn), msg->size, oh->type,
+ p->min_size);
+ return;
+ }
+ if (p->handler) {
+ (p->handler)(sw, rconn, msg->data);
+ }
+ return;
+ }
+ }
+ if (VLOG_IS_DBG_ENABLED()) {
+ char *p = ofp_to_string(msg->data, msg->size, 2);
+ VLOG_DBG_RL(&rl, "%012llx: OpenFlow packet ignored: %s",
+ sw->datapath_id, p);
+ free(p);
+ }
+}
+
+static void
+send_features_request(struct lswitch *sw, struct rconn *rconn)
+{
+ time_t now = time_now();
+ if (now >= sw->last_features_request + 1) {
+ struct ofpbuf *b;
+ struct ofp_switch_config *osc;
+
+ /* Send OFPT_FEATURES_REQUEST. */
+ make_openflow(sizeof(struct ofp_header), OFPT_FEATURES_REQUEST, &b);
+ queue_tx(sw, rconn, b);
+
+ /* Send OFPT_SET_CONFIG. */
+ osc = make_openflow(sizeof *osc, OFPT_SET_CONFIG, &b);
+ osc->flags = htons(OFPC_SEND_FLOW_EXP);
+ osc->miss_send_len = htons(OFP_DEFAULT_MISS_SEND_LEN);
+ queue_tx(sw, rconn, b);
+
+ sw->last_features_request = now;
+ }
+}
+
+static void
+queue_tx(struct lswitch *sw, struct rconn *rconn, struct ofpbuf *b)
+{
+ int retval = rconn_send_with_limit(rconn, b, sw->queued, 10);
+ if (retval && retval != ENOTCONN) {
+ if (retval == EAGAIN) {
+ VLOG_INFO_RL(&rl, "%012llx: %s: tx queue overflow",
+ sw->datapath_id, rconn_get_name(rconn));
+ } else {
+ VLOG_WARN_RL(&rl, "%012llx: %s: send: %s",
+ sw->datapath_id, rconn_get_name(rconn),
+ strerror(retval));
+ }
+ }
+}
+
+static void
+schedule_query(struct lswitch *sw, long long int delay)
+{
+ long long int now = time_msec();
+ if (sw->next_query == LLONG_MIN || sw->next_query > now + delay) {
+ sw->next_query = now + delay;
+ }
+}
+
+static void
+process_switch_features(struct lswitch *sw, struct rconn *rconn, void *osf_)
+{
+ struct ofp_switch_features *osf = osf_;
+ size_t n_ports = ((ntohs(osf->header.length)
+ - offsetof(struct ofp_switch_features, ports))
+ / sizeof *osf->ports);
+ size_t i;
+
+ sw->datapath_id = ntohll(osf->datapath_id);
+ sw->capabilities = ntohl(osf->capabilities);
+ for (i = 0; i < n_ports; i++) {
+ process_phy_port(sw, rconn, &osf->ports[i]);
+ }
+ if (sw->capabilities & OFPC_STP) {
+ schedule_query(sw, 1000);
+ }
+}
+
+static void
+process_packet_in(struct lswitch *sw, struct rconn *rconn, void *opi_)
+{
+ struct ofp_packet_in *opi = opi_;
+ uint16_t in_port = ntohs(opi->in_port);
+ uint16_t out_port = OFPP_FLOOD;
+
+ size_t pkt_ofs, pkt_len;
+ struct ofpbuf pkt;
+ flow_t flow;
+
+ /* Extract flow data from 'opi' into 'flow'. */
+ pkt_ofs = offsetof(struct ofp_packet_in, data);
+ pkt_len = ntohs(opi->header.length) - pkt_ofs;
+ pkt.data = opi->data;
+ pkt.size = pkt_len;
+ flow_extract(&pkt, in_port, &flow);
+
+ if (may_learn(sw, in_port) && sw->ml) {
+ if (mac_learning_learn(sw->ml, flow.dl_src, 0, in_port)) {
+ VLOG_DBG_RL(&rl, "%012llx: learned that "ETH_ADDR_FMT" is on "
+ "port %"PRIu16, sw->datapath_id,
+ ETH_ADDR_ARGS(flow.dl_src), in_port);
+ }
+ }
+
+ if (eth_addr_is_reserved(flow.dl_src)) {
+ goto drop_it;
+ }
+
+ if (!may_recv(sw, in_port, false)) {
+ /* STP prevents receiving anything on this port. */
+ goto drop_it;
+ }
+
+ if (sw->ml) {
+ int learned_port = mac_learning_lookup(sw->ml, flow.dl_dst, 0);
+ if (learned_port >= 0 && may_send(sw, learned_port)) {
+ out_port = learned_port;
+ }
+ }
+
+ if (in_port == out_port) {
+ /* Don't send out packets on their input ports. */
+ goto drop_it;
+ } else if (sw->max_idle >= 0 && (!sw->ml || out_port != OFPP_FLOOD)) {
+ /* The output port is known, or we always flood everything, so add a
+ * new flow. */
+ queue_tx(sw, rconn, make_add_simple_flow(&flow, ntohl(opi->buffer_id),
+ out_port, sw->max_idle));
+
+ /* If the switch didn't buffer the packet, we need to send a copy. */
+ if (ntohl(opi->buffer_id) == UINT32_MAX) {
+ queue_tx(sw, rconn,
+ make_unbuffered_packet_out(&pkt, in_port, out_port));
+ }
+ } else {
+ /* We don't know that MAC, or we don't set up flows. Send along the
+ * packet without setting up a flow. */
+ struct ofpbuf *b;
+ if (ntohl(opi->buffer_id) == UINT32_MAX) {
+ b = make_unbuffered_packet_out(&pkt, in_port, out_port);
+ } else {
+ b = make_buffered_packet_out(ntohl(opi->buffer_id),
+ in_port, out_port);
+ }
+ queue_tx(sw, rconn, b);
+ }
+ return;
+
+drop_it:
+ if (sw->max_idle >= 0) {
+ /* Set up a flow to drop packets. */
+ queue_tx(sw, rconn, make_add_flow(&flow, ntohl(opi->buffer_id),
+ sw->max_idle, 0));
+ } else {
+ /* Just drop the packet, since we don't set up flows at all.
+ * XXX we should send a packet_out with no actions if buffer_id !=
+ * UINT32_MAX, to avoid clogging the kernel buffers. */
+ }
+ return;
+}
+
+static void
+process_echo_request(struct lswitch *sw, struct rconn *rconn, void *rq_)
+{
+ struct ofp_header *rq = rq_;
+ queue_tx(sw, rconn, make_echo_reply(rq));
+}
+
+static void
+process_port_status(struct lswitch *sw, struct rconn *rconn, void *ops_)
+{
+ struct ofp_port_status *ops = ops_;
+ process_phy_port(sw, rconn, &ops->desc);
+}
+
+static void
+process_phy_port(struct lswitch *sw, struct rconn *rconn UNUSED, void *opp_)
+{
+ const struct ofp_phy_port *opp = opp_;
+ uint16_t port_no = ntohs(opp->port_no);
+ if (sw->capabilities & OFPC_STP && port_no < STP_MAX_PORTS) {
+ uint32_t config = ntohl(opp->config);
+ uint32_t state = ntohl(opp->state);
+ unsigned int *port_state = &sw->port_states[port_no];
+ unsigned int new_port_state;
+
+ if (!(config & (OFPPC_NO_STP | OFPPC_PORT_DOWN))
+ && !(state & OFPPS_LINK_DOWN))
+ {
+ switch (state & OFPPS_STP_MASK) {
+ case OFPPS_STP_LISTEN:
+ new_port_state = P_LISTENING;
+ break;
+ case OFPPS_STP_LEARN:
+ new_port_state = P_LEARNING;
+ break;
+ case OFPPS_STP_FORWARD:
+ new_port_state = P_FORWARDING;
+ break;
+ case OFPPS_STP_BLOCK:
+ new_port_state = P_BLOCKING;
+ break;
+ default:
+ new_port_state = P_DISABLED;
+ break;
+ }
+ } else {
+ new_port_state = P_FORWARDING;
+ }
+ if (*port_state != new_port_state) {
+ *port_state = new_port_state;
+ schedule_query(sw, 1000);
+ }
+ }
+}
+
+static unsigned int
+get_port_state(const struct lswitch *sw, uint16_t port_no)
+{
+ return (port_no >= STP_MAX_PORTS || !(sw->capabilities & OFPC_STP)
+ ? P_FORWARDING
+ : sw->port_states[port_no]);
+}
+
+static bool
+may_learn(const struct lswitch *sw, uint16_t port_no)
+{
+ return get_port_state(sw, port_no) & (P_LEARNING | P_FORWARDING);
+}
+
+static bool
+may_recv(const struct lswitch *sw, uint16_t port_no, bool any_actions)
+{
+ unsigned int state = get_port_state(sw, port_no);
+ return !(any_actions
+ ? state & (P_DISABLED | P_LISTENING | P_BLOCKING)
+ : state & (P_DISABLED | P_LISTENING | P_BLOCKING | P_LEARNING));
+}
+
+static bool
+may_send(const struct lswitch *sw, uint16_t port_no)
+{
+ return get_port_state(sw, port_no) & P_FORWARDING;
+}
+
+static void
+process_flow_stats(struct lswitch *sw, struct rconn *rconn,
+ const struct ofp_flow_stats *ofs)
+{
+ const char *end = (char *) ofs + ntohs(ofs->length);
+ bool delete = false;
+
+ /* Decide to delete the flow if it matches on an STP-disabled physical
+ * port. But don't delete it if the flow just drops all received packets,
+ * because that's a perfectly reasonable thing to do for disabled physical
+ * ports. */
+ if (!(ofs->match.wildcards & htonl(OFPFW_IN_PORT))) {
+ if (!may_recv(sw, ntohs(ofs->match.in_port),
+ end > (char *) ofs->actions)) {
+ delete = true;
+ sw->n_no_recv++;
+ }
+ }
+
+ /* Decide to delete the flow if it forwards to an STP-disabled physical
+ * port. */
+ if (!delete) {
+ const struct ofp_action_header *a;
+ size_t len;
+
+ for (a = ofs->actions; (char *) a < end; a += len / 8) {
+ len = ntohs(a->len);
+ if (len > end - (char *) a) {
+ VLOG_DBG_RL(&rl, "%012llx: action exceeds available space "
+ "(%zu > %td)",
+ sw->datapath_id, len, end - (char *) a);
+ break;
+ } else if (len % 8) {
+ VLOG_DBG_RL(&rl, "%012llx: action length (%zu) not multiple "
+ "of 8 bytes", sw->datapath_id, len);
+ break;
+ }
+
+ if (a->type == htons(OFPAT_OUTPUT)) {
+ struct ofp_action_output *oao = (struct ofp_action_output *) a;
+ if (!may_send(sw, ntohs(oao->port))) {
+ delete = true;
+ sw->n_no_send++;
+ break;
+ }
+ }
+ }
+ }
+
+ /* Delete the flow. */
+ if (delete) {
+ struct ofp_flow_mod *ofm;
+ struct ofpbuf *b;
+
+ ofm = make_openflow(offsetof(struct ofp_flow_mod, actions),
+ OFPT_FLOW_MOD, &b);
+ ofm->match = ofs->match;
+ ofm->command = OFPFC_DELETE_STRICT;
+ rconn_send(rconn, b, NULL);
+ }
+}
+
+static void
+process_stats_reply(struct lswitch *sw, struct rconn *rconn, void *osr_)
+{
+ struct ofp_stats_reply *osr = osr_;
+ struct flow_stats_iterator i;
+ const struct ofp_flow_stats *fs;
+
+ if (sw->last_query == LLONG_MIN
+ || osr->type != htons(OFPST_FLOW)
+ || osr->header.xid != sw->query_xid) {
+ return;
+ }
+ for (fs = flow_stats_first(&i, osr); fs; fs = flow_stats_next(&i)) {
+ sw->n_flows++;
+ process_flow_stats(sw, rconn, fs);
+ }
+ if (!(osr->flags & htons(OFPSF_REPLY_MORE))) {
+ VLOG_DBG("%012llx: Deleted %d of %d received flows to "
+ "implement STP, %d because of no-recv, %d because of "
+ "no-send", sw->datapath_id,
+ sw->n_no_recv + sw->n_no_send, sw->n_flows,
+ sw->n_no_recv, sw->n_no_send);
+ sw->last_query = LLONG_MIN;
+ sw->last_reply = LLONG_MIN;
+ } else {
+ sw->last_reply = time_msec();
+ }
+}
+
diff --git a/lib/learning-switch.h b/lib/learning-switch.h
new file mode 100644
index 000000000..5f2f36ec3
--- /dev/null
+++ b/lib/learning-switch.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2008 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef LEARNING_SWITCH_H
+#define LEARNING_SWITCH_H 1
+
+#include <stdbool.h>
+
+struct ofpbuf;
+struct rconn;
+
+struct lswitch *lswitch_create(struct rconn *, bool learn_macs, int max_idle);
+void lswitch_run(struct lswitch *, struct rconn *);
+void lswitch_wait(struct lswitch *);
+void lswitch_destroy(struct lswitch *);
+void lswitch_process_packet(struct lswitch *, struct rconn *,
+ const struct ofpbuf *);
+
+
+#endif /* learning-switch.h */
diff --git a/lib/list.c b/lib/list.c
new file mode 100644
index 000000000..087a5f445
--- /dev/null
+++ b/lib/list.c
@@ -0,0 +1,158 @@
+/*
+ * Copyright (c) 2008, 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+#include <config.h>
+#include "list.h"
+#include <assert.h>
+
+/* Initializes 'list' as an empty list. */
+void
+list_init(struct list *list)
+{
+ list->next = list->prev = list;
+}
+
+/* Inserts 'elem' just before 'before'. */
+void
+list_insert(struct list *before, struct list *elem)
+{
+ elem->prev = before->prev;
+ elem->next = before;
+ before->prev->next = elem;
+ before->prev = elem;
+}
+
+/* Removes elements 'first' though 'last' (exclusive) from their current list,
+ then inserts them just before 'before'. */
+void
+list_splice(struct list *before, struct list *first, struct list *last)
+{
+ if (first == last)
+ return;
+ last = last->prev;
+
+ /* Cleanly remove 'first'...'last' from its current list. */
+ first->prev->next = last->next;
+ last->next->prev = first->prev;
+
+ /* Splice 'first'...'last' into new list. */
+ first->prev = before->prev;
+ last->next = before;
+ before->prev->next = first;
+ before->prev = last;
+}
+
+/* Inserts 'elem' at the beginning of 'list', so that it becomes the front in
+ 'list'. */
+void
+list_push_front(struct list *list, struct list *elem)
+{
+ list_insert(list->next, elem);
+}
+
+/* Inserts 'elem' at the end of 'list', so that it becomes the back in
+ * 'list'. */
+void
+list_push_back(struct list *list, struct list *elem)
+{
+ list_insert(list, elem);
+}
+
+/* Puts 'elem' in the position currently occupied by 'position'.
+ * Afterward, 'position' is not part of a list. */
+void
+list_replace(struct list *element, const struct list *position)
+{
+ element->next = position->next;
+ element->next->prev = element;
+ element->prev = position->prev;
+ element->prev->next = element;
+}
+
+/* Adjusts pointers around 'list' to compensate for 'list' having been moved
+ * around in memory (e.g. as a consequence of realloc()). */
+void
+list_moved(struct list *list)
+{
+ list->prev->next = list->next->prev = list;
+}
+
+/* Removes 'elem' from its list and returns the element that followed it.
+ Undefined behavior if 'elem' is not in a list. */
+struct list *
+list_remove(struct list *elem)
+{
+ elem->prev->next = elem->next;
+ elem->next->prev = elem->prev;
+ return elem->next;
+}
+
+/* Removes the front element from 'list' and returns it. Undefined behavior if
+ 'list' is empty before removal. */
+struct list *
+list_pop_front(struct list *list)
+{
+ struct list *front = list->next;
+ list_remove(front);
+ return front;
+}
+
+/* Removes the back element from 'list' and returns it.
+ Undefined behavior if 'list' is empty before removal. */
+struct list *
+list_pop_back(struct list *list)
+{
+ struct list *back = list->prev;
+ list_remove(back);
+ return back;
+}
+
+/* Returns the front element in 'list'.
+ Undefined behavior if 'list' is empty. */
+struct list *
+list_front(struct list *list)
+{
+ assert(!list_is_empty(list));
+ return list->next;
+}
+
+/* Returns the back element in 'list'.
+ Undefined behavior if 'list' is empty. */
+struct list *
+list_back(struct list *list)
+{
+ assert(!list_is_empty(list));
+ return list->prev;
+}
+
+/* Returns the number of elements in 'list'.
+ Runs in O(n) in the number of elements. */
+size_t
+list_size(const struct list *list)
+{
+ const struct list *e;
+ size_t cnt = 0;
+
+ for (e = list->next; e != list; e = e->next)
+ cnt++;
+ return cnt;
+}
+
+/* Returns true if 'list' is empty, false otherwise. */
+bool
+list_is_empty(const struct list *list)
+{
+ return list->next == list;
+}
diff --git a/lib/list.h b/lib/list.h
new file mode 100644
index 000000000..a421ad51c
--- /dev/null
+++ b/lib/list.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2008, 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+#ifndef LIST_H
+#define LIST_H 1
+
+/* Doubly linked list. */
+
+#include <stdbool.h>
+#include <stddef.h>
+#include "util.h"
+
+/* Doubly linked list head or element. */
+struct list
+ {
+ struct list *prev; /* Previous list element. */
+ struct list *next; /* Next list element. */
+ };
+
+#define LIST_INITIALIZER(LIST) { LIST, LIST }
+
+void list_init(struct list *);
+
+/* List insertion. */
+void list_insert(struct list *, struct list *);
+void list_splice(struct list *before, struct list *first, struct list *last);
+void list_push_front(struct list *, struct list *);
+void list_push_back(struct list *, struct list *);
+void list_replace(struct list *, const struct list *);
+void list_moved(struct list *);
+
+/* List removal. */
+struct list *list_remove(struct list *);
+struct list *list_pop_front(struct list *);
+struct list *list_pop_back(struct list *);
+
+/* List elements. */
+struct list *list_front(struct list *);
+struct list *list_back(struct list *);
+
+/* List properties. */
+size_t list_size(const struct list *);
+bool list_is_empty(const struct list *);
+
+#define LIST_FOR_EACH(ITER, STRUCT, MEMBER, LIST) \
+ for (ITER = CONTAINER_OF((LIST)->next, STRUCT, MEMBER); \
+ &(ITER)->MEMBER != (LIST); \
+ ITER = CONTAINER_OF((ITER)->MEMBER.next, STRUCT, MEMBER))
+#define LIST_FOR_EACH_REVERSE(ITER, STRUCT, MEMBER, LIST) \
+ for (ITER = CONTAINER_OF((LIST)->prev, STRUCT, MEMBER); \
+ &(ITER)->MEMBER != (LIST); \
+ ITER = CONTAINER_OF((ITER)->MEMBER.prev, STRUCT, MEMBER))
+#define LIST_FOR_EACH_SAFE(ITER, NEXT, STRUCT, MEMBER, LIST) \
+ for (ITER = CONTAINER_OF((LIST)->next, STRUCT, MEMBER); \
+ (NEXT = CONTAINER_OF((ITER)->MEMBER.next, STRUCT, MEMBER), \
+ &(ITER)->MEMBER != (LIST)); \
+ ITER = NEXT)
+
+#endif /* list.h */
diff --git a/lib/mac-learning.c b/lib/mac-learning.c
new file mode 100644
index 000000000..f03668082
--- /dev/null
+++ b/lib/mac-learning.c
@@ -0,0 +1,285 @@
+/*
+ * Copyright (c) 2008, 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <config.h>
+#include "mac-learning.h"
+
+#include <assert.h>
+#include <inttypes.h>
+#include <stdlib.h>
+
+#include "coverage.h"
+#include "hash.h"
+#include "list.h"
+#include "poll-loop.h"
+#include "tag.h"
+#include "timeval.h"
+#include "util.h"
+
+#define THIS_MODULE VLM_mac_learning
+#include "vlog.h"
+
+#define MAC_HASH_BITS 10
+#define MAC_HASH_MASK (MAC_HASH_SIZE - 1)
+#define MAC_HASH_SIZE (1u << MAC_HASH_BITS)
+
+#define MAC_MAX 1024
+
+/* A MAC learning table entry. */
+struct mac_entry {
+ struct list hash_node; /* Element in a mac_learning 'table' list. */
+ struct list lru_node; /* Element in 'lrus' or 'free' list. */
+ time_t expires; /* Expiration time. */
+ uint8_t mac[ETH_ADDR_LEN]; /* Known MAC address. */
+ uint16_t vlan; /* VLAN tag. */
+ int port; /* Port on which MAC was most recently seen. */
+ tag_type tag; /* Tag for this learning entry. */
+};
+
+/* MAC learning table. */
+struct mac_learning {
+ struct list free; /* Not-in-use entries. */
+ struct list lrus; /* In-use entries, least recently used at the
+ front, most recently used at the back. */
+ struct list table[MAC_HASH_SIZE]; /* Hash table. */
+ struct mac_entry entries[MAC_MAX]; /* All entries. */
+ uint32_t secret; /* Secret for */
+};
+
+static uint32_t
+mac_table_hash(const uint8_t mac[ETH_ADDR_LEN], uint16_t vlan)
+{
+ return hash_bytes(mac, ETH_ADDR_LEN, vlan);
+}
+
+static struct mac_entry *
+mac_entry_from_lru_node(struct list *list)
+{
+ return CONTAINER_OF(list, struct mac_entry, lru_node);
+}
+
+/* Returns a tag that represents that 'mac' is on an unknown port in 'vlan'.
+ * (When we learn where 'mac' is in 'vlan', this allows flows that were
+ * flooded to be revalidated.) */
+static tag_type
+make_unknown_mac_tag(const struct mac_learning *ml,
+ const uint8_t mac[ETH_ADDR_LEN], uint16_t vlan)
+{
+ uint32_t h = hash_int(ml->secret, mac_table_hash(mac, vlan));
+ return tag_create_deterministic(h);
+}
+
+static struct list *
+mac_table_bucket(const struct mac_learning *ml,
+ const uint8_t mac[ETH_ADDR_LEN],
+ uint16_t vlan)
+{
+ uint32_t hash = mac_table_hash(mac, vlan);
+ const struct list *list = &ml->table[hash & MAC_HASH_BITS];
+ return (struct list *) list;
+}
+
+static struct mac_entry *
+search_bucket(struct list *bucket, const uint8_t mac[ETH_ADDR_LEN],
+ uint16_t vlan)
+{
+ struct mac_entry *e;
+ LIST_FOR_EACH (e, struct mac_entry, hash_node, bucket) {
+ if (eth_addr_equals(e->mac, mac) && e->vlan == vlan) {
+ return e;
+ }
+ }
+ return NULL;
+}
+
+/* If the LRU list is not empty, stores the least-recently-used entry in '*e'
+ * and returns true. Otherwise, if the LRU list is empty, stores NULL in '*e'
+ * and return false. */
+static bool
+get_lru(struct mac_learning *ml, struct mac_entry **e)
+{
+ if (!list_is_empty(&ml->lrus)) {
+ *e = mac_entry_from_lru_node(ml->lrus.next);
+ return true;
+ } else {
+ *e = NULL;
+ return false;
+ }
+}
+
+/* Removes 'e' from the 'ml' hash table. 'e' must not already be on the free
+ * list. */
+static void
+free_mac_entry(struct mac_learning *ml, struct mac_entry *e)
+{
+ list_remove(&e->hash_node);
+ list_remove(&e->lru_node);
+ list_push_front(&ml->free, &e->lru_node);
+}
+
+/* Creates and returns a new MAC learning table. */
+struct mac_learning *
+mac_learning_create(void)
+{
+ struct mac_learning *ml;
+ int i;
+
+ ml = xmalloc(sizeof *ml);
+ list_init(&ml->lrus);
+ list_init(&ml->free);
+ for (i = 0; i < MAC_HASH_SIZE; i++) {
+ list_init(&ml->table[i]);
+ }
+ for (i = 0; i < MAC_MAX; i++) {
+ struct mac_entry *s = &ml->entries[i];
+ list_push_front(&ml->free, &s->lru_node);
+ }
+ ml->secret = random_uint32();
+ return ml;
+}
+
+/* Destroys MAC learning table 'ml'. */
+void
+mac_learning_destroy(struct mac_learning *ml)
+{
+ free(ml);
+}
+
+/* Attempts to make 'ml' learn from the fact that a frame from 'src_mac' was
+ * just observed arriving from 'src_port' on the given 'vlan'.
+ *
+ * Returns nonzero if we actually learned something from this, zero if it just
+ * confirms what we already knew. The nonzero return value is the tag of flows
+ * that now need revalidation.
+ *
+ * The 'vlan' parameter is used to maintain separate per-VLAN learning tables.
+ * Specify 0 if this behavior is undesirable. */
+tag_type
+mac_learning_learn(struct mac_learning *ml,
+ const uint8_t src_mac[ETH_ADDR_LEN], uint16_t vlan,
+ uint16_t src_port)
+{
+ struct mac_entry *e;
+ struct list *bucket;
+
+ if (eth_addr_is_multicast(src_mac)) {
+ static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(30, 30);
+ VLOG_DBG_RL(&rl, "multicast packet source "ETH_ADDR_FMT,
+ ETH_ADDR_ARGS(src_mac));
+ return 0;
+ }
+
+ bucket = mac_table_bucket(ml, src_mac, vlan);
+ e = search_bucket(bucket, src_mac, vlan);
+ if (!e) {
+ if (!list_is_empty(&ml->free)) {
+ e = mac_entry_from_lru_node(ml->free.next);
+ } else {
+ e = mac_entry_from_lru_node(ml->lrus.next);
+ list_remove(&e->hash_node);
+ }
+ memcpy(e->mac, src_mac, ETH_ADDR_LEN);
+ list_push_front(bucket, &e->hash_node);
+ e->port = -1;
+ e->vlan = vlan;
+ e->tag = make_unknown_mac_tag(ml, src_mac, vlan);
+ }
+
+ /* Make the entry most-recently-used. */
+ list_remove(&e->lru_node);
+ list_push_back(&ml->lrus, &e->lru_node);
+ e->expires = time_now() + 60;
+
+ /* Did we learn something? */
+ if (e->port != src_port) {
+ tag_type old_tag = e->tag;
+ e->port = src_port;
+ e->tag = tag_create_random();
+ COVERAGE_INC(mac_learning_learned);
+ return old_tag;
+ }
+ return 0;
+}
+
+/* Looks up MAC 'dst' for VLAN 'vlan' in 'ml'. Returns the port on which a
+ * frame destined for 'dst' should be sent, -1 if unknown. */
+int
+mac_learning_lookup(const struct mac_learning *ml,
+ const uint8_t dst[ETH_ADDR_LEN], uint16_t vlan)
+{
+ tag_type tag = 0;
+ return mac_learning_lookup_tag(ml, dst, vlan, &tag);
+}
+
+/* Looks up MAC 'dst' for VLAN 'vlan' in 'ml'. Returns the port on which a
+ * frame destined for 'dst' should be sent, -1 if unknown.
+ *
+ * Adds to '*tag' (which the caller must have initialized) the tag that should
+ * be attached to any flow created based on the return value, if any, to allow
+ * those flows to be revalidated when the MAC learning entry changes. */
+int
+mac_learning_lookup_tag(const struct mac_learning *ml,
+ const uint8_t dst[ETH_ADDR_LEN], uint16_t vlan,
+ tag_type *tag)
+{
+ if (eth_addr_is_multicast(dst)) {
+ return -1;
+ } else {
+ struct mac_entry *e = search_bucket(mac_table_bucket(ml, dst, vlan),
+ dst, vlan);
+ if (e) {
+ *tag |= e->tag;
+ return e->port;
+ } else {
+ *tag |= make_unknown_mac_tag(ml, dst, vlan);
+ return -1;
+ }
+ }
+}
+
+/* Expires all the mac-learning entries in 'ml'. The tags in 'ml' are
+ * discarded, so the client is responsible for revalidating any flows that
+ * depend on 'ml', if necessary. */
+void
+mac_learning_flush(struct mac_learning *ml)
+{
+ struct mac_entry *e;
+ while (get_lru(ml, &e)){
+ free_mac_entry(ml, e);
+ }
+}
+
+void
+mac_learning_run(struct mac_learning *ml, struct tag_set *set)
+{
+ struct mac_entry *e;
+ while (get_lru(ml, &e) && time_now() >= e->expires) {
+ COVERAGE_INC(mac_learning_expired);
+ if (set) {
+ tag_set_add(set, e->tag);
+ }
+ free_mac_entry(ml, e);
+ }
+}
+
+void
+mac_learning_wait(struct mac_learning *ml)
+{
+ if (!list_is_empty(&ml->lrus)) {
+ struct mac_entry *e = mac_entry_from_lru_node(ml->lrus.next);
+ poll_timer_wait((e->expires - time_now()) * 1000);
+ }
+}
diff --git a/lib/mac-learning.h b/lib/mac-learning.h
new file mode 100644
index 000000000..fc1d62040
--- /dev/null
+++ b/lib/mac-learning.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2008, 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef MAC_LEARNING_H
+#define MAC_LEARNING_H 1
+
+#include "packets.h"
+#include "tag.h"
+
+struct mac_learning *mac_learning_create(void);
+void mac_learning_destroy(struct mac_learning *);
+tag_type mac_learning_learn(struct mac_learning *,
+ const uint8_t src[ETH_ADDR_LEN], uint16_t vlan,
+ uint16_t src_port);
+int mac_learning_lookup(const struct mac_learning *,
+ const uint8_t dst[ETH_ADDR_LEN], uint16_t vlan);
+int mac_learning_lookup_tag(const struct mac_learning *,
+ const uint8_t dst[ETH_ADDR_LEN],
+ uint16_t vlan, tag_type *tag);
+void mac_learning_flush(struct mac_learning *);
+void mac_learning_run(struct mac_learning *, struct tag_set *);
+void mac_learning_wait(struct mac_learning *);
+
+#endif /* mac-learning.h */
diff --git a/lib/netdev.c b/lib/netdev.c
new file mode 100644
index 000000000..7fd070eb6
--- /dev/null
+++ b/lib/netdev.c
@@ -0,0 +1,1556 @@
+/*
+ * Copyright (c) 2008, 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <config.h>
+#include "netdev.h"
+
+#include <assert.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <arpa/inet.h>
+#include <inttypes.h>
+#include <linux/if_tun.h>
+#include <linux/types.h>
+#include <linux/ethtool.h>
+#include <linux/rtnetlink.h>
+#include <linux/sockios.h>
+#include <linux/version.h>
+#include <sys/types.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <netpacket/packet.h>
+#include <net/ethernet.h>
+#include <net/if.h>
+#include <net/if_arp.h>
+#include <net/if_packet.h>
+#include <net/route.h>
+#include <netinet/in.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "coverage.h"
+#include "dynamic-string.h"
+#include "fatal-signal.h"
+#include "list.h"
+#include "netlink.h"
+#include "ofpbuf.h"
+#include "openflow/openflow.h"
+#include "packets.h"
+#include "poll-loop.h"
+#include "socket-util.h"
+#include "svec.h"
+
+/* linux/if.h defines IFF_LOWER_UP, net/if.h doesn't.
+ * net/if.h defines if_nameindex(), linux/if.h doesn't.
+ * We can't include both headers, so define IFF_LOWER_UP ourselves. */
+#ifndef IFF_LOWER_UP
+#define IFF_LOWER_UP 0x10000
+#endif
+
+/* These were introduced in Linux 2.6.14, so they might be missing if we have
+ * old headers. */
+#ifndef ADVERTISED_Pause
+#define ADVERTISED_Pause (1 << 13)
+#endif
+#ifndef ADVERTISED_Asym_Pause
+#define ADVERTISED_Asym_Pause (1 << 14)
+#endif
+
+#define THIS_MODULE VLM_netdev
+#include "vlog.h"
+
+struct netdev {
+ struct list node;
+ char *name;
+
+ /* File descriptors. For ordinary network devices, the two fds below are
+ * the same; for tap devices, they differ. */
+ int netdev_fd; /* Network device. */
+ int tap_fd; /* TAP character device, if any, otherwise the
+ * network device. */
+
+ /* Cached network device information. */
+ int ifindex; /* -1 if not known. */
+ uint8_t etheraddr[ETH_ADDR_LEN];
+ struct in6_addr in6;
+ int speed;
+ int mtu;
+ int txqlen;
+ int hwaddr_family;
+
+ int save_flags; /* Initial device flags. */
+ int changed_flags; /* Flags that we changed. */
+};
+
+/* Policy for RTNLGRP_LINK messages.
+ *
+ * There are *many* more fields in these messages, but currently we only care
+ * about interface names. */
+static const struct nl_policy rtnlgrp_link_policy[] = {
+ [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
+ [IFLA_STATS] = { .type = NL_A_UNSPEC, .optional = true,
+ .min_len = sizeof(struct rtnl_link_stats) },
+};
+
+/* All open network devices. */
+static struct list netdev_list = LIST_INITIALIZER(&netdev_list);
+
+/* An AF_INET socket (used for ioctl operations). */
+static int af_inet_sock = -1;
+
+/* NETLINK_ROUTE socket. */
+static struct nl_sock *rtnl_sock;
+
+/* Can we use RTM_GETLINK to get network device statistics? (In pre-2.6.19
+ * kernels, this was only available if wireless extensions were enabled.) */
+static bool use_netlink_stats;
+
+/* This is set pretty low because we probably won't learn anything from the
+ * additional log messages. */
+static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
+
+static void init_netdev(void);
+static int do_open_netdev(const char *name, int ethertype, int tap_fd,
+ struct netdev **netdev_);
+static int restore_flags(struct netdev *netdev);
+static int get_flags(const char *netdev_name, int *flagsp);
+static int set_flags(const char *netdev_name, int flags);
+static int do_get_ifindex(const char *netdev_name);
+static int get_ifindex(const struct netdev *, int *ifindexp);
+static int get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN],
+ int *hwaddr_familyp);
+static int set_etheraddr(const char *netdev_name, int hwaddr_family,
+ const uint8_t[ETH_ADDR_LEN]);
+
+/* Obtains the IPv6 address for 'name' into 'in6'. */
+static void
+get_ipv6_address(const char *name, struct in6_addr *in6)
+{
+ FILE *file;
+ char line[128];
+
+ file = fopen("/proc/net/if_inet6", "r");
+ if (file == NULL) {
+ /* This most likely indicates that the host doesn't have IPv6 support,
+ * so it's not really a failure condition.*/
+ *in6 = in6addr_any;
+ return;
+ }
+
+ while (fgets(line, sizeof line, file)) {
+ uint8_t *s6 = in6->s6_addr;
+ char ifname[16 + 1];
+
+#define X8 "%2"SCNx8
+ if (sscanf(line, " "X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8 X8
+ "%*x %*x %*x %*x %16s\n",
+ &s6[0], &s6[1], &s6[2], &s6[3],
+ &s6[4], &s6[5], &s6[6], &s6[7],
+ &s6[8], &s6[9], &s6[10], &s6[11],
+ &s6[12], &s6[13], &s6[14], &s6[15],
+ ifname) == 17
+ && !strcmp(name, ifname))
+ {
+ fclose(file);
+ return;
+ }
+ }
+ *in6 = in6addr_any;
+
+ fclose(file);
+}
+
+static int
+do_ethtool(struct netdev *netdev, struct ethtool_cmd *ecmd,
+ int cmd, const char *cmd_name)
+{
+ struct ifreq ifr;
+
+ memset(&ifr, 0, sizeof ifr);
+ strncpy(ifr.ifr_name, netdev->name, sizeof ifr.ifr_name);
+ ifr.ifr_data = (caddr_t) ecmd;
+
+ ecmd->cmd = cmd;
+ COVERAGE_INC(netdev_ethtool);
+ if (ioctl(netdev->netdev_fd, SIOCETHTOOL, &ifr) == 0) {
+ return 0;
+ } else {
+ if (errno != EOPNOTSUPP) {
+ VLOG_WARN_RL(&rl, "ethtool command %s on network device %s "
+ "failed: %s", cmd_name, netdev->name,
+ strerror(errno));
+ } else {
+ /* The device doesn't support this operation. That's pretty
+ * common, so there's no point in logging anything. */
+ }
+ return errno;
+ }
+}
+
+static int
+do_get_features(struct netdev *netdev,
+ uint32_t *current, uint32_t *advertised,
+ uint32_t *supported, uint32_t *peer)
+{
+ struct ethtool_cmd ecmd;
+ int error;
+
+ *current = 0;
+ *supported = 0;
+ *advertised = 0;
+ *peer = 0;
+
+ memset(&ecmd, 0, sizeof ecmd);
+ error = do_ethtool(netdev, &ecmd, ETHTOOL_GSET, "ETHTOOL_GSET");
+ if (error) {
+ return error;
+ }
+
+ if (ecmd.supported & SUPPORTED_10baseT_Half) {
+ *supported |= OFPPF_10MB_HD;
+ }
+ if (ecmd.supported & SUPPORTED_10baseT_Full) {
+ *supported |= OFPPF_10MB_FD;
+ }
+ if (ecmd.supported & SUPPORTED_100baseT_Half) {
+ *supported |= OFPPF_100MB_HD;
+ }
+ if (ecmd.supported & SUPPORTED_100baseT_Full) {
+ *supported |= OFPPF_100MB_FD;
+ }
+ if (ecmd.supported & SUPPORTED_1000baseT_Half) {
+ *supported |= OFPPF_1GB_HD;
+ }
+ if (ecmd.supported & SUPPORTED_1000baseT_Full) {
+ *supported |= OFPPF_1GB_FD;
+ }
+ if (ecmd.supported & SUPPORTED_10000baseT_Full) {
+ *supported |= OFPPF_10GB_FD;
+ }
+ if (ecmd.supported & SUPPORTED_TP) {
+ *supported |= OFPPF_COPPER;
+ }
+ if (ecmd.supported & SUPPORTED_FIBRE) {
+ *supported |= OFPPF_FIBER;
+ }
+ if (ecmd.supported & SUPPORTED_Autoneg) {
+ *supported |= OFPPF_AUTONEG;
+ }
+ if (ecmd.supported & SUPPORTED_Pause) {
+ *supported |= OFPPF_PAUSE;
+ }
+ if (ecmd.supported & SUPPORTED_Asym_Pause) {
+ *supported |= OFPPF_PAUSE_ASYM;
+ }
+
+ /* Set the advertised features */
+ if (ecmd.advertising & ADVERTISED_10baseT_Half) {
+ *advertised |= OFPPF_10MB_HD;
+ }
+ if (ecmd.advertising & ADVERTISED_10baseT_Full) {
+ *advertised |= OFPPF_10MB_FD;
+ }
+ if (ecmd.advertising & ADVERTISED_100baseT_Half) {
+ *advertised |= OFPPF_100MB_HD;
+ }
+ if (ecmd.advertising & ADVERTISED_100baseT_Full) {
+ *advertised |= OFPPF_100MB_FD;
+ }
+ if (ecmd.advertising & ADVERTISED_1000baseT_Half) {
+ *advertised |= OFPPF_1GB_HD;
+ }
+ if (ecmd.advertising & ADVERTISED_1000baseT_Full) {
+ *advertised |= OFPPF_1GB_FD;
+ }
+ if (ecmd.advertising & ADVERTISED_10000baseT_Full) {
+ *advertised |= OFPPF_10GB_FD;
+ }
+ if (ecmd.advertising & ADVERTISED_TP) {
+ *advertised |= OFPPF_COPPER;
+ }
+ if (ecmd.advertising & ADVERTISED_FIBRE) {
+ *advertised |= OFPPF_FIBER;
+ }
+ if (ecmd.advertising & ADVERTISED_Autoneg) {
+ *advertised |= OFPPF_AUTONEG;
+ }
+ if (ecmd.advertising & ADVERTISED_Pause) {
+ *advertised |= OFPPF_PAUSE;
+ }
+ if (ecmd.advertising & ADVERTISED_Asym_Pause) {
+ *advertised |= OFPPF_PAUSE_ASYM;
+ }
+
+ /* Set the current features */
+ if (ecmd.speed == SPEED_10) {
+ *current = (ecmd.duplex) ? OFPPF_10MB_FD : OFPPF_10MB_HD;
+ }
+ else if (ecmd.speed == SPEED_100) {
+ *current = (ecmd.duplex) ? OFPPF_100MB_FD : OFPPF_100MB_HD;
+ }
+ else if (ecmd.speed == SPEED_1000) {
+ *current = (ecmd.duplex) ? OFPPF_1GB_FD : OFPPF_1GB_HD;
+ }
+ else if (ecmd.speed == SPEED_10000) {
+ *current = OFPPF_10GB_FD;
+ }
+
+ if (ecmd.port == PORT_TP) {
+ *current |= OFPPF_COPPER;
+ }
+ else if (ecmd.port == PORT_FIBRE) {
+ *current |= OFPPF_FIBER;
+ }
+
+ if (ecmd.autoneg) {
+ *current |= OFPPF_AUTONEG;
+ }
+ return 0;
+}
+
+/* Opens the network device named 'name' (e.g. "eth0") and returns zero if
+ * successful, otherwise a positive errno value. On success, sets '*netdevp'
+ * to the new network device, otherwise to null.
+ *
+ * 'ethertype' may be a 16-bit Ethernet protocol value in host byte order to
+ * capture frames of that type received on the device. It may also be one of
+ * the 'enum netdev_pseudo_ethertype' values to receive frames in one of those
+ * categories. */
+int
+netdev_open(const char *name, int ethertype, struct netdev **netdevp)
+{
+ if (!strncmp(name, "tap:", 4)) {
+ return netdev_open_tap(name + 4, netdevp);
+ } else {
+ return do_open_netdev(name, ethertype, -1, netdevp);
+ }
+}
+
+/* Opens a TAP virtual network device. If 'name' is a nonnull, non-empty
+ * string, attempts to assign that name to the TAP device (failing if the name
+ * is already in use); otherwise, a name is automatically assigned. Returns
+ * zero if successful, otherwise a positive errno value. On success, sets
+ * '*netdevp' to the new network device, otherwise to null. */
+int
+netdev_open_tap(const char *name, struct netdev **netdevp)
+{
+ static const char tap_dev[] = "/dev/net/tun";
+ struct ifreq ifr;
+ int error;
+ int tap_fd;
+
+ tap_fd = open(tap_dev, O_RDWR);
+ if (tap_fd < 0) {
+ ovs_error(errno, "opening \"%s\" failed", tap_dev);
+ return errno;
+ }
+
+ memset(&ifr, 0, sizeof ifr);
+ ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
+ if (name) {
+ strncpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
+ }
+ if (ioctl(tap_fd, TUNSETIFF, &ifr) < 0) {
+ int error = errno;
+ ovs_error(error, "ioctl(TUNSETIFF) on \"%s\" failed", tap_dev);
+ close(tap_fd);
+ return error;
+ }
+
+ error = set_nonblocking(tap_fd);
+ if (error) {
+ ovs_error(error, "set_nonblocking on \"%s\" failed", tap_dev);
+ close(tap_fd);
+ return error;
+ }
+
+ error = do_open_netdev(ifr.ifr_name, NETDEV_ETH_TYPE_NONE, tap_fd,
+ netdevp);
+ if (error) {
+ close(tap_fd);
+ }
+ return error;
+}
+
+static int
+do_open_netdev(const char *name, int ethertype, int tap_fd,
+ struct netdev **netdev_)
+{
+ int netdev_fd;
+ struct sockaddr_ll sll;
+ struct ifreq ifr;
+ int ifindex = -1;
+ uint8_t etheraddr[ETH_ADDR_LEN];
+ struct in6_addr in6;
+ int mtu;
+ int txqlen;
+ int hwaddr_family;
+ int error;
+ struct netdev *netdev;
+
+ init_netdev();
+ *netdev_ = NULL;
+ COVERAGE_INC(netdev_open);
+
+ /* Create raw socket. */
+ netdev_fd = socket(PF_PACKET, SOCK_RAW,
+ htons(ethertype == NETDEV_ETH_TYPE_NONE ? 0
+ : ethertype == NETDEV_ETH_TYPE_ANY ? ETH_P_ALL
+ : ethertype == NETDEV_ETH_TYPE_802_2 ? ETH_P_802_2
+ : ethertype));
+ if (netdev_fd < 0) {
+ return errno;
+ }
+
+ if (ethertype != NETDEV_ETH_TYPE_NONE) {
+ /* Set non-blocking mode. */
+ error = set_nonblocking(netdev_fd);
+ if (error) {
+ goto error_already_set;
+ }
+
+ /* Get ethernet device index. */
+ ifindex = do_get_ifindex(name);
+ if (ifindex < 0) {
+ return -ifindex;
+ }
+
+ /* Bind to specific ethernet device. */
+ memset(&sll, 0, sizeof sll);
+ sll.sll_family = AF_PACKET;
+ sll.sll_ifindex = ifindex;
+ if (bind(netdev_fd, (struct sockaddr *) &sll, sizeof sll) < 0) {
+ VLOG_ERR("bind to %s failed: %s", name, strerror(errno));
+ goto error;
+ }
+
+ /* Between the socket() and bind() calls above, the socket receives all
+ * packets of the requested type on all system interfaces. We do not
+ * want to receive that data, but there is no way to avoid it. So we
+ * must now drain out the receive queue. */
+ error = drain_rcvbuf(netdev_fd);
+ if (error) {
+ goto error_already_set;
+ }
+ }
+
+ /* Get MAC address. */
+ error = get_etheraddr(name, etheraddr, &hwaddr_family);
+ if (error) {
+ goto error_already_set;
+ }
+
+ /* Get MTU. */
+ strncpy(ifr.ifr_name, name, sizeof ifr.ifr_name);
+ if (ioctl(netdev_fd, SIOCGIFMTU, &ifr) < 0) {
+ VLOG_ERR("ioctl(SIOCGIFMTU) on %s device failed: %s",
+ name, strerror(errno));
+ goto error;
+ }
+ mtu = ifr.ifr_mtu;
+
+ /* Get TX queue length. */
+ if (ioctl(netdev_fd, SIOCGIFTXQLEN, &ifr) < 0) {
+ VLOG_ERR("ioctl(SIOCGIFTXQLEN) on %s device failed: %s",
+ name, strerror(errno));
+ goto error;
+ }
+ txqlen = ifr.ifr_qlen;
+
+ get_ipv6_address(name, &in6);
+
+ /* Allocate network device. */
+ netdev = xmalloc(sizeof *netdev);
+ netdev->name = xstrdup(name);
+ netdev->ifindex = ifindex;
+ netdev->txqlen = txqlen;
+ netdev->hwaddr_family = hwaddr_family;
+ netdev->netdev_fd = netdev_fd;
+ netdev->tap_fd = tap_fd < 0 ? netdev_fd : tap_fd;
+ memcpy(netdev->etheraddr, etheraddr, sizeof etheraddr);
+ netdev->mtu = mtu;
+ netdev->in6 = in6;
+
+ /* Save flags to restore at close or exit. */
+ error = get_flags(netdev->name, &netdev->save_flags);
+ if (error) {
+ goto error_already_set;
+ }
+ netdev->changed_flags = 0;
+ fatal_signal_block();
+ list_push_back(&netdev_list, &netdev->node);
+ fatal_signal_unblock();
+
+ /* Success! */
+ *netdev_ = netdev;
+ return 0;
+
+error:
+ error = errno;
+error_already_set:
+ close(netdev_fd);
+ if (tap_fd >= 0) {
+ close(tap_fd);
+ }
+ return error;
+}
+
+/* Closes and destroys 'netdev'. */
+void
+netdev_close(struct netdev *netdev)
+{
+ if (netdev) {
+ /* Bring down interface and drop promiscuous mode, if we brought up
+ * the interface or enabled promiscuous mode. */
+ int error;
+ fatal_signal_block();
+ error = restore_flags(netdev);
+ list_remove(&netdev->node);
+ fatal_signal_unblock();
+ if (error) {
+ VLOG_WARN("failed to restore network device flags on %s: %s",
+ netdev->name, strerror(error));
+ }
+
+ /* Free. */
+ free(netdev->name);
+ close(netdev->netdev_fd);
+ if (netdev->netdev_fd != netdev->tap_fd) {
+ close(netdev->tap_fd);
+ }
+ free(netdev);
+ }
+}
+
+/* Pads 'buffer' out with zero-bytes to the minimum valid length of an
+ * Ethernet packet, if necessary. */
+static void
+pad_to_minimum_length(struct ofpbuf *buffer)
+{
+ if (buffer->size < ETH_TOTAL_MIN) {
+ ofpbuf_put_zeros(buffer, ETH_TOTAL_MIN - buffer->size);
+ }
+}
+
+/* Attempts to receive a packet from 'netdev' into 'buffer', which the caller
+ * must have initialized with sufficient room for the packet. The space
+ * required to receive any packet is ETH_HEADER_LEN bytes, plus VLAN_HEADER_LEN
+ * bytes, plus the device's MTU (which may be retrieved via netdev_get_mtu()).
+ * (Some devices do not allow for a VLAN header, in which case VLAN_HEADER_LEN
+ * need not be included.)
+ *
+ * If a packet is successfully retrieved, returns 0. In this case 'buffer' is
+ * guaranteed to contain at least ETH_TOTAL_MIN bytes. Otherwise, returns a
+ * positive errno value. Returns EAGAIN immediately if no packet is ready to
+ * be returned.
+ */
+int
+netdev_recv(struct netdev *netdev, struct ofpbuf *buffer)
+{
+ ssize_t n_bytes;
+
+ assert(buffer->size == 0);
+ assert(ofpbuf_tailroom(buffer) >= ETH_TOTAL_MIN);
+ do {
+ n_bytes = read(netdev->tap_fd,
+ ofpbuf_tail(buffer), ofpbuf_tailroom(buffer));
+ } while (n_bytes < 0 && errno == EINTR);
+ if (n_bytes < 0) {
+ if (errno != EAGAIN) {
+ VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
+ strerror(errno), netdev->name);
+ }
+ return errno;
+ } else {
+ COVERAGE_INC(netdev_received);
+ buffer->size += n_bytes;
+
+ /* When the kernel internally sends out an Ethernet frame on an
+ * interface, it gives us a copy *before* padding the frame to the
+ * minimum length. Thus, when it sends out something like an ARP
+ * request, we see a too-short frame. So pad it out to the minimum
+ * length. */
+ pad_to_minimum_length(buffer);
+ return 0;
+ }
+}
+
+/* Registers with the poll loop to wake up from the next call to poll_block()
+ * when a packet is ready to be received with netdev_recv() on 'netdev'. */
+void
+netdev_recv_wait(struct netdev *netdev)
+{
+ poll_fd_wait(netdev->tap_fd, POLLIN);
+}
+
+/* Discards all packets waiting to be received from 'netdev'. */
+int
+netdev_drain(struct netdev *netdev)
+{
+ if (netdev->tap_fd != netdev->netdev_fd) {
+ drain_fd(netdev->tap_fd, netdev->txqlen);
+ return 0;
+ } else {
+ return drain_rcvbuf(netdev->netdev_fd);
+ }
+}
+
+/* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a positive
+ * errno value. Returns EAGAIN without blocking if the packet cannot be queued
+ * immediately. Returns EMSGSIZE if a partial packet was transmitted or if
+ * the packet is too big or too small to transmit on the device.
+ *
+ * The caller retains ownership of 'buffer' in all cases.
+ *
+ * The kernel maintains a packet transmission queue, so the caller is not
+ * expected to do additional queuing of packets. */
+int
+netdev_send(struct netdev *netdev, const struct ofpbuf *buffer)
+{
+ ssize_t n_bytes;
+
+ do {
+ n_bytes = write(netdev->tap_fd, buffer->data, buffer->size);
+ } while (n_bytes < 0 && errno == EINTR);
+
+ if (n_bytes < 0) {
+ /* The Linux AF_PACKET implementation never blocks waiting for room
+ * for packets, instead returning ENOBUFS. Translate this into EAGAIN
+ * for the caller. */
+ if (errno == ENOBUFS) {
+ return EAGAIN;
+ } else if (errno != EAGAIN) {
+ VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
+ netdev->name, strerror(errno));
+ }
+ return errno;
+ } else if (n_bytes != buffer->size) {
+ VLOG_WARN_RL(&rl,
+ "send partial Ethernet packet (%d bytes of %zu) on %s",
+ (int) n_bytes, buffer->size, netdev->name);
+ return EMSGSIZE;
+ } else {
+ COVERAGE_INC(netdev_sent);
+ return 0;
+ }
+}
+
+/* Registers with the poll loop to wake up from the next call to poll_block()
+ * when the packet transmission queue has sufficient room to transmit a packet
+ * with netdev_send().
+ *
+ * The kernel maintains a packet transmission queue, so the client is not
+ * expected to do additional queuing of packets. Thus, this function is
+ * unlikely to ever be used. It is included for completeness. */
+void
+netdev_send_wait(struct netdev *netdev)
+{
+ if (netdev->tap_fd == netdev->netdev_fd) {
+ poll_fd_wait(netdev->tap_fd, POLLOUT);
+ } else {
+ /* TAP device always accepts packets.*/
+ poll_immediate_wake();
+ }
+}
+
+/* Attempts to set 'netdev''s MAC address to 'mac'. Returns 0 if successful,
+ * otherwise a positive errno value. */
+int
+netdev_set_etheraddr(struct netdev *netdev, const uint8_t mac[ETH_ADDR_LEN])
+{
+ int error = set_etheraddr(netdev->name, netdev->hwaddr_family, mac);
+ if (!error) {
+ memcpy(netdev->etheraddr, mac, ETH_ADDR_LEN);
+ }
+ return error;
+}
+
+int
+netdev_nodev_set_etheraddr(const char *name, const uint8_t mac[ETH_ADDR_LEN])
+{
+ init_netdev();
+ return set_etheraddr(name, ARPHRD_ETHER, mac);
+}
+
+/* Returns a pointer to 'netdev''s MAC address. The caller must not modify or
+ * free the returned buffer. */
+const uint8_t *
+netdev_get_etheraddr(const struct netdev *netdev)
+{
+ return netdev->etheraddr;
+}
+
+/* Returns the name of the network device that 'netdev' represents,
+ * e.g. "eth0". The caller must not modify or free the returned string. */
+const char *
+netdev_get_name(const struct netdev *netdev)
+{
+ return netdev->name;
+}
+
+/* Returns the maximum size of transmitted (and received) packets on 'netdev',
+ * in bytes, not including the hardware header; thus, this is typically 1500
+ * bytes for Ethernet devices. */
+int
+netdev_get_mtu(const struct netdev *netdev)
+{
+ return netdev->mtu;
+}
+
+/* Stores the features supported by 'netdev' into each of '*current',
+ * '*advertised', '*supported', and '*peer' that are non-null. Each value is a
+ * bitmap of "enum ofp_port_features" bits, in host byte order. Returns 0 if
+ * successful, otherwise a positive errno value. On failure, all of the
+ * passed-in values are set to 0. */
+int
+netdev_get_features(struct netdev *netdev,
+ uint32_t *current, uint32_t *advertised,
+ uint32_t *supported, uint32_t *peer)
+{
+ uint32_t dummy[4];
+ return do_get_features(netdev,
+ current ? current : &dummy[0],
+ advertised ? advertised : &dummy[1],
+ supported ? supported : &dummy[2],
+ peer ? peer : &dummy[3]);
+}
+
+int
+netdev_set_advertisements(struct netdev *netdev, uint32_t advertise)
+{
+ struct ethtool_cmd ecmd;
+ int error;
+
+ memset(&ecmd, 0, sizeof ecmd);
+ error = do_ethtool(netdev, &ecmd, ETHTOOL_GSET, "ETHTOOL_GSET");
+ if (error) {
+ return error;
+ }
+
+ ecmd.advertising = 0;
+ if (advertise & OFPPF_10MB_HD) {
+ ecmd.advertising |= ADVERTISED_10baseT_Half;
+ }
+ if (advertise & OFPPF_10MB_FD) {
+ ecmd.advertising |= ADVERTISED_10baseT_Full;
+ }
+ if (advertise & OFPPF_100MB_HD) {
+ ecmd.advertising |= ADVERTISED_100baseT_Half;
+ }
+ if (advertise & OFPPF_100MB_FD) {
+ ecmd.advertising |= ADVERTISED_100baseT_Full;
+ }
+ if (advertise & OFPPF_1GB_HD) {
+ ecmd.advertising |= ADVERTISED_1000baseT_Half;
+ }
+ if (advertise & OFPPF_1GB_FD) {
+ ecmd.advertising |= ADVERTISED_1000baseT_Full;
+ }
+ if (advertise & OFPPF_10GB_FD) {
+ ecmd.advertising |= ADVERTISED_10000baseT_Full;
+ }
+ if (advertise & OFPPF_COPPER) {
+ ecmd.advertising |= ADVERTISED_TP;
+ }
+ if (advertise & OFPPF_FIBER) {
+ ecmd.advertising |= ADVERTISED_FIBRE;
+ }
+ if (advertise & OFPPF_AUTONEG) {
+ ecmd.advertising |= ADVERTISED_Autoneg;
+ }
+ if (advertise & OFPPF_PAUSE) {
+ ecmd.advertising |= ADVERTISED_Pause;
+ }
+ if (advertise & OFPPF_PAUSE_ASYM) {
+ ecmd.advertising |= ADVERTISED_Asym_Pause;
+ }
+ return do_ethtool(netdev, &ecmd, ETHTOOL_SSET, "ETHTOOL_SSET");
+}
+
+/* If 'netdev' has an assigned IPv4 address, sets '*in4' to that address (if
+ * 'in4' is non-null) and returns true. Otherwise, returns false. */
+bool
+netdev_get_in4(const struct netdev *netdev, struct in_addr *in4)
+{
+ struct ifreq ifr;
+ struct in_addr ip = { INADDR_ANY };
+
+ strncpy(ifr.ifr_name, netdev->name, sizeof ifr.ifr_name);
+ ifr.ifr_addr.sa_family = AF_INET;
+ COVERAGE_INC(netdev_get_in4);
+ if (ioctl(af_inet_sock, SIOCGIFADDR, &ifr) == 0) {
+ struct sockaddr_in *sin = (struct sockaddr_in *) &ifr.ifr_addr;
+ ip = sin->sin_addr;
+ } else {
+ VLOG_DBG_RL(&rl, "%s: ioctl(SIOCGIFADDR) failed: %s",
+ netdev->name, strerror(errno));
+ }
+ if (in4) {
+ *in4 = ip;
+ }
+ return ip.s_addr != INADDR_ANY;
+}
+
+static void
+make_in4_sockaddr(struct sockaddr *sa, struct in_addr addr)
+{
+ struct sockaddr_in sin;
+ memset(&sin, 0, sizeof sin);
+ sin.sin_family = AF_INET;
+ sin.sin_addr = addr;
+ sin.sin_port = 0;
+
+ memset(sa, 0, sizeof *sa);
+ memcpy(sa, &sin, sizeof sin);
+}
+
+static int
+do_set_addr(struct netdev *netdev, int sock,
+ int ioctl_nr, const char *ioctl_name, struct in_addr addr)
+{
+ struct ifreq ifr;
+ int error;
+
+ strncpy(ifr.ifr_name, netdev->name, sizeof ifr.ifr_name);
+ make_in4_sockaddr(&ifr.ifr_addr, addr);
+ COVERAGE_INC(netdev_set_in4);
+ error = ioctl(sock, ioctl_nr, &ifr) < 0 ? errno : 0;
+ if (error) {
+ VLOG_WARN("ioctl(%s): %s", ioctl_name, strerror(error));
+ }
+ return error;
+}
+
+/* Assigns 'addr' as 'netdev''s IPv4 address and 'mask' as its netmask. If
+ * 'addr' is INADDR_ANY, 'netdev''s IPv4 address is cleared. Returns a
+ * positive errno value. */
+int
+netdev_set_in4(struct netdev *netdev, struct in_addr addr, struct in_addr mask)
+{
+ int error;
+
+ error = do_set_addr(netdev, af_inet_sock,
+ SIOCSIFADDR, "SIOCSIFADDR", addr);
+ if (!error && addr.s_addr != INADDR_ANY) {
+ error = do_set_addr(netdev, af_inet_sock,
+ SIOCSIFNETMASK, "SIOCSIFNETMASK", mask);
+ }
+ return error;
+}
+
+/* Adds 'router' as a default IP gateway. */
+int
+netdev_add_router(struct in_addr router)
+{
+ struct in_addr any = { INADDR_ANY };
+ struct rtentry rt;
+ int error;
+
+ memset(&rt, 0, sizeof rt);
+ make_in4_sockaddr(&rt.rt_dst, any);
+ make_in4_sockaddr(&rt.rt_gateway, router);
+ make_in4_sockaddr(&rt.rt_genmask, any);
+ rt.rt_flags = RTF_UP | RTF_GATEWAY;
+ COVERAGE_INC(netdev_add_router);
+ error = ioctl(af_inet_sock, SIOCADDRT, &rt) < 0 ? errno : 0;
+ if (error) {
+ VLOG_WARN("ioctl(SIOCADDRT): %s", strerror(error));
+ }
+ return error;
+}
+
+/* If 'netdev' has an assigned IPv6 address, sets '*in6' to that address (if
+ * 'in6' is non-null) and returns true. Otherwise, returns false. */
+bool
+netdev_get_in6(const struct netdev *netdev, struct in6_addr *in6)
+{
+ if (in6) {
+ *in6 = netdev->in6;
+ }
+ return memcmp(&netdev->in6, &in6addr_any, sizeof netdev->in6) != 0;
+}
+
+/* Obtains the current flags for 'netdev' and stores them into '*flagsp'.
+ * Returns 0 if successful, otherwise a positive errno value. On failure,
+ * stores 0 into '*flagsp'. */
+int
+netdev_get_flags(const struct netdev *netdev, enum netdev_flags *flagsp)
+{
+ return netdev_nodev_get_flags(netdev->name, flagsp);
+}
+
+static int
+nd_to_iff_flags(enum netdev_flags nd)
+{
+ int iff = 0;
+ if (nd & NETDEV_UP) {
+ iff |= IFF_UP;
+ }
+ if (nd & NETDEV_PROMISC) {
+ iff |= IFF_PROMISC;
+ }
+ return iff;
+}
+
+/* On 'netdev', turns off the flags in 'off' and then turns on the flags in
+ * 'on'. If 'permanent' is true, the changes will persist; otherwise, they
+ * will be reverted when 'netdev' is closed or the program exits. Returns 0 if
+ * successful, otherwise a positive errno value. */
+static int
+do_update_flags(struct netdev *netdev, enum netdev_flags off,
+ enum netdev_flags on, bool permanent)
+{
+ int old_flags, new_flags;
+ int error;
+
+ error = get_flags(netdev->name, &old_flags);
+ if (error) {
+ return error;
+ }
+
+ new_flags = (old_flags & ~nd_to_iff_flags(off)) | nd_to_iff_flags(on);
+ if (!permanent) {
+ netdev->changed_flags |= new_flags ^ old_flags;
+ }
+ if (new_flags != old_flags) {
+ error = set_flags(netdev->name, new_flags);
+ }
+ return error;
+}
+
+/* Sets the flags for 'netdev' to 'flags'.
+ * If 'permanent' is true, the changes will persist; otherwise, they
+ * will be reverted when 'netdev' is closed or the program exits.
+ * Returns 0 if successful, otherwise a positive errno value. */
+int
+netdev_set_flags(struct netdev *netdev, enum netdev_flags flags,
+ bool permanent)
+{
+ return do_update_flags(netdev, -1, flags, permanent);
+}
+
+/* Turns on the specified 'flags' on 'netdev'.
+ * If 'permanent' is true, the changes will persist; otherwise, they
+ * will be reverted when 'netdev' is closed or the program exits.
+ * Returns 0 if successful, otherwise a positive errno value. */
+int
+netdev_turn_flags_on(struct netdev *netdev, enum netdev_flags flags,
+ bool permanent)
+{
+ return do_update_flags(netdev, 0, flags, permanent);
+}
+
+/* Turns off the specified 'flags' on 'netdev'.
+ * If 'permanent' is true, the changes will persist; otherwise, they
+ * will be reverted when 'netdev' is closed or the program exits.
+ * Returns 0 if successful, otherwise a positive errno value. */
+int
+netdev_turn_flags_off(struct netdev *netdev, enum netdev_flags flags,
+ bool permanent)
+{
+ return do_update_flags(netdev, flags, 0, permanent);
+}
+
+/* Looks up the ARP table entry for 'ip' on 'netdev'. If one exists and can be
+ * successfully retrieved, it stores the corresponding MAC address in 'mac' and
+ * returns 0. Otherwise, it returns a positive errno value; in particular,
+ * ENXIO indicates that there is not ARP table entry for 'ip' on 'netdev'. */
+int
+netdev_arp_lookup(const struct netdev *netdev,
+ uint32_t ip, uint8_t mac[ETH_ADDR_LEN])
+{
+ struct arpreq r;
+ struct sockaddr_in *pa;
+ int retval;
+
+ memset(&r, 0, sizeof r);
+ pa = (struct sockaddr_in *) &r.arp_pa;
+ pa->sin_family = AF_INET;
+ pa->sin_addr.s_addr = ip;
+ pa->sin_port = 0;
+ r.arp_ha.sa_family = ARPHRD_ETHER;
+ r.arp_flags = 0;
+ strncpy(r.arp_dev, netdev->name, sizeof r.arp_dev);
+ COVERAGE_INC(netdev_arp_lookup);
+ retval = ioctl(af_inet_sock, SIOCGARP, &r) < 0 ? errno : 0;
+ if (!retval) {
+ memcpy(mac, r.arp_ha.sa_data, ETH_ADDR_LEN);
+ } else if (retval != ENXIO) {
+ VLOG_WARN_RL(&rl, "%s: could not look up ARP entry for "IP_FMT": %s",
+ netdev->name, IP_ARGS(&ip), strerror(retval));
+ }
+ return retval;
+}
+
+static int
+get_stats_via_netlink(int ifindex, struct netdev_stats *stats)
+{
+ struct ofpbuf request;
+ struct ofpbuf *reply;
+ struct ifinfomsg *ifi;
+ const struct rtnl_link_stats *rtnl_stats;
+ struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
+ int error;
+
+ ofpbuf_init(&request, 0);
+ nl_msg_put_nlmsghdr(&request, rtnl_sock, sizeof *ifi,
+ RTM_GETLINK, NLM_F_REQUEST);
+ ifi = ofpbuf_put_zeros(&request, sizeof *ifi);
+ ifi->ifi_family = PF_UNSPEC;
+ ifi->ifi_index = ifindex;
+ error = nl_sock_transact(rtnl_sock, &request, &reply);
+ ofpbuf_uninit(&request);
+ if (error) {
+ return error;
+ }
+
+ if (!nl_policy_parse(reply, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
+ rtnlgrp_link_policy,
+ attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
+ ofpbuf_delete(reply);
+ return EPROTO;
+ }
+
+ if (!attrs[IFLA_STATS]) {
+ VLOG_WARN_RL(&rl, "RTM_GETLINK reply lacks stats");
+ return EPROTO;
+ }
+
+ rtnl_stats = nl_attr_get(attrs[IFLA_STATS]);
+ stats->rx_packets = rtnl_stats->rx_packets;
+ stats->tx_packets = rtnl_stats->tx_packets;
+ stats->rx_bytes = rtnl_stats->rx_bytes;
+ stats->tx_bytes = rtnl_stats->tx_bytes;
+ stats->rx_errors = rtnl_stats->rx_errors;
+ stats->tx_errors = rtnl_stats->tx_errors;
+ stats->rx_dropped = rtnl_stats->rx_dropped;
+ stats->tx_dropped = rtnl_stats->tx_dropped;
+ stats->multicast = rtnl_stats->multicast;
+ stats->collisions = rtnl_stats->collisions;
+ stats->rx_length_errors = rtnl_stats->rx_length_errors;
+ stats->rx_over_errors = rtnl_stats->rx_over_errors;
+ stats->rx_crc_errors = rtnl_stats->rx_crc_errors;
+ stats->rx_frame_errors = rtnl_stats->rx_frame_errors;
+ stats->rx_fifo_errors = rtnl_stats->rx_fifo_errors;
+ stats->rx_missed_errors = rtnl_stats->rx_missed_errors;
+ stats->tx_aborted_errors = rtnl_stats->tx_aborted_errors;
+ stats->tx_carrier_errors = rtnl_stats->tx_carrier_errors;
+ stats->tx_fifo_errors = rtnl_stats->tx_fifo_errors;
+ stats->tx_heartbeat_errors = rtnl_stats->tx_heartbeat_errors;
+ stats->tx_window_errors = rtnl_stats->tx_window_errors;
+
+ return 0;
+}
+
+static int
+get_stats_via_proc(const char *netdev_name, struct netdev_stats *stats)
+{
+ static const char fn[] = "/proc/net/dev";
+ char line[1024];
+ FILE *stream;
+ int ln;
+
+ stream = fopen(fn, "r");
+ if (!stream) {
+ VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(errno));
+ return errno;
+ }
+
+ ln = 0;
+ while (fgets(line, sizeof line, stream)) {
+ if (++ln >= 3) {
+ char devname[16];
+#define X64 "%"SCNu64
+ if (sscanf(line,
+ " %15[^:]:"
+ X64 X64 X64 X64 X64 X64 X64 "%*u"
+ X64 X64 X64 X64 X64 X64 X64 "%*u",
+ devname,
+ &stats->rx_bytes,
+ &stats->rx_packets,
+ &stats->rx_errors,
+ &stats->rx_dropped,
+ &stats->rx_fifo_errors,
+ &stats->rx_frame_errors,
+ &stats->multicast,
+ &stats->tx_bytes,
+ &stats->tx_packets,
+ &stats->tx_errors,
+ &stats->tx_dropped,
+ &stats->tx_fifo_errors,
+ &stats->collisions,
+ &stats->tx_carrier_errors) != 15) {
+ VLOG_WARN_RL(&rl, "%s:%d: parse error", fn, ln);
+ } else if (!strcmp(devname, netdev_name)) {
+ stats->rx_length_errors = UINT64_MAX;
+ stats->rx_over_errors = UINT64_MAX;
+ stats->rx_crc_errors = UINT64_MAX;
+ stats->rx_missed_errors = UINT64_MAX;
+ stats->tx_aborted_errors = UINT64_MAX;
+ stats->tx_heartbeat_errors = UINT64_MAX;
+ stats->tx_window_errors = UINT64_MAX;
+ fclose(stream);
+ return 0;
+ }
+ }
+ }
+ VLOG_WARN_RL(&rl, "%s: no stats for %s", fn, netdev_name);
+ fclose(stream);
+ return ENODEV;
+}
+
+int
+netdev_get_carrier(const struct netdev *netdev, bool *carrier)
+{
+ char line[8];
+ int retval;
+ int error;
+ char *fn;
+ int fd;
+
+ *carrier = false;
+
+ fn = xasprintf("/sys/class/net/%s/carrier", netdev->name);
+ fd = open(fn, O_RDONLY);
+ if (fd < 0) {
+ error = errno;
+ VLOG_WARN_RL(&rl, "%s: open failed: %s", fn, strerror(error));
+ goto exit;
+ }
+
+ retval = read(fd, line, sizeof line);
+ if (retval < 0) {
+ error = errno;
+ if (error == EINVAL) {
+ /* This is the normal return value when we try to check carrier if
+ * the network device is not up. */
+ } else {
+ VLOG_WARN_RL(&rl, "%s: read failed: %s", fn, strerror(error));
+ }
+ goto exit_close;
+ } else if (retval == 0) {
+ error = EPROTO;
+ VLOG_WARN_RL(&rl, "%s: unexpected end of file", fn);
+ goto exit_close;
+ }
+
+ if (line[0] != '0' && line[0] != '1') {
+ error = EPROTO;
+ VLOG_WARN_RL(&rl, "%s: value is %c (expected 0 or 1)", fn, line[0]);
+ goto exit_close;
+ }
+ *carrier = line[0] != '0';
+ error = 0;
+
+exit_close:
+ close(fd);
+exit:
+ free(fn);
+ return error;
+}
+
+int
+netdev_get_stats(const struct netdev *netdev, struct netdev_stats *stats)
+{
+ int error;
+
+ COVERAGE_INC(netdev_get_stats);
+ if (use_netlink_stats) {
+ int ifindex;
+
+ error = get_ifindex(netdev, &ifindex);
+ if (!error) {
+ error = get_stats_via_netlink(ifindex, stats);
+ }
+ } else {
+ error = get_stats_via_proc(netdev->name, stats);
+ }
+
+ if (error) {
+ memset(stats, 0xff, sizeof *stats);
+ }
+ return error;
+}
+
+#define POLICE_ADD_CMD "/sbin/tc qdisc add dev %s handle ffff: ingress"
+#define POLICE_CONFIG_CMD "/sbin/tc filter add dev %s parent ffff: protocol ip prio 50 u32 match ip src 0.0.0.0/0 police rate %dkbit burst %dk mtu 65535 drop flowid :1"
+/* We redirect stderr to /dev/null because we often want to remove all
+ * traffic control configuration on a port so its in a known state. If
+ * this done when there is no such configuration, tc complains, so we just
+ * always ignore it.
+ */
+#define POLICE_DEL_CMD "/sbin/tc qdisc del dev %s handle ffff: ingress 2>/dev/null"
+
+/* Attempts to set input rate limiting (policing) policy. */
+int
+netdev_nodev_set_policing(const char *netdev_name, uint32_t kbits_rate,
+ uint32_t kbits_burst)
+{
+ char command[1024];
+
+ init_netdev();
+
+ COVERAGE_INC(netdev_set_policing);
+ if (kbits_rate) {
+ if (!kbits_burst) {
+ /* Default to 10 kilobits if not specified. */
+ kbits_burst = 10;
+ }
+
+ /* xxx This should be more careful about only adding if it
+ * xxx actually exists, as opposed to always deleting it. */
+ snprintf(command, sizeof(command), POLICE_DEL_CMD, netdev_name);
+ if (system(command) == -1) {
+ VLOG_WARN_RL(&rl, "%s: problem removing policing", netdev_name);
+ }
+
+ snprintf(command, sizeof(command), POLICE_ADD_CMD, netdev_name);
+ if (system(command) != 0) {
+ VLOG_WARN_RL(&rl, "%s: problem adding policing", netdev_name);
+ return -1;
+ }
+
+ snprintf(command, sizeof(command), POLICE_CONFIG_CMD, netdev_name,
+ kbits_rate, kbits_burst);
+ if (system(command) != 0) {
+ VLOG_WARN_RL(&rl, "%s: problem configuring policing",
+ netdev_name);
+ return -1;
+ }
+ } else {
+ snprintf(command, sizeof(command), POLICE_DEL_CMD, netdev_name);
+ if (system(command) == -1) {
+ VLOG_WARN_RL(&rl, "%s: problem removing policing", netdev_name);
+ }
+ }
+
+ return 0;
+}
+
+int
+netdev_set_policing(struct netdev *netdev, uint32_t kbits_rate,
+ uint32_t kbits_burst)
+{
+ return netdev_nodev_set_policing(netdev->name, kbits_rate, kbits_burst);
+}
+
+/* Initializes 'svec' with a list of the names of all known network devices. */
+void
+netdev_enumerate(struct svec *svec)
+{
+ struct if_nameindex *names;
+
+ svec_init(svec);
+ names = if_nameindex();
+ if (names) {
+ size_t i;
+
+ for (i = 0; names[i].if_name != NULL; i++) {
+ svec_add(svec, names[i].if_name);
+ }
+ if_freenameindex(names);
+ } else {
+ VLOG_WARN("could not obtain list of network device names: %s",
+ strerror(errno));
+ }
+}
+
+/* Obtains the current flags for the network device named 'netdev_name' and
+ * stores them into '*flagsp'. Returns 0 if successful, otherwise a positive
+ * errno value. On error, stores 0 into '*flagsp'.
+ *
+ * If only device flags are needed, this is more efficient than calling
+ * netdev_open(), netdev_get_flags(), netdev_close(). */
+int
+netdev_nodev_get_flags(const char *netdev_name, enum netdev_flags *flagsp)
+{
+ int error, flags;
+
+ init_netdev();
+
+ *flagsp = 0;
+ error = get_flags(netdev_name, &flags);
+ if (error) {
+ return error;
+ }
+
+ if (flags & IFF_UP) {
+ *flagsp |= NETDEV_UP;
+ }
+ if (flags & IFF_PROMISC) {
+ *flagsp |= NETDEV_PROMISC;
+ }
+ return 0;
+}
+
+int
+netdev_nodev_get_etheraddr(const char *netdev_name, uint8_t mac[6])
+{
+ init_netdev();
+
+ return get_etheraddr(netdev_name, mac, NULL);
+}
+
+/* If 'netdev_name' is the name of a VLAN network device (e.g. one created with
+ * vconfig(8)), sets '*vlan_vid' to the VLAN VID associated with that device
+ * and returns 0. Otherwise returns a errno value (specifically ENOENT if
+ * 'netdev_name' is the name of a network device that is not a VLAN device) and
+ * sets '*vlan_vid' to -1. */
+int
+netdev_get_vlan_vid(const char *netdev_name, int *vlan_vid)
+{
+ struct ds line = DS_EMPTY_INITIALIZER;
+ FILE *stream = NULL;
+ int error;
+ char *fn;
+
+ COVERAGE_INC(netdev_get_vlan_vid);
+ fn = xasprintf("/proc/net/vlan/%s", netdev_name);
+ stream = fopen(fn, "r");
+ if (!stream) {
+ error = errno;
+ goto done;
+ }
+
+ if (ds_get_line(&line, stream)) {
+ if (ferror(stream)) {
+ error = errno;
+ VLOG_ERR_RL(&rl, "error reading \"%s\": %s", fn, strerror(errno));
+ } else {
+ error = EPROTO;
+ VLOG_ERR_RL(&rl, "unexpected end of file reading \"%s\"", fn);
+ }
+ goto done;
+ }
+
+ if (!sscanf(ds_cstr(&line), "%*s VID: %d", vlan_vid)) {
+ error = EPROTO;
+ VLOG_ERR_RL(&rl, "parse error reading \"%s\" line 1: \"%s\"",
+ fn, ds_cstr(&line));
+ goto done;
+ }
+
+ error = 0;
+
+done:
+ free(fn);
+ if (stream) {
+ fclose(stream);
+ }
+ ds_destroy(&line);
+ if (error) {
+ *vlan_vid = -1;
+ }
+ return error;
+}
+
+static void restore_all_flags(void *aux);
+
+/* Set up a signal hook to restore network device flags on program
+ * termination. */
+static void
+init_netdev(void)
+{
+ static bool inited;
+ if (!inited) {
+ int ifindex;
+ int error;
+
+ inited = true;
+
+ fatal_signal_add_hook(restore_all_flags, NULL, true);
+
+ af_inet_sock = socket(AF_INET, SOCK_DGRAM, 0);
+ if (af_inet_sock < 0) {
+ ovs_fatal(errno, "socket(AF_INET)");
+ }
+
+ error = nl_sock_create(NETLINK_ROUTE, 0, 0, 0, &rtnl_sock);
+ if (error) {
+ ovs_fatal(error, "socket(AF_NETLINK, NETLINK_ROUTE)");
+ }
+
+ /* Decide on the netdev_get_stats() implementation to use. Netlink is
+ * preferable, so if that works, we'll use it. */
+ ifindex = do_get_ifindex("lo");
+ if (ifindex < 0) {
+ VLOG_WARN("failed to get ifindex for lo, "
+ "obtaining netdev stats from proc");
+ use_netlink_stats = false;
+ } else {
+ struct netdev_stats stats;
+ error = get_stats_via_netlink(ifindex, &stats);
+ if (!error) {
+ VLOG_DBG("obtaining netdev stats via rtnetlink");
+ use_netlink_stats = true;
+ } else {
+ VLOG_INFO("RTM_GETLINK failed (%s), obtaining netdev stats "
+ "via proc (you are probably running a pre-2.6.19 "
+ "kernel)", strerror(error));
+ use_netlink_stats = false;
+ }
+ }
+ }
+}
+
+/* Restore the network device flags on 'netdev' to those that were active
+ * before we changed them. Returns 0 if successful, otherwise a positive
+ * errno value.
+ *
+ * To avoid reentry, the caller must ensure that fatal signals are blocked. */
+static int
+restore_flags(struct netdev *netdev)
+{
+ struct ifreq ifr;
+ int restore_flags;
+
+ /* Get current flags. */
+ strncpy(ifr.ifr_name, netdev->name, sizeof ifr.ifr_name);
+ COVERAGE_INC(netdev_get_flags);
+ if (ioctl(netdev->netdev_fd, SIOCGIFFLAGS, &ifr) < 0) {
+ return errno;
+ }
+
+ /* Restore flags that we might have changed, if necessary. */
+ restore_flags = netdev->changed_flags & (IFF_PROMISC | IFF_UP);
+ if ((ifr.ifr_flags ^ netdev->save_flags) & restore_flags) {
+ ifr.ifr_flags &= ~restore_flags;
+ ifr.ifr_flags |= netdev->save_flags & restore_flags;
+ COVERAGE_INC(netdev_set_flags);
+ if (ioctl(netdev->netdev_fd, SIOCSIFFLAGS, &ifr) < 0) {
+ return errno;
+ }
+ }
+
+ return 0;
+}
+
+/* Retores all the flags on all network devices that we modified. Called from
+ * a signal handler, so it does not attempt to report error conditions. */
+static void
+restore_all_flags(void *aux UNUSED)
+{
+ struct netdev *netdev;
+ LIST_FOR_EACH (netdev, struct netdev, node, &netdev_list) {
+ restore_flags(netdev);
+ }
+}
+
+static int
+get_flags(const char *netdev_name, int *flags)
+{
+ struct ifreq ifr;
+ strncpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
+ COVERAGE_INC(netdev_get_flags);
+ if (ioctl(af_inet_sock, SIOCGIFFLAGS, &ifr) < 0) {
+ VLOG_ERR("ioctl(SIOCGIFFLAGS) on %s device failed: %s",
+ netdev_name, strerror(errno));
+ return errno;
+ }
+ *flags = ifr.ifr_flags;
+ return 0;
+}
+
+static int
+set_flags(const char *netdev_name, int flags)
+{
+ struct ifreq ifr;
+ strncpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
+ ifr.ifr_flags = flags;
+ COVERAGE_INC(netdev_set_flags);
+ if (ioctl(af_inet_sock, SIOCSIFFLAGS, &ifr) < 0) {
+ VLOG_ERR("ioctl(SIOCSIFFLAGS) on %s device failed: %s",
+ netdev_name, strerror(errno));
+ return errno;
+ }
+ return 0;
+}
+
+static int
+do_get_ifindex(const char *netdev_name)
+{
+ struct ifreq ifr;
+
+ strncpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
+ COVERAGE_INC(netdev_get_ifindex);
+ if (ioctl(af_inet_sock, SIOCGIFINDEX, &ifr) < 0) {
+ VLOG_WARN_RL(&rl, "ioctl(SIOCGIFINDEX) on %s device failed: %s",
+ netdev_name, strerror(errno));
+ return -errno;
+ }
+ return ifr.ifr_ifindex;
+}
+
+static int
+get_ifindex(const struct netdev *netdev, int *ifindexp)
+{
+ *ifindexp = 0;
+ if (netdev->ifindex < 0) {
+ int ifindex = do_get_ifindex(netdev->name);
+ if (ifindex < 0) {
+ return -ifindex;
+ }
+ ((struct netdev *) netdev)->ifindex = ifindex;
+ }
+ *ifindexp = netdev->ifindex;
+ return 0;
+}
+
+static int
+get_etheraddr(const char *netdev_name, uint8_t ea[ETH_ADDR_LEN],
+ int *hwaddr_familyp)
+{
+ struct ifreq ifr;
+
+ memset(&ifr, 0, sizeof ifr);
+ strncpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
+ COVERAGE_INC(netdev_get_hwaddr);
+ if (ioctl(af_inet_sock, SIOCGIFHWADDR, &ifr) < 0) {
+ VLOG_ERR("ioctl(SIOCGIFHWADDR) on %s device failed: %s",
+ netdev_name, strerror(errno));
+ return errno;
+ }
+ if (hwaddr_familyp) {
+ int hwaddr_family = ifr.ifr_hwaddr.sa_family;
+ *hwaddr_familyp = hwaddr_family;
+ if (hwaddr_family != AF_UNSPEC && hwaddr_family != ARPHRD_ETHER) {
+ VLOG_WARN("%s device has unknown hardware address family %d",
+ netdev_name, hwaddr_family);
+ }
+ }
+ memcpy(ea, ifr.ifr_hwaddr.sa_data, ETH_ADDR_LEN);
+ return 0;
+}
+
+static int
+set_etheraddr(const char *netdev_name, int hwaddr_family,
+ const uint8_t mac[ETH_ADDR_LEN])
+{
+ struct ifreq ifr;
+
+ memset(&ifr, 0, sizeof ifr);
+ strncpy(ifr.ifr_name, netdev_name, sizeof ifr.ifr_name);
+ ifr.ifr_hwaddr.sa_family = hwaddr_family;
+ memcpy(ifr.ifr_hwaddr.sa_data, mac, ETH_ADDR_LEN);
+ COVERAGE_INC(netdev_set_hwaddr);
+ if (ioctl(af_inet_sock, SIOCSIFHWADDR, &ifr) < 0) {
+ VLOG_ERR("ioctl(SIOCSIFHWADDR) on %s device failed: %s",
+ netdev_name, strerror(errno));
+ return errno;
+ }
+ return 0;
+}
diff --git a/lib/netdev.h b/lib/netdev.h
new file mode 100644
index 000000000..63462c561
--- /dev/null
+++ b/lib/netdev.h
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2008, 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef NETDEV_H
+#define NETDEV_H 1
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+/* Generic interface to network devices.
+ *
+ * Currently, there is a single implementation of this interface that supports
+ * Linux. The interface should be generic enough to be implementable on other
+ * operating systems as well. */
+
+struct ofpbuf;
+struct in_addr;
+struct in6_addr;
+struct svec;
+
+enum netdev_flags {
+ NETDEV_UP = 0x0001, /* Device enabled? */
+ NETDEV_PROMISC = 0x0002 /* Promiscuous mode? */
+};
+
+enum netdev_pseudo_ethertype {
+ NETDEV_ETH_TYPE_NONE = -128, /* Receive no frames. */
+ NETDEV_ETH_TYPE_ANY, /* Receive all frames. */
+ NETDEV_ETH_TYPE_802_2 /* Receive all IEEE 802.2 frames. */
+};
+
+struct netdev_stats {
+ uint64_t rx_packets; /* Total packets received. */
+ uint64_t tx_packets; /* Total packets transmitted. */
+ uint64_t rx_bytes; /* Total bytes received. */
+ uint64_t tx_bytes; /* Total bytes transmitted. */
+ uint64_t rx_errors; /* Bad packets received. */
+ uint64_t tx_errors; /* Packet transmit problems. */
+ uint64_t rx_dropped; /* No buffer space. */
+ uint64_t tx_dropped; /* No buffer space. */
+ uint64_t multicast; /* Multicast packets received. */
+ uint64_t collisions;
+
+ /* Detailed receive errors. */
+ uint64_t rx_length_errors;
+ uint64_t rx_over_errors; /* Receiver ring buff overflow. */
+ uint64_t rx_crc_errors; /* Recved pkt with crc error. */
+ uint64_t rx_frame_errors; /* Recv'd frame alignment error. */
+ uint64_t rx_fifo_errors; /* Recv'r fifo overrun . */
+ uint64_t rx_missed_errors; /* Receiver missed packet. */
+
+ /* Detailed transmit errors. */
+ uint64_t tx_aborted_errors;
+ uint64_t tx_carrier_errors;
+ uint64_t tx_fifo_errors;
+ uint64_t tx_heartbeat_errors;
+ uint64_t tx_window_errors;
+};
+
+struct netdev;
+
+int netdev_open(const char *name, int ethertype, struct netdev **);
+int netdev_open_tap(const char *name, struct netdev **);
+void netdev_close(struct netdev *);
+
+int netdev_recv(struct netdev *, struct ofpbuf *);
+void netdev_recv_wait(struct netdev *);
+int netdev_drain(struct netdev *);
+int netdev_send(struct netdev *, const struct ofpbuf *);
+void netdev_send_wait(struct netdev *);
+int netdev_set_etheraddr(struct netdev *, const uint8_t mac[6]);
+const uint8_t *netdev_get_etheraddr(const struct netdev *);
+const char *netdev_get_name(const struct netdev *);
+int netdev_get_mtu(const struct netdev *);
+int netdev_get_features(struct netdev *,
+ uint32_t *current, uint32_t *advertised,
+ uint32_t *supported, uint32_t *peer);
+int netdev_set_advertisements(struct netdev *, uint32_t advertise);
+bool netdev_get_in4(const struct netdev *, struct in_addr *);
+int netdev_set_in4(struct netdev *, struct in_addr addr, struct in_addr mask);
+int netdev_add_router(struct in_addr router);
+bool netdev_get_in6(const struct netdev *, struct in6_addr *);
+int netdev_get_flags(const struct netdev *, enum netdev_flags *);
+int netdev_set_flags(struct netdev *, enum netdev_flags, bool permanent);
+int netdev_turn_flags_on(struct netdev *, enum netdev_flags, bool permanent);
+int netdev_turn_flags_off(struct netdev *, enum netdev_flags, bool permanent);
+int netdev_arp_lookup(const struct netdev *, uint32_t ip, uint8_t mac[6]);
+int netdev_get_carrier(const struct netdev *, bool *carrier);
+int netdev_get_stats(const struct netdev *, struct netdev_stats *);
+int netdev_set_policing(struct netdev *, uint32_t kbits_rate,
+ uint32_t kbits_burst);
+
+void netdev_enumerate(struct svec *);
+int netdev_nodev_get_flags(const char *netdev_name, enum netdev_flags *);
+int netdev_nodev_set_etheraddr(const char *name, const uint8_t mac[6]);
+int netdev_nodev_get_etheraddr(const char *netdev_name, uint8_t mac[6]);
+int netdev_nodev_set_policing(const char *netdev_name, uint32_t kbits_rate,
+ uint32_t kbits_burst);
+
+int netdev_get_vlan_vid(const char *netdev_name, int *vlan_vid);
+
+#endif /* netdev.h */
diff --git a/lib/netlink-protocol.h b/lib/netlink-protocol.h
new file mode 100644
index 000000000..92694e8b3
--- /dev/null
+++ b/lib/netlink-protocol.h
@@ -0,0 +1,141 @@
+/*
+ * Copyright (c) 2008 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef NETLINK_PROTOCOL_H
+#define NETLINK_PROTOCOL_H 1
+
+/* Netlink protocol definitions.
+ *
+ * These definitions are equivalent to those in the Linux 2.6 kernel headers,
+ * without requiring those headers to be available. */
+
+#include <stdint.h>
+#include <sys/socket.h>
+#include "util.h"
+
+#define NETLINK_GENERIC 16
+
+struct sockaddr_nl {
+ sa_family_t nl_family;
+ unsigned short int nl_pad;
+ uint32_t nl_pid;
+ uint32_t nl_groups;
+};
+BUILD_ASSERT_DECL(sizeof(struct sockaddr_nl) == 12);
+
+/* nlmsg_flags bits. */
+#define NLM_F_REQUEST 0x001
+#define NLM_F_MULTI 0x002
+#define NLM_F_ACK 0x004
+#define NLM_F_ECHO 0x008
+
+#define NLM_F_ROOT 0x100
+#define NLM_F_MATCH 0x200
+#define NLM_F_ATOMIC 0x400
+#define NLM_F_DUMP (NLM_F_ROOT | NLM_F_MATCH)
+
+/* nlmsg_type values. */
+#define NLMSG_NOOP 1
+#define NLMSG_ERROR 2
+#define NLMSG_DONE 3
+#define NLMSG_OVERRUN 4
+
+#define NLMSG_MIN_TYPE 0x10
+
+struct nlmsghdr {
+ uint32_t nlmsg_len;
+ uint16_t nlmsg_type;
+ uint16_t nlmsg_flags;
+ uint32_t nlmsg_seq;
+ uint32_t nlmsg_pid;
+};
+BUILD_ASSERT_DECL(sizeof(struct nlmsghdr) == 16);
+
+#define NLMSG_ALIGNTO 4
+#define NLMSG_ALIGN(SIZE) ROUND_UP(SIZE, NLMSG_ALIGNTO)
+#define NLMSG_HDRLEN ((int) NLMSG_ALIGN(sizeof(struct nlmsghdr)))
+
+struct nlmsgerr
+{
+ int error;
+ struct nlmsghdr msg;
+};
+BUILD_ASSERT_DECL(sizeof(struct nlmsgerr) == 20);
+
+#define NETLINK_ADD_MEMBERSHIP 1
+#define NETLINK_DROP_MEMBERSHIP 2
+#define NETLINK_PKTINFO 3
+
+struct genlmsghdr {
+ uint8_t cmd;
+ uint8_t version;
+ uint16_t reserved;
+};
+BUILD_ASSERT_DECL(sizeof(struct genlmsghdr) == 4);
+
+#define GENL_HDRLEN NLMSG_ALIGN(sizeof(struct genlmsghdr))
+
+struct nlattr {
+ uint16_t nla_len;
+ uint16_t nla_type;
+};
+BUILD_ASSERT_DECL(sizeof(struct nlattr) == 4);
+
+#define NLA_ALIGNTO 4
+#define NLA_ALIGN(SIZE) ROUND_UP(SIZE, NLA_ALIGNTO)
+#define NLA_HDRLEN ((int) NLA_ALIGN(sizeof(struct nlattr)))
+
+#define GENL_MIN_ID NLMSG_MIN_TYPE
+#define GENL_MAX_ID 1023
+
+#define GENL_ID_CTRL NLMSG_MIN_TYPE
+
+enum {
+ CTRL_CMD_UNSPEC,
+ CTRL_CMD_NEWFAMILY,
+ CTRL_CMD_DELFAMILY,
+ CTRL_CMD_GETFAMILY,
+ CTRL_CMD_NEWOPS,
+ CTRL_CMD_DELOPS,
+ CTRL_CMD_GETOPS,
+ __CTRL_CMD_MAX,
+};
+
+#define CTRL_CMD_MAX (__CTRL_CMD_MAX - 1)
+
+enum {
+ CTRL_ATTR_UNSPEC,
+ CTRL_ATTR_FAMILY_ID,
+ CTRL_ATTR_FAMILY_NAME,
+ CTRL_ATTR_VERSION,
+ CTRL_ATTR_HDRSIZE,
+ CTRL_ATTR_MAXATTR,
+ CTRL_ATTR_OPS,
+ __CTRL_ATTR_MAX,
+};
+
+#define CTRL_ATTR_MAX (__CTRL_ATTR_MAX - 1)
+
+enum {
+ CTRL_ATTR_OP_UNSPEC,
+ CTRL_ATTR_OP_ID,
+ CTRL_ATTR_OP_FLAGS,
+ __CTRL_ATTR_OP_MAX,
+};
+
+#define CTRL_ATTR_OP_MAX (__CTRL_ATTR_OP_MAX - 1)
+
+#endif /* netlink-protocol.h */
diff --git a/lib/netlink.c b/lib/netlink.c
new file mode 100644
index 000000000..bc1956a66
--- /dev/null
+++ b/lib/netlink.c
@@ -0,0 +1,1077 @@
+/*
+ * Copyright (c) 2008, 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <config.h>
+#include "netlink.h"
+#include <assert.h>
+#include <errno.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <unistd.h>
+#include "coverage.h"
+#include "dynamic-string.h"
+#include "netlink-protocol.h"
+#include "ofpbuf.h"
+#include "poll-loop.h"
+#include "timeval.h"
+#include "util.h"
+
+#include "vlog.h"
+#define THIS_MODULE VLM_netlink
+
+/* Linux header file confusion causes this to be undefined. */
+#ifndef SOL_NETLINK
+#define SOL_NETLINK 270
+#endif
+
+/* A single (bad) Netlink message can in theory dump out many, many log
+ * messages, so the burst size is set quite high here to avoid missing useful
+ * information. Also, at high logging levels we log *all* Netlink messages. */
+static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(60, 600);
+
+static void log_nlmsg(const char *function, int error,
+ const void *message, size_t size);
+
+/* Netlink sockets. */
+
+struct nl_sock
+{
+ int fd;
+ uint32_t pid;
+};
+
+/* Next nlmsghdr sequence number.
+ *
+ * This implementation uses sequence numbers that are unique process-wide, to
+ * avoid a hypothetical race: send request, close socket, open new socket that
+ * reuses the old socket's PID value, send request on new socket, receive reply
+ * from kernel to old socket but with same PID and sequence number. (This race
+ * could be avoided other ways, e.g. by preventing PIDs from being quickly
+ * reused). */
+static uint32_t next_seq;
+
+static int alloc_pid(uint32_t *);
+static void free_pid(uint32_t);
+
+/* Creates a new netlink socket for the given netlink 'protocol'
+ * (NETLINK_ROUTE, NETLINK_GENERIC, ...). Returns 0 and sets '*sockp' to the
+ * new socket if successful, otherwise returns a positive errno value.
+ *
+ * If 'multicast_group' is nonzero, the new socket subscribes to the specified
+ * netlink multicast group. (A netlink socket may listen to an arbitrary
+ * number of multicast groups, but so far we only need one at a time.)
+ *
+ * Nonzero 'so_sndbuf' or 'so_rcvbuf' override the kernel default send or
+ * receive buffer size, respectively.
+ */
+int
+nl_sock_create(int protocol, int multicast_group,
+ size_t so_sndbuf, size_t so_rcvbuf, struct nl_sock **sockp)
+{
+ struct nl_sock *sock;
+ struct sockaddr_nl local, remote;
+ int retval = 0;
+
+ if (next_seq == 0) {
+ /* Pick initial sequence number. */
+ next_seq = getpid() ^ time_now();
+ }
+
+ *sockp = NULL;
+ sock = malloc(sizeof *sock);
+ if (sock == NULL) {
+ return ENOMEM;
+ }
+
+ sock->fd = socket(AF_NETLINK, SOCK_RAW, protocol);
+ if (sock->fd < 0) {
+ VLOG_ERR("fcntl: %s", strerror(errno));
+ goto error;
+ }
+
+ retval = alloc_pid(&sock->pid);
+ if (retval) {
+ goto error;
+ }
+
+ if (so_sndbuf != 0
+ && setsockopt(sock->fd, SOL_SOCKET, SO_SNDBUF,
+ &so_sndbuf, sizeof so_sndbuf) < 0) {
+ VLOG_ERR("setsockopt(SO_SNDBUF,%zu): %s", so_sndbuf, strerror(errno));
+ goto error_free_pid;
+ }
+
+ if (so_rcvbuf != 0
+ && setsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF,
+ &so_rcvbuf, sizeof so_rcvbuf) < 0) {
+ VLOG_ERR("setsockopt(SO_RCVBUF,%zu): %s", so_rcvbuf, strerror(errno));
+ goto error_free_pid;
+ }
+
+ /* Bind local address as our selected pid. */
+ memset(&local, 0, sizeof local);
+ local.nl_family = AF_NETLINK;
+ local.nl_pid = sock->pid;
+ if (multicast_group > 0 && multicast_group <= 32) {
+ /* This method of joining multicast groups is supported by old kernels,
+ * but it only allows 32 multicast groups per protocol. */
+ local.nl_groups |= 1ul << (multicast_group - 1);
+ }
+ if (bind(sock->fd, (struct sockaddr *) &local, sizeof local) < 0) {
+ VLOG_ERR("bind(%"PRIu32"): %s", sock->pid, strerror(errno));
+ goto error_free_pid;
+ }
+
+ /* Bind remote address as the kernel (pid 0). */
+ memset(&remote, 0, sizeof remote);
+ remote.nl_family = AF_NETLINK;
+ remote.nl_pid = 0;
+ if (connect(sock->fd, (struct sockaddr *) &remote, sizeof remote) < 0) {
+ VLOG_ERR("connect(0): %s", strerror(errno));
+ goto error_free_pid;
+ }
+
+ /* Older kernel headers failed to define this macro. We want our programs
+ * to support the newer kernel features even if compiled with older
+ * headers, so define it ourselves in such a case. */
+#ifndef NETLINK_ADD_MEMBERSHIP
+#define NETLINK_ADD_MEMBERSHIP 1
+#endif
+
+ /* This method of joining multicast groups is only supported by newish
+ * kernels, but it allows for an arbitrary number of multicast groups. */
+ if (multicast_group > 32
+ && setsockopt(sock->fd, SOL_NETLINK, NETLINK_ADD_MEMBERSHIP,
+ &multicast_group, sizeof multicast_group) < 0) {
+ VLOG_ERR("setsockopt(NETLINK_ADD_MEMBERSHIP,%d): %s",
+ multicast_group, strerror(errno));
+ goto error_free_pid;
+ }
+
+ *sockp = sock;
+ return 0;
+
+error_free_pid:
+ free_pid(sock->pid);
+error:
+ if (retval == 0) {
+ retval = errno;
+ if (retval == 0) {
+ retval = EINVAL;
+ }
+ }
+ if (sock->fd >= 0) {
+ close(sock->fd);
+ }
+ free(sock);
+ return retval;
+}
+
+/* Destroys netlink socket 'sock'. */
+void
+nl_sock_destroy(struct nl_sock *sock)
+{
+ if (sock) {
+ close(sock->fd);
+ free_pid(sock->pid);
+ free(sock);
+ }
+}
+
+/* Tries to send 'msg', which must contain a Netlink message, to the kernel on
+ * 'sock'. nlmsg_len in 'msg' will be finalized to match msg->size before the
+ * message is sent.
+ *
+ * Returns 0 if successful, otherwise a positive errno value. If
+ * 'wait' is true, then the send will wait until buffer space is ready;
+ * otherwise, returns EAGAIN if the 'sock' send buffer is full. */
+int
+nl_sock_send(struct nl_sock *sock, const struct ofpbuf *msg, bool wait)
+{
+ int error;
+
+ nl_msg_nlmsghdr(msg)->nlmsg_len = msg->size;
+ do {
+ int retval;
+ retval = send(sock->fd, msg->data, msg->size, wait ? 0 : MSG_DONTWAIT);
+ error = retval < 0 ? errno : 0;
+ } while (error == EINTR);
+ log_nlmsg(__func__, error, msg->data, msg->size);
+ if (!error) {
+ COVERAGE_INC(netlink_sent);
+ }
+ return error;
+}
+
+/* Tries to send the 'n_iov' chunks of data in 'iov' to the kernel on 'sock' as
+ * a single Netlink message. (The message must be fully formed and not require
+ * finalization of its nlmsg_len field.)
+ *
+ * Returns 0 if successful, otherwise a positive errno value. If 'wait' is
+ * true, then the send will wait until buffer space is ready; otherwise,
+ * returns EAGAIN if the 'sock' send buffer is full. */
+int
+nl_sock_sendv(struct nl_sock *sock, const struct iovec iov[], size_t n_iov,
+ bool wait)
+{
+ struct msghdr msg;
+ int error;
+
+ COVERAGE_INC(netlink_send);
+ memset(&msg, 0, sizeof msg);
+ msg.msg_iov = (struct iovec *) iov;
+ msg.msg_iovlen = n_iov;
+ do {
+ int retval;
+ retval = sendmsg(sock->fd, &msg, wait ? 0 : MSG_DONTWAIT);
+ error = retval < 0 ? errno : 0;
+ } while (error == EINTR);
+ if (error != EAGAIN) {
+ log_nlmsg(__func__, error, iov[0].iov_base, iov[0].iov_len);
+ if (!error) {
+ COVERAGE_INC(netlink_sent);
+ }
+ }
+ return error;
+}
+
+/* Tries to receive a netlink message from the kernel on 'sock'. If
+ * successful, stores the received message into '*bufp' and returns 0. The
+ * caller is responsible for destroying the message with ofpbuf_delete(). On
+ * failure, returns a positive errno value and stores a null pointer into
+ * '*bufp'.
+ *
+ * If 'wait' is true, nl_sock_recv waits for a message to be ready; otherwise,
+ * returns EAGAIN if the 'sock' receive buffer is empty. */
+int
+nl_sock_recv(struct nl_sock *sock, struct ofpbuf **bufp, bool wait)
+{
+ uint8_t tmp;
+ ssize_t bufsize = 2048;
+ ssize_t nbytes, nbytes2;
+ struct ofpbuf *buf;
+ struct nlmsghdr *nlmsghdr;
+ struct iovec iov;
+ struct msghdr msg = {
+ .msg_name = NULL,
+ .msg_namelen = 0,
+ .msg_iov = &iov,
+ .msg_iovlen = 1,
+ .msg_control = NULL,
+ .msg_controllen = 0,
+ .msg_flags = 0
+ };
+
+ buf = ofpbuf_new(bufsize);
+ *bufp = NULL;
+
+try_again:
+ /* Attempt to read the message. We don't know the size of the data
+ * yet, so we take a guess at 2048. If we're wrong, we keep trying
+ * and doubling the buffer size each time.
+ */
+ nlmsghdr = ofpbuf_put_uninit(buf, bufsize);
+ iov.iov_base = nlmsghdr;
+ iov.iov_len = bufsize;
+ do {
+ nbytes = recvmsg(sock->fd, &msg, (wait ? 0 : MSG_DONTWAIT) | MSG_PEEK);
+ } while (nbytes < 0 && errno == EINTR);
+ if (nbytes < 0) {
+ ofpbuf_delete(buf);
+ return errno;
+ }
+ if (msg.msg_flags & MSG_TRUNC) {
+ COVERAGE_INC(netlink_recv_retry);
+ bufsize *= 2;
+ ofpbuf_reinit(buf, bufsize);
+ goto try_again;
+ }
+ buf->size = nbytes;
+
+ /* We successfully read the message, so recv again to clear the queue */
+ iov.iov_base = &tmp;
+ iov.iov_len = 1;
+ do {
+ nbytes2 = recvmsg(sock->fd, &msg, MSG_DONTWAIT);
+ } while (nbytes2 < 0 && errno == EINTR);
+ if (nbytes2 < 0) {
+ if (errno == ENOBUFS) {
+ /* The kernel is notifying us that a message it tried to send to us
+ * was dropped. We have to pass this along to the caller in case
+ * it wants to retry a request. So kill the buffer, which we can
+ * re-read next time. */
+ COVERAGE_INC(netlink_overflow);
+ ofpbuf_delete(buf);
+ return ENOBUFS;
+ } else {
+ VLOG_ERR_RL(&rl, "failed to remove nlmsg from socket: %s\n",
+ strerror(errno));
+ }
+ }
+ if (nbytes < sizeof *nlmsghdr
+ || nlmsghdr->nlmsg_len < sizeof *nlmsghdr
+ || nlmsghdr->nlmsg_len > nbytes) {
+ VLOG_ERR_RL(&rl, "received invalid nlmsg (%zd bytes < %d)",
+ bufsize, NLMSG_HDRLEN);
+ ofpbuf_delete(buf);
+ return EPROTO;
+ }
+ *bufp = buf;
+ log_nlmsg(__func__, 0, buf->data, buf->size);
+ COVERAGE_INC(netlink_received);
+ return 0;
+}
+
+/* Sends 'request' to the kernel via 'sock' and waits for a response. If
+ * successful, stores the reply into '*replyp' and returns 0. The caller is
+ * responsible for destroying the reply with ofpbuf_delete(). On failure,
+ * returns a positive errno value and stores a null pointer into '*replyp'.
+ *
+ * The caller is responsible for destroying 'request'.
+ *
+ * Bare Netlink is an unreliable transport protocol. This function layers
+ * reliable delivery and reply semantics on top of bare Netlink.
+ *
+ * In Netlink, sending a request to the kernel is reliable enough, because the
+ * kernel will tell us if the message cannot be queued (and we will in that
+ * case put it on the transmit queue and wait until it can be delivered).
+ *
+ * Receiving the reply is the real problem: if the socket buffer is full when
+ * the kernel tries to send the reply, the reply will be dropped. However, the
+ * kernel sets a flag that a reply has been dropped. The next call to recv
+ * then returns ENOBUFS. We can then re-send the request.
+ *
+ * Caveats:
+ *
+ * 1. Netlink depends on sequence numbers to match up requests and
+ * replies. The sender of a request supplies a sequence number, and
+ * the reply echos back that sequence number.
+ *
+ * This is fine, but (1) some kernel netlink implementations are
+ * broken, in that they fail to echo sequence numbers and (2) this
+ * function will drop packets with non-matching sequence numbers, so
+ * that only a single request can be usefully transacted at a time.
+ *
+ * 2. Resending the request causes it to be re-executed, so the request
+ * needs to be idempotent.
+ */
+int
+nl_sock_transact(struct nl_sock *sock,
+ const struct ofpbuf *request, struct ofpbuf **replyp)
+{
+ uint32_t seq = nl_msg_nlmsghdr(request)->nlmsg_seq;
+ struct nlmsghdr *nlmsghdr;
+ struct ofpbuf *reply;
+ int retval;
+
+ *replyp = NULL;
+
+ /* Ensure that we get a reply even if this message doesn't ordinarily call
+ * for one. */
+ nl_msg_nlmsghdr(request)->nlmsg_flags |= NLM_F_ACK;
+
+send:
+ retval = nl_sock_send(sock, request, true);
+ if (retval) {
+ return retval;
+ }
+
+recv:
+ retval = nl_sock_recv(sock, &reply, true);
+ if (retval) {
+ if (retval == ENOBUFS) {
+ COVERAGE_INC(netlink_overflow);
+ VLOG_DBG_RL(&rl, "receive buffer overflow, resending request");
+ goto send;
+ } else {
+ return retval;
+ }
+ }
+ nlmsghdr = nl_msg_nlmsghdr(reply);
+ if (seq != nlmsghdr->nlmsg_seq) {
+ VLOG_DBG_RL(&rl, "ignoring seq %"PRIu32" != expected %"PRIu32,
+ nl_msg_nlmsghdr(reply)->nlmsg_seq, seq);
+ ofpbuf_delete(reply);
+ goto recv;
+ }
+ if (nl_msg_nlmsgerr(reply, &retval)) {
+ ofpbuf_delete(reply);
+ if (retval) {
+ VLOG_DBG_RL(&rl, "received NAK error=%d (%s)",
+ retval, strerror(retval));
+ }
+ return retval != EAGAIN ? retval : EPROTO;
+ }
+
+ *replyp = reply;
+ return 0;
+}
+
+/* Causes poll_block() to wake up when any of the specified 'events' (which is
+ * a OR'd combination of POLLIN, POLLOUT, etc.) occur on 'sock'. */
+void
+nl_sock_wait(const struct nl_sock *sock, short int events)
+{
+ poll_fd_wait(sock->fd, events);
+}
+
+/* Netlink messages. */
+
+/* Returns the nlmsghdr at the head of 'msg'.
+ *
+ * 'msg' must be at least as large as a nlmsghdr. */
+struct nlmsghdr *
+nl_msg_nlmsghdr(const struct ofpbuf *msg)
+{
+ return ofpbuf_at_assert(msg, 0, NLMSG_HDRLEN);
+}
+
+/* Returns the genlmsghdr just past 'msg''s nlmsghdr.
+ *
+ * Returns a null pointer if 'msg' is not large enough to contain an nlmsghdr
+ * and a genlmsghdr. */
+struct genlmsghdr *
+nl_msg_genlmsghdr(const struct ofpbuf *msg)
+{
+ return ofpbuf_at(msg, NLMSG_HDRLEN, GENL_HDRLEN);
+}
+
+/* If 'buffer' is a NLMSG_ERROR message, stores 0 in '*errorp' if it is an ACK
+ * message, otherwise a positive errno value, and returns true. If 'buffer' is
+ * not an NLMSG_ERROR message, returns false.
+ *
+ * 'msg' must be at least as large as a nlmsghdr. */
+bool
+nl_msg_nlmsgerr(const struct ofpbuf *msg, int *errorp)
+{
+ if (nl_msg_nlmsghdr(msg)->nlmsg_type == NLMSG_ERROR) {
+ struct nlmsgerr *err = ofpbuf_at(msg, NLMSG_HDRLEN, sizeof *err);
+ int code = EPROTO;
+ if (!err) {
+ VLOG_ERR_RL(&rl, "received invalid nlmsgerr (%zd bytes < %zd)",
+ msg->size, NLMSG_HDRLEN + sizeof *err);
+ } else if (err->error <= 0 && err->error > INT_MIN) {
+ code = -err->error;
+ }
+ if (errorp) {
+ *errorp = code;
+ }
+ return true;
+ } else {
+ return false;
+ }
+}
+
+/* Ensures that 'b' has room for at least 'size' bytes plus netlink padding at
+ * its tail end, reallocating and copying its data if necessary. */
+void
+nl_msg_reserve(struct ofpbuf *msg, size_t size)
+{
+ ofpbuf_prealloc_tailroom(msg, NLMSG_ALIGN(size));
+}
+
+/* Puts a nlmsghdr at the beginning of 'msg', which must be initially empty.
+ * Uses the given 'type' and 'flags'. 'sock' is used to obtain a PID and
+ * sequence number for proper routing of replies. 'expected_payload' should be
+ * an estimate of the number of payload bytes to be supplied; if the size of
+ * the payload is unknown a value of 0 is acceptable.
+ *
+ * 'type' is ordinarily an enumerated value specific to the Netlink protocol
+ * (e.g. RTM_NEWLINK, for NETLINK_ROUTE protocol). For Generic Netlink, 'type'
+ * is the family number obtained via nl_lookup_genl_family().
+ *
+ * 'flags' is a bit-mask that indicates what kind of request is being made. It
+ * is often NLM_F_REQUEST indicating that a request is being made, commonly
+ * or'd with NLM_F_ACK to request an acknowledgement.
+ *
+ * nl_msg_put_genlmsghdr is more convenient for composing a Generic Netlink
+ * message. */
+void
+nl_msg_put_nlmsghdr(struct ofpbuf *msg, struct nl_sock *sock,
+ size_t expected_payload, uint32_t type, uint32_t flags)
+{
+ struct nlmsghdr *nlmsghdr;
+
+ assert(msg->size == 0);
+
+ nl_msg_reserve(msg, NLMSG_HDRLEN + expected_payload);
+ nlmsghdr = nl_msg_put_uninit(msg, NLMSG_HDRLEN);
+ nlmsghdr->nlmsg_len = 0;
+ nlmsghdr->nlmsg_type = type;
+ nlmsghdr->nlmsg_flags = flags;
+ nlmsghdr->nlmsg_seq = ++next_seq;
+ nlmsghdr->nlmsg_pid = sock->pid;
+}
+
+/* Puts a nlmsghdr and genlmsghdr at the beginning of 'msg', which must be
+ * initially empty. 'sock' is used to obtain a PID and sequence number for
+ * proper routing of replies. 'expected_payload' should be an estimate of the
+ * number of payload bytes to be supplied; if the size of the payload is
+ * unknown a value of 0 is acceptable.
+ *
+ * 'family' is the family number obtained via nl_lookup_genl_family().
+ *
+ * 'flags' is a bit-mask that indicates what kind of request is being made. It
+ * is often NLM_F_REQUEST indicating that a request is being made, commonly
+ * or'd with NLM_F_ACK to request an acknowledgement.
+ *
+ * 'cmd' is an enumerated value specific to the Generic Netlink family
+ * (e.g. CTRL_CMD_NEWFAMILY for the GENL_ID_CTRL family).
+ *
+ * 'version' is a version number specific to the family and command (often 1).
+ *
+ * nl_msg_put_nlmsghdr should be used to compose Netlink messages that are not
+ * Generic Netlink messages. */
+void
+nl_msg_put_genlmsghdr(struct ofpbuf *msg, struct nl_sock *sock,
+ size_t expected_payload, int family, uint32_t flags,
+ uint8_t cmd, uint8_t version)
+{
+ struct genlmsghdr *genlmsghdr;
+
+ nl_msg_put_nlmsghdr(msg, sock, GENL_HDRLEN + expected_payload,
+ family, flags);
+ assert(msg->size == NLMSG_HDRLEN);
+ genlmsghdr = nl_msg_put_uninit(msg, GENL_HDRLEN);
+ genlmsghdr->cmd = cmd;
+ genlmsghdr->version = version;
+ genlmsghdr->reserved = 0;
+}
+
+/* Appends the 'size' bytes of data in 'p', plus Netlink padding if needed, to
+ * the tail end of 'msg'. Data in 'msg' is reallocated and copied if
+ * necessary. */
+void
+nl_msg_put(struct ofpbuf *msg, const void *data, size_t size)
+{
+ memcpy(nl_msg_put_uninit(msg, size), data, size);
+}
+
+/* Appends 'size' bytes of data, plus Netlink padding if needed, to the tail
+ * end of 'msg', reallocating and copying its data if necessary. Returns a
+ * pointer to the first byte of the new data, which is left uninitialized. */
+void *
+nl_msg_put_uninit(struct ofpbuf *msg, size_t size)
+{
+ size_t pad = NLMSG_ALIGN(size) - size;
+ char *p = ofpbuf_put_uninit(msg, size + pad);
+ if (pad) {
+ memset(p + size, 0, pad);
+ }
+ return p;
+}
+
+/* Appends a Netlink attribute of the given 'type' and room for 'size' bytes of
+ * data as its payload, plus Netlink padding if needed, to the tail end of
+ * 'msg', reallocating and copying its data if necessary. Returns a pointer to
+ * the first byte of data in the attribute, which is left uninitialized. */
+void *
+nl_msg_put_unspec_uninit(struct ofpbuf *msg, uint16_t type, size_t size)
+{
+ size_t total_size = NLA_HDRLEN + size;
+ struct nlattr* nla = nl_msg_put_uninit(msg, total_size);
+ assert(NLA_ALIGN(total_size) <= UINT16_MAX);
+ nla->nla_len = total_size;
+ nla->nla_type = type;
+ return nla + 1;
+}
+
+/* Appends a Netlink attribute of the given 'type' and the 'size' bytes of
+ * 'data' as its payload, to the tail end of 'msg', reallocating and copying
+ * its data if necessary. Returns a pointer to the first byte of data in the
+ * attribute, which is left uninitialized. */
+void
+nl_msg_put_unspec(struct ofpbuf *msg, uint16_t type,
+ const void *data, size_t size)
+{
+ memcpy(nl_msg_put_unspec_uninit(msg, type, size), data, size);
+}
+
+/* Appends a Netlink attribute of the given 'type' and no payload to 'msg'.
+ * (Some Netlink protocols use the presence or absence of an attribute as a
+ * Boolean flag.) */
+void
+nl_msg_put_flag(struct ofpbuf *msg, uint16_t type)
+{
+ nl_msg_put_unspec(msg, type, NULL, 0);
+}
+
+/* Appends a Netlink attribute of the given 'type' and the given 8-bit 'value'
+ * to 'msg'. */
+void
+nl_msg_put_u8(struct ofpbuf *msg, uint16_t type, uint8_t value)
+{
+ nl_msg_put_unspec(msg, type, &value, sizeof value);
+}
+
+/* Appends a Netlink attribute of the given 'type' and the given 16-bit 'value'
+ * to 'msg'. */
+void
+nl_msg_put_u16(struct ofpbuf *msg, uint16_t type, uint16_t value)
+{
+ nl_msg_put_unspec(msg, type, &value, sizeof value);
+}
+
+/* Appends a Netlink attribute of the given 'type' and the given 32-bit 'value'
+ * to 'msg'. */
+void
+nl_msg_put_u32(struct ofpbuf *msg, uint16_t type, uint32_t value)
+{
+ nl_msg_put_unspec(msg, type, &value, sizeof value);
+}
+
+/* Appends a Netlink attribute of the given 'type' and the given 64-bit 'value'
+ * to 'msg'. */
+void
+nl_msg_put_u64(struct ofpbuf *msg, uint16_t type, uint64_t value)
+{
+ nl_msg_put_unspec(msg, type, &value, sizeof value);
+}
+
+/* Appends a Netlink attribute of the given 'type' and the given
+ * null-terminated string 'value' to 'msg'. */
+void
+nl_msg_put_string(struct ofpbuf *msg, uint16_t type, const char *value)
+{
+ nl_msg_put_unspec(msg, type, value, strlen(value) + 1);
+}
+
+/* Appends a Netlink attribute of the given 'type' and the given buffered
+ * netlink message in 'nested_msg' to 'msg'. The nlmsg_len field in
+ * 'nested_msg' is finalized to match 'nested_msg->size'. */
+void
+nl_msg_put_nested(struct ofpbuf *msg,
+ uint16_t type, struct ofpbuf *nested_msg)
+{
+ nl_msg_nlmsghdr(nested_msg)->nlmsg_len = nested_msg->size;
+ nl_msg_put_unspec(msg, type, nested_msg->data, nested_msg->size);
+}
+
+/* Returns the first byte in the payload of attribute 'nla'. */
+const void *
+nl_attr_get(const struct nlattr *nla)
+{
+ assert(nla->nla_len >= NLA_HDRLEN);
+ return nla + 1;
+}
+
+/* Returns the number of bytes in the payload of attribute 'nla'. */
+size_t
+nl_attr_get_size(const struct nlattr *nla)
+{
+ assert(nla->nla_len >= NLA_HDRLEN);
+ return nla->nla_len - NLA_HDRLEN;
+}
+
+/* Asserts that 'nla''s payload is at least 'size' bytes long, and returns the
+ * first byte of the payload. */
+const void *
+nl_attr_get_unspec(const struct nlattr *nla, size_t size)
+{
+ assert(nla->nla_len >= NLA_HDRLEN + size);
+ return nla + 1;
+}
+
+/* Returns true if 'nla' is nonnull. (Some Netlink protocols use the presence
+ * or absence of an attribute as a Boolean flag.) */
+bool
+nl_attr_get_flag(const struct nlattr *nla)
+{
+ return nla != NULL;
+}
+
+#define NL_ATTR_GET_AS(NLA, TYPE) \
+ (*(TYPE*) nl_attr_get_unspec(nla, sizeof(TYPE)))
+
+/* Returns the 8-bit value in 'nla''s payload.
+ *
+ * Asserts that 'nla''s payload is at least 1 byte long. */
+uint8_t
+nl_attr_get_u8(const struct nlattr *nla)
+{
+ return NL_ATTR_GET_AS(nla, uint8_t);
+}
+
+/* Returns the 16-bit value in 'nla''s payload.
+ *
+ * Asserts that 'nla''s payload is at least 2 bytes long. */
+uint16_t
+nl_attr_get_u16(const struct nlattr *nla)
+{
+ return NL_ATTR_GET_AS(nla, uint16_t);
+}
+
+/* Returns the 32-bit value in 'nla''s payload.
+ *
+ * Asserts that 'nla''s payload is at least 4 bytes long. */
+uint32_t
+nl_attr_get_u32(const struct nlattr *nla)
+{
+ return NL_ATTR_GET_AS(nla, uint32_t);
+}
+
+/* Returns the 64-bit value in 'nla''s payload.
+ *
+ * Asserts that 'nla''s payload is at least 8 bytes long. */
+uint64_t
+nl_attr_get_u64(const struct nlattr *nla)
+{
+ return NL_ATTR_GET_AS(nla, uint64_t);
+}
+
+/* Returns the null-terminated string value in 'nla''s payload.
+ *
+ * Asserts that 'nla''s payload contains a null-terminated string. */
+const char *
+nl_attr_get_string(const struct nlattr *nla)
+{
+ assert(nla->nla_len > NLA_HDRLEN);
+ assert(memchr(nl_attr_get(nla), '\0', nla->nla_len - NLA_HDRLEN) != NULL);
+ return nl_attr_get(nla);
+}
+
+/* Default minimum and maximum payload sizes for each type of attribute. */
+static const size_t attr_len_range[][2] = {
+ [0 ... N_NL_ATTR_TYPES - 1] = { 0, SIZE_MAX },
+ [NL_A_U8] = { 1, 1 },
+ [NL_A_U16] = { 2, 2 },
+ [NL_A_U32] = { 4, 4 },
+ [NL_A_U64] = { 8, 8 },
+ [NL_A_STRING] = { 1, SIZE_MAX },
+ [NL_A_FLAG] = { 0, SIZE_MAX },
+ [NL_A_NESTED] = { NLMSG_HDRLEN, SIZE_MAX },
+};
+
+/* Parses the 'msg' starting at the given 'nla_offset' as a sequence of Netlink
+ * attributes. 'policy[i]', for 0 <= i < n_attrs, specifies how the attribute
+ * with nla_type == i is parsed; a pointer to attribute i is stored in
+ * attrs[i]. Returns true if successful, false on failure.
+ *
+ * If the Netlink attributes in 'msg' follow a Netlink header and a Generic
+ * Netlink header, then 'nla_offset' should be NLMSG_HDRLEN + GENL_HDRLEN. */
+bool
+nl_policy_parse(const struct ofpbuf *msg, size_t nla_offset,
+ const struct nl_policy policy[],
+ struct nlattr *attrs[], size_t n_attrs)
+{
+ void *p, *tail;
+ size_t n_required;
+ size_t i;
+
+ n_required = 0;
+ for (i = 0; i < n_attrs; i++) {
+ attrs[i] = NULL;
+
+ assert(policy[i].type < N_NL_ATTR_TYPES);
+ if (policy[i].type != NL_A_NO_ATTR
+ && policy[i].type != NL_A_FLAG
+ && !policy[i].optional) {
+ n_required++;
+ }
+ }
+
+ p = ofpbuf_at(msg, nla_offset, 0);
+ if (p == NULL) {
+ VLOG_DBG_RL(&rl, "missing headers in nl_policy_parse");
+ return false;
+ }
+ tail = ofpbuf_tail(msg);
+
+ while (p < tail) {
+ size_t offset = (char*)p - (char*)msg->data;
+ struct nlattr *nla = p;
+ size_t len, aligned_len;
+ uint16_t type;
+
+ /* Make sure its claimed length is plausible. */
+ if (nla->nla_len < NLA_HDRLEN) {
+ VLOG_DBG_RL(&rl, "%zu: attr shorter than NLA_HDRLEN (%"PRIu16")",
+ offset, nla->nla_len);
+ return false;
+ }
+ len = nla->nla_len - NLA_HDRLEN;
+ aligned_len = NLA_ALIGN(len);
+ if (aligned_len > (char*)tail - (char*)p) {
+ VLOG_DBG_RL(&rl, "%zu: attr %"PRIu16" aligned data len (%zu) "
+ "> bytes left (%tu)",
+ offset, nla->nla_type, aligned_len,
+ (char*)tail - (char*)p);
+ return false;
+ }
+
+ type = nla->nla_type;
+ if (type < n_attrs && policy[type].type != NL_A_NO_ATTR) {
+ const struct nl_policy *p = &policy[type];
+ size_t min_len, max_len;
+
+ /* Validate length and content. */
+ min_len = p->min_len ? p->min_len : attr_len_range[p->type][0];
+ max_len = p->max_len ? p->max_len : attr_len_range[p->type][1];
+ if (len < min_len || len > max_len) {
+ VLOG_DBG_RL(&rl, "%zu: attr %"PRIu16" length %zu not in "
+ "allowed range %zu...%zu",
+ offset, type, len, min_len, max_len);
+ return false;
+ }
+ if (p->type == NL_A_STRING) {
+ if (((char *) nla)[nla->nla_len - 1]) {
+ VLOG_DBG_RL(&rl, "%zu: attr %"PRIu16" lacks null at end",
+ offset, type);
+ return false;
+ }
+ if (memchr(nla + 1, '\0', len - 1) != NULL) {
+ VLOG_DBG_RL(&rl, "%zu: attr %"PRIu16" has bad length",
+ offset, type);
+ return false;
+ }
+ }
+ if (!p->optional && attrs[type] == NULL) {
+ assert(n_required > 0);
+ --n_required;
+ }
+ attrs[type] = nla;
+ } else {
+ /* Skip attribute type that we don't care about. */
+ }
+ p = (char*)p + NLA_ALIGN(nla->nla_len);
+ }
+ if (n_required) {
+ VLOG_DBG_RL(&rl, "%zu required attrs missing", n_required);
+ return false;
+ }
+ return true;
+}
+
+/* Miscellaneous. */
+
+static const struct nl_policy family_policy[CTRL_ATTR_MAX + 1] = {
+ [CTRL_ATTR_FAMILY_ID] = {.type = NL_A_U16},
+};
+
+static int do_lookup_genl_family(const char *name)
+{
+ struct nl_sock *sock;
+ struct ofpbuf request, *reply;
+ struct nlattr *attrs[ARRAY_SIZE(family_policy)];
+ int retval;
+
+ retval = nl_sock_create(NETLINK_GENERIC, 0, 0, 0, &sock);
+ if (retval) {
+ return -retval;
+ }
+
+ ofpbuf_init(&request, 0);
+ nl_msg_put_genlmsghdr(&request, sock, 0, GENL_ID_CTRL, NLM_F_REQUEST,
+ CTRL_CMD_GETFAMILY, 1);
+ nl_msg_put_string(&request, CTRL_ATTR_FAMILY_NAME, name);
+ retval = nl_sock_transact(sock, &request, &reply);
+ ofpbuf_uninit(&request);
+ if (retval) {
+ nl_sock_destroy(sock);
+ return -retval;
+ }
+
+ if (!nl_policy_parse(reply, NLMSG_HDRLEN + GENL_HDRLEN,
+ family_policy, attrs, ARRAY_SIZE(family_policy))) {
+ nl_sock_destroy(sock);
+ ofpbuf_delete(reply);
+ return -EPROTO;
+ }
+
+ retval = nl_attr_get_u16(attrs[CTRL_ATTR_FAMILY_ID]);
+ if (retval == 0) {
+ retval = -EPROTO;
+ }
+ nl_sock_destroy(sock);
+ ofpbuf_delete(reply);
+ return retval;
+}
+
+/* If '*number' is 0, translates the given Generic Netlink family 'name' to a
+ * number and stores it in '*number'. If successful, returns 0 and the caller
+ * may use '*number' as the family number. On failure, returns a positive
+ * errno value and '*number' caches the errno value. */
+int
+nl_lookup_genl_family(const char *name, int *number)
+{
+ if (*number == 0) {
+ *number = do_lookup_genl_family(name);
+ assert(*number != 0);
+ }
+ return *number > 0 ? 0 : -*number;
+}
+
+/* Netlink PID.
+ *
+ * Every Netlink socket must be bound to a unique 32-bit PID. By convention,
+ * programs that have a single Netlink socket use their Unix process ID as PID,
+ * and programs with multiple Netlink sockets add a unique per-socket
+ * identifier in the bits above the Unix process ID.
+ *
+ * The kernel has Netlink PID 0.
+ */
+
+/* Parameters for how many bits in the PID should come from the Unix process ID
+ * and how many unique per-socket. */
+#define SOCKET_BITS 10
+#define MAX_SOCKETS (1u << SOCKET_BITS)
+
+#define PROCESS_BITS (32 - SOCKET_BITS)
+#define MAX_PROCESSES (1u << PROCESS_BITS)
+#define PROCESS_MASK ((uint32_t) (MAX_PROCESSES - 1))
+
+/* Bit vector of unused socket identifiers. */
+static uint32_t avail_sockets[ROUND_UP(MAX_SOCKETS, 32)];
+
+/* Allocates and returns a new Netlink PID. */
+static int
+alloc_pid(uint32_t *pid)
+{
+ int i;
+
+ for (i = 0; i < MAX_SOCKETS; i++) {
+ if ((avail_sockets[i / 32] & (1u << (i % 32))) == 0) {
+ avail_sockets[i / 32] |= 1u << (i % 32);
+ *pid = (getpid() & PROCESS_MASK) | (i << PROCESS_BITS);
+ return 0;
+ }
+ }
+ VLOG_ERR("netlink pid space exhausted");
+ return ENOBUFS;
+}
+
+/* Makes the specified 'pid' available for reuse. */
+static void
+free_pid(uint32_t pid)
+{
+ int sock = pid >> PROCESS_BITS;
+ assert(avail_sockets[sock / 32] & (1u << (sock % 32)));
+ avail_sockets[sock / 32] &= ~(1u << (sock % 32));
+}
+
+static void
+nlmsghdr_to_string(const struct nlmsghdr *h, struct ds *ds)
+{
+ struct nlmsg_flag {
+ unsigned int bits;
+ const char *name;
+ };
+ static const struct nlmsg_flag flags[] = {
+ { NLM_F_REQUEST, "REQUEST" },
+ { NLM_F_MULTI, "MULTI" },
+ { NLM_F_ACK, "ACK" },
+ { NLM_F_ECHO, "ECHO" },
+ { NLM_F_DUMP, "DUMP" },
+ { NLM_F_ROOT, "ROOT" },
+ { NLM_F_MATCH, "MATCH" },
+ { NLM_F_ATOMIC, "ATOMIC" },
+ };
+ const struct nlmsg_flag *flag;
+ uint16_t flags_left;
+
+ ds_put_format(ds, "nl(len:%"PRIu32", type=%"PRIu16,
+ h->nlmsg_len, h->nlmsg_type);
+ if (h->nlmsg_type == NLMSG_NOOP) {
+ ds_put_cstr(ds, "(no-op)");
+ } else if (h->nlmsg_type == NLMSG_ERROR) {
+ ds_put_cstr(ds, "(error)");
+ } else if (h->nlmsg_type == NLMSG_DONE) {
+ ds_put_cstr(ds, "(done)");
+ } else if (h->nlmsg_type == NLMSG_OVERRUN) {
+ ds_put_cstr(ds, "(overrun)");
+ } else if (h->nlmsg_type < NLMSG_MIN_TYPE) {
+ ds_put_cstr(ds, "(reserved)");
+ } else {
+ ds_put_cstr(ds, "(family-defined)");
+ }
+ ds_put_format(ds, ", flags=%"PRIx16, h->nlmsg_flags);
+ flags_left = h->nlmsg_flags;
+ for (flag = flags; flag < &flags[ARRAY_SIZE(flags)]; flag++) {
+ if ((flags_left & flag->bits) == flag->bits) {
+ ds_put_format(ds, "[%s]", flag->name);
+ flags_left &= ~flag->bits;
+ }
+ }
+ if (flags_left) {
+ ds_put_format(ds, "[OTHER:%"PRIx16"]", flags_left);
+ }
+ ds_put_format(ds, ", seq=%"PRIx32", pid=%"PRIu32"(%d:%d))",
+ h->nlmsg_seq, h->nlmsg_pid,
+ (int) (h->nlmsg_pid & PROCESS_MASK),
+ (int) (h->nlmsg_pid >> PROCESS_BITS));
+}
+
+static char *
+nlmsg_to_string(const struct ofpbuf *buffer)
+{
+ struct ds ds = DS_EMPTY_INITIALIZER;
+ const struct nlmsghdr *h = ofpbuf_at(buffer, 0, NLMSG_HDRLEN);
+ if (h) {
+ nlmsghdr_to_string(h, &ds);
+ if (h->nlmsg_type == NLMSG_ERROR) {
+ const struct nlmsgerr *e;
+ e = ofpbuf_at(buffer, NLMSG_HDRLEN,
+ NLMSG_ALIGN(sizeof(struct nlmsgerr)));
+ if (e) {
+ ds_put_format(&ds, " error(%d", e->error);
+ if (e->error < 0) {
+ ds_put_format(&ds, "(%s)", strerror(-e->error));
+ }
+ ds_put_cstr(&ds, ", in-reply-to(");
+ nlmsghdr_to_string(&e->msg, &ds);
+ ds_put_cstr(&ds, "))");
+ } else {
+ ds_put_cstr(&ds, " error(truncated)");
+ }
+ } else if (h->nlmsg_type == NLMSG_DONE) {
+ int *error = ofpbuf_at(buffer, NLMSG_HDRLEN, sizeof *error);
+ if (error) {
+ ds_put_format(&ds, " done(%d", *error);
+ if (*error < 0) {
+ ds_put_format(&ds, "(%s)", strerror(-*error));
+ }
+ ds_put_cstr(&ds, ")");
+ } else {
+ ds_put_cstr(&ds, " done(truncated)");
+ }
+ }
+ } else {
+ ds_put_cstr(&ds, "nl(truncated)");
+ }
+ return ds.string;
+}
+
+static void
+log_nlmsg(const char *function, int error,
+ const void *message, size_t size)
+{
+ struct ofpbuf buffer;
+ char *nlmsg;
+
+ if (!VLOG_IS_DBG_ENABLED()) {
+ return;
+ }
+
+ buffer.data = (void *) message;
+ buffer.size = size;
+ nlmsg = nlmsg_to_string(&buffer);
+ VLOG_DBG_RL(&rl, "%s (%s): %s", function, strerror(error), nlmsg);
+ free(nlmsg);
+}
+
diff --git a/lib/netlink.h b/lib/netlink.h
new file mode 100644
index 000000000..64452e9f4
--- /dev/null
+++ b/lib/netlink.h
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2008, 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef NETLINK_H
+#define NETLINK_H 1
+
+/* Netlink interface.
+ *
+ * Netlink is a datagram-based network protocol primarily for communication
+ * between user processes and the kernel, and mainly on Linux. Netlink is
+ * specified in RFC 3549, "Linux Netlink as an IP Services Protocol".
+ *
+ * Netlink is not suitable for use in physical networks of heterogeneous
+ * machines because host byte order is used throughout. */
+
+#include <stdbool.h>
+#include <sys/uio.h>
+#include <stdint.h>
+
+struct ofpbuf;
+struct nl_sock;
+struct nlattr;
+
+/* Netlink sockets. */
+
+int nl_sock_create(int protocol, int multicast_group,
+ size_t so_sndbuf, size_t so_rcvbuf,
+ struct nl_sock **);
+void nl_sock_destroy(struct nl_sock *);
+
+int nl_sock_send(struct nl_sock *, const struct ofpbuf *, bool wait);
+int nl_sock_sendv(struct nl_sock *sock, const struct iovec iov[], size_t n_iov,
+ bool wait);
+int nl_sock_recv(struct nl_sock *, struct ofpbuf **, bool wait);
+int nl_sock_transact(struct nl_sock *, const struct ofpbuf *request,
+ struct ofpbuf **reply);
+
+void nl_sock_wait(const struct nl_sock *, short int events);
+
+/* Netlink messages. */
+
+/* Accessing headers and data. */
+struct nlmsghdr *nl_msg_nlmsghdr(const struct ofpbuf *);
+struct genlmsghdr *nl_msg_genlmsghdr(const struct ofpbuf *);
+bool nl_msg_nlmsgerr(const struct ofpbuf *, int *error);
+void nl_msg_reserve(struct ofpbuf *, size_t);
+
+/* Appending headers and raw data. */
+void nl_msg_put_nlmsghdr(struct ofpbuf *, struct nl_sock *,
+ size_t expected_payload,
+ uint32_t type, uint32_t flags);
+void nl_msg_put_genlmsghdr(struct ofpbuf *, struct nl_sock *,
+ size_t expected_payload, int family, uint32_t flags,
+ uint8_t cmd, uint8_t version);
+void nl_msg_put(struct ofpbuf *, const void *, size_t);
+void *nl_msg_put_uninit(struct ofpbuf *, size_t);
+
+/* Appending attributes. */
+void *nl_msg_put_unspec_uninit(struct ofpbuf *, uint16_t type, size_t);
+void nl_msg_put_unspec(struct ofpbuf *, uint16_t type, const void *, size_t);
+void nl_msg_put_flag(struct ofpbuf *, uint16_t type);
+void nl_msg_put_u8(struct ofpbuf *, uint16_t type, uint8_t value);
+void nl_msg_put_u16(struct ofpbuf *, uint16_t type, uint16_t value);
+void nl_msg_put_u32(struct ofpbuf *, uint16_t type, uint32_t value);
+void nl_msg_put_u64(struct ofpbuf *, uint16_t type, uint64_t value);
+void nl_msg_put_string(struct ofpbuf *, uint16_t type, const char *value);
+void nl_msg_put_nested(struct ofpbuf *, uint16_t type, struct ofpbuf *);
+
+/* Netlink attribute types. */
+enum nl_attr_type
+{
+ NL_A_NO_ATTR = 0,
+ NL_A_UNSPEC,
+ NL_A_U8,
+ NL_A_U16,
+ NL_A_U32,
+ NL_A_U64,
+ NL_A_STRING,
+ NL_A_FLAG,
+ NL_A_NESTED,
+ N_NL_ATTR_TYPES
+};
+
+/* Netlink attribute parsing. */
+const void *nl_attr_get(const struct nlattr *);
+size_t nl_attr_get_size(const struct nlattr *);
+const void *nl_attr_get_unspec(const struct nlattr *, size_t size);
+bool nl_attr_get_flag(const struct nlattr *);
+uint8_t nl_attr_get_u8(const struct nlattr *);
+uint16_t nl_attr_get_u16(const struct nlattr *);
+uint32_t nl_attr_get_u32(const struct nlattr *);
+uint64_t nl_attr_get_u64(const struct nlattr *);
+const char *nl_attr_get_string(const struct nlattr *);
+
+/* Netlink attribute policy.
+ *
+ * Specifies how to parse a single attribute from a Netlink message payload.
+ */
+struct nl_policy
+{
+ enum nl_attr_type type;
+ size_t min_len, max_len;
+ bool optional;
+};
+
+bool nl_policy_parse(const struct ofpbuf *, size_t offset,
+ const struct nl_policy[],
+ struct nlattr *[], size_t n_attrs);
+
+/* Miscellaneous. */
+
+int nl_lookup_genl_family(const char *name, int *number);
+
+#endif /* netlink.h */
diff --git a/lib/odp-util.c b/lib/odp-util.c
new file mode 100644
index 000000000..d32697b7e
--- /dev/null
+++ b/lib/odp-util.c
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <config.h>
+#include "odp-util.h"
+#include <inttypes.h>
+#include <stdlib.h>
+#include <string.h>
+#include "coverage.h"
+#include "dynamic-string.h"
+#include "flow.h"
+#include "packets.h"
+#include "timeval.h"
+#include "util.h"
+
+union odp_action *
+odp_actions_add(struct odp_actions *actions, uint16_t type)
+{
+ union odp_action *a;
+ if (actions->n_actions < MAX_ODP_ACTIONS) {
+ a = &actions->actions[actions->n_actions++];
+ } else {
+ COVERAGE_INC(odp_overflow);
+ actions->n_actions = MAX_ODP_ACTIONS + 1;
+ a = &actions->actions[MAX_ODP_ACTIONS - 1];
+ }
+ memset(a, 0, sizeof *a);
+ a->type = type;
+ return a;
+}
+
+void
+format_odp_action(struct ds *ds, const union odp_action *a)
+{
+ switch (a->type) {
+ case ODPAT_OUTPUT:
+ ds_put_format(ds, "%"PRIu16, a->output.port);
+ break;
+ case ODPAT_OUTPUT_GROUP:
+ ds_put_format(ds, "g%"PRIu16, a->output_group.group);
+ break;
+ case ODPAT_CONTROLLER:
+ ds_put_format(ds, "ctl(%"PRIu32")", a->controller.arg);
+ break;
+ case ODPAT_SET_VLAN_VID:
+ ds_put_format(ds, "set_vlan(%"PRIu16")", ntohs(a->vlan_vid.vlan_vid));
+ break;
+ case ODPAT_SET_VLAN_PCP:
+ ds_put_format(ds, "set_vlan_pcp(%"PRIu8")", a->vlan_pcp.vlan_pcp);
+ break;
+ case ODPAT_STRIP_VLAN:
+ ds_put_format(ds, "strip_vlan");
+ break;
+ case ODPAT_SET_DL_SRC:
+ ds_put_format(ds, "set_dl_src("ETH_ADDR_FMT")",
+ ETH_ADDR_ARGS(a->dl_addr.dl_addr));
+ break;
+ case ODPAT_SET_DL_DST:
+ ds_put_format(ds, "set_dl_dst("ETH_ADDR_FMT")",
+ ETH_ADDR_ARGS(a->dl_addr.dl_addr));
+ break;
+ case ODPAT_SET_NW_SRC:
+ ds_put_format(ds, "set_nw_src("IP_FMT")",
+ IP_ARGS(&a->nw_addr.nw_addr));
+ break;
+ case ODPAT_SET_NW_DST:
+ ds_put_format(ds, "set_nw_dst("IP_FMT")",
+ IP_ARGS(&a->nw_addr.nw_addr));
+ break;
+ case ODPAT_SET_TP_SRC:
+ ds_put_format(ds, "set_tp_src(%"PRIu16")", ntohs(a->tp_port.tp_port));
+ break;
+ case ODPAT_SET_TP_DST:
+ ds_put_format(ds, "set_tp_dst(%"PRIu16")", ntohs(a->tp_port.tp_port));
+ break;
+ default:
+ ds_put_format(ds, "***bad action %"PRIu16"***", a->type);
+ break;
+ }
+}
+
+void
+format_odp_actions(struct ds *ds, const union odp_action *actions,
+ size_t n_actions)
+{
+ size_t i;
+ for (i = 0; i < n_actions; i++) {
+ if (i) {
+ ds_put_char(ds, ',');
+ }
+ format_odp_action(ds, &actions[i]);
+ }
+ if (!n_actions) {
+ ds_put_cstr(ds, "drop");
+ }
+}
+
+void
+format_odp_flow_stats(struct ds *ds, const struct odp_flow_stats *s)
+{
+ ds_put_format(ds, "packets:%"PRIu64", bytes:%"PRIu64", used:",
+ s->n_packets, s->n_bytes);
+ if (s->used_sec) {
+ long long int used = s->used_sec * 1000 + s->used_nsec / 1000000;
+ ds_put_format(ds, "%.3fs", (time_msec() - used) / 1000.0);
+ } else {
+ ds_put_format(ds, "never");
+ }
+}
+
+void
+format_odp_flow(struct ds *ds, const struct odp_flow *f)
+{
+ flow_format(ds, &f->key);
+ ds_put_cstr(ds, ", ");
+ format_odp_flow_stats(ds, &f->stats);
+ ds_put_cstr(ds, ", actions:");
+ format_odp_actions(ds, f->actions, f->n_actions);
+}
+
diff --git a/lib/odp-util.h b/lib/odp-util.h
new file mode 100644
index 000000000..77d7f9a53
--- /dev/null
+++ b/lib/odp-util.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef ODP_UTIL_H
+#define ODP_UTIL_H 1
+
+#include <stdbool.h>
+#include <stdint.h>
+#include "openflow/openflow.h"
+#include "openvswitch/datapath-protocol.h"
+
+struct ds;
+
+/* The kernel datapaths limits actions to those that fit in a single page of
+ * memory, so there is no point in allocating more than that. */
+enum { MAX_ODP_ACTIONS = 4096 / sizeof(union odp_action) };
+
+struct odp_actions {
+ size_t n_actions;
+ union odp_action actions[MAX_ODP_ACTIONS];
+};
+
+static inline void
+odp_actions_init(struct odp_actions *actions)
+{
+ actions->n_actions = 0;
+}
+
+union odp_action *odp_actions_add(struct odp_actions *actions, uint16_t type);
+
+static inline bool
+odp_actions_overflow(const struct odp_actions *actions)
+{
+ return actions->n_actions > MAX_ODP_ACTIONS;
+}
+
+static inline uint16_t
+ofp_port_to_odp_port(uint16_t ofp_port)
+{
+ switch (ofp_port) {
+ case OFPP_LOCAL:
+ return ODPP_LOCAL;
+ case OFPP_NONE:
+ return ODPP_NONE;
+ default:
+ return ofp_port;
+ }
+}
+
+static inline uint16_t
+odp_port_to_ofp_port(uint16_t odp_port)
+{
+ switch (odp_port) {
+ case ODPP_LOCAL:
+ return OFPP_LOCAL;
+ case ODPP_NONE:
+ return OFPP_NONE;
+ default:
+ return odp_port;
+ }
+}
+
+void format_odp_action(struct ds *, const union odp_action *);
+void format_odp_actions(struct ds *, const union odp_action *actions,
+ size_t n_actions);
+void format_odp_flow_stats(struct ds *, const struct odp_flow_stats *);
+void format_odp_flow(struct ds *, const struct odp_flow *);
+
+#endif /* odp-util.h */
diff --git a/lib/ofp-print.c b/lib/ofp-print.c
new file mode 100644
index 000000000..0c7980e84
--- /dev/null
+++ b/lib/ofp-print.c
@@ -0,0 +1,1473 @@
+/*
+ * Copyright (c) 2008, 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <config.h>
+#include "ofp-print.h"
+#include "xtoxll.h"
+
+#include <errno.h>
+#include <inttypes.h>
+#include <netinet/in.h>
+#include <sys/wait.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include <ctype.h>
+
+#include "compiler.h"
+#include "dynamic-string.h"
+#include "flow.h"
+#include "ofpbuf.h"
+#include "openflow/openflow.h"
+#include "openflow/nicira-ext.h"
+#include "packets.h"
+#include "pcap.h"
+#include "util.h"
+
+static void ofp_print_port_name(struct ds *string, uint16_t port);
+static void ofp_print_match(struct ds *, const struct ofp_match *,
+ int verbosity);
+
+/* Returns a string that represents the contents of the Ethernet frame in the
+ * 'len' bytes starting at 'data' to 'stream' as output by tcpdump.
+ * 'total_len' specifies the full length of the Ethernet frame (of which 'len'
+ * bytes were captured).
+ *
+ * The caller must free the returned string.
+ *
+ * This starts and kills a tcpdump subprocess so it's quite expensive. */
+char *
+ofp_packet_to_string(const void *data, size_t len, size_t total_len UNUSED)
+{
+ struct ds ds = DS_EMPTY_INITIALIZER;
+ struct ofpbuf buf;
+
+ char command[128];
+ FILE *pcap;
+ FILE *tcpdump;
+ int status;
+ int c;
+
+ buf.data = (void *) data;
+ buf.size = len;
+
+ pcap = tmpfile();
+ if (!pcap) {
+ ovs_error(errno, "tmpfile");
+ return xstrdup("<error>");
+ }
+ pcap_write_header(pcap);
+ pcap_write(pcap, &buf);
+ fflush(pcap);
+ if (ferror(pcap)) {
+ ovs_error(errno, "error writing temporary file");
+ }
+ rewind(pcap);
+
+ snprintf(command, sizeof command, "/usr/sbin/tcpdump -e -n -r /dev/fd/%d 2>/dev/null",
+ fileno(pcap));
+ tcpdump = popen(command, "r");
+ fclose(pcap);
+ if (!tcpdump) {
+ ovs_error(errno, "exec(\"%s\")", command);
+ return xstrdup("<error>");
+ }
+
+ while ((c = getc(tcpdump)) != EOF) {
+ ds_put_char(&ds, c);
+ }
+
+ status = pclose(tcpdump);
+ if (WIFEXITED(status)) {
+ if (WEXITSTATUS(status))
+ ovs_error(0, "tcpdump exited with status %d", WEXITSTATUS(status));
+ } else if (WIFSIGNALED(status)) {
+ ovs_error(0, "tcpdump exited with signal %d", WTERMSIG(status));
+ }
+ return ds_cstr(&ds);
+}
+
+/* Pretty-print the OFPT_PACKET_IN packet of 'len' bytes at 'oh' to 'stream'
+ * at the given 'verbosity' level. */
+static void
+ofp_packet_in(struct ds *string, const void *oh, size_t len, int verbosity)
+{
+ const struct ofp_packet_in *op = oh;
+ size_t data_len;
+
+ ds_put_format(string, " total_len=%"PRIu16" in_port=",
+ ntohs(op->total_len));
+ ofp_print_port_name(string, ntohs(op->in_port));
+
+ if (op->reason == OFPR_ACTION)
+ ds_put_cstr(string, " (via action)");
+ else if (op->reason != OFPR_NO_MATCH)
+ ds_put_format(string, " (***reason %"PRIu8"***)", op->reason);
+
+ data_len = len - offsetof(struct ofp_packet_in, data);
+ ds_put_format(string, " data_len=%zu", data_len);
+ if (htonl(op->buffer_id) == UINT32_MAX) {
+ ds_put_format(string, " (unbuffered)");
+ if (ntohs(op->total_len) != data_len)
+ ds_put_format(string, " (***total_len != data_len***)");
+ } else {
+ ds_put_format(string, " buffer=0x%08"PRIx32, ntohl(op->buffer_id));
+ if (ntohs(op->total_len) < data_len)
+ ds_put_format(string, " (***total_len < data_len***)");
+ }
+ ds_put_char(string, '\n');
+
+ if (verbosity > 0) {
+ flow_t flow;
+ struct ofpbuf packet;
+ struct ofp_match match;
+ packet.data = (void *) op->data;
+ packet.size = data_len;
+ flow_extract(&packet, ntohs(op->in_port), &flow);
+ flow_to_match(&flow, 0, &match);
+ ofp_print_match(string, &match, verbosity);
+ ds_put_char(string, '\n');
+ }
+ if (verbosity > 1) {
+ char *packet = ofp_packet_to_string(op->data, data_len,
+ ntohs(op->total_len));
+ ds_put_cstr(string, packet);
+ free(packet);
+ }
+}
+
+static void ofp_print_port_name(struct ds *string, uint16_t port)
+{
+ const char *name;
+ switch (port) {
+ case OFPP_IN_PORT:
+ name = "IN_PORT";
+ break;
+ case OFPP_TABLE:
+ name = "TABLE";
+ break;
+ case OFPP_NORMAL:
+ name = "NORMAL";
+ break;
+ case OFPP_FLOOD:
+ name = "FLOOD";
+ break;
+ case OFPP_ALL:
+ name = "ALL";
+ break;
+ case OFPP_CONTROLLER:
+ name = "CONTROLLER";
+ break;
+ case OFPP_LOCAL:
+ name = "LOCAL";
+ break;
+ case OFPP_NONE:
+ name = "NONE";
+ break;
+ default:
+ ds_put_format(string, "%"PRIu16, port);
+ return;
+ }
+ ds_put_cstr(string, name);
+}
+
+static void
+ofp_print_nx_action(struct ds *string, const struct nx_action_header *nah)
+{
+ switch (ntohs(nah->subtype)) {
+ case NXAST_RESUBMIT: {
+ const struct nx_action_resubmit *nar = (struct nx_action_resubmit *)nah;
+ ds_put_format(string, "resubmit:");
+ ofp_print_port_name(string, ntohs(nar->in_port));
+ break;
+ }
+
+ default:
+ ds_put_format(string, "***unknown Nicira action:%d***\n",
+ ntohs(nah->subtype));
+ }
+}
+
+static int
+ofp_print_action(struct ds *string, const struct ofp_action_header *ah,
+ size_t actions_len)
+{
+ uint16_t type;
+ size_t len;
+
+ struct openflow_action {
+ size_t min_size;
+ size_t max_size;
+ };
+
+ const struct openflow_action of_actions[] = {
+ [OFPAT_OUTPUT] = {
+ sizeof(struct ofp_action_output),
+ sizeof(struct ofp_action_output),
+ },
+ [OFPAT_SET_VLAN_VID] = {
+ sizeof(struct ofp_action_vlan_vid),
+ sizeof(struct ofp_action_vlan_vid),
+ },
+ [OFPAT_SET_VLAN_PCP] = {
+ sizeof(struct ofp_action_vlan_pcp),
+ sizeof(struct ofp_action_vlan_pcp),
+ },
+ [OFPAT_STRIP_VLAN] = {
+ sizeof(struct ofp_action_header),
+ sizeof(struct ofp_action_header),
+ },
+ [OFPAT_SET_DL_SRC] = {
+ sizeof(struct ofp_action_dl_addr),
+ sizeof(struct ofp_action_dl_addr),
+ },
+ [OFPAT_SET_DL_DST] = {
+ sizeof(struct ofp_action_dl_addr),
+ sizeof(struct ofp_action_dl_addr),
+ },
+ [OFPAT_SET_NW_SRC] = {
+ sizeof(struct ofp_action_nw_addr),
+ sizeof(struct ofp_action_nw_addr),
+ },
+ [OFPAT_SET_NW_DST] = {
+ sizeof(struct ofp_action_nw_addr),
+ sizeof(struct ofp_action_nw_addr),
+ },
+ [OFPAT_SET_TP_SRC] = {
+ sizeof(struct ofp_action_tp_port),
+ sizeof(struct ofp_action_tp_port),
+ },
+ [OFPAT_SET_TP_DST] = {
+ sizeof(struct ofp_action_tp_port),
+ sizeof(struct ofp_action_tp_port),
+ }
+ /* OFPAT_VENDOR is not here, since it would blow up the array size. */
+ };
+
+ if (actions_len < sizeof *ah) {
+ ds_put_format(string, "***action array too short for next action***\n");
+ return -1;
+ }
+
+ type = ntohs(ah->type);
+ len = ntohs(ah->len);
+ if (actions_len < len) {
+ ds_put_format(string, "***truncated action %"PRIu16"***\n", type);
+ return -1;
+ }
+
+ if ((len % 8) != 0) {
+ ds_put_format(string,
+ "***action %"PRIu16" length not a multiple of 8***\n",
+ type);
+ return -1;
+ }
+
+ if (type < ARRAY_SIZE(of_actions)) {
+ const struct openflow_action *act = &of_actions[type];
+ if ((len < act->min_size) || (len > act->max_size)) {
+ ds_put_format(string,
+ "***action %"PRIu16" wrong length: %zu***\n", type, len);
+ return -1;
+ }
+ }
+
+ switch (type) {
+ case OFPAT_OUTPUT: {
+ struct ofp_action_output *oa = (struct ofp_action_output *)ah;
+ uint16_t port = ntohs(oa->port);
+ if (port < OFPP_MAX) {
+ ds_put_format(string, "output:%"PRIu16, port);
+ } else {
+ ofp_print_port_name(string, port);
+ if (port == OFPP_CONTROLLER) {
+ if (oa->max_len) {
+ ds_put_format(string, ":%"PRIu16, ntohs(oa->max_len));
+ } else {
+ ds_put_cstr(string, ":all");
+ }
+ }
+ }
+ break;
+ }
+
+ case OFPAT_SET_VLAN_VID: {
+ struct ofp_action_vlan_vid *va = (struct ofp_action_vlan_vid *)ah;
+ ds_put_format(string, "mod_vlan_vid:%"PRIu16, ntohs(va->vlan_vid));
+ break;
+ }
+
+ case OFPAT_SET_VLAN_PCP: {
+ struct ofp_action_vlan_pcp *va = (struct ofp_action_vlan_pcp *)ah;
+ ds_put_format(string, "mod_vlan_pcp:%"PRIu8, va->vlan_pcp);
+ break;
+ }
+
+ case OFPAT_STRIP_VLAN:
+ ds_put_cstr(string, "strip_vlan");
+ break;
+
+ case OFPAT_SET_DL_SRC: {
+ struct ofp_action_dl_addr *da = (struct ofp_action_dl_addr *)ah;
+ ds_put_format(string, "mod_dl_src:"ETH_ADDR_FMT,
+ ETH_ADDR_ARGS(da->dl_addr));
+ break;
+ }
+
+ case OFPAT_SET_DL_DST: {
+ struct ofp_action_dl_addr *da = (struct ofp_action_dl_addr *)ah;
+ ds_put_format(string, "mod_dl_dst:"ETH_ADDR_FMT,
+ ETH_ADDR_ARGS(da->dl_addr));
+ break;
+ }
+
+ case OFPAT_SET_NW_SRC: {
+ struct ofp_action_nw_addr *na = (struct ofp_action_nw_addr *)ah;
+ ds_put_format(string, "mod_nw_src:"IP_FMT, IP_ARGS(&na->nw_addr));
+ break;
+ }
+
+ case OFPAT_SET_NW_DST: {
+ struct ofp_action_nw_addr *na = (struct ofp_action_nw_addr *)ah;
+ ds_put_format(string, "mod_nw_dst:"IP_FMT, IP_ARGS(&na->nw_addr));
+ break;
+ }
+
+ case OFPAT_SET_TP_SRC: {
+ struct ofp_action_tp_port *ta = (struct ofp_action_tp_port *)ah;
+ ds_put_format(string, "mod_tp_src:%d", ntohs(ta->tp_port));
+ break;
+ }
+
+ case OFPAT_SET_TP_DST: {
+ struct ofp_action_tp_port *ta = (struct ofp_action_tp_port *)ah;
+ ds_put_format(string, "mod_tp_dst:%d", ntohs(ta->tp_port));
+ break;
+ }
+
+ case OFPAT_VENDOR: {
+ struct ofp_action_vendor_header *avh
+ = (struct ofp_action_vendor_header *)ah;
+ if (len < sizeof *avh) {
+ ds_put_format(string, "***ofpat_vendor truncated***\n");
+ return -1;
+ }
+ if (avh->vendor == htonl(NX_VENDOR_ID)) {
+ ofp_print_nx_action(string, (struct nx_action_header *)avh);
+ } else {
+ ds_put_format(string, "vendor action:0x%x", ntohl(avh->vendor));
+ }
+ break;
+ }
+
+ default:
+ ds_put_format(string, "(decoder %"PRIu16" not implemented)", type);
+ break;
+ }
+
+ return len;
+}
+
+static void
+ofp_print_actions(struct ds *string, const struct ofp_action_header *action,
+ size_t actions_len)
+{
+ uint8_t *p = (uint8_t *)action;
+ int len = 0;
+
+ ds_put_cstr(string, "actions=");
+ if (!actions_len) {
+ ds_put_cstr(string, "drop");
+ }
+ while (actions_len > 0) {
+ if (len) {
+ ds_put_cstr(string, ",");
+ }
+ len = ofp_print_action(string, (struct ofp_action_header *)p,
+ actions_len);
+ if (len < 0) {
+ return;
+ }
+ p += len;
+ actions_len -= len;
+ }
+}
+
+/* Pretty-print the OFPT_PACKET_OUT packet of 'len' bytes at 'oh' to 'string'
+ * at the given 'verbosity' level. */
+static void ofp_packet_out(struct ds *string, const void *oh, size_t len,
+ int verbosity)
+{
+ const struct ofp_packet_out *opo = oh;
+ size_t actions_len = ntohs(opo->actions_len);
+
+ ds_put_cstr(string, " in_port=");
+ ofp_print_port_name(string, ntohs(opo->in_port));
+
+ ds_put_format(string, " actions_len=%zu ", actions_len);
+ if (actions_len > (ntohs(opo->header.length) - sizeof *opo)) {
+ ds_put_format(string, "***packet too short for action length***\n");
+ return;
+ }
+ ofp_print_actions(string, opo->actions, actions_len);
+
+ if (ntohl(opo->buffer_id) == UINT32_MAX) {
+ int data_len = len - sizeof *opo - actions_len;
+ ds_put_format(string, " data_len=%d", data_len);
+ if (verbosity > 0 && len > sizeof *opo) {
+ char *packet = ofp_packet_to_string(
+ (uint8_t *)opo->actions + actions_len, data_len, data_len);
+ ds_put_char(string, '\n');
+ ds_put_cstr(string, packet);
+ free(packet);
+ }
+ } else {
+ ds_put_format(string, " buffer=0x%08"PRIx32, ntohl(opo->buffer_id));
+ }
+ ds_put_char(string, '\n');
+}
+
+/* qsort comparison function. */
+static int
+compare_ports(const void *a_, const void *b_)
+{
+ const struct ofp_phy_port *a = a_;
+ const struct ofp_phy_port *b = b_;
+ uint16_t ap = ntohs(a->port_no);
+ uint16_t bp = ntohs(b->port_no);
+
+ return ap < bp ? -1 : ap > bp;
+}
+
+static void ofp_print_port_features(struct ds *string, uint32_t features)
+{
+ if (features == 0) {
+ ds_put_cstr(string, "Unsupported\n");
+ return;
+ }
+ if (features & OFPPF_10MB_HD) {
+ ds_put_cstr(string, "10MB-HD ");
+ }
+ if (features & OFPPF_10MB_FD) {
+ ds_put_cstr(string, "10MB-FD ");
+ }
+ if (features & OFPPF_100MB_HD) {
+ ds_put_cstr(string, "100MB-HD ");
+ }
+ if (features & OFPPF_100MB_FD) {
+ ds_put_cstr(string, "100MB-FD ");
+ }
+ if (features & OFPPF_1GB_HD) {
+ ds_put_cstr(string, "1GB-HD ");
+ }
+ if (features & OFPPF_1GB_FD) {
+ ds_put_cstr(string, "1GB-FD ");
+ }
+ if (features & OFPPF_10GB_FD) {
+ ds_put_cstr(string, "10GB-FD ");
+ }
+ if (features & OFPPF_COPPER) {
+ ds_put_cstr(string, "COPPER ");
+ }
+ if (features & OFPPF_FIBER) {
+ ds_put_cstr(string, "FIBER ");
+ }
+ if (features & OFPPF_AUTONEG) {
+ ds_put_cstr(string, "AUTO_NEG ");
+ }
+ if (features & OFPPF_PAUSE) {
+ ds_put_cstr(string, "AUTO_PAUSE ");
+ }
+ if (features & OFPPF_PAUSE_ASYM) {
+ ds_put_cstr(string, "AUTO_PAUSE_ASYM ");
+ }
+ ds_put_char(string, '\n');
+}
+
+static void
+ofp_print_phy_port(struct ds *string, const struct ofp_phy_port *port)
+{
+ uint8_t name[OFP_MAX_PORT_NAME_LEN];
+ int j;
+
+ memcpy(name, port->name, sizeof name);
+ for (j = 0; j < sizeof name - 1; j++) {
+ if (!isprint(name[j])) {
+ break;
+ }
+ }
+ name[j] = '\0';
+
+ ds_put_char(string, ' ');
+ ofp_print_port_name(string, ntohs(port->port_no));
+ ds_put_format(string, "(%s): addr:"ETH_ADDR_FMT", config: %#x, state:%#x\n",
+ name, ETH_ADDR_ARGS(port->hw_addr), ntohl(port->config),
+ ntohl(port->state));
+ if (port->curr) {
+ ds_put_format(string, " current: ");
+ ofp_print_port_features(string, ntohl(port->curr));
+ }
+ if (port->advertised) {
+ ds_put_format(string, " advertised: ");
+ ofp_print_port_features(string, ntohl(port->advertised));
+ }
+ if (port->supported) {
+ ds_put_format(string, " supported: ");
+ ofp_print_port_features(string, ntohl(port->supported));
+ }
+ if (port->peer) {
+ ds_put_format(string, " peer: ");
+ ofp_print_port_features(string, ntohl(port->peer));
+ }
+}
+
+/* Pretty-print the struct ofp_switch_features of 'len' bytes at 'oh' to
+ * 'string' at the given 'verbosity' level. */
+static void
+ofp_print_switch_features(struct ds *string, const void *oh, size_t len,
+ int verbosity UNUSED)
+{
+ const struct ofp_switch_features *osf = oh;
+ struct ofp_phy_port *port_list;
+ int n_ports;
+ int i;
+
+ ds_put_format(string, " ver:0x%x, dpid:%"PRIx64"\n",
+ osf->header.version, ntohll(osf->datapath_id));
+ ds_put_format(string, "n_tables:%d, n_buffers:%d\n", osf->n_tables,
+ ntohl(osf->n_buffers));
+ ds_put_format(string, "features: capabilities:%#x, actions:%#x\n",
+ ntohl(osf->capabilities), ntohl(osf->actions));
+
+ if (ntohs(osf->header.length) >= sizeof *osf) {
+ len = MIN(len, ntohs(osf->header.length));
+ }
+ n_ports = (len - sizeof *osf) / sizeof *osf->ports;
+
+ port_list = xmemdup(osf->ports, len - sizeof *osf);
+ qsort(port_list, n_ports, sizeof *port_list, compare_ports);
+ for (i = 0; i < n_ports; i++) {
+ ofp_print_phy_port(string, &port_list[i]);
+ }
+ free(port_list);
+}
+
+/* Pretty-print the struct ofp_switch_config of 'len' bytes at 'oh' to 'string'
+ * at the given 'verbosity' level. */
+static void
+ofp_print_switch_config(struct ds *string, const void *oh, size_t len UNUSED,
+ int verbosity UNUSED)
+{
+ const struct ofp_switch_config *osc = oh;
+ uint16_t flags;
+
+ flags = ntohs(osc->flags);
+ if (flags & OFPC_SEND_FLOW_EXP) {
+ flags &= ~OFPC_SEND_FLOW_EXP;
+ ds_put_format(string, " (sending flow expirations)");
+ }
+ if (flags) {
+ ds_put_format(string, " ***unknown flags 0x%04"PRIx16"***", flags);
+ }
+
+ ds_put_format(string, " miss_send_len=%"PRIu16"\n", ntohs(osc->miss_send_len));
+}
+
+static void print_wild(struct ds *string, const char *leader, int is_wild,
+ int verbosity, const char *format, ...)
+ __attribute__((format(printf, 5, 6)));
+
+static void print_wild(struct ds *string, const char *leader, int is_wild,
+ int verbosity, const char *format, ...)
+{
+ if (is_wild && verbosity < 2) {
+ return;
+ }
+ ds_put_cstr(string, leader);
+ if (!is_wild) {
+ va_list args;
+
+ va_start(args, format);
+ ds_put_format_valist(string, format, args);
+ va_end(args);
+ } else {
+ ds_put_char(string, '*');
+ }
+ ds_put_char(string, ',');
+}
+
+static void
+print_ip_netmask(struct ds *string, const char *leader, uint32_t ip,
+ uint32_t wild_bits, int verbosity)
+{
+ if (wild_bits >= 32 && verbosity < 2) {
+ return;
+ }
+ ds_put_cstr(string, leader);
+ if (wild_bits < 32) {
+ ds_put_format(string, IP_FMT, IP_ARGS(&ip));
+ if (wild_bits) {
+ ds_put_format(string, "/%d", 32 - wild_bits);
+ }
+ } else {
+ ds_put_char(string, '*');
+ }
+ ds_put_char(string, ',');
+}
+
+static void
+ofp_print_match(struct ds *f, const struct ofp_match *om, int verbosity)
+{
+ char *s = ofp_match_to_string(om, verbosity);
+ ds_put_cstr(f, s);
+ free(s);
+}
+
+char *
+ofp_match_to_string(const struct ofp_match *om, int verbosity)
+{
+ struct ds f = DS_EMPTY_INITIALIZER;
+ uint32_t w = ntohl(om->wildcards);
+ bool skip_type = false;
+ bool skip_proto = false;
+
+ if (!(w & OFPFW_DL_TYPE)) {
+ skip_type = true;
+ if (om->dl_type == htons(ETH_TYPE_IP)) {
+ if (!(w & OFPFW_NW_PROTO)) {
+ skip_proto = true;
+ if (om->nw_proto == IP_TYPE_ICMP) {
+ ds_put_cstr(&f, "icmp,");
+ } else if (om->nw_proto == IP_TYPE_TCP) {
+ ds_put_cstr(&f, "tcp,");
+ } else if (om->nw_proto == IP_TYPE_UDP) {
+ ds_put_cstr(&f, "udp,");
+ } else {
+ ds_put_cstr(&f, "ip,");
+ skip_proto = false;
+ }
+ } else {
+ ds_put_cstr(&f, "ip,");
+ }
+ } else if (om->dl_type == htons(ETH_TYPE_ARP)) {
+ ds_put_cstr(&f, "arp,");
+ } else {
+ skip_type = false;
+ }
+ }
+ print_wild(&f, "in_port=", w & OFPFW_IN_PORT, verbosity,
+ "%d", ntohs(om->in_port));
+ print_wild(&f, "dl_vlan=", w & OFPFW_DL_VLAN, verbosity,
+ "0x%04x", ntohs(om->dl_vlan));
+ print_wild(&f, "dl_src=", w & OFPFW_DL_SRC, verbosity,
+ ETH_ADDR_FMT, ETH_ADDR_ARGS(om->dl_src));
+ print_wild(&f, "dl_dst=", w & OFPFW_DL_DST, verbosity,
+ ETH_ADDR_FMT, ETH_ADDR_ARGS(om->dl_dst));
+ if (!skip_type) {
+ print_wild(&f, "dl_type=", w & OFPFW_DL_TYPE, verbosity,
+ "0x%04x", ntohs(om->dl_type));
+ }
+ print_ip_netmask(&f, "nw_src=", om->nw_src,
+ (w & OFPFW_NW_SRC_MASK) >> OFPFW_NW_SRC_SHIFT, verbosity);
+ print_ip_netmask(&f, "nw_dst=", om->nw_dst,
+ (w & OFPFW_NW_DST_MASK) >> OFPFW_NW_DST_SHIFT, verbosity);
+ if (!skip_proto) {
+ print_wild(&f, "nw_proto=", w & OFPFW_NW_PROTO, verbosity,
+ "%u", om->nw_proto);
+ }
+ if (om->nw_proto == IP_TYPE_ICMP) {
+ print_wild(&f, "icmp_type=", w & OFPFW_ICMP_TYPE, verbosity,
+ "%d", ntohs(om->icmp_type));
+ print_wild(&f, "icmp_code=", w & OFPFW_ICMP_CODE, verbosity,
+ "%d", ntohs(om->icmp_code));
+ } else {
+ print_wild(&f, "tp_src=", w & OFPFW_TP_SRC, verbosity,
+ "%d", ntohs(om->tp_src));
+ print_wild(&f, "tp_dst=", w & OFPFW_TP_DST, verbosity,
+ "%d", ntohs(om->tp_dst));
+ }
+ return ds_cstr(&f);
+}
+
+/* Pretty-print the OFPT_FLOW_MOD packet of 'len' bytes at 'oh' to 'string'
+ * at the given 'verbosity' level. */
+static void
+ofp_print_flow_mod(struct ds *string, const void *oh, size_t len,
+ int verbosity)
+{
+ const struct ofp_flow_mod *ofm = oh;
+
+ ofp_print_match(string, &ofm->match, verbosity);
+ switch (ntohs(ofm->command)) {
+ case OFPFC_ADD:
+ ds_put_cstr(string, " ADD: ");
+ break;
+ case OFPFC_MODIFY:
+ ds_put_cstr(string, " MOD: ");
+ break;
+ case OFPFC_MODIFY_STRICT:
+ ds_put_cstr(string, " MOD_STRICT: ");
+ break;
+ case OFPFC_DELETE:
+ ds_put_cstr(string, " DEL: ");
+ break;
+ case OFPFC_DELETE_STRICT:
+ ds_put_cstr(string, " DEL_STRICT: ");
+ break;
+ default:
+ ds_put_format(string, " cmd:%d ", ntohs(ofm->command));
+ }
+ ds_put_format(string, "idle:%d hard:%d pri:%d buf:%#x",
+ ntohs(ofm->idle_timeout), ntohs(ofm->hard_timeout),
+ ofm->match.wildcards ? ntohs(ofm->priority) : (uint16_t)-1,
+ ntohl(ofm->buffer_id));
+ ofp_print_actions(string, ofm->actions,
+ len - offsetof(struct ofp_flow_mod, actions));
+ ds_put_char(string, '\n');
+}
+
+/* Pretty-print the OFPT_FLOW_EXPIRED packet of 'len' bytes at 'oh' to 'string'
+ * at the given 'verbosity' level. */
+static void
+ofp_print_flow_expired(struct ds *string, const void *oh, size_t len UNUSED,
+ int verbosity)
+{
+ const struct ofp_flow_expired *ofe = oh;
+
+ ofp_print_match(string, &ofe->match, verbosity);
+ ds_put_cstr(string, " reason=");
+ switch (ofe->reason) {
+ case OFPER_IDLE_TIMEOUT:
+ ds_put_cstr(string, "idle");
+ break;
+ case OFPER_HARD_TIMEOUT:
+ ds_put_cstr(string, "hard");
+ break;
+ default:
+ ds_put_format(string, "**%"PRIu8"**", ofe->reason);
+ break;
+ }
+ ds_put_format(string,
+ " pri%"PRIu16" secs%"PRIu32" pkts%"PRIu64" bytes%"PRIu64"\n",
+ ofe->match.wildcards ? ntohs(ofe->priority) : (uint16_t)-1,
+ ntohl(ofe->duration), ntohll(ofe->packet_count),
+ ntohll(ofe->byte_count));
+}
+
+static void
+ofp_print_port_mod(struct ds *string, const void *oh, size_t len UNUSED,
+ int verbosity UNUSED)
+{
+ const struct ofp_port_mod *opm = oh;
+
+ ds_put_format(string, "port: %d: addr:"ETH_ADDR_FMT", config: %#x, mask:%#x\n",
+ ntohs(opm->port_no), ETH_ADDR_ARGS(opm->hw_addr),
+ ntohl(opm->config), ntohl(opm->mask));
+ ds_put_format(string, " advertise: ");
+ if (opm->advertise) {
+ ofp_print_port_features(string, ntohl(opm->advertise));
+ } else {
+ ds_put_format(string, "UNCHANGED\n");
+ }
+}
+
+struct error_type {
+ int type;
+ int code;
+ const char *name;
+};
+
+static const struct error_type error_types[] = {
+#define ERROR_TYPE(TYPE) {TYPE, -1, #TYPE}
+#define ERROR_CODE(TYPE, CODE) {TYPE, CODE, #CODE}
+ ERROR_TYPE(OFPET_HELLO_FAILED),
+ ERROR_CODE(OFPET_HELLO_FAILED, OFPHFC_INCOMPATIBLE),
+
+ ERROR_TYPE(OFPET_BAD_REQUEST),
+ ERROR_CODE(OFPET_BAD_REQUEST, OFPBRC_BAD_VERSION),
+ ERROR_CODE(OFPET_BAD_REQUEST, OFPBRC_BAD_TYPE),
+ ERROR_CODE(OFPET_BAD_REQUEST, OFPBRC_BAD_STAT),
+ ERROR_CODE(OFPET_BAD_REQUEST, OFPBRC_BAD_VERSION),
+
+ ERROR_TYPE(OFPET_BAD_ACTION),
+ ERROR_CODE(OFPET_BAD_ACTION, OFPBAC_BAD_TYPE),
+ ERROR_CODE(OFPET_BAD_ACTION, OFPBAC_BAD_LEN),
+ ERROR_CODE(OFPET_BAD_ACTION, OFPBAC_BAD_VENDOR),
+ ERROR_CODE(OFPET_BAD_ACTION, OFPBAC_BAD_VENDOR_TYPE),
+ ERROR_CODE(OFPET_BAD_ACTION, OFPBAC_BAD_OUT_PORT),
+
+ ERROR_TYPE(OFPET_FLOW_MOD_FAILED),
+ ERROR_CODE(OFPET_FLOW_MOD_FAILED, OFPFMFC_ALL_TABLES_FULL)
+};
+#define N_ERROR_TYPES ARRAY_SIZE(error_types)
+
+static const char *
+lookup_error_type(int type)
+{
+ const struct error_type *t;
+
+ for (t = error_types; t < &error_types[N_ERROR_TYPES]; t++) {
+ if (t->type == type && t->code == -1) {
+ return t->name;
+ }
+ }
+ return "?";
+}
+
+static const char *
+lookup_error_code(int type, int code)
+{
+ const struct error_type *t;
+
+ for (t = error_types; t < &error_types[N_ERROR_TYPES]; t++) {
+ if (t->type == type && t->code == code) {
+ return t->name;
+ }
+ }
+ return "?";
+}
+
+/* Pretty-print the OFPT_ERROR packet of 'len' bytes at 'oh' to 'string'
+ * at the given 'verbosity' level. */
+static void
+ofp_print_error_msg(struct ds *string, const void *oh, size_t len,
+ int verbosity UNUSED)
+{
+ const struct ofp_error_msg *oem = oh;
+ int type = ntohs(oem->type);
+ int code = ntohs(oem->code);
+ char *s;
+
+ ds_put_format(string, " type%d(%s) code%d(%s) payload:\n",
+ type, lookup_error_type(type),
+ code, lookup_error_code(type, code));
+
+ switch (type) {
+ case OFPET_HELLO_FAILED:
+ ds_put_printable(string, (char *) oem->data, len - sizeof *oem);
+ break;
+
+ case OFPET_BAD_REQUEST:
+ s = ofp_to_string(oem->data, len - sizeof *oem, 1);
+ ds_put_cstr(string, s);
+ free(s);
+ break;
+
+ default:
+ ds_put_hex_dump(string, oem->data, len - sizeof *oem, 0, true);
+ break;
+ }
+}
+
+/* Pretty-print the OFPT_PORT_STATUS packet of 'len' bytes at 'oh' to 'string'
+ * at the given 'verbosity' level. */
+static void
+ofp_print_port_status(struct ds *string, const void *oh, size_t len UNUSED,
+ int verbosity UNUSED)
+{
+ const struct ofp_port_status *ops = oh;
+
+ if (ops->reason == OFPPR_ADD) {
+ ds_put_format(string, " ADD:");
+ } else if (ops->reason == OFPPR_DELETE) {
+ ds_put_format(string, " DEL:");
+ } else if (ops->reason == OFPPR_MODIFY) {
+ ds_put_format(string, " MOD:");
+ }
+
+ ofp_print_phy_port(string, &ops->desc);
+}
+
+static void
+ofp_desc_stats_reply(struct ds *string, const void *body, size_t len UNUSED,
+ int verbosity UNUSED)
+{
+ const struct ofp_desc_stats *ods = body;
+
+ ds_put_format(string, "Manufacturer: %s\n", ods->mfr_desc);
+ ds_put_format(string, "Hardware: %s\n", ods->hw_desc);
+ ds_put_format(string, "Software: %s\n", ods->sw_desc);
+ ds_put_format(string, "Serial Num: %s\n", ods->serial_num);
+}
+
+static void
+ofp_flow_stats_request(struct ds *string, const void *oh, size_t len UNUSED,
+ int verbosity)
+{
+ const struct ofp_flow_stats_request *fsr = oh;
+
+ if (fsr->table_id == 0xff) {
+ ds_put_format(string, " table_id=any, ");
+ } else {
+ ds_put_format(string, " table_id=%"PRIu8", ", fsr->table_id);
+ }
+
+ ofp_print_match(string, &fsr->match, verbosity);
+}
+
+static void
+ofp_flow_stats_reply(struct ds *string, const void *body_, size_t len,
+ int verbosity)
+{
+ const char *body = body_;
+ const char *pos = body;
+ for (;;) {
+ const struct ofp_flow_stats *fs;
+ ptrdiff_t bytes_left = body + len - pos;
+ size_t length;
+
+ if (bytes_left < sizeof *fs) {
+ if (bytes_left != 0) {
+ ds_put_format(string, " ***%td leftover bytes at end***",
+ bytes_left);
+ }
+ break;
+ }
+
+ fs = (const void *) pos;
+ length = ntohs(fs->length);
+ if (length < sizeof *fs) {
+ ds_put_format(string, " ***length=%zu shorter than minimum %zu***",
+ length, sizeof *fs);
+ break;
+ } else if (length > bytes_left) {
+ ds_put_format(string,
+ " ***length=%zu but only %td bytes left***",
+ length, bytes_left);
+ break;
+ } else if ((length - sizeof *fs) % sizeof fs->actions[0]) {
+ ds_put_format(string,
+ " ***length=%zu has %zu bytes leftover in "
+ "final action***",
+ length,
+ (length - sizeof *fs) % sizeof fs->actions[0]);
+ break;
+ }
+
+ ds_put_format(string, " duration=%"PRIu32"s, ", ntohl(fs->duration));
+ ds_put_format(string, "table_id=%"PRIu8", ", fs->table_id);
+ ds_put_format(string, "priority=%"PRIu16", ",
+ fs->match.wildcards ? ntohs(fs->priority) : (uint16_t)-1);
+ ds_put_format(string, "n_packets=%"PRIu64", ",
+ ntohll(fs->packet_count));
+ ds_put_format(string, "n_bytes=%"PRIu64", ", ntohll(fs->byte_count));
+ if (fs->idle_timeout != htons(OFP_FLOW_PERMANENT)) {
+ ds_put_format(string, "idle_timeout=%"PRIu16",",
+ ntohs(fs->idle_timeout));
+ }
+ if (fs->hard_timeout != htons(OFP_FLOW_PERMANENT)) {
+ ds_put_format(string, "hard_timeout=%"PRIu16",",
+ ntohs(fs->hard_timeout));
+ }
+ ofp_print_match(string, &fs->match, verbosity);
+ ofp_print_actions(string, fs->actions, length - sizeof *fs);
+ ds_put_char(string, '\n');
+
+ pos += length;
+ }
+}
+
+static void
+ofp_aggregate_stats_request(struct ds *string, const void *oh,
+ size_t len UNUSED, int verbosity)
+{
+ const struct ofp_aggregate_stats_request *asr = oh;
+
+ if (asr->table_id == 0xff) {
+ ds_put_format(string, " table_id=any, ");
+ } else {
+ ds_put_format(string, " table_id=%"PRIu8", ", asr->table_id);
+ }
+
+ ofp_print_match(string, &asr->match, verbosity);
+}
+
+static void
+ofp_aggregate_stats_reply(struct ds *string, const void *body_,
+ size_t len UNUSED, int verbosity UNUSED)
+{
+ const struct ofp_aggregate_stats_reply *asr = body_;
+
+ ds_put_format(string, " packet_count=%"PRIu64, ntohll(asr->packet_count));
+ ds_put_format(string, " byte_count=%"PRIu64, ntohll(asr->byte_count));
+ ds_put_format(string, " flow_count=%"PRIu32, ntohl(asr->flow_count));
+}
+
+static void print_port_stat(struct ds *string, const char *leader,
+ uint64_t stat, int more)
+{
+ ds_put_cstr(string, leader);
+ if (stat != -1) {
+ ds_put_format(string, "%"PRIu64, stat);
+ } else {
+ ds_put_char(string, '?');
+ }
+ if (more) {
+ ds_put_cstr(string, ", ");
+ } else {
+ ds_put_cstr(string, "\n");
+ }
+}
+
+static void
+ofp_port_stats_reply(struct ds *string, const void *body, size_t len,
+ int verbosity)
+{
+ const struct ofp_port_stats *ps = body;
+ size_t n = len / sizeof *ps;
+ ds_put_format(string, " %zu ports\n", n);
+ if (verbosity < 1) {
+ return;
+ }
+
+ for (; n--; ps++) {
+ ds_put_format(string, " port %2"PRIu16": ", ntohs(ps->port_no));
+
+ ds_put_cstr(string, "rx ");
+ print_port_stat(string, "pkts=", ntohll(ps->rx_packets), 1);
+ print_port_stat(string, "bytes=", ntohll(ps->rx_bytes), 1);
+ print_port_stat(string, "drop=", ntohll(ps->rx_dropped), 1);
+ print_port_stat(string, "errs=", ntohll(ps->rx_errors), 1);
+ print_port_stat(string, "frame=", ntohll(ps->rx_frame_err), 1);
+ print_port_stat(string, "over=", ntohll(ps->rx_over_err), 1);
+ print_port_stat(string, "crc=", ntohll(ps->rx_crc_err), 0);
+
+ ds_put_cstr(string, " tx ");
+ print_port_stat(string, "pkts=", ntohll(ps->tx_packets), 1);
+ print_port_stat(string, "bytes=", ntohll(ps->tx_bytes), 1);
+ print_port_stat(string, "drop=", ntohll(ps->tx_dropped), 1);
+ print_port_stat(string, "errs=", ntohll(ps->tx_errors), 1);
+ print_port_stat(string, "coll=", ntohll(ps->collisions), 0);
+ }
+}
+
+static void
+ofp_table_stats_reply(struct ds *string, const void *body, size_t len,
+ int verbosity)
+{
+ const struct ofp_table_stats *ts = body;
+ size_t n = len / sizeof *ts;
+ ds_put_format(string, " %zu tables\n", n);
+ if (verbosity < 1) {
+ return;
+ }
+
+ for (; n--; ts++) {
+ char name[OFP_MAX_TABLE_NAME_LEN + 1];
+ strncpy(name, ts->name, sizeof name);
+ name[OFP_MAX_TABLE_NAME_LEN] = '\0';
+
+ ds_put_format(string, " %d: %-8s: ", ts->table_id, name);
+ ds_put_format(string, "wild=0x%05"PRIx32", ", ntohl(ts->wildcards));
+ ds_put_format(string, "max=%6"PRIu32", ", ntohl(ts->max_entries));
+ ds_put_format(string, "active=%"PRIu32"\n", ntohl(ts->active_count));
+ ds_put_cstr(string, " ");
+ ds_put_format(string, "lookup=%"PRIu64", ",
+ ntohll(ts->lookup_count));
+ ds_put_format(string, "matched=%"PRIu64"\n",
+ ntohll(ts->matched_count));
+ }
+}
+
+static void
+vendor_stat(struct ds *string, const void *body, size_t len,
+ int verbosity UNUSED)
+{
+ ds_put_format(string, " vendor=%08"PRIx32, ntohl(*(uint32_t *) body));
+ ds_put_format(string, " %zu bytes additional data",
+ len - sizeof(uint32_t));
+}
+
+enum stats_direction {
+ REQUEST,
+ REPLY
+};
+
+static void
+print_stats(struct ds *string, int type, const void *body, size_t body_len,
+ int verbosity, enum stats_direction direction)
+{
+ struct stats_msg {
+ size_t min_body, max_body;
+ void (*printer)(struct ds *, const void *, size_t len, int verbosity);
+ };
+
+ struct stats_type {
+ int type;
+ const char *name;
+ struct stats_msg request;
+ struct stats_msg reply;
+ };
+
+ static const struct stats_type stats_types[] = {
+ {
+ OFPST_DESC,
+ "description",
+ { 0, 0, NULL },
+ { 0, SIZE_MAX, ofp_desc_stats_reply },
+ },
+ {
+ OFPST_FLOW,
+ "flow",
+ { sizeof(struct ofp_flow_stats_request),
+ sizeof(struct ofp_flow_stats_request),
+ ofp_flow_stats_request },
+ { 0, SIZE_MAX, ofp_flow_stats_reply },
+ },
+ {
+ OFPST_AGGREGATE,
+ "aggregate",
+ { sizeof(struct ofp_aggregate_stats_request),
+ sizeof(struct ofp_aggregate_stats_request),
+ ofp_aggregate_stats_request },
+ { sizeof(struct ofp_aggregate_stats_reply),
+ sizeof(struct ofp_aggregate_stats_reply),
+ ofp_aggregate_stats_reply },
+ },
+ {
+ OFPST_TABLE,
+ "table",
+ { 0, 0, NULL },
+ { 0, SIZE_MAX, ofp_table_stats_reply },
+ },
+ {
+ OFPST_PORT,
+ "port",
+ { 0, 0, NULL, },
+ { 0, SIZE_MAX, ofp_port_stats_reply },
+ },
+ {
+ OFPST_VENDOR,
+ "vendor-specific",
+ { sizeof(uint32_t), SIZE_MAX, vendor_stat },
+ { sizeof(uint32_t), SIZE_MAX, vendor_stat },
+ },
+ {
+ -1,
+ "unknown",
+ { 0, 0, NULL, },
+ { 0, 0, NULL, },
+ },
+ };
+
+ const struct stats_type *s;
+ const struct stats_msg *m;
+
+ if (type >= ARRAY_SIZE(stats_types) || !stats_types[type].name) {
+ ds_put_format(string, " ***unknown type %d***", type);
+ return;
+ }
+ for (s = stats_types; s->type >= 0; s++) {
+ if (s->type == type) {
+ break;
+ }
+ }
+ ds_put_format(string, " type=%d(%s)\n", type, s->name);
+
+ m = direction == REQUEST ? &s->request : &s->reply;
+ if (body_len < m->min_body || body_len > m->max_body) {
+ ds_put_format(string, " ***body_len=%zu not in %zu...%zu***",
+ body_len, m->min_body, m->max_body);
+ return;
+ }
+ if (m->printer) {
+ m->printer(string, body, body_len, verbosity);
+ }
+}
+
+static void
+ofp_stats_request(struct ds *string, const void *oh, size_t len, int verbosity)
+{
+ const struct ofp_stats_request *srq = oh;
+
+ if (srq->flags) {
+ ds_put_format(string, " ***unknown flags 0x%04"PRIx16"***",
+ ntohs(srq->flags));
+ }
+
+ print_stats(string, ntohs(srq->type), srq->body,
+ len - offsetof(struct ofp_stats_request, body),
+ verbosity, REQUEST);
+}
+
+static void
+ofp_stats_reply(struct ds *string, const void *oh, size_t len, int verbosity)
+{
+ const struct ofp_stats_reply *srp = oh;
+
+ ds_put_cstr(string, " flags=");
+ if (!srp->flags) {
+ ds_put_cstr(string, "none");
+ } else {
+ uint16_t flags = ntohs(srp->flags);
+ if (flags & OFPSF_REPLY_MORE) {
+ ds_put_cstr(string, "[more]");
+ flags &= ~OFPSF_REPLY_MORE;
+ }
+ if (flags) {
+ ds_put_format(string, "[***unknown flags 0x%04"PRIx16"***]", flags);
+ }
+ }
+
+ print_stats(string, ntohs(srp->type), srp->body,
+ len - offsetof(struct ofp_stats_reply, body),
+ verbosity, REPLY);
+}
+
+static void
+ofp_echo(struct ds *string, const void *oh, size_t len, int verbosity)
+{
+ const struct ofp_header *hdr = oh;
+
+ ds_put_format(string, " %zu bytes of payload\n", len - sizeof *hdr);
+ if (verbosity > 1) {
+ ds_put_hex_dump(string, hdr, len - sizeof *hdr, 0, true);
+ }
+}
+
+struct openflow_packet {
+ uint8_t type;
+ const char *name;
+ size_t min_size;
+ void (*printer)(struct ds *, const void *, size_t len, int verbosity);
+};
+
+static const struct openflow_packet packets[] = {
+ {
+ OFPT_HELLO,
+ "hello",
+ sizeof (struct ofp_header),
+ NULL,
+ },
+ {
+ OFPT_FEATURES_REQUEST,
+ "features_request",
+ sizeof (struct ofp_header),
+ NULL,
+ },
+ {
+ OFPT_FEATURES_REPLY,
+ "features_reply",
+ sizeof (struct ofp_switch_features),
+ ofp_print_switch_features,
+ },
+ {
+ OFPT_GET_CONFIG_REQUEST,
+ "get_config_request",
+ sizeof (struct ofp_header),
+ NULL,
+ },
+ {
+ OFPT_GET_CONFIG_REPLY,
+ "get_config_reply",
+ sizeof (struct ofp_switch_config),
+ ofp_print_switch_config,
+ },
+ {
+ OFPT_SET_CONFIG,
+ "set_config",
+ sizeof (struct ofp_switch_config),
+ ofp_print_switch_config,
+ },
+ {
+ OFPT_PACKET_IN,
+ "packet_in",
+ offsetof(struct ofp_packet_in, data),
+ ofp_packet_in,
+ },
+ {
+ OFPT_PACKET_OUT,
+ "packet_out",
+ sizeof (struct ofp_packet_out),
+ ofp_packet_out,
+ },
+ {
+ OFPT_FLOW_MOD,
+ "flow_mod",
+ sizeof (struct ofp_flow_mod),
+ ofp_print_flow_mod,
+ },
+ {
+ OFPT_FLOW_EXPIRED,
+ "flow_expired",
+ sizeof (struct ofp_flow_expired),
+ ofp_print_flow_expired,
+ },
+ {
+ OFPT_PORT_MOD,
+ "port_mod",
+ sizeof (struct ofp_port_mod),
+ ofp_print_port_mod,
+ },
+ {
+ OFPT_PORT_STATUS,
+ "port_status",
+ sizeof (struct ofp_port_status),
+ ofp_print_port_status
+ },
+ {
+ OFPT_ERROR,
+ "error_msg",
+ sizeof (struct ofp_error_msg),
+ ofp_print_error_msg,
+ },
+ {
+ OFPT_STATS_REQUEST,
+ "stats_request",
+ sizeof (struct ofp_stats_request),
+ ofp_stats_request,
+ },
+ {
+ OFPT_STATS_REPLY,
+ "stats_reply",
+ sizeof (struct ofp_stats_reply),
+ ofp_stats_reply,
+ },
+ {
+ OFPT_ECHO_REQUEST,
+ "echo_request",
+ sizeof (struct ofp_header),
+ ofp_echo,
+ },
+ {
+ OFPT_ECHO_REPLY,
+ "echo_reply",
+ sizeof (struct ofp_header),
+ ofp_echo,
+ },
+ {
+ OFPT_VENDOR,
+ "vendor",
+ sizeof (struct ofp_vendor_header),
+ NULL,
+ },
+};
+
+/* Composes and returns a string representing the OpenFlow packet of 'len'
+ * bytes at 'oh' at the given 'verbosity' level. 0 is a minimal amount of
+ * verbosity and higher numbers increase verbosity. The caller is responsible
+ * for freeing the string. */
+char *
+ofp_to_string(const void *oh_, size_t len, int verbosity)
+{
+ struct ds string = DS_EMPTY_INITIALIZER;
+ const struct ofp_header *oh = oh_;
+ const struct openflow_packet *pkt;
+
+ if (len < sizeof(struct ofp_header)) {
+ ds_put_cstr(&string, "OpenFlow packet too short:\n");
+ ds_put_hex_dump(&string, oh, len, 0, true);
+ return ds_cstr(&string);
+ } else if (oh->version != OFP_VERSION) {
+ ds_put_format(&string, "Bad OpenFlow version %"PRIu8":\n", oh->version);
+ ds_put_hex_dump(&string, oh, len, 0, true);
+ return ds_cstr(&string);
+ }
+
+ for (pkt = packets; ; pkt++) {
+ if (pkt >= &packets[ARRAY_SIZE(packets)]) {
+ ds_put_format(&string, "Unknown OpenFlow packet type %"PRIu8":\n",
+ oh->type);
+ ds_put_hex_dump(&string, oh, len, 0, true);
+ return ds_cstr(&string);
+ } else if (oh->type == pkt->type) {
+ break;
+ }
+ }
+
+ ds_put_format(&string, "%s (xid=0x%"PRIx32"):", pkt->name, oh->xid);
+
+ if (ntohs(oh->length) > len)
+ ds_put_format(&string, " (***truncated to %zu bytes from %"PRIu16"***)",
+ len, ntohs(oh->length));
+ else if (ntohs(oh->length) < len) {
+ ds_put_format(&string, " (***only uses %"PRIu16" bytes out of %zu***)\n",
+ ntohs(oh->length), len);
+ len = ntohs(oh->length);
+ }
+
+ if (len < pkt->min_size) {
+ ds_put_format(&string, " (***length=%zu < min_size=%zu***)\n",
+ len, pkt->min_size);
+ } else if (!pkt->printer) {
+ if (len > sizeof *oh) {
+ ds_put_format(&string, " length=%"PRIu16" (decoder not implemented)\n",
+ ntohs(oh->length));
+ }
+ } else {
+ pkt->printer(&string, oh, len, verbosity);
+ }
+ if (verbosity >= 3) {
+ ds_put_hex_dump(&string, oh, len, 0, true);
+ }
+ if (string.string[string.length - 1] != '\n') {
+ ds_put_char(&string, '\n');
+ }
+ return ds_cstr(&string);
+}
+
+/* Returns the name for the specified OpenFlow message type as a string,
+ * e.g. "OFPT_FEATURES_REPLY". If no name is known, the string returned is a
+ * hex number, e.g. "0x55".
+ *
+ * The caller must free the returned string when it is no longer needed. */
+char *
+ofp_message_type_to_string(uint8_t type)
+{
+ struct ds s = DS_EMPTY_INITIALIZER;
+ const struct openflow_packet *pkt;
+ for (pkt = packets; ; pkt++) {
+ if (pkt >= &packets[ARRAY_SIZE(packets)]) {
+ ds_put_format(&s, "0x%02"PRIx8, type);
+ break;
+ } else if (type == pkt->type) {
+ const char *p;
+
+ ds_put_cstr(&s, "OFPT_");
+ for (p = pkt->name; *p; p++) {
+ ds_put_char(&s, toupper((unsigned char) *p));
+ }
+ break;
+ }
+ }
+ return ds_cstr(&s);
+}
+
+static void
+print_and_free(FILE *stream, char *string)
+{
+ fputs(string, stream);
+ free(string);
+}
+
+/* Pretty-print the OpenFlow packet of 'len' bytes at 'oh' to 'stream' at the
+ * given 'verbosity' level. 0 is a minimal amount of verbosity and higher
+ * numbers increase verbosity. */
+void
+ofp_print(FILE *stream, const void *oh, size_t len, int verbosity)
+{
+ print_and_free(stream, ofp_to_string(oh, len, verbosity));
+}
+
+/* Dumps the contents of the Ethernet frame in the 'len' bytes starting at
+ * 'data' to 'stream' using tcpdump. 'total_len' specifies the full length of
+ * the Ethernet frame (of which 'len' bytes were captured).
+ *
+ * This starts and kills a tcpdump subprocess so it's quite expensive. */
+void
+ofp_print_packet(FILE *stream, const void *data, size_t len, size_t total_len)
+{
+ print_and_free(stream, ofp_packet_to_string(data, len, total_len));
+}
diff --git a/lib/ofp-print.h b/lib/ofp-print.h
new file mode 100644
index 000000000..21fb749bb
--- /dev/null
+++ b/lib/ofp-print.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2008, 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+/* OpenFlow protocol pretty-printer. */
+
+#ifndef OFP_PRINT_H
+#define OFP_PRINT_H 1
+
+#include <stdint.h>
+#include <stdio.h>
+
+struct ofp_flow_mod;
+struct ofp_match;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void ofp_print(FILE *, const void *, size_t, int verbosity);
+void ofp_print_packet(FILE *stream, const void *data, size_t len, size_t total_len);
+
+char *ofp_to_string(const void *, size_t, int verbosity);
+char *ofp_match_to_string(const struct ofp_match *, int verbosity);
+char *ofp_packet_to_string(const void *data, size_t len, size_t total_len);
+char *ofp_message_type_to_string(uint8_t type);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ofp-print.h */
diff --git a/lib/ofpbuf.c b/lib/ofpbuf.c
new file mode 100644
index 000000000..ed326fd5c
--- /dev/null
+++ b/lib/ofpbuf.c
@@ -0,0 +1,288 @@
+/*
+ * Copyright (c) 2008, 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <config.h>
+#include "ofpbuf.h"
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+#include "util.h"
+
+/* Initializes 'b' as an empty ofpbuf that contains the 'allocated' bytes of
+ * memory starting at 'base'.
+ *
+ * 'base' should ordinarily be the first byte of a region obtained from
+ * malloc(), but in circumstances where it can be guaranteed that 'b' will
+ * never need to be expanded or freed, it can be a pointer into arbitrary
+ * memory. */
+void
+ofpbuf_use(struct ofpbuf *b, void *base, size_t allocated)
+{
+ b->base = b->data = base;
+ b->allocated = allocated;
+ b->size = 0;
+ b->l2 = b->l3 = b->l4 = b->l7 = NULL;
+ b->next = NULL;
+ b->private = NULL;
+}
+
+/* Initializes 'b' as an empty ofpbuf with an initial capacity of 'size'
+ * bytes. */
+void
+ofpbuf_init(struct ofpbuf *b, size_t size)
+{
+ ofpbuf_use(b, size ? xmalloc(size) : NULL, size);
+}
+
+/* Frees memory that 'b' points to. */
+void
+ofpbuf_uninit(struct ofpbuf *b)
+{
+ if (b) {
+ free(b->base);
+ }
+}
+
+/* Frees memory that 'b' points to and allocates a new ofpbuf */
+void
+ofpbuf_reinit(struct ofpbuf *b, size_t size)
+{
+ ofpbuf_uninit(b);
+ ofpbuf_init(b, size);
+}
+
+/* Creates and returns a new ofpbuf with an initial capacity of 'size'
+ * bytes. */
+struct ofpbuf *
+ofpbuf_new(size_t size)
+{
+ struct ofpbuf *b = xmalloc(sizeof *b);
+ ofpbuf_init(b, size);
+ return b;
+}
+
+struct ofpbuf *
+ofpbuf_clone(const struct ofpbuf *buffer)
+{
+ return ofpbuf_clone_data(buffer->data, buffer->size);
+}
+
+struct ofpbuf *
+ofpbuf_clone_data(const void *data, size_t size)
+{
+ struct ofpbuf *b = ofpbuf_new(size);
+ ofpbuf_put(b, data, size);
+ return b;
+}
+
+/* Frees memory that 'b' points to, as well as 'b' itself. */
+void
+ofpbuf_delete(struct ofpbuf *b)
+{
+ if (b) {
+ ofpbuf_uninit(b);
+ free(b);
+ }
+}
+
+/* Returns the number of bytes of headroom in 'b', that is, the number of bytes
+ * of unused space in ofpbuf 'b' before the data that is in use. (Most
+ * commonly, the data in a ofpbuf is at its beginning, and thus the ofpbuf's
+ * headroom is 0.) */
+size_t
+ofpbuf_headroom(struct ofpbuf *b)
+{
+ return (char*)b->data - (char*)b->base;
+}
+
+/* Returns the number of bytes that may be appended to the tail end of ofpbuf
+ * 'b' before the ofpbuf must be reallocated. */
+size_t
+ofpbuf_tailroom(struct ofpbuf *b)
+{
+ return (char*)ofpbuf_end(b) - (char*)ofpbuf_tail(b);
+}
+
+/* Ensures that 'b' has room for at least 'size' bytes at its tail end,
+ * reallocating and copying its data if necessary. */
+void
+ofpbuf_prealloc_tailroom(struct ofpbuf *b, size_t size)
+{
+ if (size > ofpbuf_tailroom(b)) {
+ size_t new_allocated = b->allocated + MAX(size, 64);
+ void *new_base = xmalloc(new_allocated);
+ uintptr_t base_delta = (char*)new_base - (char*)b->base;
+ memcpy(new_base, b->base, b->allocated);
+ free(b->base);
+ b->base = new_base;
+ b->allocated = new_allocated;
+ b->data = (char*)b->data + base_delta;
+ if (b->l2) {
+ b->l2 = (char*)b->l2 + base_delta;
+ }
+ if (b->l3) {
+ b->l3 = (char*)b->l3 + base_delta;
+ }
+ if (b->l4) {
+ b->l4 = (char*)b->l4 + base_delta;
+ }
+ if (b->l7) {
+ b->l7 = (char*)b->l7 + base_delta;
+ }
+ }
+}
+
+void
+ofpbuf_prealloc_headroom(struct ofpbuf *b, size_t size)
+{
+ assert(size <= ofpbuf_headroom(b));
+}
+
+/* Trims the size of 'b' to fit its actual content. */
+void
+ofpbuf_trim(struct ofpbuf *b)
+{
+ /* XXX These could be supported, but the current client doesn't care. */
+ assert(b->data == b->base);
+ assert(b->l2 == NULL && b->l3 == NULL && b->l4 == NULL && b->l7 == NULL);
+ if (b->allocated > b->size) {
+ b->base = b->data = xrealloc(b->base, b->size);
+ b->allocated = b->size;
+ }
+}
+
+/* Appends 'size' bytes of data to the tail end of 'b', reallocating and
+ * copying its data if necessary. Returns a pointer to the first byte of the
+ * new data, which is left uninitialized. */
+void *
+ofpbuf_put_uninit(struct ofpbuf *b, size_t size)
+{
+ void *p;
+ ofpbuf_prealloc_tailroom(b, size);
+ p = ofpbuf_tail(b);
+ b->size += size;
+ return p;
+}
+
+/* Appends 'size' zeroed bytes to the tail end of 'b'. Data in 'b' is
+ * reallocated and copied if necessary. Returns a pointer to the first byte of
+ * the data's location in the ofpbuf. */
+void *
+ofpbuf_put_zeros(struct ofpbuf *b, size_t size)
+{
+ void *dst = ofpbuf_put_uninit(b, size);
+ memset(dst, 0, size);
+ return dst;
+}
+
+/* Appends the 'size' bytes of data in 'p' to the tail end of 'b'. Data in 'b'
+ * is reallocated and copied if necessary. Returns a pointer to the first
+ * byte of the data's location in the ofpbuf. */
+void *
+ofpbuf_put(struct ofpbuf *b, const void *p, size_t size)
+{
+ void *dst = ofpbuf_put_uninit(b, size);
+ memcpy(dst, p, size);
+ return dst;
+}
+
+/* Reserves 'size' bytes of headroom so that they can be later allocated with
+ * ofpbuf_push_uninit() without reallocating the ofpbuf. */
+void
+ofpbuf_reserve(struct ofpbuf *b, size_t size)
+{
+ assert(!b->size);
+ ofpbuf_prealloc_tailroom(b, size);
+ b->data = (char*)b->data + size;
+}
+
+void *
+ofpbuf_push_uninit(struct ofpbuf *b, size_t size)
+{
+ ofpbuf_prealloc_headroom(b, size);
+ b->data = (char*)b->data - size;
+ b->size += size;
+ return b->data;
+}
+
+void *
+ofpbuf_push(struct ofpbuf *b, const void *p, size_t size)
+{
+ void *dst = ofpbuf_push_uninit(b, size);
+ memcpy(dst, p, size);
+ return dst;
+}
+
+/* If 'b' contains at least 'offset + size' bytes of data, returns a pointer to
+ * byte 'offset'. Otherwise, returns a null pointer. */
+void *
+ofpbuf_at(const struct ofpbuf *b, size_t offset, size_t size)
+{
+ return offset + size <= b->size ? (char *) b->data + offset : NULL;
+}
+
+/* Returns a pointer to byte 'offset' in 'b', which must contain at least
+ * 'offset + size' bytes of data. */
+void *
+ofpbuf_at_assert(const struct ofpbuf *b, size_t offset, size_t size)
+{
+ assert(offset + size <= b->size);
+ return ((char *) b->data) + offset;
+}
+
+/* Returns the byte following the last byte of data in use in 'b'. */
+void *
+ofpbuf_tail(const struct ofpbuf *b)
+{
+ return (char *) b->data + b->size;
+}
+
+/* Returns the byte following the last byte allocated for use (but not
+ * necessarily in use) by 'b'. */
+void *
+ofpbuf_end(const struct ofpbuf *b)
+{
+ return (char *) b->base + b->allocated;
+}
+
+/* Clears any data from 'b'. */
+void
+ofpbuf_clear(struct ofpbuf *b)
+{
+ b->data = b->base;
+ b->size = 0;
+}
+
+/* Removes 'size' bytes from the head end of 'b', which must contain at least
+ * 'size' bytes of data. Returns the first byte of data removed. */
+void *
+ofpbuf_pull(struct ofpbuf *b, size_t size)
+{
+ void *data = b->data;
+ assert(b->size >= size);
+ b->data = (char*)b->data + size;
+ b->size -= size;
+ return data;
+}
+
+/* If 'b' has at least 'size' bytes of data, removes that many bytes from the
+ * head end of 'b' and returns the first byte removed. Otherwise, returns a
+ * null pointer without modifying 'b'. */
+void *
+ofpbuf_try_pull(struct ofpbuf *b, size_t size)
+{
+ return b->size >= size ? ofpbuf_pull(b, size) : NULL;
+}
diff --git a/lib/ofpbuf.h b/lib/ofpbuf.h
new file mode 100644
index 000000000..a68c2800a
--- /dev/null
+++ b/lib/ofpbuf.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2008, 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef OFPBUF_H
+#define OFPBUF_H 1
+
+#include <stddef.h>
+
+/* Buffer for holding arbitrary data. An ofpbuf is automatically reallocated
+ * as necessary if it grows too large for the available memory. */
+struct ofpbuf {
+ void *base; /* First byte of area malloc()'d area. */
+ size_t allocated; /* Number of bytes allocated. */
+
+ void *data; /* First byte actually in use. */
+ size_t size; /* Number of bytes in use. */
+
+ void *l2; /* Link-level header. */
+ void *l3; /* Network-level header. */
+ void *l4; /* Transport-level header. */
+ void *l7; /* Application data. */
+
+ struct ofpbuf *next; /* Next in a list of ofpbufs. */
+ void *private; /* Private pointer for use by owner. */
+};
+
+void ofpbuf_use(struct ofpbuf *, void *, size_t);
+
+void ofpbuf_init(struct ofpbuf *, size_t);
+void ofpbuf_uninit(struct ofpbuf *);
+void ofpbuf_reinit(struct ofpbuf *, size_t);
+
+struct ofpbuf *ofpbuf_new(size_t);
+struct ofpbuf *ofpbuf_clone(const struct ofpbuf *);
+struct ofpbuf *ofpbuf_clone_data(const void *, size_t);
+void ofpbuf_delete(struct ofpbuf *);
+
+void *ofpbuf_at(const struct ofpbuf *, size_t offset, size_t size);
+void *ofpbuf_at_assert(const struct ofpbuf *, size_t offset, size_t size);
+void *ofpbuf_tail(const struct ofpbuf *);
+void *ofpbuf_end(const struct ofpbuf *);
+
+void *ofpbuf_put_uninit(struct ofpbuf *, size_t);
+void *ofpbuf_put_zeros(struct ofpbuf *, size_t);
+void *ofpbuf_put(struct ofpbuf *, const void *, size_t);
+void ofpbuf_reserve(struct ofpbuf *, size_t);
+void *ofpbuf_push_uninit(struct ofpbuf *b, size_t);
+void *ofpbuf_push(struct ofpbuf *b, const void *, size_t);
+
+size_t ofpbuf_headroom(struct ofpbuf *);
+size_t ofpbuf_tailroom(struct ofpbuf *);
+void ofpbuf_prealloc_headroom(struct ofpbuf *, size_t);
+void ofpbuf_prealloc_tailroom(struct ofpbuf *, size_t);
+void ofpbuf_trim(struct ofpbuf *);
+
+void ofpbuf_clear(struct ofpbuf *);
+void *ofpbuf_pull(struct ofpbuf *, size_t);
+void *ofpbuf_try_pull(struct ofpbuf *, size_t);
+
+#endif /* ofpbuf.h */
diff --git a/lib/packets.h b/lib/packets.h
new file mode 100644
index 000000000..c1ae621ea
--- /dev/null
+++ b/lib/packets.h
@@ -0,0 +1,274 @@
+/*
+ * Copyright (c) 2008, 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+#ifndef PACKETS_H
+#define PACKETS_H 1
+
+#include <stdint.h>
+#include <string.h>
+#include "compiler.h"
+#include "random.h"
+#include "util.h"
+
+#define ETH_ADDR_LEN 6
+
+static const uint8_t eth_addr_broadcast[ETH_ADDR_LEN] UNUSED
+ = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
+
+static inline bool eth_addr_is_broadcast(const uint8_t ea[6])
+{
+ return (ea[0] & ea[1] & ea[2] & ea[3] & ea[4] & ea[5]) == 0xff;
+}
+
+/* Returns true if 'ea' is an Ethernet address used for virtual interfaces
+ * under XenServer. Generally the actual Ethernet address is FE:FF:FF:FF:FF:FF
+ * but it can be FE:FE:FE:FE:FE:FE in some cases. */
+static inline bool eth_addr_is_vif(const uint8_t ea[6])
+{
+ return ea[0] == 0xfe && (ea[1] & ea[2] & ea[3] & ea[4] & ea[5]) >= 0xfe;
+}
+
+static inline bool eth_addr_is_multicast(const uint8_t ea[6])
+{
+ return ea[0] & 1;
+}
+static inline bool eth_addr_is_local(const uint8_t ea[6])
+{
+ return ea[0] & 2;
+}
+static inline bool eth_addr_is_zero(const uint8_t ea[6])
+{
+ return !(ea[0] | ea[1] | ea[2] | ea[3] | ea[4] | ea[5]);
+}
+static inline bool eth_addr_equals(const uint8_t a[ETH_ADDR_LEN],
+ const uint8_t b[ETH_ADDR_LEN])
+{
+ return !memcmp(a, b, ETH_ADDR_LEN);
+}
+static inline uint64_t eth_addr_to_uint64(const uint8_t ea[ETH_ADDR_LEN])
+{
+ return (((uint64_t) ea[0] << 40)
+ | ((uint64_t) ea[1] << 32)
+ | ((uint64_t) ea[2] << 24)
+ | ((uint64_t) ea[3] << 16)
+ | ((uint64_t) ea[4] << 8)
+ | ea[5]);
+}
+static inline void eth_addr_from_uint64(uint64_t x, uint8_t ea[ETH_ADDR_LEN])
+{
+ ea[0] = x >> 40;
+ ea[1] = x >> 32;
+ ea[2] = x >> 24;
+ ea[3] = x >> 16;
+ ea[4] = x >> 8;
+ ea[5] = x;
+}
+static inline void eth_addr_mark_random(uint8_t ea[ETH_ADDR_LEN])
+{
+ ea[0] &= ~1; /* Unicast. */
+ ea[0] |= 2; /* Private. */
+}
+static inline void eth_addr_random(uint8_t ea[ETH_ADDR_LEN])
+{
+ random_bytes(ea, ETH_ADDR_LEN);
+ eth_addr_mark_random(ea);
+}
+/* Returns true if 'ea' is a reserved multicast address, that a bridge must
+ * never forward, false otherwise. */
+static inline bool eth_addr_is_reserved(const uint8_t ea[ETH_ADDR_LEN])
+{
+ return (ea[0] == 0x01
+ && ea[1] == 0x80
+ && ea[2] == 0xc2
+ && ea[3] == 0x00
+ && ea[4] == 0x00
+ && (ea[5] & 0xf0) == 0x00);
+}
+
+#define ETH_ADDR_FMT \
+ "%02"PRIx8":%02"PRIx8":%02"PRIx8":%02"PRIx8":%02"PRIx8":%02"PRIx8
+#define ETH_ADDR_ARGS(ea) \
+ (ea)[0], (ea)[1], (ea)[2], (ea)[3], (ea)[4], (ea)[5]
+
+#define ETH_TYPE_IP 0x0800
+#define ETH_TYPE_ARP 0x0806
+#define ETH_TYPE_VLAN 0x8100
+
+#define ETH_HEADER_LEN 14
+#define ETH_PAYLOAD_MIN 46
+#define ETH_PAYLOAD_MAX 1500
+#define ETH_TOTAL_MIN (ETH_HEADER_LEN + ETH_PAYLOAD_MIN)
+#define ETH_TOTAL_MAX (ETH_HEADER_LEN + ETH_PAYLOAD_MAX)
+#define ETH_VLAN_TOTAL_MAX (ETH_HEADER_LEN + VLAN_HEADER_LEN + ETH_PAYLOAD_MAX)
+struct eth_header {
+ uint8_t eth_dst[ETH_ADDR_LEN];
+ uint8_t eth_src[ETH_ADDR_LEN];
+ uint16_t eth_type;
+} __attribute__((packed));
+BUILD_ASSERT_DECL(ETH_HEADER_LEN == sizeof(struct eth_header));
+
+#define LLC_DSAP_SNAP 0xaa
+#define LLC_SSAP_SNAP 0xaa
+#define LLC_CNTL_SNAP 3
+
+#define LLC_HEADER_LEN 3
+struct llc_header {
+ uint8_t llc_dsap;
+ uint8_t llc_ssap;
+ uint8_t llc_cntl;
+} __attribute__((packed));
+BUILD_ASSERT_DECL(LLC_HEADER_LEN == sizeof(struct llc_header));
+
+#define SNAP_ORG_ETHERNET "\0\0" /* The compiler adds a null byte, so
+ sizeof(SNAP_ORG_ETHERNET) == 3. */
+#define SNAP_HEADER_LEN 5
+struct snap_header {
+ uint8_t snap_org[3];
+ uint16_t snap_type;
+} __attribute__((packed));
+BUILD_ASSERT_DECL(SNAP_HEADER_LEN == sizeof(struct snap_header));
+
+#define LLC_SNAP_HEADER_LEN (LLC_HEADER_LEN + SNAP_HEADER_LEN)
+struct llc_snap_header {
+ struct llc_header llc;
+ struct snap_header snap;
+} __attribute__((packed));
+BUILD_ASSERT_DECL(LLC_SNAP_HEADER_LEN == sizeof(struct llc_snap_header));
+
+#define VLAN_VID_MASK 0x0fff
+#define VLAN_PCP_MASK 0xe000
+
+#define VLAN_HEADER_LEN 4
+struct vlan_header {
+ uint16_t vlan_tci; /* Lowest 12 bits are VLAN ID. */
+ uint16_t vlan_next_type;
+};
+BUILD_ASSERT_DECL(VLAN_HEADER_LEN == sizeof(struct vlan_header));
+
+#define VLAN_ETH_HEADER_LEN (ETH_HEADER_LEN + VLAN_HEADER_LEN)
+struct vlan_eth_header {
+ uint8_t veth_dst[ETH_ADDR_LEN];
+ uint8_t veth_src[ETH_ADDR_LEN];
+ uint16_t veth_type; /* Always htons(ETH_TYPE_VLAN). */
+ uint16_t veth_tci; /* Lowest 12 bits are VLAN ID. */
+ uint16_t veth_next_type;
+} __attribute__((packed));
+BUILD_ASSERT_DECL(VLAN_ETH_HEADER_LEN == sizeof(struct vlan_eth_header));
+
+/* The "(void) (ip)[0]" below has no effect on the value, since it's the first
+ * argument of a comma expression, but it makes sure that 'ip' is a pointer.
+ * This is useful since a common mistake is to pass an integer instead of a
+ * pointer to IP_ARGS. */
+#define IP_FMT "%"PRIu8".%"PRIu8".%"PRIu8".%"PRIu8
+#define IP_ARGS(ip) \
+ ((void) (ip)[0], ((uint8_t *) ip)[0]), \
+ ((uint8_t *) ip)[1], \
+ ((uint8_t *) ip)[2], \
+ ((uint8_t *) ip)[3]
+
+#define IP_VER(ip_ihl_ver) ((ip_ihl_ver) >> 4)
+#define IP_IHL(ip_ihl_ver) ((ip_ihl_ver) & 15)
+#define IP_IHL_VER(ihl, ver) (((ver) << 4) | (ihl))
+
+#define IP_TYPE_ICMP 1
+#define IP_TYPE_TCP 6
+#define IP_TYPE_UDP 17
+
+#define IP_VERSION 4
+
+#define IP_DONT_FRAGMENT 0x4000 /* Don't fragment. */
+#define IP_MORE_FRAGMENTS 0x2000 /* More fragments. */
+#define IP_FRAG_OFF_MASK 0x1fff /* Fragment offset. */
+#define IP_IS_FRAGMENT(ip_frag_off) \
+ ((ip_frag_off) & htons(IP_MORE_FRAGMENTS | IP_FRAG_OFF_MASK))
+
+#define IP_HEADER_LEN 20
+struct ip_header {
+ uint8_t ip_ihl_ver;
+ uint8_t ip_tos;
+ uint16_t ip_tot_len;
+ uint16_t ip_id;
+ uint16_t ip_frag_off;
+ uint8_t ip_ttl;
+ uint8_t ip_proto;
+ uint16_t ip_csum;
+ uint32_t ip_src;
+ uint32_t ip_dst;
+};
+BUILD_ASSERT_DECL(IP_HEADER_LEN == sizeof(struct ip_header));
+
+#define ICMP_HEADER_LEN 4
+struct icmp_header {
+ uint8_t icmp_type;
+ uint8_t icmp_code;
+ uint16_t icmp_csum;
+};
+BUILD_ASSERT_DECL(ICMP_HEADER_LEN == sizeof(struct icmp_header));
+
+#define UDP_HEADER_LEN 8
+struct udp_header {
+ uint16_t udp_src;
+ uint16_t udp_dst;
+ uint16_t udp_len;
+ uint16_t udp_csum;
+};
+BUILD_ASSERT_DECL(UDP_HEADER_LEN == sizeof(struct udp_header));
+
+#define TCP_FIN 0x01
+#define TCP_SYN 0x02
+#define TCP_RST 0x04
+#define TCP_PSH 0x08
+#define TCP_ACK 0x10
+#define TCP_URG 0x20
+
+#define TCP_FLAGS(tcp_ctl) (htons(tcp_ctl) & 0x003f)
+#define TCP_OFFSET(tcp_ctl) (htons(tcp_ctl) >> 12)
+
+#define TCP_HEADER_LEN 20
+struct tcp_header {
+ uint16_t tcp_src;
+ uint16_t tcp_dst;
+ uint32_t tcp_seq;
+ uint32_t tcp_ack;
+ uint16_t tcp_ctl;
+ uint16_t tcp_winsz;
+ uint16_t tcp_csum;
+ uint16_t tcp_urg;
+};
+BUILD_ASSERT_DECL(TCP_HEADER_LEN == sizeof(struct tcp_header));
+
+#define ARP_HRD_ETHERNET 1
+#define ARP_PRO_IP 0x0800
+#define ARP_OP_REQUEST 1
+#define ARP_OP_REPLY 2
+
+#define ARP_ETH_HEADER_LEN 28
+struct arp_eth_header {
+ /* Generic members. */
+ uint16_t ar_hrd; /* Hardware type. */
+ uint16_t ar_pro; /* Protocol type. */
+ uint8_t ar_hln; /* Hardware address length. */
+ uint8_t ar_pln; /* Protocol address length. */
+ uint16_t ar_op; /* Opcode. */
+
+ /* Ethernet+IPv4 specific members. */
+ uint8_t ar_sha[ETH_ADDR_LEN]; /* Sender hardware address. */
+ uint32_t ar_spa; /* Sender protocol address. */
+ uint8_t ar_tha[ETH_ADDR_LEN]; /* Target hardware address. */
+ uint32_t ar_tpa; /* Target protocol address. */
+} __attribute__((packed));
+BUILD_ASSERT_DECL(ARP_ETH_HEADER_LEN == sizeof(struct arp_eth_header));
+
+#endif /* packets.h */
diff --git a/lib/pcap.c b/lib/pcap.c
new file mode 100644
index 000000000..b2cca7611
--- /dev/null
+++ b/lib/pcap.c
@@ -0,0 +1,163 @@
+/*
+ * Copyright (c) 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <config.h>
+#include "pcap.h"
+#include <assert.h>
+#include <errno.h>
+#include <inttypes.h>
+#include <string.h>
+#include "compiler.h"
+#include "ofpbuf.h"
+
+#define THIS_MODULE VLM_pcap
+#include "vlog.h"
+
+struct pcap_hdr {
+ uint32_t magic_number; /* magic number */
+ uint16_t version_major; /* major version number */
+ uint16_t version_minor; /* minor version number */
+ int32_t thiszone; /* GMT to local correction */
+ uint32_t sigfigs; /* accuracy of timestamps */
+ uint32_t snaplen; /* max length of captured packets */
+ uint32_t network; /* data link type */
+} PACKED;
+
+struct pcaprec_hdr {
+ uint32_t ts_sec; /* timestamp seconds */
+ uint32_t ts_usec; /* timestamp microseconds */
+ uint32_t incl_len; /* number of octets of packet saved in file */
+ uint32_t orig_len; /* actual length of packet */
+} PACKED;
+
+FILE *
+pcap_open(const char *file_name, const char *mode)
+{
+ FILE *file;
+
+ assert(!strcmp(mode, "rb") || !strcmp(mode, "wb"));
+
+ file = fopen(file_name, mode);
+ if (file == NULL) {
+ VLOG_WARN("%s: failed to open pcap file for %s",
+ file_name, mode[0] == 'r' ? "reading" : "writing");
+ return NULL;
+ }
+
+ if (mode[0] == 'r') {
+ if (!pcap_read_header(file)) {
+ fclose(file);
+ return NULL;
+ }
+ } else {
+ pcap_write_header(file);
+ }
+ return file;
+}
+
+int
+pcap_read_header(FILE *file)
+{
+ struct pcap_hdr ph;
+ if (fread(&ph, sizeof ph, 1, file) != 1) {
+ int error = ferror(file) ? errno : EOF;
+ VLOG_WARN("failed to read pcap header: %s",
+ error > 0 ? strerror(error) : "end of file");
+ return error;
+ }
+ if (ph.magic_number != 0xa1b2c3d4 && ph.magic_number != 0xd4c3b2a1) {
+ VLOG_WARN("bad magic 0x%08"PRIx32" reading pcap file "
+ "(expected 0xa1b2c3d4 or 0xd4c3b2a1)", ph.magic_number);
+ return EPROTO;
+ }
+ return 0;
+}
+
+void
+pcap_write_header(FILE *file)
+{
+ /* The pcap reader is responsible for figuring out endianness based on the
+ * magic number, so the lack of htonX calls here is intentional. */
+ struct pcap_hdr ph;
+ ph.magic_number = 0xa1b2c3d4;
+ ph.version_major = 2;
+ ph.version_minor = 4;
+ ph.thiszone = 0;
+ ph.sigfigs = 0;
+ ph.snaplen = 1518;
+ ph.network = 1; /* Ethernet */
+ fwrite(&ph, sizeof ph, 1, file);
+}
+
+int
+pcap_read(FILE *file, struct ofpbuf **bufp)
+{
+ struct pcaprec_hdr prh;
+ struct ofpbuf *buf;
+ void *data;
+ size_t len;
+
+ *bufp = NULL;
+
+ /* Read header. */
+ if (fread(&prh, sizeof prh, 1, file) != 1) {
+ int error = ferror(file) ? errno : EOF;
+ VLOG_WARN("failed to read pcap record header: %s",
+ error > 0 ? strerror(error) : "end of file");
+ return error;
+ }
+
+ /* Calculate length. */
+ len = prh.incl_len;
+ if (len > 0xffff) {
+ uint32_t swapped_len = (((len & 0xff000000) >> 24) |
+ ((len & 0x00ff0000) >> 8) |
+ ((len & 0x0000ff00) << 8) |
+ ((len & 0x000000ff) << 24));
+ if (swapped_len > 0xffff) {
+ VLOG_WARN("bad packet length %"PRIu32" or %"PRIu32" "
+ "reading pcap file",
+ len, swapped_len);
+ return EPROTO;
+ }
+ len = swapped_len;
+ }
+
+ /* Read packet. */
+ buf = ofpbuf_new(len);
+ data = ofpbuf_put_uninit(buf, len);
+ if (fread(data, len, 1, file) != 1) {
+ int error = ferror(file) ? errno : EOF;
+ VLOG_WARN("failed to read pcap packet: %s",
+ error > 0 ? strerror(error) : "end of file");
+ ofpbuf_delete(buf);
+ return error;
+ }
+ *bufp = buf;
+ return 0;
+}
+
+void
+pcap_write(FILE *file, struct ofpbuf *buf)
+{
+ struct pcaprec_hdr prh;
+ prh.ts_sec = 0;
+ prh.ts_usec = 0;
+ prh.incl_len = buf->size;
+ prh.orig_len = buf->size;
+ fwrite(&prh, sizeof prh, 1, file);
+ fwrite(buf->data, buf->size, 1, file);
+}
diff --git a/lib/pcap.h b/lib/pcap.h
new file mode 100644
index 000000000..dd7deb454
--- /dev/null
+++ b/lib/pcap.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef PCAP_H
+#define PCAP_H 1
+
+#include <stdio.h>
+
+struct ofpbuf;
+
+FILE *pcap_open(const char *file_name, const char *mode);
+int pcap_read_header(FILE *);
+void pcap_write_header(FILE *);
+int pcap_read(FILE *, struct ofpbuf **);
+void pcap_write(FILE *, struct ofpbuf *);
+
+#endif /* dhcp.h */
diff --git a/lib/poll-loop.c b/lib/poll-loop.c
new file mode 100644
index 000000000..d6dbdb854
--- /dev/null
+++ b/lib/poll-loop.c
@@ -0,0 +1,267 @@
+/*
+ * Copyright (c) 2008, 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <config.h>
+#include "poll-loop.h"
+#include <assert.h>
+#include <errno.h>
+#include <poll.h>
+#include <stdlib.h>
+#include <string.h>
+#include "backtrace.h"
+#include "coverage.h"
+#include "dynamic-string.h"
+#include "list.h"
+#include "timeval.h"
+
+#define THIS_MODULE VLM_poll_loop
+#include "vlog.h"
+
+/* An event that will wake the following call to poll_block(). */
+struct poll_waiter {
+ /* Set when the waiter is created. */
+ struct list node; /* Element in global waiters list. */
+ int fd; /* File descriptor. */
+ short int events; /* Events to wait for (POLLIN, POLLOUT). */
+ poll_fd_func *function; /* Callback function, if any, or null. */
+ void *aux; /* Argument to callback function. */
+ struct backtrace *backtrace; /* Optionally, event that created waiter. */
+
+ /* Set only when poll_block() is called. */
+ struct pollfd *pollfd; /* Pointer to element of the pollfds array
+ (null if added from a callback). */
+};
+
+/* All active poll waiters. */
+static struct list waiters = LIST_INITIALIZER(&waiters);
+
+/* Number of elements in the waiters list. */
+static size_t n_waiters;
+
+/* Max time to wait in next call to poll_block(), in milliseconds, or -1 to
+ * wait forever. */
+static int timeout = -1;
+
+/* Backtrace of 'timeout''s registration, if debugging is enabled. */
+static struct backtrace timeout_backtrace;
+
+/* Callback currently running, to allow verifying that poll_cancel() is not
+ * being called on a running callback. */
+#ifndef NDEBUG
+static struct poll_waiter *running_cb;
+#endif
+
+static struct poll_waiter *new_waiter(int fd, short int events);
+
+/* Registers 'fd' as waiting for the specified 'events' (which should be POLLIN
+ * or POLLOUT or POLLIN | POLLOUT). The following call to poll_block() will
+ * wake up when 'fd' becomes ready for one or more of the requested events.
+ *
+ * The event registration is one-shot: only the following call to poll_block()
+ * is affected. The event will need to be re-registered after poll_block() is
+ * called if it is to persist. */
+struct poll_waiter *
+poll_fd_wait(int fd, short int events)
+{
+ COVERAGE_INC(poll_fd_wait);
+ return new_waiter(fd, events);
+}
+
+/* Causes the following call to poll_block() to block for no more than 'msec'
+ * milliseconds. If 'msec' is nonpositive, the following call to poll_block()
+ * will not block at all.
+ *
+ * The timer registration is one-shot: only the following call to poll_block()
+ * is affected. The timer will need to be re-registered after poll_block() is
+ * called if it is to persist. */
+void
+poll_timer_wait(int msec)
+{
+ if (timeout < 0 || msec < timeout) {
+ timeout = MAX(0, msec);
+ if (VLOG_IS_DBG_ENABLED()) {
+ backtrace_capture(&timeout_backtrace);
+ }
+ }
+}
+
+/* Causes the following call to poll_block() to wake up immediately, without
+ * blocking. */
+void
+poll_immediate_wake(void)
+{
+ poll_timer_wait(0);
+}
+
+static void PRINTF_FORMAT(2, 3)
+log_wakeup(const struct backtrace *backtrace, const char *format, ...)
+{
+ struct ds ds;
+ va_list args;
+
+ ds_init(&ds);
+ va_start(args, format);
+ ds_put_format_valist(&ds, format, args);
+ va_end(args);
+
+ if (backtrace) {
+ int i;
+
+ ds_put_char(&ds, ':');
+ for (i = 0; i < backtrace->n_frames; i++) {
+ ds_put_format(&ds, " 0x%x", backtrace->frames[i]);
+ }
+ }
+ VLOG_DBG("%s", ds_cstr(&ds));
+ ds_destroy(&ds);
+}
+
+/* Blocks until one or more of the events registered with poll_fd_wait()
+ * occurs, or until the minimum duration registered with poll_timer_wait()
+ * elapses, or not at all if poll_immediate_wake() has been called.
+ *
+ * Also executes any autonomous subroutines registered with poll_fd_callback(),
+ * if their file descriptors have become ready. */
+void
+poll_block(void)
+{
+ static struct pollfd *pollfds;
+ static size_t max_pollfds;
+
+ struct poll_waiter *pw;
+ struct list *node;
+ int n_pollfds;
+ int retval;
+
+ assert(!running_cb);
+ if (max_pollfds < n_waiters) {
+ max_pollfds = n_waiters;
+ pollfds = xrealloc(pollfds, max_pollfds * sizeof *pollfds);
+ }
+
+ n_pollfds = 0;
+ LIST_FOR_EACH (pw, struct poll_waiter, node, &waiters) {
+ pw->pollfd = &pollfds[n_pollfds];
+ pollfds[n_pollfds].fd = pw->fd;
+ pollfds[n_pollfds].events = pw->events;
+ pollfds[n_pollfds].revents = 0;
+ n_pollfds++;
+ }
+
+ if (!timeout) {
+ COVERAGE_INC(poll_zero_timeout);
+ }
+ retval = time_poll(pollfds, n_pollfds, timeout);
+ if (retval < 0) {
+ static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
+ VLOG_ERR_RL(&rl, "poll: %s", strerror(-retval));
+ } else if (!retval && VLOG_IS_DBG_ENABLED()) {
+ log_wakeup(&timeout_backtrace, "%d-ms timeout", timeout);
+ }
+
+ for (node = waiters.next; node != &waiters; ) {
+ pw = CONTAINER_OF(node, struct poll_waiter, node);
+ if (!pw->pollfd || !pw->pollfd->revents) {
+ if (pw->function) {
+ node = node->next;
+ continue;
+ }
+ } else {
+ if (VLOG_IS_DBG_ENABLED()) {
+ log_wakeup(pw->backtrace, "%s%s%s%s%s on fd %d",
+ pw->pollfd->revents & POLLIN ? "[POLLIN]" : "",
+ pw->pollfd->revents & POLLOUT ? "[POLLOUT]" : "",
+ pw->pollfd->revents & POLLERR ? "[POLLERR]" : "",
+ pw->pollfd->revents & POLLHUP ? "[POLLHUP]" : "",
+ pw->pollfd->revents & POLLNVAL ? "[POLLNVAL]" : "",
+ pw->fd);
+ }
+
+ if (pw->function) {
+#ifndef NDEBUG
+ running_cb = pw;
+#endif
+ pw->function(pw->fd, pw->pollfd->revents, pw->aux);
+#ifndef NDEBUG
+ running_cb = NULL;
+#endif
+ }
+ }
+ node = node->next;
+ poll_cancel(pw);
+ }
+
+ timeout = -1;
+ timeout_backtrace.n_frames = 0;
+}
+
+/* Registers 'function' to be called with argument 'aux' by poll_block() when
+ * 'fd' becomes ready for one of the events in 'events', which should be POLLIN
+ * or POLLOUT or POLLIN | POLLOUT.
+ *
+ * The callback registration persists until the event actually occurs. At that
+ * point, it is automatically de-registered. The callback function must
+ * re-register the event by calling poll_fd_callback() again within the
+ * callback, if it wants to be called back again later. */
+struct poll_waiter *
+poll_fd_callback(int fd, short int events, poll_fd_func *function, void *aux)
+{
+ struct poll_waiter *pw = new_waiter(fd, events);
+ pw->function = function;
+ pw->aux = aux;
+ return pw;
+}
+
+/* Cancels the file descriptor event registered with poll_fd_wait() or
+ * poll_fd_callback(). 'pw' must be the struct poll_waiter returned by one of
+ * those functions.
+ *
+ * An event registered with poll_fd_wait() may be canceled from its time of
+ * registration until the next call to poll_block(). At that point, the event
+ * is automatically canceled by the system and its poll_waiter is freed.
+ *
+ * An event registered with poll_fd_callback() may be canceled from its time of
+ * registration until its callback is actually called. At that point, the
+ * event is automatically canceled by the system and its poll_waiter is
+ * freed. */
+void
+poll_cancel(struct poll_waiter *pw)
+{
+ if (pw) {
+ assert(pw != running_cb);
+ list_remove(&pw->node);
+ free(pw->backtrace);
+ free(pw);
+ n_waiters--;
+ }
+}
+
+/* Creates and returns a new poll_waiter for 'fd' and 'events'. */
+static struct poll_waiter *
+new_waiter(int fd, short int events)
+{
+ struct poll_waiter *waiter = xcalloc(1, sizeof *waiter);
+ assert(fd >= 0);
+ waiter->fd = fd;
+ waiter->events = events;
+ if (VLOG_IS_DBG_ENABLED()) {
+ waiter->backtrace = xmalloc(sizeof *waiter->backtrace);
+ backtrace_capture(waiter->backtrace);
+ }
+ list_push_back(&waiters, &waiter->node);
+ n_waiters++;
+ return waiter;
+}
diff --git a/lib/poll-loop.h b/lib/poll-loop.h
new file mode 100644
index 000000000..188b4f198
--- /dev/null
+++ b/lib/poll-loop.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2008 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+/* High-level wrapper around the "poll" system call.
+ *
+ * Intended usage is for the program's main loop to go about its business
+ * servicing whatever events it needs to. Then, when it runs out of immediate
+ * tasks, it calls each subordinate module's "wait" function, which in turn
+ * calls one (or more) of the functions poll_fd_wait(), poll_immediate_wake(),
+ * and poll_timer_wait() to register to be awakened when the appropriate event
+ * occurs. Then the main loop calls poll_block(), which blocks until one of
+ * the registered events happens.
+ *
+ * There is also some support for autonomous subroutines that are executed by
+ * poll_block() when a file descriptor becomes ready. To prevent these
+ * routines from starving if events are continuously ready, the application
+ * should bound the amount of work it does between poll_block() calls. */
+
+#ifndef POLL_LOOP_H
+#define POLL_LOOP_H 1
+
+#include <poll.h>
+
+struct poll_waiter;
+
+/* Schedule events to wake up the following poll_block(). */
+struct poll_waiter *poll_fd_wait(int fd, short int events);
+void poll_timer_wait(int msec);
+void poll_immediate_wake(void);
+
+/* Wait until an event occurs. */
+void poll_block(void);
+
+/* Autonomous function callbacks. */
+typedef void poll_fd_func(int fd, short int revents, void *aux);
+struct poll_waiter *poll_fd_callback(int fd, short int events,
+ poll_fd_func *, void *aux);
+
+/* Cancel a file descriptor callback or event. */
+void poll_cancel(struct poll_waiter *);
+
+#endif /* poll-loop.h */
diff --git a/lib/port-array.c b/lib/port-array.c
new file mode 100644
index 000000000..87bb21687
--- /dev/null
+++ b/lib/port-array.c
@@ -0,0 +1,183 @@
+/*
+ * Copyright (c) 2008 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <config.h>
+#include "port-array.h"
+#include <stdlib.h>
+
+static struct port_array_l2 l2_sentinel;
+static struct port_array_l3 l3_sentinel;
+static bool inited;
+
+/* Initializes 'pa' as an empty port_array. */
+void
+port_array_init(struct port_array *pa)
+{
+ size_t i;
+ if (!inited) {
+ inited = true;
+ for (i = 0; i < PORT_ARRAY_L2_SIZE; i++) {
+ l2_sentinel.l2[i] = &l3_sentinel;
+ }
+ }
+ for (i = 0; i < PORT_ARRAY_L1_SIZE; i++) {
+ pa->l1[i] = &l2_sentinel;
+ }
+}
+
+/* Frees all the memory allocated for 'pa'. It is the client's responsibility
+ * to free memory that 'pa' elements point to. */
+void
+port_array_destroy(struct port_array *pa)
+{
+ unsigned int l1_idx;
+
+ for (l1_idx = 0; l1_idx < PORT_ARRAY_L1_SIZE; l1_idx++) {
+ struct port_array_l2 *l2 = pa->l1[l1_idx];
+
+ if (l2 != &l2_sentinel) {
+ unsigned int l2_idx;
+
+ for (l2_idx = 0; l2_idx < PORT_ARRAY_L2_SIZE; l2_idx++) {
+ struct port_array_l3 *l3 = l2->l2[l2_idx];
+ if (l3 != &l3_sentinel) {
+ free(l3);
+ }
+ }
+ free(l2);
+ }
+ }
+}
+
+/* Clears all elements of 'pa' to null pointers. */
+void
+port_array_clear(struct port_array *pa)
+{
+ port_array_destroy(pa);
+ port_array_init(pa);
+}
+
+/* Sets 'pa' element numbered 'idx' to 'p'. */
+void
+port_array_set(struct port_array *pa, uint16_t idx, void *p)
+{
+ struct port_array_l2 **l2p, *l2;
+ struct port_array_l3 **l3p, *l3;
+
+ /* Traverse level 1. */
+ l2p = &pa->l1[PORT_ARRAY_L1(idx)];
+ if (*l2p == &l2_sentinel) {
+ *l2p = xmemdup(&l2_sentinel, sizeof l2_sentinel);
+ }
+ l2 = *l2p;
+
+ /* Traverse level 2. */
+ l3p = &l2->l2[PORT_ARRAY_L2(idx)];
+ if (*l3p == &l3_sentinel) {
+ *l3p = xmemdup(&l3_sentinel, sizeof l3_sentinel);
+ }
+ l3 = *l3p;
+
+ /* Set level 3. */
+ l3->l3[PORT_ARRAY_L3(idx)] = p;
+}
+
+static void *
+next(const struct port_array *pa, unsigned int *idxp)
+{
+ unsigned int idx = *idxp;
+
+ /* Using shift-right directly here, instead of PORT_ARRAY_L1(idx), ensures
+ * that with an initially too-big value of '*idxp' we will skip the outer
+ * loop and return NULL. */
+ unsigned int l1_idx = idx >> PORT_ARRAY_L1_SHIFT;
+ unsigned int l2_idx = PORT_ARRAY_L2(idx);
+ unsigned int l3_idx = PORT_ARRAY_L3(idx);
+ while (l1_idx < PORT_ARRAY_L1_SIZE) {
+ struct port_array_l2 *l2 = pa->l1[l1_idx];
+ if (l2 != &l2_sentinel) {
+ while (l2_idx < PORT_ARRAY_L2_SIZE) {
+ struct port_array_l3 *l3 = l2->l2[l2_idx];
+ if (l3 != &l3_sentinel) {
+ while (l3_idx < PORT_ARRAY_L3_SIZE) {
+ void *p = l3->l3[l3_idx];
+ if (p) {
+ *idxp = ((l1_idx << PORT_ARRAY_L1_SHIFT)
+ | (l2_idx << PORT_ARRAY_L2_SHIFT)
+ | (l3_idx << PORT_ARRAY_L3_SHIFT));
+ return p;
+ }
+ l3_idx++;
+ }
+ }
+ l2_idx++;
+ l3_idx = 0;
+ }
+ }
+ l1_idx++;
+ l2_idx = 0;
+ l3_idx = 0;
+ }
+ *idxp = PORT_ARRAY_SIZE;
+ return NULL;
+}
+
+/* Returns the value of the lowest-numbered non-empty element of 'pa', and sets
+ * '*idxp' to that element's index. If 'pa' is entirely empty, returns a null
+ * pointer and sets '*idxp' to 65536. */
+void *
+port_array_first(const struct port_array *pa, unsigned int *idxp)
+{
+ *idxp = 0;
+ return next(pa, idxp);
+}
+
+/* Returns the value of the lowest-numbered non-empty element of 'pa' greater
+ * than the initial value of '*idxp', and sets '*idxp' to that element's index.
+ * If 'pa' contains no non-empty elements with indexes greater than the initial
+ * value of '*idxp', returns a null pointer and sets '*idxp' to 65536. */
+void *
+port_array_next(const struct port_array *pa, unsigned int *idxp)
+{
+ ++*idxp;
+ return next(pa, idxp);
+}
+
+/* Returns the number of non-null elements of 'pa'. */
+unsigned int
+port_array_count(const struct port_array *pa)
+{
+ unsigned int l1_idx, l2_idx, l3_idx;
+ unsigned int count;
+
+ count = 0;
+ for (l1_idx = 0; l1_idx < PORT_ARRAY_L1_SIZE; l1_idx++) {
+ struct port_array_l2 *l2 = pa->l1[l1_idx];
+ if (l2 != &l2_sentinel) {
+ for (l2_idx = 0; l2_idx < PORT_ARRAY_L2_SIZE; l2_idx++) {
+ struct port_array_l3 *l3 = l2->l2[l2_idx];
+ if (l3 != &l3_sentinel) {
+ for (l3_idx = 0; l3_idx < PORT_ARRAY_L3_SIZE; l3_idx++) {
+ if (l3->l3[l3_idx]) {
+ count++;
+ }
+ }
+ }
+ }
+ }
+ }
+ return count;
+}
diff --git a/lib/port-array.h b/lib/port-array.h
new file mode 100644
index 000000000..e9b3cf11c
--- /dev/null
+++ b/lib/port-array.h
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2008, 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef PORT_ARRAY_H
+#define PORT_ARRAY_H 1
+
+#include <assert.h>
+#include "openflow/openflow.h"
+#include "util.h"
+
+static inline uint16_t
+port_array_extract_bits__(uint16_t data, int start, int count)
+{
+ return (data >> start) & ((1u << count) - 1);
+}
+
+/* Level 1: most-significant bits. */
+#define PORT_ARRAY_L1_BITS 5
+#define PORT_ARRAY_L1_SHIFT (PORT_ARRAY_L3_BITS + PORT_ARRAY_L2_BITS)
+#define PORT_ARRAY_L1_SIZE (1u << PORT_ARRAY_L1_BITS)
+#define PORT_ARRAY_L1(IDX) \
+ port_array_extract_bits__(IDX, PORT_ARRAY_L1_SHIFT, PORT_ARRAY_L1_BITS)
+
+/* Level 2: middle bits. */
+#define PORT_ARRAY_L2_BITS 5
+#define PORT_ARRAY_L2_SHIFT PORT_ARRAY_L3_BITS
+#define PORT_ARRAY_L2_SIZE (1u << PORT_ARRAY_L2_BITS)
+#define PORT_ARRAY_L2(IDX) \
+ port_array_extract_bits__(IDX, PORT_ARRAY_L2_SHIFT, PORT_ARRAY_L2_BITS)
+
+/* Level 3: least-significant bits. */
+#define PORT_ARRAY_L3_BITS 6
+#define PORT_ARRAY_L3_SHIFT 0
+#define PORT_ARRAY_L3_SIZE (1u << PORT_ARRAY_L3_BITS)
+#define PORT_ARRAY_L3(IDX) \
+ port_array_extract_bits__(IDX, PORT_ARRAY_L3_SHIFT, PORT_ARRAY_L3_BITS)
+
+#define PORT_ARRAY_SIZE (1u << (PORT_ARRAY_L1_BITS \
+ + PORT_ARRAY_L2_BITS \
+ + PORT_ARRAY_L3_BITS))
+
+BUILD_ASSERT_DECL(PORT_ARRAY_SIZE > 0xffff);
+
+/* A "sparse array" of up to 65536 elements (numbered 0...65535), implemented
+ * as a 3-level trie. Most efficient when the elements are clustered
+ * together. */
+struct port_array {
+ struct port_array_l2 *l1[1u << PORT_ARRAY_L1_BITS];
+};
+
+struct port_array_l2 {
+ struct port_array_l3 *l2[1u << PORT_ARRAY_L2_BITS];
+};
+
+struct port_array_l3 {
+ void *l3[1u << PORT_ARRAY_L3_BITS];
+};
+
+/* Returns the value of the element numbered 'idx' in 'pa', or a null pointer
+ * if no element numbered 'idx' has been set. */
+static inline void *
+port_array_get(const struct port_array *pa, uint16_t idx)
+{
+ unsigned int l1_idx = PORT_ARRAY_L1(idx);
+ unsigned int l2_idx = PORT_ARRAY_L2(idx);
+ unsigned int l3_idx = PORT_ARRAY_L3(idx);
+ return pa->l1[l1_idx]->l2[l2_idx]->l3[l3_idx];
+}
+
+void port_array_init(struct port_array *);
+void port_array_destroy(struct port_array *);
+void port_array_clear(struct port_array *);
+void port_array_set(struct port_array *, uint16_t idx, void *);
+void *port_array_first(const struct port_array *, unsigned int *);
+void *port_array_next(const struct port_array *, unsigned int *);
+unsigned int port_array_count(const struct port_array *);
+
+#define PORT_ARRAY_FOR_EACH(DATA, ARRAY, PORT_NO) \
+ for ((DATA) = port_array_first(ARRAY, &(PORT_NO)); (DATA) != NULL; \
+ (DATA) = port_array_next(ARRAY, &(PORT_NO)))
+
+#endif /* port-array.h */
diff --git a/lib/process.c b/lib/process.c
new file mode 100644
index 000000000..79b5659f4
--- /dev/null
+++ b/lib/process.c
@@ -0,0 +1,417 @@
+/*
+ * Copyright (c) 2008, 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <config.h>
+#include "process.h"
+#include <assert.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include "coverage.h"
+#include "dynamic-string.h"
+#include "list.h"
+#include "poll-loop.h"
+#include "socket-util.h"
+#include "util.h"
+
+#define THIS_MODULE VLM_process
+#include "vlog.h"
+
+struct process {
+ struct list node;
+ char *name;
+ pid_t pid;
+
+ /* Modified by signal handler. */
+ volatile bool exited;
+ volatile int status;
+};
+
+/* Pipe used to signal child termination. */
+static int fds[2];
+
+/* All processes. */
+static struct list all_processes = LIST_INITIALIZER(&all_processes);
+
+static void block_sigchld(sigset_t *);
+static void unblock_sigchld(const sigset_t *);
+static void sigchld_handler(int signr UNUSED);
+static bool is_member(int x, const int *array, size_t);
+
+/* Initializes the process subsystem (if it is not already initialized). Calls
+ * exit() if initialization fails.
+ *
+ * Calling this function is optional; it will be called automatically by
+ * process_start() if necessary. Calling it explicitly allows the client to
+ * prevent the process from exiting at an unexpected time. */
+void
+process_init(void)
+{
+ static bool inited;
+ struct sigaction sa;
+
+ if (inited) {
+ return;
+ }
+ inited = true;
+
+ /* Create notification pipe. */
+ if (pipe(fds)) {
+ ovs_fatal(errno, "could not create pipe");
+ }
+ set_nonblocking(fds[0]);
+ set_nonblocking(fds[1]);
+
+ /* Set up child termination signal handler. */
+ memset(&sa, 0, sizeof sa);
+ sa.sa_handler = sigchld_handler;
+ sigemptyset(&sa.sa_mask);
+ sa.sa_flags = SA_NOCLDSTOP | SA_RESTART;
+ if (sigaction(SIGCHLD, &sa, NULL)) {
+ ovs_fatal(errno, "sigaction(SIGCHLD) failed");
+ }
+}
+
+char *
+process_escape_args(char **argv)
+{
+ struct ds ds = DS_EMPTY_INITIALIZER;
+ char **argp;
+ for (argp = argv; *argp; argp++) {
+ const char *arg = *argp;
+ const char *p;
+ if (argp != argv) {
+ ds_put_char(&ds, ' ');
+ }
+ if (arg[strcspn(arg, " \t\r\n\v\\")]) {
+ ds_put_char(&ds, '"');
+ for (p = arg; *p; p++) {
+ if (*p == '\\' || *p == '\"') {
+ ds_put_char(&ds, '\\');
+ }
+ ds_put_char(&ds, *p);
+ }
+ ds_put_char(&ds, '"');
+ } else {
+ ds_put_cstr(&ds, arg);
+ }
+ }
+ return ds_cstr(&ds);
+}
+
+/* Starts a subprocess with the arguments in the null-terminated argv[] array.
+ * argv[0] is used as the name of the process. Searches the PATH environment
+ * variable to find the program to execute.
+ *
+ * All file descriptors are closed before executing the subprocess, except for
+ * fds 0, 1, and 2 and the 'n_keep_fds' fds listed in 'keep_fds'. Also, any of
+ * the 'n_null_fds' fds listed in 'null_fds' are replaced by /dev/null.
+ *
+ * Returns 0 if successful, otherwise a positive errno value indicating the
+ * error. If successful, '*pp' is assigned a new struct process that may be
+ * used to query the process's status. On failure, '*pp' is set to NULL. */
+int
+process_start(char **argv,
+ const int keep_fds[], size_t n_keep_fds,
+ const int null_fds[], size_t n_null_fds,
+ struct process **pp)
+{
+ sigset_t oldsigs;
+ char *binary;
+ pid_t pid;
+
+ *pp = NULL;
+ process_init();
+ COVERAGE_INC(process_start);
+
+ if (VLOG_IS_DBG_ENABLED()) {
+ char *args = process_escape_args(argv);
+ VLOG_DBG("starting subprocess: %s", args);
+ free(args);
+ }
+
+ /* execvp() will search PATH too, but the error in that case is more
+ * obscure, since it is only reported post-fork. */
+ binary = process_search_path(argv[0]);
+ if (!binary) {
+ VLOG_ERR("%s not found in PATH", argv[0]);
+ return ENOENT;
+ }
+ free(binary);
+
+ block_sigchld(&oldsigs);
+ pid = fork();
+ if (pid < 0) {
+ unblock_sigchld(&oldsigs);
+ VLOG_WARN("fork failed: %s", strerror(errno));
+ return errno;
+ } else if (pid) {
+ /* Running in parent process. */
+ struct process *p;
+ const char *slash;
+
+ p = xcalloc(1, sizeof *p);
+ p->pid = pid;
+ slash = strrchr(argv[0], '/');
+ p->name = xstrdup(slash ? slash + 1 : argv[0]);
+ p->exited = false;
+
+ list_push_back(&all_processes, &p->node);
+ unblock_sigchld(&oldsigs);
+
+ *pp = p;
+ return 0;
+ } else {
+ /* Running in child process. */
+ int fd_max = get_max_fds();
+ int fd;
+
+ unblock_sigchld(&oldsigs);
+ for (fd = 0; fd < fd_max; fd++) {
+ if (is_member(fd, null_fds, n_null_fds)) {
+ int nullfd = open("/dev/null", O_RDWR);
+ dup2(nullfd, fd);
+ close(nullfd);
+ } else if (fd >= 3 && !is_member(fd, keep_fds, n_keep_fds)) {
+ close(fd);
+ }
+ }
+ execvp(argv[0], argv);
+ fprintf(stderr, "execvp(\"%s\") failed: %s\n",
+ argv[0], strerror(errno));
+ _exit(1);
+ }
+}
+
+/* Destroys process 'p'. */
+void
+process_destroy(struct process *p)
+{
+ if (p) {
+ sigset_t oldsigs;
+
+ block_sigchld(&oldsigs);
+ list_remove(&p->node);
+ unblock_sigchld(&oldsigs);
+
+ free(p->name);
+ free(p);
+ }
+}
+
+/* Sends signal 'signr' to process 'p'. Returns 0 if successful, otherwise a
+ * positive errno value. */
+int
+process_kill(const struct process *p, int signr)
+{
+ return (p->exited ? ESRCH
+ : !kill(p->pid, signr) ? 0
+ : errno);
+}
+
+/* Returns the pid of process 'p'. */
+pid_t
+process_pid(const struct process *p)
+{
+ return p->pid;
+}
+
+/* Returns the name of process 'p' (the name passed to process_start() with any
+ * leading directories stripped). */
+const char *
+process_name(const struct process *p)
+{
+ return p->name;
+}
+
+/* Returns true if process 'p' has exited, false otherwise. */
+bool
+process_exited(struct process *p)
+{
+ if (p->exited) {
+ return true;
+ } else {
+ char buf[_POSIX_PIPE_BUF];
+ read(fds[0], buf, sizeof buf);
+ return false;
+ }
+}
+
+/* Returns process 'p''s exit status, as reported by waitpid(2).
+ * process_status(p) may be called only after process_exited(p) has returned
+ * true. */
+int
+process_status(const struct process *p)
+{
+ assert(p->exited);
+ return p->status;
+}
+
+int
+process_run(char **argv,
+ const int keep_fds[], size_t n_keep_fds,
+ const int null_fds[], size_t n_null_fds,
+ int *status)
+{
+ struct process *p;
+ int retval;
+
+ COVERAGE_INC(process_run);
+ retval = process_start(argv, keep_fds, n_keep_fds, null_fds, n_null_fds,
+ &p);
+ if (retval) {
+ *status = 0;
+ return retval;
+ }
+
+ while (!process_exited(p)) {
+ process_wait(p);
+ poll_block();
+ }
+ *status = process_status(p);
+ process_destroy(p);
+ return 0;
+}
+
+/* Given 'status', which is a process status in the form reported by waitpid(2)
+ * and returned by process_status(), returns a string describing how the
+ * process terminated. The caller is responsible for freeing the string when
+ * it is no longer needed. */
+char *
+process_status_msg(int status)
+{
+ struct ds ds = DS_EMPTY_INITIALIZER;
+ if (WIFEXITED(status)) {
+ ds_put_format(&ds, "exit status %d", WEXITSTATUS(status));
+ } else if (WIFSIGNALED(status) || WIFSTOPPED(status)) {
+ int signr = WIFSIGNALED(status) ? WTERMSIG(status) : WSTOPSIG(status);
+ const char *name = NULL;
+#ifdef HAVE_STRSIGNAL
+ name = strsignal(signr);
+#endif
+ ds_put_format(&ds, "%s by signal %d",
+ WIFSIGNALED(status) ? "killed" : "stopped", signr);
+ if (name) {
+ ds_put_format(&ds, " (%s)", name);
+ }
+ } else {
+ ds_put_format(&ds, "terminated abnormally (%x)", status);
+ }
+ if (WCOREDUMP(status)) {
+ ds_put_cstr(&ds, ", core dumped");
+ }
+ return ds_cstr(&ds);
+}
+
+/* Causes the next call to poll_block() to wake up when process 'p' has
+ * exited. */
+void
+process_wait(struct process *p)
+{
+ if (p->exited) {
+ poll_immediate_wake();
+ } else {
+ poll_fd_wait(fds[0], POLLIN);
+ }
+}
+
+char *
+process_search_path(const char *name)
+{
+ char *save_ptr = NULL;
+ char *path, *dir;
+ struct stat s;
+
+ if (strchr(name, '/') || !getenv("PATH")) {
+ return stat(name, &s) == 0 ? xstrdup(name) : NULL;
+ }
+
+ path = xstrdup(getenv("PATH"));
+ for (dir = strtok_r(path, ":", &save_ptr); dir;
+ dir = strtok_r(NULL, ":", &save_ptr)) {
+ char *file = xasprintf("%s/%s", dir, name);
+ if (stat(file, &s) == 0) {
+ free(path);
+ return file;
+ }
+ free(file);
+ }
+ free(path);
+ return NULL;
+}
+
+static void
+sigchld_handler(int signr UNUSED)
+{
+ struct process *p;
+
+ COVERAGE_INC(process_sigchld);
+ LIST_FOR_EACH (p, struct process, node, &all_processes) {
+ if (!p->exited) {
+ int retval, status;
+ do {
+ retval = waitpid(p->pid, &status, WNOHANG);
+ } while (retval == -1 && errno == EINTR);
+ if (retval == p->pid) {
+ p->exited = true;
+ p->status = status;
+ } else if (retval < 0) {
+ /* XXX We want to log something but we're in a signal
+ * handler. */
+ p->exited = true;
+ p->status = -1;
+ }
+ }
+ }
+ write(fds[1], "", 1);
+}
+
+static bool
+is_member(int x, const int *array, size_t n)
+{
+ size_t i;
+
+ for (i = 0; i < n; i++) {
+ if (array[i] == x) {
+ return true;
+ }
+ }
+ return false;
+}
+
+static void
+block_sigchld(sigset_t *oldsigs)
+{
+ sigset_t sigchld;
+ sigemptyset(&sigchld);
+ sigaddset(&sigchld, SIGCHLD);
+ if (sigprocmask(SIG_BLOCK, &sigchld, oldsigs)) {
+ ovs_fatal(errno, "sigprocmask");
+ }
+}
+
+static void
+unblock_sigchld(const sigset_t *oldsigs)
+{
+ if (sigprocmask(SIG_SETMASK, oldsigs, NULL)) {
+ ovs_fatal(errno, "sigprocmask");
+ }
+}
diff --git a/lib/process.h b/lib/process.h
new file mode 100644
index 000000000..d4aba3ae7
--- /dev/null
+++ b/lib/process.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2008, 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef PROCESS_H
+#define PROCESS_H 1
+
+#include <stdbool.h>
+#include <sys/types.h>
+
+struct process;
+void process_init(void);
+char *process_escape_args(char **argv);
+int process_start(char **argv,
+ const int *keep_fds, size_t n_keep_fds,
+ const int *null_fds, size_t n_null_fds,
+ struct process **);
+void process_destroy(struct process *);
+int process_kill(const struct process *, int signr);
+
+int process_run(char **argv,
+ const int *keep_fds, size_t n_keep_fds,
+ const int *null_fds, size_t n_null_fds,
+ int *status);
+
+pid_t process_pid(const struct process *);
+const char *process_name(const struct process *);
+bool process_exited(struct process *);
+int process_status(const struct process *);
+char *process_status_msg(int);
+
+void process_wait(struct process *);
+
+char *process_search_path(const char *);
+
+#endif /* process.h */
diff --git a/lib/queue.c b/lib/queue.c
new file mode 100644
index 000000000..2e4c7ca62
--- /dev/null
+++ b/lib/queue.c
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2008, 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <config.h>
+#include "queue.h"
+#include <assert.h>
+#include "compiler.h"
+#include "leak-checker.h"
+#include "ofpbuf.h"
+
+static void check_queue(struct ovs_queue *q);
+
+/* Initializes 'q' as an empty packet queue. */
+void
+queue_init(struct ovs_queue *q)
+{
+ q->n = 0;
+ q->head = NULL;
+ q->tail = NULL;
+}
+
+/* Destroys 'q' and all of the packets that it contains. */
+void
+queue_destroy(struct ovs_queue *q)
+{
+ struct ofpbuf *cur, *next;
+ for (cur = q->head; cur != NULL; cur = next) {
+ next = cur->next;
+ ofpbuf_delete(cur);
+ }
+}
+
+/* Removes and destroys all of the packets in 'q', rendering it empty. */
+void
+queue_clear(struct ovs_queue *q)
+{
+ queue_destroy(q);
+ queue_init(q);
+}
+
+/* Advances the first packet in 'q' from 'q->head' to 'next', which should be
+ * the second packet in the queue.
+ *
+ * The odd, unsafe interface here allows the first packet in the queue to be
+ * passed to a function for possible consumption (and destruction) and only
+ * dropped from the queue if that function actually accepts it. */
+void
+queue_advance_head(struct ovs_queue *q, struct ofpbuf *next)
+{
+ assert(q->n);
+ assert(q->head);
+ q->head = next;
+ if (q->head == NULL) {
+ q->tail = NULL;
+ }
+ q->n--;
+}
+
+/* Appends 'b' to the tail of 'q'. */
+void
+queue_push_tail(struct ovs_queue *q, struct ofpbuf *b)
+{
+ check_queue(q);
+ leak_checker_claim(b);
+
+ b->next = NULL;
+ if (q->n++) {
+ q->tail->next = b;
+ } else {
+ q->head = b;
+ }
+ q->tail = b;
+
+ check_queue(q);
+}
+
+/* Removes the first buffer from 'q', which must not be empty, and returns
+ * it. The caller must free the buffer (with ofpbuf_delete()) when it is no
+ * longer needed. */
+struct ofpbuf *
+queue_pop_head(struct ovs_queue *q)
+{
+ struct ofpbuf *head = q->head;
+ queue_advance_head(q, head->next);
+ return head;
+}
+
+/* Checks the internal integrity of 'q'. For use in debugging. */
+static void
+check_queue(struct ovs_queue *q UNUSED)
+{
+#if 0
+ struct ofpbuf *iter;
+ size_t n;
+
+ assert(q->n == 0
+ ? q->head == NULL && q->tail == NULL
+ : q->head != NULL && q->tail != NULL);
+
+ n = 0;
+ for (iter = q->head; iter != NULL; iter = iter->next) {
+ n++;
+ assert((iter->next != NULL) == (iter != q->tail));
+ }
+ assert(n == q->n);
+#endif
+}
diff --git a/lib/queue.h b/lib/queue.h
new file mode 100644
index 000000000..597d53945
--- /dev/null
+++ b/lib/queue.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2008, 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef QUEUE_H
+#define QUEUE_H 1
+
+/* Packet queue. */
+struct ovs_queue {
+ int n; /* Number of queued packets. */
+ struct ofpbuf *head; /* First queued packet, null if n == 0. */
+ struct ofpbuf *tail; /* Last queued packet, null if n == 0. */
+};
+
+void queue_init(struct ovs_queue *);
+void queue_destroy(struct ovs_queue *);
+void queue_clear(struct ovs_queue *);
+void queue_advance_head(struct ovs_queue *, struct ofpbuf *next);
+void queue_push_tail(struct ovs_queue *, struct ofpbuf *);
+struct ofpbuf *queue_pop_head(struct ovs_queue *);
+
+#endif /* queue.h */
diff --git a/lib/random.c b/lib/random.c
new file mode 100644
index 000000000..96713c505
--- /dev/null
+++ b/lib/random.c
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2008, 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <config.h>
+#include "random.h"
+
+#include <errno.h>
+#include <stdlib.h>
+#include <sys/time.h>
+
+#include "util.h"
+
+void
+random_init(void)
+{
+ static bool inited = false;
+ if (!inited) {
+ struct timeval tv;
+ inited = true;
+ if (gettimeofday(&tv, NULL) < 0) {
+ ovs_fatal(errno, "gettimeofday");
+ }
+ srand(tv.tv_sec ^ tv.tv_usec);
+ }
+}
+
+void
+random_bytes(void *p_, size_t n)
+{
+ uint8_t *p = p_;
+ random_init();
+ while (n--) {
+ *p++ = rand();
+ }
+}
+
+uint8_t
+random_uint8(void)
+{
+ random_init();
+ return rand();
+}
+
+uint16_t
+random_uint16(void)
+{
+ if (RAND_MAX >= UINT16_MAX) {
+ random_init();
+ return rand();
+ } else {
+ uint16_t x;
+ random_bytes(&x, sizeof x);
+ return x;
+ }
+}
+
+uint32_t
+random_uint32(void)
+{
+ if (RAND_MAX >= UINT32_MAX) {
+ random_init();
+ return rand();
+ } else if (RAND_MAX == INT32_MAX) {
+ random_init();
+ return rand() | ((rand() & 1u) << 31);
+ } else {
+ uint32_t x;
+ random_bytes(&x, sizeof x);
+ return x;
+ }
+}
+
+int
+random_range(int max)
+{
+ return random_uint32() % max;
+}
diff --git a/lib/random.h b/lib/random.h
new file mode 100644
index 000000000..bba76933d
--- /dev/null
+++ b/lib/random.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2008, 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef RANDOM_H
+#define RANDOM_H 1
+
+#include <stddef.h>
+#include <stdint.h>
+
+void random_init(void);
+void random_bytes(void *, size_t);
+uint8_t random_uint8(void);
+uint16_t random_uint16(void);
+uint32_t random_uint32(void);
+int random_range(int max);
+
+#endif /* random.h */
diff --git a/lib/rconn.c b/lib/rconn.c
new file mode 100644
index 000000000..ead8d2d6a
--- /dev/null
+++ b/lib/rconn.c
@@ -0,0 +1,959 @@
+/*
+ * Copyright (c) 2008, 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <config.h>
+#include "rconn.h"
+#include <assert.h>
+#include <errno.h>
+#include <limits.h>
+#include <stdlib.h>
+#include <string.h>
+#include "coverage.h"
+#include "ofpbuf.h"
+#include "openflow/openflow.h"
+#include "poll-loop.h"
+#include "sat-math.h"
+#include "timeval.h"
+#include "util.h"
+#include "vconn.h"
+
+#define THIS_MODULE VLM_rconn
+#include "vlog.h"
+
+#define STATES \
+ STATE(VOID, 1 << 0) \
+ STATE(BACKOFF, 1 << 1) \
+ STATE(CONNECTING, 1 << 2) \
+ STATE(ACTIVE, 1 << 3) \
+ STATE(IDLE, 1 << 4)
+enum state {
+#define STATE(NAME, VALUE) S_##NAME = VALUE,
+ STATES
+#undef STATE
+};
+
+static const char *
+state_name(enum state state)
+{
+ switch (state) {
+#define STATE(NAME, VALUE) case S_##NAME: return #NAME;
+ STATES
+#undef STATE
+ }
+ return "***ERROR***";
+}
+
+/* A reliable connection to an OpenFlow switch or controller.
+ *
+ * See the large comment in rconn.h for more information. */
+struct rconn {
+ enum state state;
+ time_t state_entered;
+
+ struct vconn *vconn;
+ char *name;
+ bool reliable;
+
+ struct ovs_queue txq;
+
+ int backoff;
+ int max_backoff;
+ time_t backoff_deadline;
+ time_t last_received;
+ time_t last_connected;
+ unsigned int packets_sent;
+ unsigned int seqno;
+
+ /* In S_ACTIVE and S_IDLE, probably_admitted reports whether we believe
+ * that the peer has made a (positive) admission control decision on our
+ * connection. If we have not yet been (probably) admitted, then the
+ * connection does not reset the timer used for deciding whether the switch
+ * should go into fail-open mode.
+ *
+ * last_admitted reports the last time we believe such a positive admission
+ * control decision was made. */
+ bool probably_admitted;
+ time_t last_admitted;
+
+ /* These values are simply for statistics reporting, not used directly by
+ * anything internal to the rconn (or the secchan for that matter). */
+ unsigned int packets_received;
+ unsigned int n_attempted_connections, n_successful_connections;
+ time_t creation_time;
+ unsigned long int total_time_connected;
+
+ /* If we can't connect to the peer, it could be for any number of reasons.
+ * Usually, one would assume it is because the peer is not running or
+ * because the network is partitioned. But it could also be because the
+ * network topology has changed, in which case the upper layer will need to
+ * reassess it (in particular, obtain a new IP address via DHCP and find
+ * the new location of the controller). We set this flag when we suspect
+ * that this could be the case. */
+ bool questionable_connectivity;
+ time_t last_questioned;
+
+ /* Throughout this file, "probe" is shorthand for "inactivity probe".
+ * When nothing has been received from the peer for a while, we send out
+ * an echo request as an inactivity probe packet. We should receive back
+ * a response. */
+ int probe_interval; /* Secs of inactivity before sending probe. */
+
+ /* Messages sent or received are copied to the monitor connections. */
+#define MAX_MONITORS 8
+ struct vconn *monitors[8];
+ size_t n_monitors;
+};
+
+static unsigned int elapsed_in_this_state(const struct rconn *);
+static unsigned int timeout(const struct rconn *);
+static bool timed_out(const struct rconn *);
+static void state_transition(struct rconn *, enum state);
+static int try_send(struct rconn *);
+static int reconnect(struct rconn *);
+static void disconnect(struct rconn *, int error);
+static void flush_queue(struct rconn *);
+static void question_connectivity(struct rconn *);
+static void copy_to_monitor(struct rconn *, const struct ofpbuf *);
+static bool is_connected_state(enum state);
+static bool is_admitted_msg(const struct ofpbuf *);
+
+/* Creates a new rconn, connects it (reliably) to 'name', and returns it. */
+struct rconn *
+rconn_new(const char *name, int inactivity_probe_interval, int max_backoff)
+{
+ struct rconn *rc = rconn_create(inactivity_probe_interval, max_backoff);
+ rconn_connect(rc, name);
+ return rc;
+}
+
+/* Creates a new rconn, connects it (unreliably) to 'vconn', and returns it. */
+struct rconn *
+rconn_new_from_vconn(const char *name, struct vconn *vconn)
+{
+ struct rconn *rc = rconn_create(60, 0);
+ rconn_connect_unreliably(rc, name, vconn);
+ return rc;
+}
+
+/* Creates and returns a new rconn.
+ *
+ * 'probe_interval' is a number of seconds. If the interval passes once
+ * without an OpenFlow message being received from the peer, the rconn sends
+ * out an "echo request" message. If the interval passes again without a
+ * message being received, the rconn disconnects and re-connects to the peer.
+ * Setting 'probe_interval' to 0 disables this behavior.
+ *
+ * 'max_backoff' is the maximum number of seconds between attempts to connect
+ * to the peer. The actual interval starts at 1 second and doubles on each
+ * failure until it reaches 'max_backoff'. If 0 is specified, the default of
+ * 60 seconds is used. */
+struct rconn *
+rconn_create(int probe_interval, int max_backoff)
+{
+ struct rconn *rc = xcalloc(1, sizeof *rc);
+
+ rc->state = S_VOID;
+ rc->state_entered = time_now();
+
+ rc->vconn = NULL;
+ rc->name = xstrdup("void");
+ rc->reliable = false;
+
+ queue_init(&rc->txq);
+
+ rc->backoff = 0;
+ rc->max_backoff = max_backoff ? max_backoff : 60;
+ rc->backoff_deadline = TIME_MIN;
+ rc->last_received = time_now();
+ rc->last_connected = time_now();
+ rc->seqno = 0;
+
+ rc->packets_sent = 0;
+
+ rc->probably_admitted = false;
+ rc->last_admitted = time_now();
+
+ rc->packets_received = 0;
+ rc->n_attempted_connections = 0;
+ rc->n_successful_connections = 0;
+ rc->creation_time = time_now();
+ rc->total_time_connected = 0;
+
+ rc->questionable_connectivity = false;
+ rc->last_questioned = time_now();
+
+ rconn_set_probe_interval(rc, probe_interval);
+
+ rc->n_monitors = 0;
+
+ return rc;
+}
+
+void
+rconn_set_max_backoff(struct rconn *rc, int max_backoff)
+{
+ rc->max_backoff = MAX(1, max_backoff);
+ if (rc->state == S_BACKOFF && rc->backoff > max_backoff) {
+ rc->backoff = max_backoff;
+ if (rc->backoff_deadline > time_now() + max_backoff) {
+ rc->backoff_deadline = time_now() + max_backoff;
+ }
+ }
+}
+
+int
+rconn_get_max_backoff(const struct rconn *rc)
+{
+ return rc->max_backoff;
+}
+
+void
+rconn_set_probe_interval(struct rconn *rc, int probe_interval)
+{
+ rc->probe_interval = probe_interval ? MAX(5, probe_interval) : 0;
+}
+
+int
+rconn_get_probe_interval(const struct rconn *rc)
+{
+ return rc->probe_interval;
+}
+
+int
+rconn_connect(struct rconn *rc, const char *name)
+{
+ rconn_disconnect(rc);
+ free(rc->name);
+ rc->name = xstrdup(name);
+ rc->reliable = true;
+ return reconnect(rc);
+}
+
+void
+rconn_connect_unreliably(struct rconn *rc,
+ const char *name, struct vconn *vconn)
+{
+ assert(vconn != NULL);
+ rconn_disconnect(rc);
+ free(rc->name);
+ rc->name = xstrdup(name);
+ rc->reliable = false;
+ rc->vconn = vconn;
+ rc->last_connected = time_now();
+ state_transition(rc, S_ACTIVE);
+}
+
+/* If 'rc' is connected, forces it to drop the connection and reconnect. */
+void
+rconn_reconnect(struct rconn *rc)
+{
+ if (rc->state & (S_ACTIVE | S_IDLE)) {
+ disconnect(rc, 0);
+ }
+}
+
+void
+rconn_disconnect(struct rconn *rc)
+{
+ if (rc->state != S_VOID) {
+ if (rc->vconn) {
+ vconn_close(rc->vconn);
+ rc->vconn = NULL;
+ }
+ free(rc->name);
+ rc->name = xstrdup("void");
+ rc->reliable = false;
+
+ rc->backoff = 0;
+ rc->backoff_deadline = TIME_MIN;
+
+ state_transition(rc, S_VOID);
+ }
+}
+
+/* Disconnects 'rc' and frees the underlying storage. */
+void
+rconn_destroy(struct rconn *rc)
+{
+ if (rc) {
+ size_t i;
+
+ free(rc->name);
+ vconn_close(rc->vconn);
+ flush_queue(rc);
+ queue_destroy(&rc->txq);
+ for (i = 0; i < rc->n_monitors; i++) {
+ vconn_close(rc->monitors[i]);
+ }
+ free(rc);
+ }
+}
+
+static unsigned int
+timeout_VOID(const struct rconn *rc UNUSED)
+{
+ return UINT_MAX;
+}
+
+static void
+run_VOID(struct rconn *rc UNUSED)
+{
+ /* Nothing to do. */
+}
+
+static int
+reconnect(struct rconn *rc)
+{
+ int retval;
+
+ VLOG_INFO("%s: connecting...", rc->name);
+ rc->n_attempted_connections++;
+ retval = vconn_open(rc->name, OFP_VERSION, &rc->vconn);
+ if (!retval) {
+ rc->backoff_deadline = time_now() + rc->backoff;
+ state_transition(rc, S_CONNECTING);
+ } else {
+ VLOG_WARN("%s: connection failed (%s)", rc->name, strerror(retval));
+ rc->backoff_deadline = TIME_MAX; /* Prevent resetting backoff. */
+ disconnect(rc, 0);
+ }
+ return retval;
+}
+
+static unsigned int
+timeout_BACKOFF(const struct rconn *rc)
+{
+ return rc->backoff;
+}
+
+static void
+run_BACKOFF(struct rconn *rc)
+{
+ if (timed_out(rc)) {
+ reconnect(rc);
+ }
+}
+
+static unsigned int
+timeout_CONNECTING(const struct rconn *rc)
+{
+ return MAX(1, rc->backoff);
+}
+
+static void
+run_CONNECTING(struct rconn *rc)
+{
+ int retval = vconn_connect(rc->vconn);
+ if (!retval) {
+ VLOG_INFO("%s: connected", rc->name);
+ rc->n_successful_connections++;
+ state_transition(rc, S_ACTIVE);
+ rc->last_connected = rc->state_entered;
+ } else if (retval != EAGAIN) {
+ VLOG_INFO("%s: connection failed (%s)", rc->name, strerror(retval));
+ disconnect(rc, retval);
+ } else if (timed_out(rc)) {
+ VLOG_INFO("%s: connection timed out", rc->name);
+ rc->backoff_deadline = TIME_MAX; /* Prevent resetting backoff. */
+ disconnect(rc, 0);
+ }
+}
+
+static void
+do_tx_work(struct rconn *rc)
+{
+ if (!rc->txq.n) {
+ return;
+ }
+ while (rc->txq.n > 0) {
+ int error = try_send(rc);
+ if (error) {
+ break;
+ }
+ }
+ if (!rc->txq.n) {
+ poll_immediate_wake();
+ }
+}
+
+static unsigned int
+timeout_ACTIVE(const struct rconn *rc)
+{
+ if (rc->probe_interval) {
+ unsigned int base = MAX(rc->last_received, rc->state_entered);
+ unsigned int arg = base + rc->probe_interval - rc->state_entered;
+ return arg;
+ }
+ return UINT_MAX;
+}
+
+static void
+run_ACTIVE(struct rconn *rc)
+{
+ if (timed_out(rc)) {
+ unsigned int base = MAX(rc->last_received, rc->state_entered);
+ VLOG_DBG("%s: idle %u seconds, sending inactivity probe",
+ rc->name, (unsigned int) (time_now() - base));
+
+ /* Ordering is important here: rconn_send() can transition to BACKOFF,
+ * and we don't want to transition back to IDLE if so, because then we
+ * can end up queuing a packet with vconn == NULL and then *boom*. */
+ state_transition(rc, S_IDLE);
+ rconn_send(rc, make_echo_request(), NULL);
+ return;
+ }
+
+ do_tx_work(rc);
+}
+
+static unsigned int
+timeout_IDLE(const struct rconn *rc)
+{
+ return rc->probe_interval;
+}
+
+static void
+run_IDLE(struct rconn *rc)
+{
+ if (timed_out(rc)) {
+ question_connectivity(rc);
+ VLOG_ERR("%s: no response to inactivity probe after %u "
+ "seconds, disconnecting",
+ rc->name, elapsed_in_this_state(rc));
+ disconnect(rc, 0);
+ } else {
+ do_tx_work(rc);
+ }
+}
+
+/* Performs whatever activities are necessary to maintain 'rc': if 'rc' is
+ * disconnected, attempts to (re)connect, backing off as necessary; if 'rc' is
+ * connected, attempts to send packets in the send queue, if any. */
+void
+rconn_run(struct rconn *rc)
+{
+ int old_state;
+ do {
+ old_state = rc->state;
+ switch (rc->state) {
+#define STATE(NAME, VALUE) case S_##NAME: run_##NAME(rc); break;
+ STATES
+#undef STATE
+ default:
+ NOT_REACHED();
+ }
+ } while (rc->state != old_state);
+}
+
+/* Causes the next call to poll_block() to wake up when rconn_run() should be
+ * called on 'rc'. */
+void
+rconn_run_wait(struct rconn *rc)
+{
+ unsigned int timeo = timeout(rc);
+ if (timeo != UINT_MAX) {
+ unsigned int expires = sat_add(rc->state_entered, timeo);
+ unsigned int remaining = sat_sub(expires, time_now());
+ poll_timer_wait(sat_mul(remaining, 1000));
+ }
+
+ if ((rc->state & (S_ACTIVE | S_IDLE)) && rc->txq.n) {
+ vconn_wait(rc->vconn, WAIT_SEND);
+ }
+}
+
+/* Attempts to receive a packet from 'rc'. If successful, returns the packet;
+ * otherwise, returns a null pointer. The caller is responsible for freeing
+ * the packet (with ofpbuf_delete()). */
+struct ofpbuf *
+rconn_recv(struct rconn *rc)
+{
+ if (rc->state & (S_ACTIVE | S_IDLE)) {
+ struct ofpbuf *buffer;
+ int error = vconn_recv(rc->vconn, &buffer);
+ if (!error) {
+ copy_to_monitor(rc, buffer);
+ if (is_admitted_msg(buffer)
+ || time_now() - rc->last_connected >= 30) {
+ rc->probably_admitted = true;
+ rc->last_admitted = time_now();
+ }
+ rc->last_received = time_now();
+ rc->packets_received++;
+ if (rc->state == S_IDLE) {
+ state_transition(rc, S_ACTIVE);
+ }
+ return buffer;
+ } else if (error != EAGAIN) {
+ disconnect(rc, error);
+ }
+ }
+ return NULL;
+}
+
+/* Causes the next call to poll_block() to wake up when a packet may be ready
+ * to be received by vconn_recv() on 'rc'. */
+void
+rconn_recv_wait(struct rconn *rc)
+{
+ if (rc->vconn) {
+ vconn_wait(rc->vconn, WAIT_RECV);
+ }
+}
+
+/* Sends 'b' on 'rc'. Returns 0 if successful (in which case 'b' is
+ * destroyed), or ENOTCONN if 'rc' is not currently connected (in which case
+ * the caller retains ownership of 'b').
+ *
+ * If 'counter' is non-null, then 'counter' will be incremented while the
+ * packet is in flight, then decremented when it has been sent (or discarded
+ * due to disconnection). Because 'b' may be sent (or discarded) before this
+ * function returns, the caller may not be able to observe any change in
+ * 'counter'.
+ *
+ * There is no rconn_send_wait() function: an rconn has a send queue that it
+ * takes care of sending if you call rconn_run(), which will have the side
+ * effect of waking up poll_block(). */
+int
+rconn_send(struct rconn *rc, struct ofpbuf *b,
+ struct rconn_packet_counter *counter)
+{
+ if (rconn_is_connected(rc)) {
+ COVERAGE_INC(rconn_queued);
+ copy_to_monitor(rc, b);
+ b->private = counter;
+ if (counter) {
+ rconn_packet_counter_inc(counter);
+ }
+ queue_push_tail(&rc->txq, b);
+
+ /* If the queue was empty before we added 'b', try to send some
+ * packets. (But if the queue had packets in it, it's because the
+ * vconn is backlogged and there's no point in stuffing more into it
+ * now. We'll get back to that in rconn_run().) */
+ if (rc->txq.n == 1) {
+ try_send(rc);
+ }
+ return 0;
+ } else {
+ return ENOTCONN;
+ }
+}
+
+/* Sends 'b' on 'rc'. Increments 'counter' while the packet is in flight; it
+ * will be decremented when it has been sent (or discarded due to
+ * disconnection). Returns 0 if successful, EAGAIN if 'counter->n' is already
+ * at least as large as 'queue_limit', or ENOTCONN if 'rc' is not currently
+ * connected. Regardless of return value, 'b' is destroyed.
+ *
+ * Because 'b' may be sent (or discarded) before this function returns, the
+ * caller may not be able to observe any change in 'counter'.
+ *
+ * There is no rconn_send_wait() function: an rconn has a send queue that it
+ * takes care of sending if you call rconn_run(), which will have the side
+ * effect of waking up poll_block(). */
+int
+rconn_send_with_limit(struct rconn *rc, struct ofpbuf *b,
+ struct rconn_packet_counter *counter, int queue_limit)
+{
+ int retval;
+ retval = counter->n >= queue_limit ? EAGAIN : rconn_send(rc, b, counter);
+ if (retval) {
+ COVERAGE_INC(rconn_overflow);
+ ofpbuf_delete(b);
+ }
+ return retval;
+}
+
+/* Returns the total number of packets successfully sent on the underlying
+ * vconn. A packet is not counted as sent while it is still queued in the
+ * rconn, only when it has been successfuly passed to the vconn. */
+unsigned int
+rconn_packets_sent(const struct rconn *rc)
+{
+ return rc->packets_sent;
+}
+
+/* Adds 'vconn' to 'rc' as a monitoring connection, to which all messages sent
+ * and received on 'rconn' will be copied. 'rc' takes ownership of 'vconn'. */
+void
+rconn_add_monitor(struct rconn *rc, struct vconn *vconn)
+{
+ if (rc->n_monitors < ARRAY_SIZE(rc->monitors)) {
+ VLOG_INFO("new monitor connection from %s", vconn_get_name(vconn));
+ rc->monitors[rc->n_monitors++] = vconn;
+ } else {
+ VLOG_DBG("too many monitor connections, discarding %s",
+ vconn_get_name(vconn));
+ vconn_close(vconn);
+ }
+}
+
+/* Returns 'rc''s name (the 'name' argument passed to rconn_new()). */
+const char *
+rconn_get_name(const struct rconn *rc)
+{
+ return rc->name;
+}
+
+/* Returns true if 'rconn' is connected or in the process of reconnecting,
+ * false if 'rconn' is disconnected and will not reconnect on its own. */
+bool
+rconn_is_alive(const struct rconn *rconn)
+{
+ return rconn->state != S_VOID;
+}
+
+/* Returns true if 'rconn' is connected, false otherwise. */
+bool
+rconn_is_connected(const struct rconn *rconn)
+{
+ return is_connected_state(rconn->state);
+}
+
+/* Returns 0 if 'rconn' is connected. Otherwise, if 'rconn' is in a "failure
+ * mode" (that is, it is not connected), returns the number of seconds that it
+ * has been in failure mode, ignoring any times that it connected but the
+ * controller's admission control policy caused it to be quickly
+ * disconnected. */
+int
+rconn_failure_duration(const struct rconn *rconn)
+{
+ return rconn_is_connected(rconn) ? 0 : time_now() - rconn->last_admitted;
+}
+
+/* Returns the IP address of the peer, or 0 if the peer is not connected over
+ * an IP-based protocol or if its IP address is not known. */
+uint32_t
+rconn_get_ip(const struct rconn *rconn)
+{
+ return rconn->vconn ? vconn_get_ip(rconn->vconn) : 0;
+}
+
+/* If 'rconn' can't connect to the peer, it could be for any number of reasons.
+ * Usually, one would assume it is because the peer is not running or because
+ * the network is partitioned. But it could also be because the network
+ * topology has changed, in which case the upper layer will need to reassess it
+ * (in particular, obtain a new IP address via DHCP and find the new location
+ * of the controller). When this appears that this might be the case, this
+ * function returns true. It also clears the questionability flag and prevents
+ * it from being set again for some time. */
+bool
+rconn_is_connectivity_questionable(struct rconn *rconn)
+{
+ bool questionable = rconn->questionable_connectivity;
+ rconn->questionable_connectivity = false;
+ return questionable;
+}
+
+/* Returns the total number of packets successfully received by the underlying
+ * vconn. */
+unsigned int
+rconn_packets_received(const struct rconn *rc)
+{
+ return rc->packets_received;
+}
+
+/* Returns a string representing the internal state of 'rc'. The caller must
+ * not modify or free the string. */
+const char *
+rconn_get_state(const struct rconn *rc)
+{
+ return state_name(rc->state);
+}
+
+/* Returns the number of connection attempts made by 'rc', including any
+ * ongoing attempt that has not yet succeeded or failed. */
+unsigned int
+rconn_get_attempted_connections(const struct rconn *rc)
+{
+ return rc->n_attempted_connections;
+}
+
+/* Returns the number of successful connection attempts made by 'rc'. */
+unsigned int
+rconn_get_successful_connections(const struct rconn *rc)
+{
+ return rc->n_successful_connections;
+}
+
+/* Returns the time at which the last successful connection was made by
+ * 'rc'. */
+time_t
+rconn_get_last_connection(const struct rconn *rc)
+{
+ return rc->last_connected;
+}
+
+/* Returns the time at which 'rc' was created. */
+time_t
+rconn_get_creation_time(const struct rconn *rc)
+{
+ return rc->creation_time;
+}
+
+/* Returns the approximate number of seconds that 'rc' has been connected. */
+unsigned long int
+rconn_get_total_time_connected(const struct rconn *rc)
+{
+ return (rc->total_time_connected
+ + (rconn_is_connected(rc) ? elapsed_in_this_state(rc) : 0));
+}
+
+/* Returns the current amount of backoff, in seconds. This is the amount of
+ * time after which the rconn will transition from BACKOFF to CONNECTING. */
+int
+rconn_get_backoff(const struct rconn *rc)
+{
+ return rc->backoff;
+}
+
+/* Returns the number of seconds spent in this state so far. */
+unsigned int
+rconn_get_state_elapsed(const struct rconn *rc)
+{
+ return elapsed_in_this_state(rc);
+}
+
+/* Returns 'rc''s current connection sequence number, a number that changes
+ * every time that 'rconn' connects or disconnects. */
+unsigned int
+rconn_get_connection_seqno(const struct rconn *rc)
+{
+ return rc->seqno;
+}
+
+struct rconn_packet_counter *
+rconn_packet_counter_create(void)
+{
+ struct rconn_packet_counter *c = xmalloc(sizeof *c);
+ c->n = 0;
+ c->ref_cnt = 1;
+ return c;
+}
+
+void
+rconn_packet_counter_destroy(struct rconn_packet_counter *c)
+{
+ if (c) {
+ assert(c->ref_cnt > 0);
+ if (!--c->ref_cnt && !c->n) {
+ free(c);
+ }
+ }
+}
+
+void
+rconn_packet_counter_inc(struct rconn_packet_counter *c)
+{
+ c->n++;
+}
+
+void
+rconn_packet_counter_dec(struct rconn_packet_counter *c)
+{
+ assert(c->n > 0);
+ if (!--c->n && !c->ref_cnt) {
+ free(c);
+ }
+}
+
+/* Tries to send a packet from 'rc''s send buffer. Returns 0 if successful,
+ * otherwise a positive errno value. */
+static int
+try_send(struct rconn *rc)
+{
+ int retval = 0;
+ struct ofpbuf *next = rc->txq.head->next;
+ struct rconn_packet_counter *counter = rc->txq.head->private;
+ retval = vconn_send(rc->vconn, rc->txq.head);
+ if (retval) {
+ if (retval != EAGAIN) {
+ disconnect(rc, retval);
+ }
+ return retval;
+ }
+ COVERAGE_INC(rconn_sent);
+ rc->packets_sent++;
+ if (counter) {
+ rconn_packet_counter_dec(counter);
+ }
+ queue_advance_head(&rc->txq, next);
+ return 0;
+}
+
+/* Disconnects 'rc'. 'error' is used only for logging purposes. If it is
+ * nonzero, then it should be EOF to indicate the connection was closed by the
+ * peer in a normal fashion or a positive errno value. */
+static void
+disconnect(struct rconn *rc, int error)
+{
+ if (rc->reliable) {
+ time_t now = time_now();
+
+ if (rc->state & (S_CONNECTING | S_ACTIVE | S_IDLE)) {
+ if (error > 0) {
+ VLOG_WARN("%s: connection dropped (%s)",
+ rc->name, strerror(error));
+ } else if (error == EOF) {
+ if (rc->reliable) {
+ VLOG_INFO("%s: connection closed by peer", rc->name);
+ }
+ } else {
+ VLOG_INFO("%s: connection dropped", rc->name);
+ }
+ vconn_close(rc->vconn);
+ rc->vconn = NULL;
+ flush_queue(rc);
+ }
+
+ if (now >= rc->backoff_deadline) {
+ rc->backoff = 1;
+ } else {
+ rc->backoff = MIN(rc->max_backoff, MAX(1, 2 * rc->backoff));
+ VLOG_INFO("%s: waiting %d seconds before reconnect\n",
+ rc->name, rc->backoff);
+ }
+ rc->backoff_deadline = now + rc->backoff;
+ state_transition(rc, S_BACKOFF);
+ if (now - rc->last_connected > 60) {
+ question_connectivity(rc);
+ }
+ } else {
+ rconn_disconnect(rc);
+ }
+}
+
+/* Drops all the packets from 'rc''s send queue and decrements their queue
+ * counts. */
+static void
+flush_queue(struct rconn *rc)
+{
+ if (!rc->txq.n) {
+ return;
+ }
+ while (rc->txq.n > 0) {
+ struct ofpbuf *b = queue_pop_head(&rc->txq);
+ struct rconn_packet_counter *counter = b->private;
+ if (counter) {
+ rconn_packet_counter_dec(counter);
+ }
+ COVERAGE_INC(rconn_discarded);
+ ofpbuf_delete(b);
+ }
+ poll_immediate_wake();
+}
+
+static unsigned int
+elapsed_in_this_state(const struct rconn *rc)
+{
+ return time_now() - rc->state_entered;
+}
+
+static unsigned int
+timeout(const struct rconn *rc)
+{
+ switch (rc->state) {
+#define STATE(NAME, VALUE) case S_##NAME: return timeout_##NAME(rc);
+ STATES
+#undef STATE
+ default:
+ NOT_REACHED();
+ }
+}
+
+static bool
+timed_out(const struct rconn *rc)
+{
+ return time_now() >= sat_add(rc->state_entered, timeout(rc));
+}
+
+static void
+state_transition(struct rconn *rc, enum state state)
+{
+ rc->seqno += (rc->state == S_ACTIVE) != (state == S_ACTIVE);
+ if (is_connected_state(state) && !is_connected_state(rc->state)) {
+ rc->probably_admitted = false;
+ }
+ if (rconn_is_connected(rc)) {
+ rc->total_time_connected += elapsed_in_this_state(rc);
+ }
+ VLOG_DBG("%s: entering %s", rc->name, state_name(state));
+ rc->state = state;
+ rc->state_entered = time_now();
+}
+
+static void
+question_connectivity(struct rconn *rc)
+{
+ time_t now = time_now();
+ if (now - rc->last_questioned > 60) {
+ rc->questionable_connectivity = true;
+ rc->last_questioned = now;
+ }
+}
+
+static void
+copy_to_monitor(struct rconn *rc, const struct ofpbuf *b)
+{
+ struct ofpbuf *clone = NULL;
+ int retval;
+ size_t i;
+
+ for (i = 0; i < rc->n_monitors; ) {
+ struct vconn *vconn = rc->monitors[i];
+
+ if (!clone) {
+ clone = ofpbuf_clone(b);
+ }
+ retval = vconn_send(vconn, clone);
+ if (!retval) {
+ clone = NULL;
+ } else if (retval != EAGAIN) {
+ VLOG_DBG("%s: closing monitor connection to %s: %s",
+ rconn_get_name(rc), vconn_get_name(vconn),
+ strerror(retval));
+ rc->monitors[i] = rc->monitors[--rc->n_monitors];
+ continue;
+ }
+ i++;
+ }
+ ofpbuf_delete(clone);
+}
+
+static bool
+is_connected_state(enum state state)
+{
+ return (state & (S_ACTIVE | S_IDLE)) != 0;
+}
+
+static bool
+is_admitted_msg(const struct ofpbuf *b)
+{
+ struct ofp_header *oh = b->data;
+ uint8_t type = oh->type;
+ return !(type < 32
+ && (1u << type) & ((1u << OFPT_HELLO) |
+ (1u << OFPT_ERROR) |
+ (1u << OFPT_ECHO_REQUEST) |
+ (1u << OFPT_ECHO_REPLY) |
+ (1u << OFPT_VENDOR) |
+ (1u << OFPT_FEATURES_REQUEST) |
+ (1u << OFPT_FEATURES_REPLY) |
+ (1u << OFPT_GET_CONFIG_REQUEST) |
+ (1u << OFPT_GET_CONFIG_REPLY) |
+ (1u << OFPT_SET_CONFIG)));
+}
diff --git a/lib/rconn.h b/lib/rconn.h
new file mode 100644
index 000000000..837bc538e
--- /dev/null
+++ b/lib/rconn.h
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2008, 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef RCONN_H
+#define RCONN_H 1
+
+#include "queue.h"
+#include <stdbool.h>
+#include <stdint.h>
+#include <time.h>
+
+/* A wrapper around vconn that provides queuing and optionally reliability.
+ *
+ * An rconn maintains a message transmission queue of bounded length specified
+ * by the caller. The rconn does not guarantee reliable delivery of
+ * queued messages: all queued messages are dropped when reconnection becomes
+ * necessary.
+ *
+ * An rconn optionally provides reliable communication, in this sense: the
+ * rconn will re-connect, with exponential backoff, when the underlying vconn
+ * disconnects.
+ */
+
+struct vconn;
+struct rconn_packet_counter;
+
+struct rconn *rconn_new(const char *name,
+ int inactivity_probe_interval, int max_backoff);
+struct rconn *rconn_new_from_vconn(const char *name, struct vconn *);
+struct rconn *rconn_create(int inactivity_probe_interval, int max_backoff);
+
+void rconn_set_max_backoff(struct rconn *, int max_backoff);
+int rconn_get_max_backoff(const struct rconn *);
+void rconn_set_probe_interval(struct rconn *, int inactivity_probe_interval);
+int rconn_get_probe_interval(const struct rconn *);
+
+int rconn_connect(struct rconn *, const char *name);
+void rconn_connect_unreliably(struct rconn *,
+ const char *name, struct vconn *vconn);
+void rconn_reconnect(struct rconn *);
+void rconn_disconnect(struct rconn *);
+void rconn_destroy(struct rconn *);
+
+void rconn_run(struct rconn *);
+void rconn_run_wait(struct rconn *);
+struct ofpbuf *rconn_recv(struct rconn *);
+void rconn_recv_wait(struct rconn *);
+int rconn_send(struct rconn *, struct ofpbuf *, struct rconn_packet_counter *);
+int rconn_send_with_limit(struct rconn *, struct ofpbuf *,
+ struct rconn_packet_counter *, int queue_limit);
+unsigned int rconn_packets_sent(const struct rconn *);
+unsigned int rconn_packets_received(const struct rconn *);
+
+void rconn_add_monitor(struct rconn *, struct vconn *);
+
+const char *rconn_get_name(const struct rconn *);
+bool rconn_is_alive(const struct rconn *);
+bool rconn_is_connected(const struct rconn *);
+int rconn_failure_duration(const struct rconn *);
+bool rconn_is_connectivity_questionable(struct rconn *);
+
+uint32_t rconn_get_ip(const struct rconn *);
+
+const char *rconn_get_state(const struct rconn *);
+unsigned int rconn_get_attempted_connections(const struct rconn *);
+unsigned int rconn_get_successful_connections(const struct rconn *);
+time_t rconn_get_last_connection(const struct rconn *);
+time_t rconn_get_creation_time(const struct rconn *);
+unsigned long int rconn_get_total_time_connected(const struct rconn *);
+int rconn_get_backoff(const struct rconn *);
+unsigned int rconn_get_state_elapsed(const struct rconn *);
+unsigned int rconn_get_connection_seqno(const struct rconn *);
+
+/* Counts the number of packets queued into an rconn by a given source. */
+struct rconn_packet_counter {
+ int n; /* Number of packets queued. */
+ int ref_cnt; /* Number of owners. */
+};
+
+struct rconn_packet_counter *rconn_packet_counter_create(void);
+void rconn_packet_counter_destroy(struct rconn_packet_counter *);
+void rconn_packet_counter_inc(struct rconn_packet_counter *);
+void rconn_packet_counter_dec(struct rconn_packet_counter *);
+
+static inline int
+rconn_packet_counter_read(const struct rconn_packet_counter *counter)
+{
+ return counter->n;
+}
+
+#endif /* rconn.h */
diff --git a/lib/sat-math.h b/lib/sat-math.h
new file mode 100644
index 000000000..84ff51ac3
--- /dev/null
+++ b/lib/sat-math.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2008 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef SAT_MATH_H
+#define SAT_MATH_H 1
+
+#include <assert.h>
+#include <limits.h>
+
+/* Saturating addition: overflow yields UINT_MAX. */
+static inline unsigned int
+sat_add(unsigned int x, unsigned int y)
+{
+ return x + y >= x ? x + y : UINT_MAX;
+}
+
+/* Saturating subtraction: underflow yields 0. */
+static inline unsigned int
+sat_sub(unsigned int x, unsigned int y)
+{
+ return x >= y ? x - y : 0;
+}
+
+/* Saturating multiplication: overflow yields UINT_MAX. */
+static inline unsigned int
+sat_mul(unsigned int x, unsigned int y)
+{
+ return (!y ? 0
+ : x <= UINT_MAX / y ? x * y
+ : UINT_MAX);
+}
+
+#endif /* sat-math.h */
diff --git a/lib/sha1.c b/lib/sha1.c
new file mode 100644
index 000000000..5fc763f88
--- /dev/null
+++ b/lib/sha1.c
@@ -0,0 +1,394 @@
+/*
+ * sha1.c
+ *
+ * Description:
+ * This file implements the Secure Hashing Algorithm 1 as
+ * defined in FIPS PUB 180-1 published April 17, 1995.
+ *
+ * The SHA-1, produces a 160-bit message digest for a given
+ * data stream. It should take about 2**n steps to find a
+ * message with the same digest as a given message and
+ * 2**(n/2) to find any two messages with the same digest,
+ * when n is the digest size in bits. Therefore, this
+ * algorithm can serve as a means of providing a
+ * "fingerprint" for a message.
+ *
+ * Portability Issues:
+ * SHA-1 is defined in terms of 32-bit "words". This code
+ * uses <stdint.h> (included via "sha1.h" to define 32 and 8
+ * bit unsigned integer types. If your C compiler does not
+ * support 32 bit unsigned integers, this code is not
+ * appropriate.
+ *
+ * Caveats:
+ * SHA-1 is designed to work with messages less than 2^64 bits
+ * long. Although SHA-1 allows a message digest to be generated
+ * for messages of any number of bits less than 2^64, this
+ * implementation only works with messages with a length that is
+ * a multiple of the size of an 8-bit character.
+ *
+ */
+
+#include "sha1.h"
+
+/*
+ * Define the SHA1 circular left shift macro
+ */
+#define SHA1CircularShift(bits,word) \
+ (((word) << (bits)) | ((word) >> (32-(bits))))
+
+/* Local Function Prototyptes */
+void SHA1PadMessage(SHA1Context *);
+void SHA1ProcessMessageBlock(SHA1Context *);
+
+/*
+ * SHA1Reset
+ *
+ * Description:
+ * This function will initialize the SHA1Context in preparation
+ * for computing a new SHA1 message digest.
+ *
+ * Parameters:
+ * context: [in/out]
+ * The context to reset.
+ *
+ * Returns:
+ * sha Error Code.
+ *
+ */
+int SHA1Reset(SHA1Context *context)
+{
+ if (!context)
+ {
+ return shaNull;
+ }
+
+ context->Length_Low = 0;
+ context->Length_High = 0;
+ context->Message_Block_Index = 0;
+
+ context->Intermediate_Hash[0] = 0x67452301;
+ context->Intermediate_Hash[1] = 0xEFCDAB89;
+ context->Intermediate_Hash[2] = 0x98BADCFE;
+ context->Intermediate_Hash[3] = 0x10325476;
+ context->Intermediate_Hash[4] = 0xC3D2E1F0;
+
+ context->Computed = 0;
+ context->Corrupted = 0;
+
+ return shaSuccess;
+}
+
+/*
+ * SHA1Result
+ *
+ * Description:
+ * This function will return the 160-bit message digest into the
+ * Message_Digest array provided by the caller.
+ * NOTE: The first octet of hash is stored in the 0th element,
+ * the last octet of hash in the 19th element.
+ *
+ * Parameters:
+ * context: [in/out]
+ * The context to use to calculate the SHA-1 hash.
+ * Message_Digest: [out]
+ * Where the digest is returned.
+ *
+ * Returns:
+ * sha Error Code.
+ *
+ */
+int SHA1Result( SHA1Context *context,
+ uint8_t Message_Digest[SHA1HashSize])
+{
+ int i;
+
+ if (!context || !Message_Digest)
+ {
+ return shaNull;
+ }
+
+ if (context->Corrupted)
+ {
+ return context->Corrupted;
+ }
+
+ if (!context->Computed)
+ {
+ SHA1PadMessage(context);
+ for(i=0; i<64; ++i)
+ {
+ /* message may be sensitive, clear it out */
+ context->Message_Block[i] = 0;
+ }
+ context->Length_Low = 0; /* and clear length */
+ context->Length_High = 0;
+ context->Computed = 1;
+ }
+
+ for(i = 0; i < SHA1HashSize; ++i)
+ {
+ Message_Digest[i] = context->Intermediate_Hash[i>>2]
+ >> 8 * ( 3 - ( i & 0x03 ) );
+ }
+
+ return shaSuccess;
+}
+
+/*
+ * SHA1Input
+ *
+ * Description:
+ * This function accepts an array of octets as the next portion
+ * of the message.
+ *
+ * Parameters:
+ * context: [in/out]
+ * The SHA context to update
+ * message_array: [in]
+ * An array of characters representing the next portion of
+ * the message.
+ * length: [in]
+ * The length of the message in message_array
+ *
+ * Returns:
+ * sha Error Code.
+ *
+ */
+int SHA1Input( SHA1Context *context,
+ const uint8_t *message_array,
+ unsigned length)
+{
+ if (!length)
+ {
+ return shaSuccess;
+ }
+
+ if (!context || !message_array)
+ {
+ return shaNull;
+ }
+
+ if (context->Computed)
+ {
+ context->Corrupted = shaStateError;
+ return shaStateError;
+ }
+
+ if (context->Corrupted)
+ {
+ return context->Corrupted;
+ }
+ while(length-- && !context->Corrupted)
+ {
+ context->Message_Block[context->Message_Block_Index++] =
+ (*message_array & 0xFF);
+
+ context->Length_Low += 8;
+ if (context->Length_Low == 0)
+ {
+ context->Length_High++;
+ if (context->Length_High == 0)
+ {
+ /* Message is too long */
+ context->Corrupted = 1;
+ }
+ }
+
+ if (context->Message_Block_Index == 64)
+ {
+ SHA1ProcessMessageBlock(context);
+ }
+
+ message_array++;
+ }
+
+ return shaSuccess;
+}
+
+/*
+ * SHA1ProcessMessageBlock
+ *
+ * Description:
+ * This function will process the next 512 bits of the message
+ * stored in the Message_Block array.
+ *
+ * Parameters:
+ * None.
+ *
+ * Returns:
+ * Nothing.
+ *
+ * Comments:
+ * Many of the variable names in this code, especially the
+ * single character names, were used because those were the
+ * names used in the publication.
+ *
+ *
+ */
+void SHA1ProcessMessageBlock(SHA1Context *context)
+{
+ const uint32_t K[] = { /* Constants defined in SHA-1 */
+ 0x5A827999,
+ 0x6ED9EBA1,
+ 0x8F1BBCDC,
+ 0xCA62C1D6
+ };
+ int t; /* Loop counter */
+ uint32_t temp; /* Temporary word value */
+ uint32_t W[80]; /* Word sequence */
+ uint32_t A, B, C, D, E; /* Word buffers */
+
+ /*
+ * Initialize the first 16 words in the array W
+ */
+ for(t = 0; t < 16; t++)
+ {
+ W[t] = context->Message_Block[t * 4] << 24;
+ W[t] |= context->Message_Block[t * 4 + 1] << 16;
+ W[t] |= context->Message_Block[t * 4 + 2] << 8;
+ W[t] |= context->Message_Block[t * 4 + 3];
+ }
+
+ for(t = 16; t < 80; t++)
+ {
+ W[t] = SHA1CircularShift(1,W[t-3] ^ W[t-8] ^ W[t-14] ^ W[t-16]);
+ }
+
+ A = context->Intermediate_Hash[0];
+ B = context->Intermediate_Hash[1];
+ C = context->Intermediate_Hash[2];
+ D = context->Intermediate_Hash[3];
+ E = context->Intermediate_Hash[4];
+
+ for(t = 0; t < 20; t++)
+ {
+ temp = SHA1CircularShift(5,A) +
+ ((B & C) | ((~B) & D)) + E + W[t] + K[0];
+ E = D;
+ D = C;
+ C = SHA1CircularShift(30,B);
+ B = A;
+ A = temp;
+ }
+
+ for(t = 20; t < 40; t++)
+ {
+ temp = SHA1CircularShift(5,A) + (B ^ C ^ D) + E + W[t] + K[1];
+ E = D;
+ D = C;
+ C = SHA1CircularShift(30,B);
+ B = A;
+ A = temp;
+ }
+
+ for(t = 40; t < 60; t++)
+ {
+ temp = SHA1CircularShift(5,A) +
+ ((B & C) | (B & D) | (C & D)) + E + W[t] + K[2];
+ E = D;
+ D = C;
+ C = SHA1CircularShift(30,B);
+ B = A;
+ A = temp;
+ }
+
+ for(t = 60; t < 80; t++)
+ {
+ temp = SHA1CircularShift(5,A) + (B ^ C ^ D) + E + W[t] + K[3];
+ E = D;
+ D = C;
+ C = SHA1CircularShift(30,B);
+ B = A;
+ A = temp;
+ }
+
+ context->Intermediate_Hash[0] += A;
+ context->Intermediate_Hash[1] += B;
+ context->Intermediate_Hash[2] += C;
+ context->Intermediate_Hash[3] += D;
+ context->Intermediate_Hash[4] += E;
+
+ context->Message_Block_Index = 0;
+}
+
+
+/*
+ * SHA1PadMessage
+ *
+ * Description:
+ * According to the standard, the message must be padded to an even
+ * 512 bits. The first padding bit must be a '1'. The last 64
+ * bits represent the length of the original message. All bits in
+ * between should be 0. This function will pad the message
+ * according to those rules by filling the Message_Block array
+ * accordingly. It will also call the ProcessMessageBlock function
+ * provided appropriately. When it returns, it can be assumed that
+ * the message digest has been computed.
+ *
+ * Parameters:
+ * context: [in/out]
+ * The context to pad
+ * ProcessMessageBlock: [in]
+ * The appropriate SHA*ProcessMessageBlock function
+ * Returns:
+ * Nothing.
+ *
+ */
+
+void SHA1PadMessage(SHA1Context *context)
+{
+ /*
+ * Check to see if the current message block is too small to hold
+ * the initial padding bits and length. If so, we will pad the
+ * block, process it, and then continue padding into a second
+ * block.
+ */
+ if (context->Message_Block_Index > 55)
+ {
+ context->Message_Block[context->Message_Block_Index++] = 0x80;
+ while(context->Message_Block_Index < 64)
+ {
+ context->Message_Block[context->Message_Block_Index++] = 0;
+ }
+
+ SHA1ProcessMessageBlock(context);
+
+ while(context->Message_Block_Index < 56)
+ {
+ context->Message_Block[context->Message_Block_Index++] = 0;
+ }
+ }
+ else
+ {
+ context->Message_Block[context->Message_Block_Index++] = 0x80;
+ while(context->Message_Block_Index < 56)
+ {
+ context->Message_Block[context->Message_Block_Index++] = 0;
+ }
+ }
+
+ /*
+ * Store the message length as the last 8 octets
+ */
+ context->Message_Block[56] = context->Length_High >> 24;
+ context->Message_Block[57] = context->Length_High >> 16;
+ context->Message_Block[58] = context->Length_High >> 8;
+ context->Message_Block[59] = context->Length_High;
+ context->Message_Block[60] = context->Length_Low >> 24;
+ context->Message_Block[61] = context->Length_Low >> 16;
+ context->Message_Block[62] = context->Length_Low >> 8;
+ context->Message_Block[63] = context->Length_Low;
+
+ SHA1ProcessMessageBlock(context);
+}
+
+void
+SHA1Bytes(const void *data, unsigned int n,
+ uint8_t Message_Digest[SHA1HashSize])
+{
+ SHA1Context ctx;
+ SHA1Reset(&ctx);
+ SHA1Input(&ctx, data, n);
+ SHA1Result(&ctx, Message_Digest);
+}
diff --git a/lib/sha1.h b/lib/sha1.h
new file mode 100644
index 000000000..382cf3204
--- /dev/null
+++ b/lib/sha1.h
@@ -0,0 +1,74 @@
+/*
+ * sha1.h
+ *
+ * Description:
+ * This is the header file for code which implements the Secure
+ * Hashing Algorithm 1 as defined in FIPS PUB 180-1 published
+ * April 17, 1995.
+ *
+ * Many of the variable names in this code, especially the
+ * single character names, were used because those were the names
+ * used in the publication.
+ *
+ * Please read the file sha1.c for more information.
+ *
+ */
+#ifndef _SHA1_H_
+#define _SHA1_H_
+
+#include <stdint.h>
+/*
+ * If you do not have the ISO standard stdint.h header file, then you
+ * must typdef the following:
+ * name meaning
+ * uint32_t unsigned 32 bit integer
+ * uint8_t unsigned 8 bit integer (i.e., unsigned char)
+ * int_least16_t integer of >= 16 bits
+ *
+ */
+
+#ifndef _SHA_enum_
+#define _SHA_enum_
+enum
+{
+ shaSuccess = 0,
+ shaNull, /* Null pointer parameter */
+ shaInputTooLong, /* input data too long */
+ shaStateError /* called Input after Result */
+};
+#endif
+#define SHA1HashSize 20
+
+/*
+ * This structure will hold context information for the SHA-1
+ * hashing operation
+ */
+typedef struct SHA1Context
+{
+ uint32_t Intermediate_Hash[SHA1HashSize/4]; /* Message Digest */
+
+ uint32_t Length_Low; /* Message length in bits */
+ uint32_t Length_High; /* Message length in bits */
+
+ /* Index into message block array */
+ int_least16_t Message_Block_Index;
+ uint8_t Message_Block[64]; /* 512-bit message blocks */
+
+ int Computed; /* Is the digest computed? */
+ int Corrupted; /* Is the message digest corrupted? */
+} SHA1Context;
+
+/*
+ * Function Prototypes
+ */
+int SHA1Reset( SHA1Context *);
+int SHA1Input( SHA1Context *,
+ const uint8_t *,
+ unsigned int);
+int SHA1Result( SHA1Context *,
+ uint8_t Message_Digest[SHA1HashSize]);
+
+void SHA1Bytes(const void *data, unsigned int n,
+ uint8_t Message_Digest[SHA1HashSize]);
+
+#endif
diff --git a/lib/shash.c b/lib/shash.c
new file mode 100644
index 000000000..8f97f7761
--- /dev/null
+++ b/lib/shash.c
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <config.h>
+#include "shash.h"
+#include <assert.h>
+#include "hash.h"
+
+static size_t
+hash_name(const char *name)
+{
+ return hash_string(name, 0);
+}
+
+void
+shash_init(struct shash *sh)
+{
+ hmap_init(&sh->map);
+}
+
+void
+shash_destroy(struct shash *sh)
+{
+ if (sh) {
+ shash_clear(sh);
+ }
+}
+
+void
+shash_clear(struct shash *sh)
+{
+ struct shash_node *node, *next;
+
+ HMAP_FOR_EACH_SAFE (node, next, struct shash_node, node, &sh->map) {
+ hmap_remove(&sh->map, &node->node);
+ free(node->name);
+ free(node);
+ }
+}
+
+/* It is the caller's responsible to avoid duplicate names, if that is
+ * desirable. */
+void
+shash_add(struct shash *sh, const char *name, void *data)
+{
+ struct shash_node *node = xmalloc(sizeof *node);
+ node->name = xstrdup(name);
+ node->data = data;
+ hmap_insert(&sh->map, &node->node, hash_name(name));
+}
+
+void
+shash_delete(struct shash *sh, struct shash_node *node)
+{
+ hmap_remove(&sh->map, &node->node);
+ free(node->name);
+ free(node);
+}
+
+/* If there are duplicates, returns a random element. */
+struct shash_node *
+shash_find(const struct shash *sh, const char *name)
+{
+ struct shash_node *node;
+
+ HMAP_FOR_EACH_WITH_HASH (node, struct shash_node, node,
+ hash_name(name), &sh->map) {
+ if (!strcmp(node->name, name)) {
+ return node;
+ }
+ }
+ return NULL;
+}
+
+void *
+shash_find_data(const struct shash *sh, const char *name)
+{
+ struct shash_node *node = shash_find(sh, name);
+ return node ? node->data : NULL;
+}
diff --git a/lib/shash.h b/lib/shash.h
new file mode 100644
index 000000000..ee3fb5f57
--- /dev/null
+++ b/lib/shash.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef SHASH_H
+#define SHASH_H 1
+
+#include "hmap.h"
+
+struct shash_node {
+ struct hmap_node node;
+ char *name;
+ void *data;
+};
+
+struct shash {
+ struct hmap map;
+};
+
+#define SHASH_INITIALIZER(SHASH) { HMAP_INITIALIZER(&(SHASH)->map) }
+
+void shash_init(struct shash *);
+void shash_destroy(struct shash *);
+void shash_clear(struct shash *);
+void shash_add(struct shash *, const char *, void *);
+void shash_delete(struct shash *, struct shash_node *);
+struct shash_node *shash_find(const struct shash *, const char *);
+void *shash_find_data(const struct shash *, const char *);
+
+#endif /* shash.h */
diff --git a/lib/signals.c b/lib/signals.c
new file mode 100644
index 000000000..26eebcdb6
--- /dev/null
+++ b/lib/signals.c
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2008, 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <config.h>
+#include "signals.h"
+#include <assert.h>
+#include <errno.h>
+#include <limits.h>
+#include <signal.h>
+#include <unistd.h>
+#include "poll-loop.h"
+#include "socket-util.h"
+#include "util.h"
+
+#if defined(_NSIG)
+#define N_SIGNALS _NSIG
+#elif defined(NSIG)
+#define N_SIGNALS NSIG
+#else
+/* We could try harder to get the maximum signal number, but in practice we
+ * only care about SIGHUP, which is normally signal 1 anyway. */
+#define N_SIGNALS 32
+#endif
+
+struct signal {
+ int signr;
+};
+
+static volatile sig_atomic_t signaled[N_SIGNALS];
+
+static int fds[2];
+
+static void signal_handler(int signr);
+
+/* Initializes the signals subsystem (if it is not already initialized). Calls
+ * exit() if initialization fails.
+ *
+ * Calling this function is optional; it will be called automatically by
+ * signal_start() if necessary. Calling it explicitly allows the client to
+ * prevent the process from exiting at an unexpected time. */
+void
+signal_init(void)
+{
+ static bool inited;
+ if (!inited) {
+ inited = true;
+ if (pipe(fds)) {
+ ovs_fatal(errno, "could not create pipe");
+ }
+ set_nonblocking(fds[0]);
+ set_nonblocking(fds[1]);
+ }
+}
+
+/* Sets up a handler for 'signr' and returns a structure that represents it.
+ *
+ * Only one handler for a given signal may be registered at a time. */
+struct signal *
+signal_register(int signr)
+{
+ struct sigaction sa;
+ struct signal *s;
+
+ signal_init();
+
+ /* Set up signal handler. */
+ assert(signr >= 1 && signr < N_SIGNALS);
+ memset(&sa, 0, sizeof sa);
+ sa.sa_handler = signal_handler;
+ sigemptyset(&sa.sa_mask);
+ sa.sa_flags = SA_RESTART;
+ if (sigaction(signr, &sa, NULL)) {
+ ovs_fatal(errno, "sigaction(%d) failed", signr);
+ }
+
+ /* Return structure. */
+ s = xmalloc(sizeof *s);
+ s->signr = signr;
+ return s;
+}
+
+/* Returns true if signal 's' has been received since the last call to this
+ * function with argument 's'. */
+bool
+signal_poll(struct signal *s)
+{
+ char buf[_POSIX_PIPE_BUF];
+ read(fds[0], buf, sizeof buf);
+ if (signaled[s->signr]) {
+ signaled[s->signr] = 0;
+ return true;
+ }
+ return false;
+}
+
+/* Causes the next call to poll_block() to wake up when signal_poll(s) would
+ * return true. */
+void
+signal_wait(struct signal *s)
+{
+ if (signaled[s->signr]) {
+ poll_immediate_wake();
+ } else {
+ poll_fd_wait(fds[0], POLLIN);
+ }
+}
+
+static void
+signal_handler(int signr)
+{
+ if (signr >= 1 && signr < N_SIGNALS) {
+ write(fds[1], "", 1);
+ signaled[signr] = true;
+ }
+}
diff --git a/lib/signals.h b/lib/signals.h
new file mode 100644
index 000000000..7f25f662a
--- /dev/null
+++ b/lib/signals.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2008 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef SIGNALS_H
+#define SIGNALS_H 1
+
+#include <stdbool.h>
+
+void signal_init(void);
+struct signal *signal_register(int signr);
+bool signal_poll(struct signal *);
+void signal_wait(struct signal *);
+
+#endif /* signals.h */
diff --git a/lib/socket-util.c b/lib/socket-util.c
new file mode 100644
index 000000000..3d290e883
--- /dev/null
+++ b/lib/socket-util.c
@@ -0,0 +1,343 @@
+/*
+ * Copyright (c) 2008, 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <config.h>
+#include "socket-util.h"
+#include <arpa/inet.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <netdb.h>
+#include <poll.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/resource.h>
+#include <sys/un.h>
+#include <unistd.h>
+#include "fatal-signal.h"
+#include "util.h"
+
+#include "vlog.h"
+#define THIS_MODULE VLM_socket_util
+
+/* Sets 'fd' to non-blocking mode. Returns 0 if successful, otherwise a
+ * positive errno value. */
+int
+set_nonblocking(int fd)
+{
+ int flags = fcntl(fd, F_GETFL, 0);
+ if (flags != -1) {
+ if (fcntl(fd, F_SETFL, flags | O_NONBLOCK) != -1) {
+ return 0;
+ } else {
+ VLOG_ERR("fcntl(F_SETFL) failed: %s", strerror(errno));
+ return errno;
+ }
+ } else {
+ VLOG_ERR("fcntl(F_GETFL) failed: %s", strerror(errno));
+ return errno;
+ }
+}
+
+/* Returns the maximum valid FD value, plus 1. */
+int
+get_max_fds(void)
+{
+ static int max_fds = -1;
+ if (max_fds < 0) {
+ struct rlimit r;
+ if (!getrlimit(RLIMIT_NOFILE, &r)
+ && r.rlim_cur != RLIM_INFINITY
+ && r.rlim_cur != RLIM_SAVED_MAX
+ && r.rlim_cur != RLIM_SAVED_CUR) {
+ max_fds = r.rlim_cur;
+ } else {
+ VLOG_WARN("failed to obtain fd limit, defaulting to 1024");
+ max_fds = 1024;
+ }
+ }
+ return max_fds;
+}
+
+/* Translates 'host_name', which may be a DNS name or an IP address, into a
+ * numeric IP address in '*addr'. Returns 0 if successful, otherwise a
+ * positive errno value. */
+int
+lookup_ip(const char *host_name, struct in_addr *addr)
+{
+ if (!inet_aton(host_name, addr)) {
+ struct hostent *he = gethostbyname(host_name);
+ if (he == NULL) {
+ struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
+ VLOG_ERR_RL(&rl, "gethostbyname(%s): %s", host_name,
+ (h_errno == HOST_NOT_FOUND ? "host not found"
+ : h_errno == TRY_AGAIN ? "try again"
+ : h_errno == NO_RECOVERY ? "non-recoverable error"
+ : h_errno == NO_ADDRESS ? "no address"
+ : "unknown error"));
+ return ENOENT;
+ }
+ addr->s_addr = *(uint32_t *) he->h_addr;
+ }
+ return 0;
+}
+
+/* Returns the error condition associated with socket 'fd' and resets the
+ * socket's error status. */
+int
+get_socket_error(int fd)
+{
+ int error;
+ socklen_t len = sizeof(error);
+ if (getsockopt(fd, SOL_SOCKET, SO_ERROR, &error, &len) < 0) {
+ struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 10);
+ error = errno;
+ VLOG_ERR_RL(&rl, "getsockopt(SO_ERROR): %s", strerror(error));
+ }
+ return error;
+}
+
+int
+check_connection_completion(int fd)
+{
+ struct pollfd pfd;
+ int retval;
+
+ pfd.fd = fd;
+ pfd.events = POLLOUT;
+ do {
+ retval = poll(&pfd, 1, 0);
+ } while (retval < 0 && errno == EINTR);
+ if (retval == 1) {
+ return get_socket_error(fd);
+ } else if (retval < 0) {
+ static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 10);
+ VLOG_ERR_RL(&rl, "poll: %s", strerror(errno));
+ return errno;
+ } else {
+ return EAGAIN;
+ }
+}
+
+/* Drain all the data currently in the receive queue of a datagram socket (and
+ * possibly additional data). There is no way to know how many packets are in
+ * the receive queue, but we do know that the total number of bytes queued does
+ * not exceed the receive buffer size, so we pull packets until none are left
+ * or we've read that many bytes. */
+int
+drain_rcvbuf(int fd)
+{
+ socklen_t rcvbuf_len;
+ size_t rcvbuf;
+
+ rcvbuf_len = sizeof rcvbuf;
+ if (getsockopt(fd, SOL_SOCKET, SO_RCVBUF, &rcvbuf, &rcvbuf_len) < 0) {
+ static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 10);
+ VLOG_ERR_RL(&rl, "getsockopt(SO_RCVBUF) failed: %s", strerror(errno));
+ return errno;
+ }
+ while (rcvbuf > 0) {
+ /* In Linux, specifying MSG_TRUNC in the flags argument causes the
+ * datagram length to be returned, even if that is longer than the
+ * buffer provided. Thus, we can use a 1-byte buffer to discard the
+ * incoming datagram and still be able to account how many bytes were
+ * removed from the receive buffer.
+ *
+ * On other Unix-like OSes, MSG_TRUNC has no effect in the flags
+ * argument. */
+#ifdef __linux__
+#define BUFFER_SIZE 1
+#else
+#define BUFFER_SIZE 2048
+#endif
+ char buffer[BUFFER_SIZE];
+ ssize_t n_bytes = recv(fd, buffer, sizeof buffer,
+ MSG_TRUNC | MSG_DONTWAIT);
+ if (n_bytes <= 0 || n_bytes >= rcvbuf) {
+ break;
+ }
+ rcvbuf -= n_bytes;
+ }
+ return 0;
+}
+
+/* Reads and discards up to 'n' datagrams from 'fd', stopping as soon as no
+ * more data can be immediately read. ('fd' should therefore be in
+ * non-blocking mode.)*/
+void
+drain_fd(int fd, size_t n_packets)
+{
+ for (; n_packets > 0; n_packets--) {
+ /* 'buffer' only needs to be 1 byte long in most circumstances. This
+ * size is defensive against the possibility that we someday want to
+ * use a Linux tap device without TUN_NO_PI, in which case a buffer
+ * smaller than sizeof(struct tun_pi) will give EINVAL on read. */
+ char buffer[128];
+ if (read(fd, buffer, sizeof buffer) <= 0) {
+ break;
+ }
+ }
+}
+
+/* Stores in '*un' a sockaddr_un that refers to file 'name'. Stores in
+ * '*un_len' the size of the sockaddr_un. */
+static void
+make_sockaddr_un(const char *name, struct sockaddr_un* un, socklen_t *un_len)
+{
+ un->sun_family = AF_UNIX;
+ strncpy(un->sun_path, name, sizeof un->sun_path);
+ un->sun_path[sizeof un->sun_path - 1] = '\0';
+ *un_len = (offsetof(struct sockaddr_un, sun_path)
+ + strlen (un->sun_path) + 1);
+}
+
+/* Creates a Unix domain socket in the given 'style' (either SOCK_DGRAM or
+ * SOCK_STREAM) that is bound to '*bind_path' (if 'bind_path' is non-null) and
+ * connected to '*connect_path' (if 'connect_path' is non-null). If 'nonblock'
+ * is true, the socket is made non-blocking. If 'passcred' is true, the socket
+ * is configured to receive SCM_CREDENTIALS control messages.
+ *
+ * Returns the socket's fd if successful, otherwise a negative errno value. */
+int
+make_unix_socket(int style, bool nonblock, bool passcred UNUSED,
+ const char *bind_path, const char *connect_path)
+{
+ int error;
+ int fd;
+
+ fd = socket(PF_UNIX, style, 0);
+ if (fd < 0) {
+ return -errno;
+ }
+
+ /* Set nonblocking mode right away, if we want it. This prevents blocking
+ * in connect(), if connect_path != NULL. (In turn, that's a corner case:
+ * it will only happen if style is SOCK_STREAM or SOCK_SEQPACKET, and only
+ * if a backlog of un-accepted connections has built up in the kernel.) */
+ if (nonblock) {
+ int flags = fcntl(fd, F_GETFL, 0);
+ if (flags == -1) {
+ goto error;
+ }
+ if (fcntl(fd, F_SETFL, flags | O_NONBLOCK) == -1) {
+ goto error;
+ }
+ }
+
+ if (bind_path) {
+ struct sockaddr_un un;
+ socklen_t un_len;
+ make_sockaddr_un(bind_path, &un, &un_len);
+ if (unlink(un.sun_path) && errno != ENOENT) {
+ VLOG_WARN("unlinking \"%s\": %s\n", un.sun_path, strerror(errno));
+ }
+ fatal_signal_add_file_to_unlink(bind_path);
+ if (bind(fd, (struct sockaddr*) &un, un_len)
+ || fchmod(fd, S_IRWXU)) {
+ goto error;
+ }
+ }
+
+ if (connect_path) {
+ struct sockaddr_un un;
+ socklen_t un_len;
+ make_sockaddr_un(connect_path, &un, &un_len);
+ if (connect(fd, (struct sockaddr*) &un, un_len)
+ && errno != EINPROGRESS) {
+ goto error;
+ }
+ }
+
+#ifdef SCM_CREDENTIALS
+ if (passcred) {
+ int enable = 1;
+ if (setsockopt(fd, SOL_SOCKET, SO_PASSCRED, &enable, sizeof(enable))) {
+ goto error;
+ }
+ }
+#endif
+
+ return fd;
+
+error:
+ if (bind_path) {
+ fatal_signal_remove_file_to_unlink(bind_path);
+ }
+ error = errno;
+ close(fd);
+ return -error;
+}
+
+int
+get_unix_name_len(socklen_t sun_len)
+{
+ return (sun_len >= offsetof(struct sockaddr_un, sun_path)
+ ? sun_len - offsetof(struct sockaddr_un, sun_path)
+ : 0);
+}
+
+uint32_t
+guess_netmask(uint32_t ip)
+{
+ ip = ntohl(ip);
+ return ((ip >> 31) == 0 ? htonl(0xff000000) /* Class A */
+ : (ip >> 30) == 2 ? htonl(0xffff0000) /* Class B */
+ : (ip >> 29) == 6 ? htonl(0xffffff00) /* Class C */
+ : htonl(0)); /* ??? */
+}
+
+int
+read_fully(int fd, void *p_, size_t size, size_t *bytes_read)
+{
+ uint8_t *p = p_;
+
+ *bytes_read = 0;
+ while (size > 0) {
+ ssize_t retval = read(fd, p, size);
+ if (retval > 0) {
+ *bytes_read += retval;
+ size -= retval;
+ p += retval;
+ } else if (retval == 0) {
+ return EOF;
+ } else if (errno != EINTR) {
+ return errno;
+ }
+ }
+ return 0;
+}
+
+int
+write_fully(int fd, const void *p_, size_t size, size_t *bytes_written)
+{
+ const uint8_t *p = p_;
+
+ *bytes_written = 0;
+ while (size > 0) {
+ ssize_t retval = write(fd, p, size);
+ if (retval > 0) {
+ *bytes_written += retval;
+ size -= retval;
+ p += retval;
+ } else if (retval == 0) {
+ VLOG_WARN("write returned 0");
+ return EPROTO;
+ } else if (errno != EINTR) {
+ return errno;
+ }
+ }
+ return 0;
+}
diff --git a/lib/socket-util.h b/lib/socket-util.h
new file mode 100644
index 000000000..bdfb3dcb5
--- /dev/null
+++ b/lib/socket-util.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2008, 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef SOCKET_UTIL_H
+#define SOCKET_UTIL_H 1
+
+#include <sys/types.h>
+#include <netinet/in.h>
+#include <stdbool.h>
+
+int set_nonblocking(int fd);
+int get_max_fds(void);
+int lookup_ip(const char *host_name, struct in_addr *address);
+int get_socket_error(int sock);
+int check_connection_completion(int fd);
+int drain_rcvbuf(int fd);
+void drain_fd(int fd, size_t n_packets);
+int make_unix_socket(int style, bool nonblock, bool passcred,
+ const char *bind_path, const char *connect_path);
+int get_unix_name_len(socklen_t sun_len);
+uint32_t guess_netmask(uint32_t ip);
+
+int read_fully(int fd, void *, size_t, size_t *bytes_read);
+int write_fully(int fd, const void *, size_t, size_t *bytes_written);
+
+#endif /* socket-util.h */
diff --git a/lib/stp.c b/lib/stp.c
new file mode 100644
index 000000000..59f5c6158
--- /dev/null
+++ b/lib/stp.c
@@ -0,0 +1,1226 @@
+/*
+ * Copyright (c) 2008, 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+/* Based on sample implementation in 802.1D-1998. Above copyright and license
+ * applies to all modifications. */
+
+#include "stp.h"
+#include <arpa/inet.h>
+#include <assert.h>
+#include <inttypes.h>
+#include <stdlib.h>
+#include "ofpbuf.h"
+#include "packets.h"
+#include "util.h"
+#include "xtoxll.h"
+
+#include "vlog.h"
+#define THIS_MODULE VLM_stp
+
+/* Ethernet address used as the destination for STP frames. */
+const uint8_t stp_eth_addr[ETH_ADDR_LEN]
+= { 0x01, 0x80, 0xC2, 0x00, 0x00, 0x01 };
+
+#define STP_PROTOCOL_ID 0x0000
+#define STP_PROTOCOL_VERSION 0x00
+#define STP_TYPE_CONFIG 0x00
+#define STP_TYPE_TCN 0x80
+
+struct stp_bpdu_header {
+ uint16_t protocol_id; /* STP_PROTOCOL_ID. */
+ uint8_t protocol_version; /* STP_PROTOCOL_VERSION. */
+ uint8_t bpdu_type; /* One of STP_TYPE_*. */
+} __attribute__((packed));
+BUILD_ASSERT_DECL(sizeof(struct stp_bpdu_header) == 4);
+
+enum stp_config_bpdu_flags {
+ STP_CONFIG_TOPOLOGY_CHANGE_ACK = 0x80,
+ STP_CONFIG_TOPOLOGY_CHANGE = 0x01
+};
+
+struct stp_config_bpdu {
+ struct stp_bpdu_header header; /* Type STP_TYPE_CONFIG. */
+ uint8_t flags; /* STP_CONFIG_* flags. */
+ uint64_t root_id; /* 8.5.1.1: Bridge believed to be root. */
+ uint32_t root_path_cost; /* 8.5.1.2: Cost of path to root. */
+ uint64_t bridge_id; /* 8.5.1.3: ID of transmitting bridge. */
+ uint16_t port_id; /* 8.5.1.4: Port transmitting the BPDU. */
+ uint16_t message_age; /* 8.5.1.5: Age of BPDU at tx time. */
+ uint16_t max_age; /* 8.5.1.6: Timeout for received data. */
+ uint16_t hello_time; /* 8.5.1.7: Time between BPDU generation. */
+ uint16_t forward_delay; /* 8.5.1.8: State progression delay. */
+} __attribute__((packed));
+BUILD_ASSERT_DECL(sizeof(struct stp_config_bpdu) == 35);
+
+struct stp_tcn_bpdu {
+ struct stp_bpdu_header header; /* Type STP_TYPE_TCN. */
+} __attribute__((packed));
+BUILD_ASSERT_DECL(sizeof(struct stp_tcn_bpdu) == 4);
+
+struct stp_timer {
+ bool active; /* Timer in use? */
+ int value; /* Current value of timer, counting up. */
+};
+
+struct stp_port {
+ struct stp *stp;
+ int port_id; /* 8.5.5.1: Unique port identifier. */
+ enum stp_state state; /* 8.5.5.2: Current state. */
+ int path_cost; /* 8.5.5.3: Cost of tx/rx on this port. */
+ stp_identifier designated_root; /* 8.5.5.4. */
+ int designated_cost; /* 8.5.5.5: Path cost to root on port. */
+ stp_identifier designated_bridge; /* 8.5.5.6. */
+ int designated_port; /* 8.5.5.7: Port to send config msgs on. */
+ bool topology_change_ack; /* 8.5.5.8: Flag for next config BPDU. */
+ bool config_pending; /* 8.5.5.9: Send BPDU when hold expires? */
+ bool change_detection_enabled; /* 8.5.5.10: Detect topology changes? */
+
+ struct stp_timer message_age_timer; /* 8.5.6.1: Age of received info. */
+ struct stp_timer forward_delay_timer; /* 8.5.6.2: State change timer. */
+ struct stp_timer hold_timer; /* 8.5.6.3: BPDU rate limit timer. */
+
+ bool state_changed;
+};
+
+struct stp {
+ /* Static bridge data. */
+ char *name; /* Human-readable name for log messages. */
+ stp_identifier bridge_id; /* 8.5.3.7: This bridge. */
+ int max_age; /* 8.5.3.4: Time to drop received data. */
+ int hello_time; /* 8.5.3.5: Time between sending BPDUs. */
+ int forward_delay; /* 8.5.3.6: Delay between state changes. */
+ int bridge_max_age; /* 8.5.3.8: max_age when we're root. */
+ int bridge_hello_time; /* 8.5.3.9: hello_time as root. */
+ int bridge_forward_delay; /* 8.5.3.10: forward_delay as root. */
+ int rq_max_age; /* User-requested max age, in ms. */
+ int rq_hello_time; /* User-requested hello time, in ms. */
+ int rq_forward_delay; /* User-requested forward delay, in ms. */
+ int elapsed_remainder; /* Left-over msecs from last stp_tick(). */
+
+ /* Dynamic bridge data. */
+ stp_identifier designated_root; /* 8.5.3.1: Bridge believed to be root. */
+ unsigned int root_path_cost; /* 8.5.3.2: Cost of path to root. */
+ struct stp_port *root_port; /* 8.5.3.3: Lowest cost port to root. */
+ bool topology_change_detected; /* 8.5.3.11: Detected a topology change? */
+ bool topology_change; /* 8.5.3.12: Received topology change? */
+
+ /* Bridge timers. */
+ struct stp_timer hello_timer; /* 8.5.4.1: Hello timer. */
+ struct stp_timer tcn_timer; /* 8.5.4.2: Topology change timer. */
+ struct stp_timer topology_change_timer; /* 8.5.4.3. */
+
+ /* Ports. */
+ struct stp_port ports[STP_MAX_PORTS];
+
+ /* Interface to client. */
+ struct stp_port *first_changed_port;
+ void (*send_bpdu)(struct ofpbuf *bpdu, int port_no, void *aux);
+ void *aux;
+};
+
+#define FOR_EACH_ENABLED_PORT(PORT, STP) \
+ for ((PORT) = stp_next_enabled_port((STP), (STP)->ports); \
+ (PORT); \
+ (PORT) = stp_next_enabled_port((STP), (PORT) + 1))
+static struct stp_port *
+stp_next_enabled_port(const struct stp *stp, const struct stp_port *port)
+{
+ for (; port < &stp->ports[ARRAY_SIZE(stp->ports)]; port++) {
+ if (port->state != STP_DISABLED) {
+ return (struct stp_port *) port;
+ }
+ }
+ return NULL;
+}
+
+#define MESSAGE_AGE_INCREMENT 1
+
+static void stp_transmit_config(struct stp_port *);
+static bool stp_supersedes_port_info(const struct stp_port *,
+ const struct stp_config_bpdu *);
+static void stp_record_config_information(struct stp_port *,
+ const struct stp_config_bpdu *);
+static void stp_record_config_timeout_values(struct stp *,
+ const struct stp_config_bpdu *);
+static bool stp_is_designated_port(const struct stp_port *);
+static void stp_config_bpdu_generation(struct stp *);
+static void stp_transmit_tcn(struct stp *);
+static void stp_configuration_update(struct stp *);
+static bool stp_supersedes_root(const struct stp_port *root,
+ const struct stp_port *);
+static void stp_root_selection(struct stp *);
+static void stp_designated_port_selection(struct stp *);
+static void stp_become_designated_port(struct stp_port *);
+static void stp_port_state_selection(struct stp *);
+static void stp_make_forwarding(struct stp_port *);
+static void stp_make_blocking(struct stp_port *);
+static void stp_set_port_state(struct stp_port *, enum stp_state);
+static void stp_topology_change_detection(struct stp *);
+static void stp_topology_change_acknowledged(struct stp *);
+static void stp_acknowledge_topology_change(struct stp_port *);
+static void stp_received_config_bpdu(struct stp *, struct stp_port *,
+ const struct stp_config_bpdu *);
+static void stp_received_tcn_bpdu(struct stp *, struct stp_port *);
+static void stp_hello_timer_expiry(struct stp *);
+static void stp_message_age_timer_expiry(struct stp_port *);
+static bool stp_is_designated_for_some_port(const struct stp *);
+static void stp_forward_delay_timer_expiry(struct stp_port *);
+static void stp_tcn_timer_expiry(struct stp *);
+static void stp_topology_change_timer_expiry(struct stp *);
+static void stp_hold_timer_expiry(struct stp_port *);
+static void stp_initialize_port(struct stp_port *, enum stp_state);
+static void stp_become_root_bridge(struct stp *);
+static void stp_update_bridge_timers(struct stp *);
+
+static int clamp(int x, int min, int max);
+static int ms_to_timer(int ms);
+static int ms_to_timer_remainder(int ms);
+static int timer_to_ms(int timer);
+static void stp_start_timer(struct stp_timer *, int value);
+static void stp_stop_timer(struct stp_timer *);
+static bool stp_timer_expired(struct stp_timer *, int elapsed, int timeout);
+
+static void stp_send_bpdu(struct stp_port *, const void *, size_t);
+
+/* Creates and returns a new STP instance that initially has no ports enabled.
+ *
+ * 'bridge_id' should be a 48-bit MAC address as returned by
+ * eth_addr_to_uint64(). 'bridge_id' may also have a priority value in its top
+ * 16 bits; if those bits are set to 0, STP_DEFAULT_BRIDGE_PRIORITY is used.
+ * (This priority may be changed with stp_set_bridge_priority().)
+ *
+ * When the bridge needs to send out a BPDU, it calls 'send_bpdu'. This
+ * callback may be called from stp_tick() or stp_received_bpdu(). The
+ * arguments to 'send_bpdu' are an STP BPDU encapsulated in
+ */
+struct stp *
+stp_create(const char *name, stp_identifier bridge_id,
+ void (*send_bpdu)(struct ofpbuf *bpdu, int port_no, void *aux),
+ void *aux)
+{
+ struct stp *stp;
+ struct stp_port *p;
+
+ stp = xcalloc(1, sizeof *stp);
+ stp->name = xstrdup(name);
+ stp->bridge_id = bridge_id;
+ if (!(stp->bridge_id >> 48)) {
+ stp->bridge_id |= (uint64_t) STP_DEFAULT_BRIDGE_PRIORITY << 48;
+ }
+
+ stp->rq_max_age = 6000;
+ stp->rq_hello_time = 2000;
+ stp->rq_forward_delay = 4000;
+ stp_update_bridge_timers(stp);
+ stp->max_age = stp->bridge_max_age;
+ stp->hello_time = stp->bridge_hello_time;
+ stp->forward_delay = stp->bridge_forward_delay;
+
+ stp->designated_root = stp->bridge_id;
+ stp->root_path_cost = 0;
+ stp->root_port = NULL;
+ stp->topology_change_detected = false;
+ stp->topology_change = false;
+
+ stp_stop_timer(&stp->tcn_timer);
+ stp_stop_timer(&stp->topology_change_timer);
+ stp_start_timer(&stp->hello_timer, 0);
+
+ stp->send_bpdu = send_bpdu;
+ stp->aux = aux;
+
+ stp->first_changed_port = &stp->ports[ARRAY_SIZE(stp->ports)];
+ for (p = stp->ports; p < &stp->ports[ARRAY_SIZE(stp->ports)]; p++) {
+ p->stp = stp;
+ p->port_id = (stp_port_no(p) + 1) | (STP_DEFAULT_PORT_PRIORITY << 8);
+ p->path_cost = 19; /* Recommended default for 100 Mb/s link. */
+ stp_initialize_port(p, STP_DISABLED);
+ }
+ return stp;
+}
+
+/* Destroys 'stp'. */
+void
+stp_destroy(struct stp *stp)
+{
+ free(stp);
+}
+
+/* Runs 'stp' given that 'ms' milliseconds have passed. */
+void
+stp_tick(struct stp *stp, int ms)
+{
+ struct stp_port *p;
+ int elapsed;
+
+ /* Convert 'ms' to STP timer ticks. Preserve any leftover milliseconds
+ * from previous stp_tick() calls so that we don't lose STP ticks when we
+ * are called too frequently. */
+ ms = clamp(ms, 0, INT_MAX - 1000) + stp->elapsed_remainder;
+ elapsed = ms_to_timer(ms);
+ stp->elapsed_remainder = ms_to_timer_remainder(ms);
+ if (!elapsed) {
+ return;
+ }
+
+ if (stp_timer_expired(&stp->hello_timer, elapsed, stp->hello_time)) {
+ stp_hello_timer_expiry(stp);
+ }
+ if (stp_timer_expired(&stp->tcn_timer, elapsed, stp->bridge_hello_time)) {
+ stp_tcn_timer_expiry(stp);
+ }
+ if (stp_timer_expired(&stp->topology_change_timer, elapsed,
+ stp->max_age + stp->forward_delay)) {
+ stp_topology_change_timer_expiry(stp);
+ }
+ FOR_EACH_ENABLED_PORT (p, stp) {
+ if (stp_timer_expired(&p->message_age_timer, elapsed, stp->max_age)) {
+ stp_message_age_timer_expiry(p);
+ }
+ }
+ FOR_EACH_ENABLED_PORT (p, stp) {
+ if (stp_timer_expired(&p->forward_delay_timer, elapsed,
+ stp->forward_delay)) {
+ stp_forward_delay_timer_expiry(p);
+ }
+ if (stp_timer_expired(&p->hold_timer, elapsed, ms_to_timer(1000))) {
+ stp_hold_timer_expiry(p);
+ }
+ }
+}
+
+static void
+set_bridge_id(struct stp *stp, stp_identifier new_bridge_id)
+{
+ if (new_bridge_id != stp->bridge_id) {
+ bool root;
+ struct stp_port *p;
+
+ root = stp_is_root_bridge(stp);
+ FOR_EACH_ENABLED_PORT (p, stp) {
+ if (stp_is_designated_port(p)) {
+ p->designated_bridge = new_bridge_id;
+ }
+ }
+ stp->bridge_id = new_bridge_id;
+ stp_configuration_update(stp);
+ stp_port_state_selection(stp);
+ if (stp_is_root_bridge(stp) && !root) {
+ stp_become_root_bridge(stp);
+ }
+ }
+}
+
+void
+stp_set_bridge_id(struct stp *stp, stp_identifier bridge_id)
+{
+ const uint64_t mac_bits = (UINT64_C(1) << 48) - 1;
+ const uint64_t pri_bits = ~mac_bits;
+ set_bridge_id(stp, (stp->bridge_id & pri_bits) | (bridge_id & mac_bits));
+}
+
+void
+stp_set_bridge_priority(struct stp *stp, uint16_t new_priority)
+{
+ const uint64_t mac_bits = (UINT64_C(1) << 48) - 1;
+ set_bridge_id(stp, ((stp->bridge_id & mac_bits)
+ | ((uint64_t) new_priority << 48)));
+}
+
+/* Sets the desired hello time for 'stp' to 'ms', in milliseconds. The actual
+ * hello time is clamped to the range of 1 to 10 seconds and subject to the
+ * relationship (bridge_max_age >= 2 * (bridge_hello_time + 1 s)). The bridge
+ * hello time is only used when 'stp' is the root bridge. */
+void
+stp_set_hello_time(struct stp *stp, int ms)
+{
+ stp->rq_hello_time = ms;
+ stp_update_bridge_timers(stp);
+}
+
+/* Sets the desired max age for 'stp' to 'ms', in milliseconds. The actual max
+ * age is clamped to the range of 6 to 40 seconds and subject to the
+ * relationships (2 * (bridge_forward_delay - 1 s) >= bridge_max_age) and
+ * (bridge_max_age >= 2 * (bridge_hello_time + 1 s)). The bridge max age is
+ * only used when 'stp' is the root bridge. */
+void
+stp_set_max_age(struct stp *stp, int ms)
+{
+ stp->rq_max_age = ms;
+ stp_update_bridge_timers(stp);
+}
+
+/* Sets the desired forward delay for 'stp' to 'ms', in milliseconds. The
+ * actual forward delay is clamped to the range of 4 to 30 seconds and subject
+ * to the relationship (2 * (bridge_forward_delay - 1 s) >= bridge_max_age).
+ * The bridge forward delay is only used when 'stp' is the root bridge. */
+void
+stp_set_forward_delay(struct stp *stp, int ms)
+{
+ stp->rq_forward_delay = ms;
+ stp_update_bridge_timers(stp);
+}
+
+/* Returns the name given to 'stp' in the call to stp_create(). */
+const char *
+stp_get_name(const struct stp *stp)
+{
+ return stp->name;
+}
+
+/* Returns the bridge ID for 'stp'. */
+stp_identifier
+stp_get_bridge_id(const struct stp *stp)
+{
+ return stp->bridge_id;
+}
+
+/* Returns the bridge ID of the bridge currently believed to be the root. */
+stp_identifier
+stp_get_designated_root(const struct stp *stp)
+{
+ return stp->designated_root;
+}
+
+/* Returns true if 'stp' believes itself to the be root of the spanning tree,
+ * false otherwise. */
+bool
+stp_is_root_bridge(const struct stp *stp)
+{
+ return stp->bridge_id == stp->designated_root;
+}
+
+/* Returns the cost of the path from 'stp' to the root of the spanning tree. */
+int
+stp_get_root_path_cost(const struct stp *stp)
+{
+ return stp->root_path_cost;
+}
+
+/* Returns the bridge hello time, in ms. The returned value is not necessarily
+ * the value passed to stp_set_hello_time(): it is clamped to the valid range
+ * and quantized to the STP timer resolution. */
+int
+stp_get_hello_time(const struct stp *stp)
+{
+ return timer_to_ms(stp->bridge_hello_time);
+}
+
+/* Returns the bridge max age, in ms. The returned value is not necessarily
+ * the value passed to stp_set_max_age(): it is clamped to the valid range,
+ * quantized to the STP timer resolution, and adjusted to match the constraints
+ * due to the hello time. */
+int
+stp_get_max_age(const struct stp *stp)
+{
+ return timer_to_ms(stp->bridge_max_age);
+}
+
+/* Returns the bridge forward delay, in ms. The returned value is not
+ * necessarily the value passed to stp_set_forward_delay(): it is clamped to
+ * the valid range, quantized to the STP timer resolution, and adjusted to
+ * match the constraints due to the forward delay. */
+int
+stp_get_forward_delay(const struct stp *stp)
+{
+ return timer_to_ms(stp->bridge_forward_delay);
+}
+
+/* Returns the port in 'stp' with index 'port_no', which must be between 0 and
+ * STP_MAX_PORTS. */
+struct stp_port *
+stp_get_port(struct stp *stp, int port_no)
+{
+ assert(port_no >= 0 && port_no < ARRAY_SIZE(stp->ports));
+ return &stp->ports[port_no];
+}
+
+/* Returns the port connecting 'stp' to the root bridge, or a null pointer if
+ * there is no such port. */
+struct stp_port *
+stp_get_root_port(struct stp *stp)
+{
+ return stp->root_port;
+}
+
+/* Finds a port whose state has changed. If successful, stores the port whose
+ * state changed in '*portp' and returns true. If no port has changed, stores
+ * NULL in '*portp' and returns false. */
+bool
+stp_get_changed_port(struct stp *stp, struct stp_port **portp)
+{
+ struct stp_port *end = &stp->ports[ARRAY_SIZE(stp->ports)];
+ struct stp_port *p;
+
+ for (p = stp->first_changed_port; p < end; p++) {
+ if (p->state_changed) {
+ p->state_changed = false;
+ stp->first_changed_port = p + 1;
+ *portp = p;
+ return true;
+ }
+ }
+ stp->first_changed_port = end;
+ *portp = NULL;
+ return false;
+}
+
+/* Returns the name for the given 'state' (for use in debugging and log
+ * messages). */
+const char *
+stp_state_name(enum stp_state state)
+{
+ switch (state) {
+ case STP_DISABLED:
+ return "disabled";
+ case STP_LISTENING:
+ return "listening";
+ case STP_LEARNING:
+ return "learning";
+ case STP_FORWARDING:
+ return "forwarding";
+ case STP_BLOCKING:
+ return "blocking";
+ default:
+ NOT_REACHED();
+ }
+}
+
+/* Returns true if 'state' is one in which packets received on a port should
+ * be forwarded, false otherwise.
+ *
+ * Returns true if 'state' is STP_DISABLED, since presumably in that case the
+ * port should still work, just not have STP applied to it. */
+bool
+stp_forward_in_state(enum stp_state state)
+{
+ return (state & (STP_DISABLED | STP_FORWARDING)) != 0;
+}
+
+/* Returns true if 'state' is one in which MAC learning should be done on
+ * packets received on a port, false otherwise.
+ *
+ * Returns true if 'state' is STP_DISABLED, since presumably in that case the
+ * port should still work, just not have STP applied to it. */
+bool
+stp_learn_in_state(enum stp_state state)
+{
+ return (state & (STP_DISABLED | STP_LEARNING | STP_FORWARDING)) != 0;
+}
+
+/* Notifies the STP entity that bridge protocol data unit 'bpdu', which is
+ * 'bpdu_size' bytes in length, was received on port 'p'.
+ *
+ * This function may call the 'send_bpdu' function provided to stp_create(). */
+void
+stp_received_bpdu(struct stp_port *p, const void *bpdu, size_t bpdu_size)
+{
+ struct stp *stp = p->stp;
+ const struct stp_bpdu_header *header;
+
+ if (p->state == STP_DISABLED) {
+ return;
+ }
+
+ if (bpdu_size < sizeof(struct stp_bpdu_header)) {
+ VLOG_WARN("%s: received runt %zu-byte BPDU", stp->name, bpdu_size);
+ return;
+ }
+
+ header = bpdu;
+ if (header->protocol_id != htons(STP_PROTOCOL_ID)) {
+ VLOG_WARN("%s: received BPDU with unexpected protocol ID %"PRIu16,
+ stp->name, ntohs(header->protocol_id));
+ return;
+ }
+ if (header->protocol_version != STP_PROTOCOL_VERSION) {
+ VLOG_DBG("%s: received BPDU with unexpected protocol version %"PRIu8,
+ stp->name, header->protocol_version);
+ }
+
+ switch (header->bpdu_type) {
+ case STP_TYPE_CONFIG:
+ if (bpdu_size < sizeof(struct stp_config_bpdu)) {
+ VLOG_WARN("%s: received config BPDU with invalid size %zu",
+ stp->name, bpdu_size);
+ return;
+ }
+ stp_received_config_bpdu(stp, p, bpdu);
+ break;
+
+ case STP_TYPE_TCN:
+ if (bpdu_size != sizeof(struct stp_tcn_bpdu)) {
+ VLOG_WARN("%s: received TCN BPDU with invalid size %zu",
+ stp->name, bpdu_size);
+ return;
+ }
+ stp_received_tcn_bpdu(stp, p);
+ break;
+
+ default:
+ VLOG_WARN("%s: received BPDU of unexpected type %"PRIu8,
+ stp->name, header->bpdu_type);
+ return;
+ }
+}
+
+/* Returns the STP entity in which 'p' is nested. */
+struct stp *
+stp_port_get_stp(struct stp_port *p)
+{
+ return p->stp;
+}
+
+/* Returns the index of port 'p' within its bridge. */
+int
+stp_port_no(const struct stp_port *p)
+{
+ struct stp *stp = p->stp;
+ assert(p >= stp->ports && p < &stp->ports[ARRAY_SIZE(stp->ports)]);
+ return p - stp->ports;
+}
+
+/* Returns the state of port 'p'. */
+enum stp_state
+stp_port_get_state(const struct stp_port *p)
+{
+ return p->state;
+}
+
+/* Disables STP on port 'p'. */
+void
+stp_port_disable(struct stp_port *p)
+{
+ struct stp *stp = p->stp;
+ if (p->state != STP_DISABLED) {
+ bool root = stp_is_root_bridge(stp);
+ stp_become_designated_port(p);
+ stp_set_port_state(p, STP_DISABLED);
+ p->topology_change_ack = false;
+ p->config_pending = false;
+ stp_stop_timer(&p->message_age_timer);
+ stp_stop_timer(&p->forward_delay_timer);
+ stp_configuration_update(stp);
+ stp_port_state_selection(stp);
+ if (stp_is_root_bridge(stp) && !root) {
+ stp_become_root_bridge(stp);
+ }
+ }
+}
+
+/* Enables STP on port 'p'. The port will initially be in "blocking" state. */
+void
+stp_port_enable(struct stp_port *p)
+{
+ if (p->state == STP_DISABLED) {
+ stp_initialize_port(p, STP_BLOCKING);
+ stp_port_state_selection(p->stp);
+ }
+}
+
+/* Sets the priority of port 'p' to 'new_priority'. Lower numerical values
+ * are interpreted as higher priorities. */
+void
+stp_port_set_priority(struct stp_port *p, uint8_t new_priority)
+{
+ uint16_t new_port_id = (p->port_id & 0xff) | (new_priority << 8);
+ if (p->port_id != new_port_id) {
+ struct stp *stp = p->stp;
+ if (stp_is_designated_port(p)) {
+ p->designated_port = new_port_id;
+ }
+ p->port_id = new_port_id;
+ if (stp->bridge_id == p->designated_bridge
+ && p->port_id < p->designated_port) {
+ stp_become_designated_port(p);
+ stp_port_state_selection(stp);
+ }
+ }
+}
+
+/* Sets the path cost of port 'p' to 'path_cost'. Lower values are generally
+ * used to indicate faster links. Use stp_port_set_speed() to automatically
+ * generate a default path cost from a link speed. */
+void
+stp_port_set_path_cost(struct stp_port *p, uint16_t path_cost)
+{
+ if (p->path_cost != path_cost) {
+ struct stp *stp = p->stp;
+ p->path_cost = path_cost;
+ stp_configuration_update(stp);
+ stp_port_state_selection(stp);
+ }
+}
+
+/* Sets the path cost of port 'p' based on 'speed' (measured in Mb/s). */
+void
+stp_port_set_speed(struct stp_port *p, unsigned int speed)
+{
+ stp_port_set_path_cost(p, (speed >= 10000 ? 2 /* 10 Gb/s. */
+ : speed >= 1000 ? 4 /* 1 Gb/s. */
+ : speed >= 100 ? 19 /* 100 Mb/s. */
+ : speed >= 16 ? 62 /* 16 Mb/s. */
+ : speed >= 10 ? 100 /* 10 Mb/s. */
+ : speed >= 4 ? 250 /* 4 Mb/s. */
+ : 19)); /* 100 Mb/s (guess). */
+}
+
+/* Enables topology change detection on port 'p'. */
+void
+stp_port_enable_change_detection(struct stp_port *p)
+{
+ p->change_detection_enabled = true;
+}
+
+/* Disables topology change detection on port 'p'. */
+void
+stp_port_disable_change_detection(struct stp_port *p)
+{
+ p->change_detection_enabled = false;
+}
+
+static void
+stp_transmit_config(struct stp_port *p)
+{
+ struct stp *stp = p->stp;
+ bool root = stp_is_root_bridge(stp);
+ if (!root && !stp->root_port) {
+ return;
+ }
+ if (p->hold_timer.active) {
+ p->config_pending = true;
+ } else {
+ struct stp_config_bpdu config;
+ memset(&config, 0, sizeof config);
+ config.header.protocol_id = htons(STP_PROTOCOL_ID);
+ config.header.protocol_version = STP_PROTOCOL_VERSION;
+ config.header.bpdu_type = STP_TYPE_CONFIG;
+ config.flags = 0;
+ if (p->topology_change_ack) {
+ config.flags |= htons(STP_CONFIG_TOPOLOGY_CHANGE_ACK);
+ }
+ if (stp->topology_change) {
+ config.flags |= htons(STP_CONFIG_TOPOLOGY_CHANGE);
+ }
+ config.root_id = htonll(stp->designated_root);
+ config.root_path_cost = htonl(stp->root_path_cost);
+ config.bridge_id = htonll(stp->bridge_id);
+ config.port_id = htons(p->port_id);
+ if (root) {
+ config.message_age = htons(0);
+ } else {
+ config.message_age = htons(stp->root_port->message_age_timer.value
+ + MESSAGE_AGE_INCREMENT);
+ }
+ config.max_age = htons(stp->max_age);
+ config.hello_time = htons(stp->hello_time);
+ config.forward_delay = htons(stp->forward_delay);
+ if (ntohs(config.message_age) < stp->max_age) {
+ p->topology_change_ack = false;
+ p->config_pending = false;
+ stp_send_bpdu(p, &config, sizeof config);
+ stp_start_timer(&p->hold_timer, 0);
+ }
+ }
+}
+
+static bool
+stp_supersedes_port_info(const struct stp_port *p,
+ const struct stp_config_bpdu *config)
+{
+ if (ntohll(config->root_id) != p->designated_root) {
+ return ntohll(config->root_id) < p->designated_root;
+ } else if (ntohl(config->root_path_cost) != p->designated_cost) {
+ return ntohl(config->root_path_cost) < p->designated_cost;
+ } else if (ntohll(config->bridge_id) != p->designated_bridge) {
+ return ntohll(config->bridge_id) < p->designated_bridge;
+ } else {
+ return (ntohll(config->bridge_id) != p->stp->bridge_id
+ || ntohs(config->port_id) <= p->designated_port);
+ }
+}
+
+static void
+stp_record_config_information(struct stp_port *p,
+ const struct stp_config_bpdu *config)
+{
+ p->designated_root = ntohll(config->root_id);
+ p->designated_cost = ntohl(config->root_path_cost);
+ p->designated_bridge = ntohll(config->bridge_id);
+ p->designated_port = ntohs(config->port_id);
+ stp_start_timer(&p->message_age_timer, ntohs(config->message_age));
+}
+
+static void
+stp_record_config_timeout_values(struct stp *stp,
+ const struct stp_config_bpdu *config)
+{
+ stp->max_age = ntohs(config->max_age);
+ stp->hello_time = ntohs(config->hello_time);
+ stp->forward_delay = ntohs(config->forward_delay);
+ stp->topology_change = config->flags & htons(STP_CONFIG_TOPOLOGY_CHANGE);
+}
+
+static bool
+stp_is_designated_port(const struct stp_port *p)
+{
+ return (p->designated_bridge == p->stp->bridge_id
+ && p->designated_port == p->port_id);
+}
+
+static void
+stp_config_bpdu_generation(struct stp *stp)
+{
+ struct stp_port *p;
+
+ FOR_EACH_ENABLED_PORT (p, stp) {
+ if (stp_is_designated_port(p)) {
+ stp_transmit_config(p);
+ }
+ }
+}
+
+static void
+stp_transmit_tcn(struct stp *stp)
+{
+ struct stp_port *p = stp->root_port;
+ struct stp_tcn_bpdu tcn_bpdu;
+ if (!p) {
+ return;
+ }
+ tcn_bpdu.header.protocol_id = htons(STP_PROTOCOL_ID);
+ tcn_bpdu.header.protocol_version = STP_PROTOCOL_VERSION;
+ tcn_bpdu.header.bpdu_type = STP_TYPE_TCN;
+ stp_send_bpdu(p, &tcn_bpdu, sizeof tcn_bpdu);
+}
+
+static void
+stp_configuration_update(struct stp *stp)
+{
+ stp_root_selection(stp);
+ stp_designated_port_selection(stp);
+}
+
+static bool
+stp_supersedes_root(const struct stp_port *root, const struct stp_port *p)
+{
+ int p_cost = p->designated_cost + p->path_cost;
+ int root_cost = root->designated_cost + root->path_cost;
+
+ if (p->designated_root != root->designated_root) {
+ return p->designated_root < root->designated_root;
+ } else if (p_cost != root_cost) {
+ return p_cost < root_cost;
+ } else if (p->designated_bridge != root->designated_bridge) {
+ return p->designated_bridge < root->designated_bridge;
+ } else if (p->designated_port != root->designated_port) {
+ return p->designated_port < root->designated_port;
+ } else {
+ return p->port_id < root->port_id;
+ }
+}
+
+static void
+stp_root_selection(struct stp *stp)
+{
+ struct stp_port *p, *root;
+
+ root = NULL;
+ FOR_EACH_ENABLED_PORT (p, stp) {
+ if (stp_is_designated_port(p)
+ || p->designated_root >= stp->bridge_id) {
+ continue;
+ }
+ if (root && !stp_supersedes_root(root, p)) {
+ continue;
+ }
+ root = p;
+ }
+ stp->root_port = root;
+ if (!root) {
+ stp->designated_root = stp->bridge_id;
+ stp->root_path_cost = 0;
+ } else {
+ stp->designated_root = root->designated_root;
+ stp->root_path_cost = root->designated_cost + root->path_cost;
+ }
+}
+
+static void
+stp_designated_port_selection(struct stp *stp)
+{
+ struct stp_port *p;
+
+ FOR_EACH_ENABLED_PORT (p, stp) {
+ if (stp_is_designated_port(p)
+ || p->designated_root != stp->designated_root
+ || stp->root_path_cost < p->designated_cost
+ || (stp->root_path_cost == p->designated_cost
+ && (stp->bridge_id < p->designated_bridge
+ || (stp->bridge_id == p->designated_bridge
+ && p->port_id <= p->designated_port))))
+ {
+ stp_become_designated_port(p);
+ }
+ }
+}
+
+static void
+stp_become_designated_port(struct stp_port *p)
+{
+ struct stp *stp = p->stp;
+ p->designated_root = stp->designated_root;
+ p->designated_cost = stp->root_path_cost;
+ p->designated_bridge = stp->bridge_id;
+ p->designated_port = p->port_id;
+}
+
+static void
+stp_port_state_selection(struct stp *stp)
+{
+ struct stp_port *p;
+
+ FOR_EACH_ENABLED_PORT (p, stp) {
+ if (p == stp->root_port) {
+ p->config_pending = false;
+ p->topology_change_ack = false;
+ stp_make_forwarding(p);
+ } else if (stp_is_designated_port(p)) {
+ stp_stop_timer(&p->message_age_timer);
+ stp_make_forwarding(p);
+ } else {
+ p->config_pending = false;
+ p->topology_change_ack = false;
+ stp_make_blocking(p);
+ }
+ }
+}
+
+static void
+stp_make_forwarding(struct stp_port *p)
+{
+ if (p->state == STP_BLOCKING) {
+ stp_set_port_state(p, STP_LISTENING);
+ stp_start_timer(&p->forward_delay_timer, 0);
+ }
+}
+
+static void
+stp_make_blocking(struct stp_port *p)
+{
+ if (!(p->state & (STP_DISABLED | STP_BLOCKING))) {
+ if (p->state & (STP_FORWARDING | STP_LEARNING)) {
+ if (p->change_detection_enabled) {
+ stp_topology_change_detection(p->stp);
+ }
+ }
+ stp_set_port_state(p, STP_BLOCKING);
+ stp_stop_timer(&p->forward_delay_timer);
+ }
+}
+
+static void
+stp_set_port_state(struct stp_port *p, enum stp_state state)
+{
+ if (state != p->state && !p->state_changed) {
+ p->state_changed = true;
+ if (p < p->stp->first_changed_port) {
+ p->stp->first_changed_port = p;
+ }
+ }
+ p->state = state;
+}
+
+static void
+stp_topology_change_detection(struct stp *stp)
+{
+ if (stp_is_root_bridge(stp)) {
+ stp->topology_change = true;
+ stp_start_timer(&stp->topology_change_timer, 0);
+ } else if (!stp->topology_change_detected) {
+ stp_transmit_tcn(stp);
+ stp_start_timer(&stp->tcn_timer, 0);
+ }
+ stp->topology_change_detected = true;
+}
+
+static void
+stp_topology_change_acknowledged(struct stp *stp)
+{
+ stp->topology_change_detected = false;
+ stp_stop_timer(&stp->tcn_timer);
+}
+
+static void
+stp_acknowledge_topology_change(struct stp_port *p)
+{
+ p->topology_change_ack = true;
+ stp_transmit_config(p);
+}
+
+void
+stp_received_config_bpdu(struct stp *stp, struct stp_port *p,
+ const struct stp_config_bpdu *config)
+{
+ if (ntohs(config->message_age) >= ntohs(config->max_age)) {
+ VLOG_WARN("%s: received config BPDU with message age (%u) greater "
+ "than max age (%u)",
+ stp->name,
+ ntohs(config->message_age), ntohs(config->max_age));
+ return;
+ }
+ if (p->state != STP_DISABLED) {
+ bool root = stp_is_root_bridge(stp);
+ if (stp_supersedes_port_info(p, config)) {
+ stp_record_config_information(p, config);
+ stp_configuration_update(stp);
+ stp_port_state_selection(stp);
+ if (!stp_is_root_bridge(stp) && root) {
+ stp_stop_timer(&stp->hello_timer);
+ if (stp->topology_change_detected) {
+ stp_stop_timer(&stp->topology_change_timer);
+ stp_transmit_tcn(stp);
+ stp_start_timer(&stp->tcn_timer, 0);
+ }
+ }
+ if (p == stp->root_port) {
+ stp_record_config_timeout_values(stp, config);
+ stp_config_bpdu_generation(stp);
+ if (config->flags & htons(STP_CONFIG_TOPOLOGY_CHANGE_ACK)) {
+ stp_topology_change_acknowledged(stp);
+ }
+ }
+ } else if (stp_is_designated_port(p)) {
+ stp_transmit_config(p);
+ }
+ }
+}
+
+void
+stp_received_tcn_bpdu(struct stp *stp, struct stp_port *p)
+{
+ if (p->state != STP_DISABLED) {
+ if (stp_is_designated_port(p)) {
+ stp_topology_change_detection(stp);
+ stp_acknowledge_topology_change(p);
+ }
+ }
+}
+
+static void
+stp_hello_timer_expiry(struct stp *stp)
+{
+ stp_config_bpdu_generation(stp);
+ stp_start_timer(&stp->hello_timer, 0);
+}
+
+static void
+stp_message_age_timer_expiry(struct stp_port *p)
+{
+ struct stp *stp = p->stp;
+ bool root = stp_is_root_bridge(stp);
+ stp_become_designated_port(p);
+ stp_configuration_update(stp);
+ stp_port_state_selection(stp);
+ if (stp_is_root_bridge(stp) && !root) {
+ stp->max_age = stp->bridge_max_age;
+ stp->hello_time = stp->bridge_hello_time;
+ stp->forward_delay = stp->bridge_forward_delay;
+ stp_topology_change_detection(stp);
+ stp_stop_timer(&stp->tcn_timer);
+ stp_config_bpdu_generation(stp);
+ stp_start_timer(&stp->hello_timer, 0);
+ }
+}
+
+static bool
+stp_is_designated_for_some_port(const struct stp *stp)
+{
+ const struct stp_port *p;
+
+ FOR_EACH_ENABLED_PORT (p, stp) {
+ if (p->designated_bridge == stp->bridge_id) {
+ return true;
+ }
+ }
+ return false;
+}
+
+static void
+stp_forward_delay_timer_expiry(struct stp_port *p)
+{
+ if (p->state == STP_LISTENING) {
+ stp_set_port_state(p, STP_LEARNING);
+ stp_start_timer(&p->forward_delay_timer, 0);
+ } else if (p->state == STP_LEARNING) {
+ stp_set_port_state(p, STP_FORWARDING);
+ if (stp_is_designated_for_some_port(p->stp)) {
+ if (p->change_detection_enabled) {
+ stp_topology_change_detection(p->stp);
+ }
+ }
+ }
+}
+
+static void
+stp_tcn_timer_expiry(struct stp *stp)
+{
+ stp_transmit_tcn(stp);
+ stp_start_timer(&stp->tcn_timer, 0);
+}
+
+static void
+stp_topology_change_timer_expiry(struct stp *stp)
+{
+ stp->topology_change_detected = false;
+ stp->topology_change = false;
+}
+
+static void
+stp_hold_timer_expiry(struct stp_port *p)
+{
+ if (p->config_pending) {
+ stp_transmit_config(p);
+ }
+}
+
+static void
+stp_initialize_port(struct stp_port *p, enum stp_state state)
+{
+ assert(state & (STP_DISABLED | STP_BLOCKING));
+ stp_become_designated_port(p);
+ stp_set_port_state(p, state);
+ p->topology_change_ack = false;
+ p->config_pending = false;
+ p->change_detection_enabled = true;
+ stp_stop_timer(&p->message_age_timer);
+ stp_stop_timer(&p->forward_delay_timer);
+ stp_stop_timer(&p->hold_timer);
+}
+
+static void
+stp_become_root_bridge(struct stp *stp)
+{
+ stp->max_age = stp->bridge_max_age;
+ stp->hello_time = stp->bridge_hello_time;
+ stp->forward_delay = stp->bridge_forward_delay;
+ stp_topology_change_detection(stp);
+ stp_stop_timer(&stp->tcn_timer);
+ stp_config_bpdu_generation(stp);
+ stp_start_timer(&stp->hello_timer, 0);
+}
+
+static void
+stp_start_timer(struct stp_timer *timer, int value)
+{
+ timer->value = value;
+ timer->active = true;
+}
+
+static void
+stp_stop_timer(struct stp_timer *timer)
+{
+ timer->active = false;
+}
+
+static bool
+stp_timer_expired(struct stp_timer *timer, int elapsed, int timeout)
+{
+ if (timer->active) {
+ timer->value += elapsed;
+ if (timer->value >= timeout) {
+ timer->active = false;
+ return true;
+ }
+ }
+ return false;
+}
+
+/* Returns the number of whole STP timer ticks in 'ms' milliseconds. There
+ * are 256 STP timer ticks per second. */
+static int
+ms_to_timer(int ms)
+{
+ return ms * 0x100 / 1000;
+}
+
+/* Returns the number of leftover milliseconds when 'ms' is converted to STP
+ * timer ticks. */
+static int
+ms_to_timer_remainder(int ms)
+{
+ return ms * 0x100 % 1000;
+}
+
+/* Returns the number of whole milliseconds in 'timer' STP timer ticks. There
+ * are 256 STP timer ticks per second. */
+static int
+timer_to_ms(int timer)
+{
+ return timer * 1000 / 0x100;
+}
+
+static int
+clamp(int x, int min, int max)
+{
+ return x < min ? min : x > max ? max : x;
+}
+
+static void
+stp_update_bridge_timers(struct stp *stp)
+{
+ int ht, ma, fd;
+
+ ht = clamp(stp->rq_hello_time, 1000, 10000);
+ ma = clamp(stp->rq_max_age, MAX(2 * (ht + 1000), 6000), 40000);
+ fd = clamp(stp->rq_forward_delay, ma / 2 + 1000, 30000);
+
+ stp->bridge_hello_time = ms_to_timer(ht);
+ stp->bridge_max_age = ms_to_timer(ma);
+ stp->bridge_forward_delay = ms_to_timer(fd);
+
+ if (stp_is_root_bridge(stp)) {
+ stp->max_age = stp->bridge_max_age;
+ stp->hello_time = stp->bridge_hello_time;
+ stp->forward_delay = stp->bridge_forward_delay;
+ }
+}
+
+static void
+stp_send_bpdu(struct stp_port *p, const void *bpdu, size_t bpdu_size)
+{
+ struct eth_header *eth;
+ struct llc_header *llc;
+ struct ofpbuf *pkt;
+
+ /* Skeleton. */
+ pkt = ofpbuf_new(ETH_HEADER_LEN + LLC_HEADER_LEN + bpdu_size);
+ pkt->l2 = eth = ofpbuf_put_zeros(pkt, sizeof *eth);
+ llc = ofpbuf_put_zeros(pkt, sizeof *llc);
+ pkt->l3 = ofpbuf_put(pkt, bpdu, bpdu_size);
+
+ /* 802.2 header. */
+ memcpy(eth->eth_dst, stp_eth_addr, ETH_ADDR_LEN);
+ /* p->stp->send_bpdu() must fill in source address. */
+ eth->eth_type = htons(pkt->size - ETH_HEADER_LEN);
+
+ /* LLC header. */
+ llc->llc_dsap = STP_LLC_DSAP;
+ llc->llc_ssap = STP_LLC_SSAP;
+ llc->llc_cntl = STP_LLC_CNTL;
+
+ p->stp->send_bpdu(pkt, stp_port_no(p), p->stp->aux);
+}
diff --git a/lib/stp.h b/lib/stp.h
new file mode 100644
index 000000000..f29ac003f
--- /dev/null
+++ b/lib/stp.h
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2008 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef STP_H
+#define STP_H 1
+
+/* This is an implementation of Spanning Tree Protocol as described in IEEE
+ * 802.1D-1998, clauses 8 and 9. Section numbers refer to this standard. */
+
+#include <stdbool.h>
+#include <stdint.h>
+#include "compiler.h"
+#include "util.h"
+
+struct ofpbuf;
+
+/* Ethernet address used as the destination for STP frames. */
+extern const uint8_t stp_eth_addr[6];
+
+/* LLC field values used for STP frames. */
+#define STP_LLC_SSAP 0x42
+#define STP_LLC_DSAP 0x42
+#define STP_LLC_CNTL 0x03
+
+/* Bridge and port priorities that should be used by default. */
+#define STP_DEFAULT_BRIDGE_PRIORITY 32768
+#define STP_DEFAULT_PORT_PRIORITY 128
+
+/* Bridge identifier. Top 16 bits are a priority value (numerically lower
+ * values are higher priorities). Bottom 48 bits are MAC address of bridge. */
+typedef uint64_t stp_identifier;
+
+/* Basic STP functionality. */
+#define STP_MAX_PORTS 255
+struct stp *stp_create(const char *name, stp_identifier bridge_id,
+ void (*send_bpdu)(struct ofpbuf *bpdu, int port_no,
+ void *aux),
+ void *aux);
+void stp_destroy(struct stp *);
+void stp_tick(struct stp *, int ms);
+void stp_set_bridge_id(struct stp *, stp_identifier bridge_id);
+void stp_set_bridge_priority(struct stp *, uint16_t new_priority);
+void stp_set_hello_time(struct stp *, int ms);
+void stp_set_max_age(struct stp *, int ms);
+void stp_set_forward_delay(struct stp *, int ms);
+
+/* STP properties. */
+const char *stp_get_name(const struct stp *);
+stp_identifier stp_get_bridge_id(const struct stp *);
+stp_identifier stp_get_designated_root(const struct stp *);
+bool stp_is_root_bridge(const struct stp *);
+int stp_get_root_path_cost(const struct stp *);
+int stp_get_hello_time(const struct stp *);
+int stp_get_max_age(const struct stp *);
+int stp_get_forward_delay(const struct stp *);
+
+/* Obtaining STP ports. */
+struct stp_port *stp_get_port(struct stp *, int port_no);
+struct stp_port *stp_get_root_port(struct stp *);
+bool stp_get_changed_port(struct stp *, struct stp_port **portp);
+
+/* State of an STP port.
+ *
+ * A port is in exactly one state at any given time, but distinct bits are used
+ * for states to allow testing for more than one state with a bit mask. */
+enum stp_state {
+ STP_DISABLED = 1 << 0, /* 8.4.5: Disabled by management. */
+ STP_LISTENING = 1 << 1, /* 8.4.2: Not learning or relaying frames. */
+ STP_LEARNING = 1 << 2, /* 8.4.3: Learning but not relaying frames. */
+ STP_FORWARDING = 1 << 3, /* 8.4.4: Learning and relaying frames. */
+ STP_BLOCKING = 1 << 4 /* 8.4.1: Initial boot state. */
+};
+const char *stp_state_name(enum stp_state);
+bool stp_forward_in_state(enum stp_state);
+bool stp_learn_in_state(enum stp_state);
+
+void stp_received_bpdu(struct stp_port *, const void *bpdu, size_t bpdu_size);
+
+struct stp *stp_port_get_stp(struct stp_port *);
+int stp_port_no(const struct stp_port *);
+enum stp_state stp_port_get_state(const struct stp_port *);
+void stp_port_enable(struct stp_port *);
+void stp_port_disable(struct stp_port *);
+void stp_port_set_priority(struct stp_port *, uint8_t new_priority);
+void stp_port_set_path_cost(struct stp_port *, uint16_t path_cost);
+void stp_port_set_speed(struct stp_port *, unsigned int speed);
+void stp_port_enable_change_detection(struct stp_port *);
+void stp_port_disable_change_detection(struct stp_port *);
+
+#endif /* stp.h */
diff --git a/lib/svec.c b/lib/svec.c
new file mode 100644
index 000000000..4f8968d17
--- /dev/null
+++ b/lib/svec.c
@@ -0,0 +1,381 @@
+/*
+ * Copyright (c) 2008, 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <config.h>
+#include "svec.h"
+#include <assert.h>
+#include <ctype.h>
+#include <stdlib.h>
+#include <string.h>
+#include "dynamic-string.h"
+#include "util.h"
+
+#define THIS_MODULE VLM_svec
+#include "vlog.h"
+
+void
+svec_init(struct svec *svec)
+{
+ svec->names = NULL;
+ svec->n = 0;
+ svec->allocated = 0;
+}
+
+void
+svec_clone(struct svec *svec, const struct svec *other)
+{
+ svec_init(svec);
+ svec_append(svec, other);
+}
+
+void
+svec_destroy(struct svec *svec)
+{
+ svec_clear(svec);
+ free(svec->names);
+}
+
+void
+svec_clear(struct svec *svec)
+{
+ size_t i;
+
+ for (i = 0; i < svec->n; i++) {
+ free(svec->names[i]);
+ }
+ svec->n = 0;
+}
+
+void
+svec_add(struct svec *svec, const char *name)
+{
+ svec_add_nocopy(svec, xstrdup(name));
+}
+
+void
+svec_del(struct svec *svec, const char *name)
+{
+ size_t offset;
+
+ offset = svec_find(svec, name);
+ if (offset != SIZE_MAX) {
+ free(svec->names[offset]);
+ memmove(&svec->names[offset], &svec->names[offset + 1],
+ sizeof *svec->names * (svec->n - offset - 1));
+ svec->n--;
+ }
+}
+
+static void
+svec_expand(struct svec *svec)
+{
+ if (svec->n >= svec->allocated) {
+ svec->names = x2nrealloc(svec->names, &svec->allocated,
+ sizeof *svec->names);
+ }
+}
+
+void
+svec_add_nocopy(struct svec *svec, char *name)
+{
+ svec_expand(svec);
+ svec->names[svec->n++] = name;
+}
+
+void
+svec_append(struct svec *svec, const struct svec *other)
+{
+ size_t i;
+ for (i = 0; i < other->n; i++) {
+ svec_add(svec, other->names[i]);
+ }
+}
+
+void
+svec_terminate(struct svec *svec)
+{
+ svec_expand(svec);
+ svec->names[svec->n] = NULL;
+}
+
+static int
+compare_strings(const void *a_, const void *b_)
+{
+ char *const *a = a_;
+ char *const *b = b_;
+ return strcmp(*a, *b);
+}
+
+void
+svec_sort(struct svec *svec)
+{
+ qsort(svec->names, svec->n, sizeof *svec->names, compare_strings);
+}
+
+void
+svec_sort_unique(struct svec *svec)
+{
+ svec_sort(svec);
+ svec_unique(svec);
+}
+
+void
+svec_unique(struct svec *svec)
+{
+ assert(svec_is_sorted(svec));
+ if (svec->n > 1) {
+ /* This algorithm is lazy and sub-optimal, but it's "obviously correct"
+ * and asymptotically optimal . */
+ struct svec tmp;
+ size_t i;
+
+ svec_init(&tmp);
+ svec_add(&tmp, svec->names[0]);
+ for (i = 1; i < svec->n; i++) {
+ if (strcmp(svec->names[i - 1], svec->names[i])) {
+ svec_add(&tmp, svec->names[i]);
+ }
+ }
+ svec_swap(&tmp, svec);
+ svec_destroy(&tmp);
+ }
+}
+
+void
+svec_compact(struct svec *svec)
+{
+ size_t i, j;
+
+ for (i = j = 0; i < svec->n; i++) {
+ if (svec->names[i] != NULL) {
+ svec->names[j++] = svec->names[i];
+ }
+ }
+ svec->n = j;
+}
+
+void
+svec_diff(const struct svec *a, const struct svec *b,
+ struct svec *a_only, struct svec *both, struct svec *b_only)
+{
+ size_t i, j;
+
+ assert(svec_is_sorted(a));
+ assert(svec_is_sorted(b));
+ if (a_only) {
+ svec_init(a_only);
+ }
+ if (both) {
+ svec_init(both);
+ }
+ if (b_only) {
+ svec_init(b_only);
+ }
+ for (i = j = 0; i < a->n && j < b->n; ) {
+ int cmp = strcmp(a->names[i], b->names[j]);
+ if (cmp < 0) {
+ if (a_only) {
+ svec_add(a_only, a->names[i]);
+ }
+ i++;
+ } else if (cmp > 0) {
+ if (b_only) {
+ svec_add(b_only, b->names[j]);
+ }
+ j++;
+ } else {
+ if (both) {
+ svec_add(both, a->names[i]);
+ }
+ i++;
+ j++;
+ }
+ }
+ if (a_only) {
+ for (; i < a->n; i++) {
+ svec_add(a_only, a->names[i]);
+ }
+ }
+ if (b_only) {
+ for (; j < b->n; j++) {
+ svec_add(b_only, b->names[j]);
+ }
+ }
+}
+
+bool
+svec_contains(const struct svec *svec, const char *name)
+{
+ return svec_find(svec, name) != SIZE_MAX;
+}
+
+size_t
+svec_find(const struct svec *svec, const char *name)
+{
+ char **p;
+
+ assert(svec_is_sorted(svec));
+ p = bsearch(&name, svec->names, svec->n, sizeof *svec->names,
+ compare_strings);
+ return p ? p - svec->names : SIZE_MAX;
+}
+
+bool
+svec_is_sorted(const struct svec *svec)
+{
+ size_t i;
+
+ for (i = 1; i < svec->n; i++) {
+ if (strcmp(svec->names[i - 1], svec->names[i]) > 0) {
+ return false;
+ }
+ }
+ return true;
+}
+
+bool
+svec_is_unique(const struct svec *svec)
+{
+ return svec_get_duplicate(svec) == NULL;
+}
+
+const char *
+svec_get_duplicate(const struct svec *svec)
+{
+ assert(svec_is_sorted(svec));
+ if (svec->n > 1) {
+ size_t i;
+ for (i = 1; i < svec->n; i++) {
+ if (!strcmp(svec->names[i - 1], svec->names[i])) {
+ return svec->names[i];
+ }
+ }
+ }
+ return NULL;
+}
+
+void
+svec_swap(struct svec *a, struct svec *b)
+{
+ struct svec tmp = *a;
+ *a = *b;
+ *b = tmp;
+}
+
+void
+svec_print(const struct svec *svec, const char *title)
+{
+ size_t i;
+
+ printf("%s:\n", title);
+ for (i = 0; i < svec->n; i++) {
+ printf("\"%s\"\n", svec->names[i]);
+ }
+}
+
+/* Breaks 'words' into words at white space, respecting shell-like quoting
+ * conventions, and appends the words to 'svec'. */
+void
+svec_parse_words(struct svec *svec, const char *words)
+{
+ struct ds word = DS_EMPTY_INITIALIZER;
+ const char *p, *q;
+
+ for (p = words; *p != '\0'; p = q) {
+ int quote = 0;
+
+ while (isspace((unsigned char) *p)) {
+ p++;
+ }
+ if (*p == '\0') {
+ break;
+ }
+
+ ds_clear(&word);
+ for (q = p; *q != '\0'; q++) {
+ if (*q == quote) {
+ quote = 0;
+ } else if (*q == '\'' || *q == '"') {
+ quote = *q;
+ } else if (*q == '\\' && (!quote || quote == '"')) {
+ q++;
+ if (*q == '\0') {
+ VLOG_WARN("%s: ends in trailing backslash", words);
+ break;
+ }
+ ds_put_char(&word, *q);
+ } else if (isspace((unsigned char) *q) && !quote) {
+ q++;
+ break;
+ } else {
+ ds_put_char(&word, *q);
+ }
+ }
+ svec_add(svec, ds_cstr(&word));
+ if (quote) {
+ VLOG_WARN("%s: word ends inside quoted string", words);
+ }
+ }
+ ds_destroy(&word);
+}
+
+bool
+svec_equal(const struct svec *a, const struct svec *b)
+{
+ size_t i;
+
+ if (a->n != b->n) {
+ return false;
+ }
+ for (i = 0; i < a->n; i++) {
+ if (strcmp(a->names[i], b->names[i])) {
+ return false;
+ }
+ }
+ return true;
+}
+
+char *
+svec_join(const struct svec *svec,
+ const char *delimiter, const char *terminator)
+{
+ struct ds ds;
+ size_t i;
+
+ ds_init(&ds);
+ for (i = 0; i < svec->n; i++) {
+ if (i) {
+ ds_put_cstr(&ds, delimiter);
+ }
+ ds_put_cstr(&ds, svec->names[i]);
+ }
+ ds_put_cstr(&ds, terminator);
+ return ds_cstr(&ds);
+}
+
+const char *
+svec_back(const struct svec *svec)
+{
+ assert(svec->n);
+ return svec->names[svec->n - 1];
+}
+
+void
+svec_pop_back(struct svec *svec)
+{
+ assert(svec->n);
+ free(svec->names[--svec->n]);
+}
diff --git a/lib/svec.h b/lib/svec.h
new file mode 100644
index 000000000..4865d2f2a
--- /dev/null
+++ b/lib/svec.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2008, 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef SVEC_H
+#define SVEC_H 1
+
+#include <stdbool.h>
+#include <stddef.h>
+
+struct svec {
+ char **names;
+ size_t n;
+ size_t allocated;
+};
+
+#define SVEC_EMPTY_INITIALIZER { NULL, 0, 0 }
+
+void svec_init(struct svec *);
+void svec_clone(struct svec *, const struct svec *);
+void svec_destroy(struct svec *);
+void svec_clear(struct svec *);
+void svec_add(struct svec *, const char *);
+void svec_add_nocopy(struct svec *, char *);
+void svec_del(struct svec *, const char *);
+void svec_append(struct svec *, const struct svec *);
+void svec_terminate(struct svec *);
+void svec_sort(struct svec *);
+void svec_sort_unique(struct svec *);
+void svec_unique(struct svec *);
+void svec_compact(struct svec *);
+void svec_diff(const struct svec *a, const struct svec *b,
+ struct svec *a_only, struct svec *both, struct svec *b_only);
+bool svec_contains(const struct svec *, const char *);
+size_t svec_find(const struct svec *, const char *);
+bool svec_is_sorted(const struct svec *);
+bool svec_is_unique(const struct svec *);
+const char *svec_get_duplicate(const struct svec *);
+void svec_swap(struct svec *a, struct svec *b);
+void svec_print(const struct svec *svec, const char *title);
+void svec_parse_words(struct svec *svec, const char *words);
+bool svec_equal(const struct svec *, const struct svec *);
+char *svec_join(const struct svec *,
+ const char *delimiter, const char *terminator);
+const char *svec_back(const struct svec *);
+void svec_pop_back(struct svec *);
+
+#endif /* svec.h */
diff --git a/lib/tag.c b/lib/tag.c
new file mode 100644
index 000000000..8a4ee8925
--- /dev/null
+++ b/lib/tag.c
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2008, 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <config.h>
+#include "tag.h"
+#include <limits.h>
+#include "random.h"
+#include "type-props.h"
+#include "util.h"
+
+#define N_TAG_BITS (CHAR_BIT * sizeof(tag_type))
+BUILD_ASSERT_DECL(IS_POW2(N_TAG_BITS));
+
+#define LOG2_N_TAG_BITS (N_TAG_BITS == 32 ? 5 : N_TAG_BITS == 64 ? 6 : 0)
+BUILD_ASSERT_DECL(LOG2_N_TAG_BITS > 0);
+
+/* Returns a randomly selected tag. */
+tag_type
+tag_create_random(void)
+{
+ int x, y;
+ do {
+ uint16_t r = random_uint16();
+ x = r & (N_TAG_BITS - 1);
+ y = r >> (16 - LOG2_N_TAG_BITS);
+ } while (x == y);
+ return (1u << x) | (1u << y);
+}
+
+/* Returns a tag deterministically generated from 'seed'.
+ *
+ * 'seed' should have data in all of its bits; if it has data only in its
+ * low-order bits then the resulting tags will be poorly distributed. Use a
+ * hash function such as hash_bytes() to generate 'seed' if necessary. */
+tag_type
+tag_create_deterministic(uint32_t seed)
+{
+ int x = seed & (N_TAG_BITS - 1);
+ int y = (seed >> LOG2_N_TAG_BITS) % 31;
+ y += y >= x;
+ return (1u << x) | (1u << y);
+}
+
+/* Initializes 'set' as an empty tag set. */
+void
+tag_set_init(struct tag_set *set)
+{
+ memset(set, 0, sizeof *set);
+}
+
+/* Adds 'tag' to 'set'. */
+void
+tag_set_add(struct tag_set *set, tag_type tag)
+{
+ if (tag && (!tag_is_valid(tag) || !tag_set_intersects(set, tag))) {
+ /* XXX We could do better by finding the set member to which we would
+ * add the fewest number of 1-bits. This would reduce the amount of
+ * ambiguity, since e.g. three 1-bits match 3 * 2 / 2 = 3 unique tags
+ * whereas four 1-bits match 4 * 3 / 2 = 6 unique tags. */
+ tag_type *t = &set->tags[set->n++ % TAG_SET_SIZE];
+ *t |= tag;
+ if (*t == TYPE_MAXIMUM(tag_type)) {
+ set->tags[0] = *t;
+ }
+
+ set->total |= tag;
+ }
+}
+
diff --git a/lib/tag.h b/lib/tag.h
new file mode 100644
index 000000000..2002e5a39
--- /dev/null
+++ b/lib/tag.h
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2008 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef TAG_H
+#define TAG_H 1
+
+#include <assert.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include "util.h"
+
+/*
+ * Tagging support.
+ *
+ * A 'tag' represents an arbitrary category. Currently, tags are used to
+ * represent categories of flows and in particular the dependencies for a flow
+ * switching decision. For example, if a flow's output port is based on
+ * knowledge that source MAC 00:02:e3:0f:80:a4 is on eth0, then a tag that
+ * represents that dependency is attached to that flow in the flowtracking hash
+ * table.
+ *
+ * As this example shows, the universe of possible categories is very large,
+ * and even the number of categories that are in use at a given time can be
+ * very large. This means that keeping track of category membership via
+ * conventional means (lists, bitmaps, etc.) is likely to be expensive.
+ *
+ * Tags are actually implemented via a "superimposed coding", as discussed in
+ * Knuth TAOCP v.3 section 6.5 "Retrieval on Secondary Keys". A tag is an
+ * unsigned integer in which exactly 2 bits are set to 1 and the rest set to 0.
+ * For 32-bit integers (as currently used) there are 32 * 31 / 2 = 496 unique
+ * tags; for 64-bit integers there are 64 * 63 / 2 = 2,016.
+ *
+ * Because there is a small finite number of unique tags, tags must collide
+ * after some number of them have been created. In practice we generally
+ * create tags by choosing bits randomly.
+ *
+ * The key property of tags is that we can combine them without increasing the
+ * amount of data required using bitwise-OR, since the result has the 1-bits
+ * from both tags set. The necessary tradeoff is that the result is even more
+ * ambiguous: if combining two tags yields a value with 4 bits set to 1, then
+ * the result value will test as having 4 * 3 / 2 = 6 unique tags, not just the
+ * two tags that we combined.
+ *
+ * The upshot is this: a value that is the bitwise-OR combination of a number
+ * of tags will always include the tags that were combined, but it may contain
+ * any number of additional tags as well. This is acceptable for flowtracking,
+ * since we want to be sure that we catch every flow that needs to be
+ * revalidated, but it is OK if we revalidate a few extra flows as well.
+ *
+ * If we combine too many tags, then the result will have every bit set, so
+ * that it will test as including every tag. Fortunately, this is not a big
+ * problem for us: although there are many flows overall, each individual flow
+ * belongs only to a small number of categories.
+ */
+
+/* Represents a tag, or the combination of 0 or more tags. */
+typedef uint32_t tag_type;
+
+tag_type tag_create_random(void);
+tag_type tag_create_deterministic(uint32_t seed);
+static inline bool tag_intersects(tag_type, tag_type);
+static inline bool tag_is_valid(tag_type);
+
+/* Returns true if 'a' and 'b' have at least one tag in common,
+ * false if their set of tags is disjoint. . */
+static inline bool
+tag_intersects(tag_type a, tag_type b)
+{
+ tag_type x = a & b;
+ return (x & (x - 1)) != 0;
+}
+
+/* Returns true if 'tag' is a valid tag, that is, if exactly two bits are set
+ * to 1 and the rest to 0. Otherwise, returns false. */
+static inline bool
+tag_is_valid(tag_type tag)
+{
+ tag_type x = tag & (tag - 1);
+ tag_type y = x & (x - 1);
+ return x && !y;
+}
+
+/*
+ * A tag set accumulates tags with reduced ambiguity compared to a single tag.
+ * The flow tracking uses tag sets to keep track of tags that need to
+ * revalidated after a number of packets have been processed.
+ */
+#define TAG_SET_SIZE 4
+struct tag_set {
+ tag_type total;
+ tag_type tags[TAG_SET_SIZE];
+ unsigned int n;
+};
+
+void tag_set_init(struct tag_set *);
+void tag_set_add(struct tag_set *, tag_type);
+static inline bool tag_set_is_empty(const struct tag_set *);
+static inline bool tag_set_intersects(const struct tag_set *, tag_type);
+
+/* Returns true if 'set' will match no tags at all,
+ * false if it will match at least one tag. */
+static inline bool
+tag_set_is_empty(const struct tag_set *set)
+{
+ return !set->n;
+}
+
+/* Returns true if any of the tags in 'tags' are also in 'set',
+ * false if the intersection is empty. */
+static inline bool
+tag_set_intersects(const struct tag_set *set, tag_type tags)
+{
+ BUILD_ASSERT_DECL(TAG_SET_SIZE == 4);
+ return (tag_intersects(set->total, tags)
+ && (tag_intersects(set->tags[0], tags)
+ || tag_intersects(set->tags[1], tags)
+ || tag_intersects(set->tags[2], tags)
+ || tag_intersects(set->tags[3], tags)));
+}
+
+#endif /* tag.h */
diff --git a/lib/timeval.c b/lib/timeval.c
new file mode 100644
index 000000000..b76993f9b
--- /dev/null
+++ b/lib/timeval.c
@@ -0,0 +1,305 @@
+/*
+ * Copyright (c) 2008, 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <config.h>
+#include "timeval.h"
+#include <assert.h>
+#include <errno.h>
+#include <poll.h>
+#include <signal.h>
+#include <string.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <unistd.h>
+#include "coverage.h"
+#include "fatal-signal.h"
+#include "util.h"
+
+#include "vlog.h"
+#define THIS_MODULE VLM_timeval
+
+/* Initialized? */
+static bool inited;
+
+/* Has a timer tick occurred? */
+static volatile sig_atomic_t tick;
+
+/* The current time, as of the last refresh. */
+static struct timeval now;
+
+/* Time at which to die with SIGALRM (if not TIME_MIN). */
+static time_t deadline = TIME_MIN;
+
+static void sigalrm_handler(int);
+static void refresh_if_ticked(void);
+static time_t time_add(time_t, time_t);
+static void block_sigalrm(sigset_t *);
+static void unblock_sigalrm(const sigset_t *);
+static void log_poll_interval(long long int last_wakeup,
+ const struct rusage *last_rusage);
+static long long int timeval_to_msec(const struct timeval *);
+
+/* Initializes the timetracking module. */
+void
+time_init(void)
+{
+ struct sigaction sa;
+ struct itimerval itimer;
+
+ if (inited) {
+ return;
+ }
+
+ inited = true;
+ gettimeofday(&now, NULL);
+ tick = false;
+
+ /* Set up signal handler. */
+ memset(&sa, 0, sizeof sa);
+ sa.sa_handler = sigalrm_handler;
+ sigemptyset(&sa.sa_mask);
+ sa.sa_flags = SA_RESTART;
+ if (sigaction(SIGALRM, &sa, NULL)) {
+ ovs_fatal(errno, "sigaction(SIGALRM) failed");
+ }
+
+ /* Set up periodic timer. */
+ itimer.it_interval.tv_sec = 0;
+ itimer.it_interval.tv_usec = TIME_UPDATE_INTERVAL * 1000;
+ itimer.it_value = itimer.it_interval;
+ if (setitimer(ITIMER_REAL, &itimer, NULL)) {
+ ovs_fatal(errno, "setitimer failed");
+ }
+}
+
+/* Forces a refresh of the current time from the kernel. It is not usually
+ * necessary to call this function, since the time will be refreshed
+ * automatically at least every TIME_UPDATE_INTERVAL milliseconds. */
+void
+time_refresh(void)
+{
+ gettimeofday(&now, NULL);
+ tick = false;
+}
+
+/* Returns the current time, in seconds. */
+time_t
+time_now(void)
+{
+ refresh_if_ticked();
+ return now.tv_sec;
+}
+
+/* Returns the current time, in ms (within TIME_UPDATE_INTERVAL ms). */
+long long int
+time_msec(void)
+{
+ refresh_if_ticked();
+ return timeval_to_msec(&now);
+}
+
+/* Stores the current time, accurate within TIME_UPDATE_INTERVAL ms, into
+ * '*tv'. */
+void
+time_timeval(struct timeval *tv)
+{
+ refresh_if_ticked();
+ *tv = now;
+}
+
+/* Configures the program to die with SIGALRM 'secs' seconds from now, if
+ * 'secs' is nonzero, or disables the feature if 'secs' is zero. */
+void
+time_alarm(unsigned int secs)
+{
+ sigset_t oldsigs;
+
+ time_init();
+ block_sigalrm(&oldsigs);
+ deadline = secs ? time_add(time_now(), secs) : TIME_MIN;
+ unblock_sigalrm(&oldsigs);
+}
+
+/* Like poll(), except:
+ *
+ * - On error, returns a negative error code (instead of setting errno).
+ *
+ * - If interrupted by a signal, retries automatically until the original
+ * 'timeout' expires. (Because of this property, this function will
+ * never return -EINTR.)
+ *
+ * - As a side effect, refreshes the current time (like time_refresh()).
+ */
+int
+time_poll(struct pollfd *pollfds, int n_pollfds, int timeout)
+{
+ static long long int last_wakeup;
+ static struct rusage last_rusage;
+ long long int start;
+ sigset_t oldsigs;
+ bool blocked;
+ int retval;
+
+ time_refresh();
+ log_poll_interval(last_wakeup, &last_rusage);
+ coverage_clear();
+ start = time_msec();
+ blocked = false;
+ for (;;) {
+ int time_left;
+ if (timeout > 0) {
+ long long int elapsed = time_msec() - start;
+ time_left = timeout >= elapsed ? timeout - elapsed : 0;
+ } else {
+ time_left = timeout;
+ }
+
+ retval = poll(pollfds, n_pollfds, time_left);
+ if (retval < 0) {
+ retval = -errno;
+ }
+ time_refresh();
+ if (retval != -EINTR) {
+ break;
+ }
+
+ if (!blocked && deadline == TIME_MIN) {
+ block_sigalrm(&oldsigs);
+ blocked = true;
+ }
+ }
+ if (blocked) {
+ unblock_sigalrm(&oldsigs);
+ }
+ last_wakeup = time_msec();
+ getrusage(RUSAGE_SELF, &last_rusage);
+ return retval;
+}
+
+/* Returns the sum of 'a' and 'b', with saturation on overflow or underflow. */
+static time_t
+time_add(time_t a, time_t b)
+{
+ return (a >= 0
+ ? (b > TIME_MAX - a ? TIME_MAX : a + b)
+ : (b < TIME_MIN - a ? TIME_MIN : a + b));
+}
+
+static void
+sigalrm_handler(int sig_nr)
+{
+ tick = true;
+ if (deadline != TIME_MIN && time(0) > deadline) {
+ fatal_signal_handler(sig_nr);
+ }
+}
+
+static void
+refresh_if_ticked(void)
+{
+ assert(inited);
+ if (tick) {
+ time_refresh();
+ }
+}
+
+static void
+block_sigalrm(sigset_t *oldsigs)
+{
+ sigset_t sigalrm;
+ sigemptyset(&sigalrm);
+ sigaddset(&sigalrm, SIGALRM);
+ if (sigprocmask(SIG_BLOCK, &sigalrm, oldsigs)) {
+ ovs_fatal(errno, "sigprocmask");
+ }
+}
+
+static void
+unblock_sigalrm(const sigset_t *oldsigs)
+{
+ if (sigprocmask(SIG_SETMASK, oldsigs, NULL)) {
+ ovs_fatal(errno, "sigprocmask");
+ }
+}
+
+static long long int
+timeval_to_msec(const struct timeval *tv)
+{
+ return (long long int) tv->tv_sec * 1000 + tv->tv_usec / 1000;
+}
+
+static long long int
+timeval_diff_msec(const struct timeval *a, const struct timeval *b)
+{
+ return timeval_to_msec(a) - timeval_to_msec(b);
+}
+
+static void
+log_poll_interval(long long int last_wakeup, const struct rusage *last_rusage)
+{
+ static unsigned int mean_interval; /* In 16ths of a millisecond. */
+ static unsigned int n_samples;
+
+ long long int now;
+ unsigned int interval; /* In 16ths of a millisecond. */
+
+ /* Compute interval from last wakeup to now in 16ths of a millisecond,
+ * capped at 10 seconds (16000 in this unit). */
+ now = time_msec();
+ interval = MIN(10000, now - last_wakeup) << 4;
+
+ /* Warn if we took too much time between polls. */
+ if (n_samples > 10 && interval > mean_interval * 8) {
+ struct rusage rusage;
+
+ getrusage(RUSAGE_SELF, &rusage);
+ VLOG_WARN("%u ms poll interval (%lld ms user, %lld ms system) "
+ "is over %u times the weighted mean interval %u ms "
+ "(%u samples)",
+ (interval + 8) / 16,
+ timeval_diff_msec(&rusage.ru_utime, &last_rusage->ru_utime),
+ timeval_diff_msec(&rusage.ru_stime, &last_rusage->ru_stime),
+ interval / mean_interval,
+ (mean_interval + 8) / 16, n_samples);
+ if (rusage.ru_minflt > last_rusage->ru_minflt
+ || rusage.ru_majflt > last_rusage->ru_majflt) {
+ VLOG_WARN("faults: %ld minor, %ld major",
+ rusage.ru_minflt - last_rusage->ru_minflt,
+ rusage.ru_majflt - last_rusage->ru_majflt);
+ }
+ if (rusage.ru_inblock > last_rusage->ru_inblock
+ || rusage.ru_oublock > last_rusage->ru_oublock) {
+ VLOG_WARN("disk: %ld reads, %ld writes",
+ rusage.ru_inblock - last_rusage->ru_inblock,
+ rusage.ru_oublock - last_rusage->ru_oublock);
+ }
+ if (rusage.ru_nvcsw > last_rusage->ru_nvcsw
+ || rusage.ru_nivcsw > last_rusage->ru_nivcsw) {
+ VLOG_WARN("context switches: %ld voluntary, %ld involuntary",
+ rusage.ru_nvcsw - last_rusage->ru_nvcsw,
+ rusage.ru_nivcsw - last_rusage->ru_nivcsw);
+ }
+ coverage_log(VLL_WARN);
+ }
+
+ /* Update exponentially weighted moving average. With these parameters, a
+ * given value decays to 1% of its value in about 100 time steps. */
+ if (n_samples++) {
+ mean_interval = (mean_interval * 122 + interval * 6 + 64) / 128;
+ } else {
+ mean_interval = interval;
+ }
+}
diff --git a/lib/timeval.h b/lib/timeval.h
new file mode 100644
index 000000000..8e10ad360
--- /dev/null
+++ b/lib/timeval.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2008, 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef TIMEVAL_H
+#define TIMEVAL_H 1
+
+#include <time.h>
+#include "type-props.h"
+#include "util.h"
+
+struct pollfd;
+struct timeval;
+
+/* POSIX allows floating-point time_t, but we don't support it. */
+BUILD_ASSERT_DECL(TYPE_IS_INTEGER(time_t));
+
+/* We do try to cater to unsigned time_t, but I want to know about it if we
+ * ever encounter such a platform. */
+BUILD_ASSERT_DECL(TYPE_IS_SIGNED(time_t));
+
+#define TIME_MAX TYPE_MAXIMUM(time_t)
+#define TIME_MIN TYPE_MINIMUM(time_t)
+
+/* Interval between updates to the time reported by time_gettimeofday(), in ms.
+ * This should not be adjusted much below 10 ms or so with the current
+ * implementation, or too much time will be wasted in signal handlers and calls
+ * to time(0). */
+#define TIME_UPDATE_INTERVAL 100
+
+void time_init(void);
+void time_refresh(void);
+time_t time_now(void);
+long long int time_msec(void);
+void time_timeval(struct timeval *);
+void time_alarm(unsigned int secs);
+int time_poll(struct pollfd *, int n_pollfds, int timeout);
+
+#endif /* timeval.h */
diff --git a/lib/type-props.h b/lib/type-props.h
new file mode 100644
index 000000000..c718cf12f
--- /dev/null
+++ b/lib/type-props.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2008 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef TYPE_PROPS_H
+#define TYPE_PROPS_H 1
+
+#include <limits.h>
+
+#define TYPE_IS_INTEGER(TYPE) ((TYPE) 1.5 == (TYPE) 1)
+#define TYPE_IS_SIGNED(TYPE) ((TYPE) 0 > (TYPE) -1)
+#define TYPE_VALUE_BITS(TYPE) (sizeof(TYPE) * CHAR_BIT - TYPE_IS_SIGNED(TYPE))
+#define TYPE_MINIMUM(TYPE) (TYPE_IS_SIGNED(TYPE) \
+ ? ~(TYPE)0 << TYPE_VALUE_BITS(TYPE) \
+ : 0)
+#define TYPE_MAXIMUM(TYPE) (TYPE_IS_SIGNED(TYPE) \
+ ? ~(~(TYPE)0 << TYPE_VALUE_BITS(TYPE)) \
+ : (TYPE)-1)
+
+#endif /* type-props.h */
diff --git a/lib/unixctl.c b/lib/unixctl.c
new file mode 100644
index 000000000..42b6eeff1
--- /dev/null
+++ b/lib/unixctl.c
@@ -0,0 +1,592 @@
+/*
+ * Copyright (c) 2008, 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <config.h>
+#include "unixctl.h"
+#include <assert.h>
+#include <ctype.h>
+#include <errno.h>
+#include <poll.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include "coverage.h"
+#include "dirs.h"
+#include "dynamic-string.h"
+#include "fatal-signal.h"
+#include "list.h"
+#include "ofpbuf.h"
+#include "poll-loop.h"
+#include "shash.h"
+#include "socket-util.h"
+#include "util.h"
+
+#ifndef SCM_CREDENTIALS
+#include <time.h>
+#endif
+
+#define THIS_MODULE VLM_unixctl
+#include "vlog.h"
+
+struct unixctl_command {
+ void (*cb)(struct unixctl_conn *, const char *args);
+};
+
+struct unixctl_conn {
+ struct list node;
+ int fd;
+
+ enum { S_RECV, S_PROCESS, S_SEND } state;
+ struct ofpbuf in;
+ struct ds out;
+ size_t out_pos;
+};
+
+/* Server for control connection. */
+struct unixctl_server {
+ char *path;
+ int fd;
+ struct list conns;
+};
+
+/* Client for control connection. */
+struct unixctl_client {
+ char *connect_path;
+ char *bind_path;
+ FILE *stream;
+};
+
+static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
+
+static struct shash commands = SHASH_INITIALIZER(&commands);
+
+static void
+unixctl_help(struct unixctl_conn *conn, const char *args UNUSED)
+{
+ struct ds ds = DS_EMPTY_INITIALIZER;
+ struct shash_node *node;
+
+ ds_put_cstr(&ds, "The available commands are:\n");
+ HMAP_FOR_EACH (node, struct shash_node, node, &commands.map) {
+ ds_put_format(&ds, "\t%s\n", node->name);
+ }
+ unixctl_command_reply(conn, 214, ds_cstr(&ds));
+ ds_destroy(&ds);
+}
+
+void
+unixctl_command_register(const char *name,
+ void (*cb)(struct unixctl_conn *, const char *args))
+{
+ struct unixctl_command *command;
+
+ assert(!shash_find_data(&commands, name)
+ || shash_find_data(&commands, name) == cb);
+ command = xmalloc(sizeof *command);
+ command->cb = cb;
+ shash_add(&commands, name, command);
+}
+
+static const char *
+translate_reply_code(int code)
+{
+ switch (code) {
+ case 200: return "OK";
+ case 201: return "Created";
+ case 202: return "Accepted";
+ case 204: return "No Content";
+ case 211: return "System Status";
+ case 214: return "Help";
+ case 400: return "Bad Request";
+ case 401: return "Unauthorized";
+ case 403: return "Forbidden";
+ case 404: return "Not Found";
+ case 500: return "Internal Server Error";
+ case 501: return "Invalid Argument";
+ case 503: return "Service Unavailable";
+ default: return "Unknown";
+ }
+}
+
+void
+unixctl_command_reply(struct unixctl_conn *conn,
+ int code, const char *body)
+{
+ struct ds *out = &conn->out;
+
+ COVERAGE_INC(unixctl_replied);
+ assert(conn->state == S_PROCESS);
+ conn->state = S_SEND;
+ conn->out_pos = 0;
+
+ ds_clear(out);
+ ds_put_format(out, "%03d %s\n", code, translate_reply_code(code));
+ if (body) {
+ const char *p;
+ for (p = body; *p != '\0'; ) {
+ size_t n = strcspn(p, "\n");
+
+ if (*p == '.') {
+ ds_put_char(out, '.');
+ }
+ ds_put_buffer(out, p, n);
+ ds_put_char(out, '\n');
+ p += n;
+ if (*p == '\n') {
+ p++;
+ }
+ }
+ }
+ ds_put_cstr(out, ".\n");
+}
+
+/* Creates a unixctl server listening on 'path', which may be:
+ *
+ * - NULL, in which case <rundir>/<program>.<pid>.ctl is used.
+ *
+ * - A name that does not start with '/', in which case it is put in
+ * <rundir>.
+ *
+ * - An absolute path (starting with '/') that gives the exact name of
+ * the Unix domain socket to listen on.
+ *
+ * A program that (optionally) daemonizes itself should call this function
+ * *after* daemonization, so that the socket name contains the pid of the
+ * daemon instead of the pid of the program that exited. (Otherwise,
+ * "ovs-appctl --target <program>.pid" will fail.)
+ *
+ * Returns 0 if successful, otherwise a positive errno value. If successful,
+ * sets '*serverp' to the new unixctl_server, otherwise to NULL. */
+int
+unixctl_server_create(const char *path, struct unixctl_server **serverp)
+{
+ struct unixctl_server *server;
+ int error;
+
+ unixctl_command_register("help", unixctl_help);
+
+ server = xmalloc(sizeof *server);
+ list_init(&server->conns);
+
+ if (path) {
+ if (path[0] == '/') {
+ server->path = xstrdup(path);
+ } else {
+ server->path = xasprintf("%s/%s", ovs_rundir, path);
+ }
+ } else {
+ server->path = xasprintf("%s/%s.%ld.ctl", ovs_rundir,
+ program_name, (long int) getpid());
+ }
+
+ server->fd = make_unix_socket(SOCK_STREAM, true, false, server->path,
+ NULL);
+ if (server->fd < 0) {
+ error = -server->fd;
+ fprintf(stderr, "Could not initialize control socket %s (%s)\n",
+ server->path, strerror(error));
+ goto error;
+ }
+
+ if (chmod(server->path, S_IRUSR | S_IWUSR) < 0) {
+ error = errno;
+ fprintf(stderr, "Failed to chmod control socket %s (%s)\n",
+ server->path, strerror(error));
+ goto error;
+ }
+
+ if (listen(server->fd, 10) < 0) {
+ error = errno;
+ fprintf(stderr, "Failed to listen on control socket %s (%s)\n",
+ server->path, strerror(error));
+ goto error;
+ }
+
+ *serverp = server;
+ return 0;
+
+error:
+ if (server->fd >= 0) {
+ close(server->fd);
+ }
+ free(server->path);
+ free(server);
+ *serverp = NULL;
+ return error;
+}
+
+static void
+new_connection(struct unixctl_server *server, int fd)
+{
+ struct unixctl_conn *conn;
+
+ set_nonblocking(fd);
+
+ conn = xmalloc(sizeof *conn);
+ list_push_back(&server->conns, &conn->node);
+ conn->fd = fd;
+ conn->state = S_RECV;
+ ofpbuf_init(&conn->in, 128);
+ ds_init(&conn->out);
+ conn->out_pos = 0;
+}
+
+static int
+run_connection_output(struct unixctl_conn *conn)
+{
+ while (conn->out_pos < conn->out.length) {
+ size_t bytes_written;
+ int error;
+
+ error = write_fully(conn->fd, conn->out.string + conn->out_pos,
+ conn->out.length - conn->out_pos, &bytes_written);
+ conn->out_pos += bytes_written;
+ if (error) {
+ return error;
+ }
+ }
+ conn->state = S_RECV;
+ return 0;
+}
+
+static void
+process_command(struct unixctl_conn *conn, char *s)
+{
+ struct unixctl_command *command;
+ size_t name_len;
+ char *name, *args;
+
+ COVERAGE_INC(unixctl_received);
+ conn->state = S_PROCESS;
+
+ name = s;
+ name_len = strcspn(name, " ");
+ args = name + name_len;
+ args += strspn(args, " ");
+ name[name_len] = '\0';
+
+ command = shash_find_data(&commands, name);
+ if (command) {
+ command->cb(conn, args);
+ } else {
+ char *msg = xasprintf("\"%s\" is not a valid command", name);
+ unixctl_command_reply(conn, 400, msg);
+ free(msg);
+ }
+}
+
+static int
+run_connection_input(struct unixctl_conn *conn)
+{
+ for (;;) {
+ size_t bytes_read;
+ char *newline;
+ int error;
+
+ newline = memchr(conn->in.data, '\n', conn->in.size);
+ if (newline) {
+ char *command = conn->in.data;
+ size_t n = newline - command + 1;
+
+ if (n > 0 && newline[-1] == '\r') {
+ newline--;
+ }
+ *newline = '\0';
+
+ process_command(conn, command);
+
+ ofpbuf_pull(&conn->in, n);
+ if (!conn->in.size) {
+ ofpbuf_clear(&conn->in);
+ }
+ return 0;
+ }
+
+ ofpbuf_prealloc_tailroom(&conn->in, 128);
+ error = read_fully(conn->fd, ofpbuf_tail(&conn->in),
+ ofpbuf_tailroom(&conn->in), &bytes_read);
+ conn->in.size += bytes_read;
+ if (conn->in.size > 65536) {
+ VLOG_WARN_RL(&rl, "excess command length, killing connection");
+ return EPROTO;
+ }
+ if (error) {
+ if (error == EAGAIN || error == EWOULDBLOCK) {
+ if (!bytes_read) {
+ return EAGAIN;
+ }
+ } else {
+ if (error != EOF || conn->in.size != 0) {
+ VLOG_WARN_RL(&rl, "read failed: %s",
+ (error == EOF
+ ? "connection dropped mid-command"
+ : strerror(error)));
+ }
+ return error;
+ }
+ }
+ }
+}
+
+static int
+run_connection(struct unixctl_conn *conn)
+{
+ int old_state;
+ do {
+ int error;
+
+ old_state = conn->state;
+ switch (conn->state) {
+ case S_RECV:
+ error = run_connection_input(conn);
+ break;
+
+ case S_PROCESS:
+ error = 0;
+ break;
+
+ case S_SEND:
+ error = run_connection_output(conn);
+ break;
+
+ default:
+ NOT_REACHED();
+ }
+ if (error) {
+ return error;
+ }
+ } while (conn->state != old_state);
+ return 0;
+}
+
+static void
+kill_connection(struct unixctl_conn *conn)
+{
+ list_remove(&conn->node);
+ ofpbuf_uninit(&conn->in);
+ ds_destroy(&conn->out);
+ close(conn->fd);
+ free(conn);
+}
+
+void
+unixctl_server_run(struct unixctl_server *server)
+{
+ struct unixctl_conn *conn, *next;
+ int i;
+
+ for (i = 0; i < 10; i++) {
+ int fd = accept(server->fd, NULL, NULL);
+ if (fd < 0) {
+ if (errno != EAGAIN && errno != EWOULDBLOCK) {
+ VLOG_WARN_RL(&rl, "accept failed: %s", strerror(errno));
+ }
+ break;
+ }
+ new_connection(server, fd);
+ }
+
+ LIST_FOR_EACH_SAFE (conn, next,
+ struct unixctl_conn, node, &server->conns) {
+ int error = run_connection(conn);
+ if (error && error != EAGAIN) {
+ kill_connection(conn);
+ }
+ }
+}
+
+void
+unixctl_server_wait(struct unixctl_server *server)
+{
+ struct unixctl_conn *conn;
+
+ poll_fd_wait(server->fd, POLLIN);
+ LIST_FOR_EACH (conn, struct unixctl_conn, node, &server->conns) {
+ if (conn->state == S_RECV) {
+ poll_fd_wait(conn->fd, POLLIN);
+ } else if (conn->state == S_SEND) {
+ poll_fd_wait(conn->fd, POLLOUT);
+ }
+ }
+}
+
+/* Destroys 'server' and stops listening for connections. */
+void
+unixctl_server_destroy(struct unixctl_server *server)
+{
+ if (server) {
+ struct unixctl_conn *conn, *next;
+
+ LIST_FOR_EACH_SAFE (conn, next,
+ struct unixctl_conn, node, &server->conns) {
+ kill_connection(conn);
+ }
+
+ close(server->fd);
+ unlink(server->path);
+ fatal_signal_remove_file_to_unlink(server->path);
+ free(server->path);
+ free(server);
+ }
+}
+
+/* Connects to a Vlog server socket. 'path' should be the name of a Vlog
+ * server socket. If it does not start with '/', it will be prefixed with
+ * ovs_rundir (e.g. /var/run).
+ *
+ * Returns 0 if successful, otherwise a positive errno value. If successful,
+ * sets '*clientp' to the new unixctl_client, otherwise to NULL. */
+int
+unixctl_client_create(const char *path, struct unixctl_client **clientp)
+{
+ static int counter;
+ struct unixctl_client *client;
+ int error;
+ int fd = -1;
+
+ /* Determine location. */
+ client = xmalloc(sizeof *client);
+ if (path[0] == '/') {
+ client->connect_path = xstrdup(path);
+ } else {
+ client->connect_path = xasprintf("%s/%s", ovs_rundir, path);
+ }
+ client->bind_path = xasprintf("/tmp/vlog.%ld.%d",
+ (long int) getpid(), counter++);
+
+ /* Open socket. */
+ fd = make_unix_socket(SOCK_STREAM, false, false,
+ client->bind_path, client->connect_path);
+ if (fd < 0) {
+ error = -fd;
+ goto error;
+ }
+
+ /* Bind socket to stream. */
+ client->stream = fdopen(fd, "r+");
+ if (!client->stream) {
+ error = errno;
+ VLOG_WARN("%s: fdopen failed (%s)",
+ client->connect_path, strerror(error));
+ goto error;
+ }
+ *clientp = client;
+ return 0;
+
+error:
+ if (fd >= 0) {
+ close(fd);
+ }
+ free(client->connect_path);
+ free(client->bind_path);
+ free(client);
+ *clientp = NULL;
+ return error;
+}
+
+/* Destroys 'client'. */
+void
+unixctl_client_destroy(struct unixctl_client *client)
+{
+ if (client) {
+ unlink(client->bind_path);
+ fatal_signal_remove_file_to_unlink(client->bind_path);
+ free(client->bind_path);
+ free(client->connect_path);
+ fclose(client->stream);
+ free(client);
+ }
+}
+
+/* Sends 'request' to the server socket and waits for a reply. Returns 0 if
+ * successful, otherwise to a positive errno value. If successful, sets
+ * '*reply' to the reply, which the caller must free, otherwise to NULL. */
+int
+unixctl_client_transact(struct unixctl_client *client,
+ const char *request,
+ int *reply_code, char **reply_body)
+{
+ struct ds line = DS_EMPTY_INITIALIZER;
+ struct ds reply = DS_EMPTY_INITIALIZER;
+ int error;
+
+ /* Send 'request' to server. Add a new-line if 'request' didn't end in
+ * one. */
+ fputs(request, client->stream);
+ if (request[0] == '\0' || request[strlen(request) - 1] != '\n') {
+ putc('\n', client->stream);
+ }
+ if (ferror(client->stream)) {
+ VLOG_WARN("error sending request to %s: %s",
+ client->connect_path, strerror(errno));
+ return errno;
+ }
+
+ /* Wait for response. */
+ *reply_code = -1;
+ for (;;) {
+ const char *s;
+
+ error = ds_get_line(&line, client->stream);
+ if (error) {
+ VLOG_WARN("error reading reply from %s: %s",
+ client->connect_path,
+ (error == EOF ? "unexpected end of file"
+ : strerror(error)));
+ goto error;
+ }
+
+ s = ds_cstr(&line);
+ if (*reply_code == -1) {
+ if (!isdigit(s[0]) || !isdigit(s[1]) || !isdigit(s[2])) {
+ VLOG_WARN("reply from %s does not start with 3-digit code",
+ client->connect_path);
+ error = EPROTO;
+ goto error;
+ }
+ sscanf(s, "%3d", reply_code);
+ } else {
+ if (s[0] == '.') {
+ if (s[1] == '\0') {
+ break;
+ }
+ s++;
+ }
+ ds_put_cstr(&reply, s);
+ ds_put_char(&reply, '\n');
+ }
+ }
+ *reply_body = ds_cstr(&reply);
+ ds_destroy(&line);
+ return 0;
+
+error:
+ ds_destroy(&line);
+ ds_destroy(&reply);
+ *reply_code = 0;
+ *reply_body = NULL;
+ return error == EOF ? EPROTO : error;
+}
+
+/* Returns the path of the server socket to which 'client' is connected. The
+ * caller must not modify or free the returned string. */
+const char *
+unixctl_client_target(const struct unixctl_client *client)
+{
+ return client->connect_path;
+}
diff --git a/lib/unixctl.h b/lib/unixctl.h
new file mode 100644
index 000000000..338eecfe4
--- /dev/null
+++ b/lib/unixctl.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2008, 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef UNIXCTL_H
+#define UNIXCTL_H 1
+
+/* Server for Unix domain socket control connection. */
+struct unixctl_server;
+int unixctl_server_create(const char *path, struct unixctl_server **);
+void unixctl_server_run(struct unixctl_server *);
+void unixctl_server_wait(struct unixctl_server *);
+void unixctl_server_destroy(struct unixctl_server *);
+
+/* Client for Unix domain socket control connection. */
+struct unixctl_client;
+int unixctl_client_create(const char *path, struct unixctl_client **);
+void unixctl_client_destroy(struct unixctl_client *);
+int unixctl_client_transact(struct unixctl_client *,
+ const char *request,
+ int *reply_code, char **reply_body);
+const char *unixctl_client_target(const struct unixctl_client *);
+
+/* Command registration. */
+struct unixctl_conn;
+void unixctl_command_register(const char *name,
+ void (*cb)(struct unixctl_conn *,
+ const char *args));
+void unixctl_command_reply(struct unixctl_conn *, int code,
+ const char *body);
+
+#endif /* unixctl.h */
diff --git a/lib/util.c b/lib/util.c
new file mode 100644
index 000000000..edc24b36b
--- /dev/null
+++ b/lib/util.c
@@ -0,0 +1,296 @@
+/*
+ * Copyright (c) 2008, 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <config.h>
+#include "util.h"
+#include <errno.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "coverage.h"
+
+const char *program_name;
+
+void
+out_of_memory(void)
+{
+ ovs_fatal(0, "virtual memory exhausted");
+}
+
+void *
+xcalloc(size_t count, size_t size)
+{
+ void *p = count && size ? calloc(count, size) : malloc(1);
+ COVERAGE_INC(util_xalloc);
+ if (p == NULL) {
+ out_of_memory();
+ }
+ return p;
+}
+
+void *
+xmalloc(size_t size)
+{
+ void *p = malloc(size ? size : 1);
+ COVERAGE_INC(util_xalloc);
+ if (p == NULL) {
+ out_of_memory();
+ }
+ return p;
+}
+
+void *
+xrealloc(void *p, size_t size)
+{
+ p = realloc(p, size ? size : 1);
+ COVERAGE_INC(util_xalloc);
+ if (p == NULL) {
+ out_of_memory();
+ }
+ return p;
+}
+
+void *
+xmemdup(const void *p_, size_t size)
+{
+ void *p = xmalloc(size);
+ memcpy(p, p_, size);
+ return p;
+}
+
+char *
+xmemdup0(const char *p_, size_t length)
+{
+ char *p = xmalloc(length + 1);
+ memcpy(p, p_, length);
+ p[length] = '\0';
+ return p;
+}
+
+char *
+xstrdup(const char *s)
+{
+ return xmemdup0(s, strlen(s));
+}
+
+char *
+xvasprintf(const char *format, va_list args)
+{
+ va_list args2;
+ size_t needed;
+ char *s;
+
+ va_copy(args2, args);
+ needed = vsnprintf(NULL, 0, format, args);
+
+ s = xmalloc(needed + 1);
+
+ vsnprintf(s, needed + 1, format, args2);
+ va_end(args2);
+
+ return s;
+}
+
+void *
+x2nrealloc(void *p, size_t *n, size_t s)
+{
+ *n = *n == 0 ? 1 : 2 * *n;
+ return xrealloc(p, *n * s);
+}
+
+char *
+xasprintf(const char *format, ...)
+{
+ va_list args;
+ char *s;
+
+ va_start(args, format);
+ s = xvasprintf(format, args);
+ va_end(args);
+
+ return s;
+}
+
+void
+ovs_strlcpy(char *dst, const char *src, size_t size)
+{
+ if (size > 0) {
+ size_t n = strlen(src);
+ size_t n_copy = MIN(n, size - 1);
+ memcpy(dst, src, n_copy);
+ dst[n_copy] = '\0';
+ }
+}
+
+void
+ovs_fatal(int err_no, const char *format, ...)
+{
+ va_list args;
+
+ fprintf(stderr, "%s: ", program_name);
+ va_start(args, format);
+ vfprintf(stderr, format, args);
+ va_end(args);
+ if (err_no != 0)
+ fprintf(stderr, " (%s)", strerror(err_no));
+ putc('\n', stderr);
+
+ exit(EXIT_FAILURE);
+}
+
+void
+ovs_error(int err_no, const char *format, ...)
+{
+ int save_errno = errno;
+ va_list args;
+
+ fprintf(stderr, "%s: ", program_name);
+ va_start(args, format);
+ vfprintf(stderr, format, args);
+ va_end(args);
+ if (err_no != 0)
+ fprintf(stderr, " (%s)", strerror(err_no));
+ putc('\n', stderr);
+
+ errno = save_errno;
+}
+
+/* Sets program_name based on 'argv0'. Should be called at the beginning of
+ * main(), as "set_program_name(argv[0]);". */
+void set_program_name(const char *argv0)
+{
+ const char *slash = strrchr(argv0, '/');
+ program_name = slash ? slash + 1 : argv0;
+}
+
+/* Print the version information for the program. */
+void
+ovs_print_version(char *date, char *time,
+ uint8_t min_ofp, uint8_t max_ofp)
+{
+ printf("%s (Open vSwitch) "VERSION BUILDNR"\n", program_name);
+ printf("Compiled %s %s\n", date, time);
+ if (min_ofp || max_ofp) {
+ printf("OpenFlow versions %#x:%#x\n", min_ofp, max_ofp);
+ }
+}
+
+/* Writes the 'size' bytes in 'buf' to 'stream' as hex bytes arranged 16 per
+ * line. Numeric offsets are also included, starting at 'ofs' for the first
+ * byte in 'buf'. If 'ascii' is true then the corresponding ASCII characters
+ * are also rendered alongside. */
+void
+ovs_hex_dump(FILE *stream, const void *buf_, size_t size,
+ uintptr_t ofs, bool ascii)
+{
+ const uint8_t *buf = buf_;
+ const size_t per_line = 16; /* Maximum bytes per line. */
+
+ while (size > 0)
+ {
+ size_t start, end, n;
+ size_t i;
+
+ /* Number of bytes on this line. */
+ start = ofs % per_line;
+ end = per_line;
+ if (end - start > size)
+ end = start + size;
+ n = end - start;
+
+ /* Print line. */
+ fprintf(stream, "%08jx ", (uintmax_t) ROUND_DOWN(ofs, per_line));
+ for (i = 0; i < start; i++)
+ fprintf(stream, " ");
+ for (; i < end; i++)
+ fprintf(stream, "%02hhx%c",
+ buf[i - start], i == per_line / 2 - 1? '-' : ' ');
+ if (ascii)
+ {
+ for (; i < per_line; i++)
+ fprintf(stream, " ");
+ fprintf(stream, "|");
+ for (i = 0; i < start; i++)
+ fprintf(stream, " ");
+ for (; i < end; i++) {
+ int c = buf[i - start];
+ putc(c >= 32 && c < 127 ? c : '.', stream);
+ }
+ for (; i < per_line; i++)
+ fprintf(stream, " ");
+ fprintf(stream, "|");
+ }
+ fprintf(stream, "\n");
+
+ ofs += n;
+ buf += n;
+ size -= n;
+ }
+}
+
+bool
+str_to_int(const char *s, int base, int *i)
+{
+ long long ll;
+ bool ok = str_to_llong(s, base, &ll);
+ *i = ll;
+ return ok;
+}
+
+bool
+str_to_long(const char *s, int base, long *li)
+{
+ long long ll;
+ bool ok = str_to_llong(s, base, &ll);
+ *li = ll;
+ return ok;
+}
+
+bool
+str_to_llong(const char *s, int base, long long *x)
+{
+ int save_errno = errno;
+ char *tail;
+ errno = 0;
+ *x = strtoll(s, &tail, base);
+ if (errno == EINVAL || errno == ERANGE || tail == s || *tail != '\0') {
+ errno = save_errno;
+ *x = 0;
+ return false;
+ } else {
+ errno = save_errno;
+ return true;
+ }
+}
+
+bool
+str_to_uint(const char *s, int base, unsigned int *u)
+{
+ return str_to_int(s, base, (int *) u);
+}
+
+bool
+str_to_ulong(const char *s, int base, unsigned long *ul)
+{
+ return str_to_long(s, base, (long *) ul);
+}
+
+bool
+str_to_ullong(const char *s, int base, unsigned long long *ull)
+{
+ return str_to_llong(s, base, (long long *) ull);
+}
diff --git a/lib/util.h b/lib/util.h
new file mode 100644
index 000000000..87e216b85
--- /dev/null
+++ b/lib/util.h
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2008, 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef UTIL_H
+#define UTIL_H 1
+
+#include <stdarg.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include "compiler.h"
+
+#ifndef va_copy
+#ifdef __va_copy
+#define va_copy __va_copy
+#else
+#define va_copy(dst, src) ((dst) = (src))
+#endif
+#endif
+
+#ifndef __cplusplus
+/* Build-time assertion building block. */
+#define BUILD_ASSERT__(EXPR) \
+ sizeof(struct { unsigned int build_assert_failed : (EXPR) ? 1 : -1; })
+
+/* Build-time assertion for use in a statement context. */
+#define BUILD_ASSERT(EXPR) (void) BUILD_ASSERT__(EXPR)
+
+/* Build-time assertion for use in a declaration context. */
+#define BUILD_ASSERT_DECL(EXPR) \
+ extern int (*build_assert(void))[BUILD_ASSERT__(EXPR)]
+#else /* __cplusplus */
+#include <boost/static_assert.hpp>
+#define BUILD_ASSERT BOOST_STATIC_ASSERT
+#define BUILD_ASSERT_DECL BOOST_STATIC_ASSERT
+#endif /* __cplusplus */
+
+extern const char *program_name;
+
+#define ARRAY_SIZE(ARRAY) (sizeof ARRAY / sizeof *ARRAY)
+#define ROUND_UP(X, Y) (((X) + ((Y) - 1)) / (Y) * (Y))
+#define ROUND_DOWN(X, Y) ((X) / (Y) * (Y))
+#define IS_POW2(X) ((X) && !((X) & ((X) - 1)))
+
+#ifndef MIN
+#define MIN(X, Y) ((X) < (Y) ? (X) : (Y))
+#endif
+
+#ifndef MAX
+#define MAX(X, Y) ((X) > (Y) ? (X) : (Y))
+#endif
+
+#define NOT_REACHED() abort()
+#define NOT_IMPLEMENTED() abort()
+#define NOT_TESTED() ((void) 0) /* XXX should print a message. */
+
+/* Given POINTER, the address of the given MEMBER in a STRUCT object, returns
+ the STRUCT object. */
+#define CONTAINER_OF(POINTER, STRUCT, MEMBER) \
+ ((STRUCT *) ((char *) (POINTER) - offsetof (STRUCT, MEMBER)))
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void set_program_name(const char *);
+
+void ovs_print_version(char *date, char *time,
+ uint8_t min_ofp, uint8_t max_ofp);
+#define OVS_PRINT_VERSION(min_ofp, max_ofp) \
+ ovs_print_version(__DATE__, __TIME__, (min_ofp), (max_ofp))
+
+void out_of_memory(void) NO_RETURN;
+void *xmalloc(size_t) MALLOC_LIKE;
+void *xcalloc(size_t, size_t) MALLOC_LIKE;
+void *xrealloc(void *, size_t);
+void *xmemdup(const void *, size_t) MALLOC_LIKE;
+char *xmemdup0(const char *, size_t) MALLOC_LIKE;
+char *xstrdup(const char *) MALLOC_LIKE;
+char *xasprintf(const char *format, ...) PRINTF_FORMAT(1, 2) MALLOC_LIKE;
+char *xvasprintf(const char *format, va_list) PRINTF_FORMAT(1, 0) MALLOC_LIKE;
+void *x2nrealloc(void *p, size_t *n, size_t s);
+
+void ovs_strlcpy(char *dst, const char *src, size_t size);
+
+void ovs_fatal(int err_no, const char *format, ...)
+ PRINTF_FORMAT(2, 3) NO_RETURN;
+void ovs_error(int err_no, const char *format, ...) PRINTF_FORMAT(2, 3);
+void ovs_hex_dump(FILE *, const void *, size_t, uintptr_t offset, bool ascii);
+
+bool str_to_int(const char *, int base, int *);
+bool str_to_long(const char *, int base, long *);
+bool str_to_llong(const char *, int base, long long *);
+bool str_to_uint(const char *, int base, unsigned int *);
+bool str_to_ulong(const char *, int base, unsigned long *);
+bool str_to_ullong(const char *, int base, unsigned long long *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* util.h */
diff --git a/lib/valgrind.h b/lib/valgrind.h
new file mode 100644
index 000000000..e15a7a70b
--- /dev/null
+++ b/lib/valgrind.h
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2008, 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef VALGRIND_H
+#define VALGRIND_H 1
+
+#ifdef HAVE_VALGRIND_VALGRIND_H
+#include <valgrind/valgrind.h>
+#else
+#define RUNNING_ON_VALGRIND 0
+#endif
+
+#endif /* valgrind.h */
diff --git a/lib/vconn-provider.h b/lib/vconn-provider.h
new file mode 100644
index 000000000..239d19e97
--- /dev/null
+++ b/lib/vconn-provider.h
@@ -0,0 +1,170 @@
+/*
+ * Copyright (c) 2008, 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef VCONN_PROVIDER_H
+#define VCONN_PROVIDER_H 1
+
+/* Provider interface to vconns, which provide a virtual connection to an
+ * OpenFlow device. */
+
+#include <assert.h>
+#include "vconn.h"
+
+/* Active virtual connection to an OpenFlow device. */
+
+/* Active virtual connection to an OpenFlow device.
+ *
+ * This structure should be treated as opaque by vconn implementations. */
+struct vconn {
+ struct vconn_class *class;
+ int state;
+ int error;
+ int min_version;
+ int version;
+ uint32_t ip;
+ char *name;
+ bool reconnectable;
+};
+
+void vconn_init(struct vconn *, struct vconn_class *, int connect_status,
+ uint32_t ip, const char *name, bool reconnectable);
+static inline void vconn_assert_class(const struct vconn *vconn,
+ const struct vconn_class *class)
+{
+ assert(vconn->class == class);
+}
+
+struct vconn_class {
+ /* Prefix for connection names, e.g. "nl", "tcp". */
+ const char *name;
+
+ /* Attempts to connect to an OpenFlow device. 'name' is the full
+ * connection name provided by the user, e.g. "tcp:1.2.3.4". This name is
+ * useful for error messages but must not be modified.
+ *
+ * 'suffix' is a copy of 'name' following the colon and may be modified.
+ *
+ * Returns 0 if successful, otherwise a positive errno value. If
+ * successful, stores a pointer to the new connection in '*vconnp'.
+ *
+ * The open function must not block waiting for a connection to complete.
+ * If the connection cannot be completed immediately, it should return
+ * EAGAIN (not EINPROGRESS, as returned by the connect system call) and
+ * continue the connection in the background. */
+ int (*open)(const char *name, char *suffix, struct vconn **vconnp);
+
+ /* Closes 'vconn' and frees associated memory. */
+ void (*close)(struct vconn *vconn);
+
+ /* Tries to complete the connection on 'vconn'. If 'vconn''s connection is
+ * complete, returns 0 if the connection was successful or a positive errno
+ * value if it failed. If the connection is still in progress, returns
+ * EAGAIN.
+ *
+ * The connect function must not block waiting for the connection to
+ * complete; instead, it should return EAGAIN immediately. */
+ int (*connect)(struct vconn *vconn);
+
+ /* Tries to receive an OpenFlow message from 'vconn'. If successful,
+ * stores the received message into '*msgp' and returns 0. The caller is
+ * responsible for destroying the message with ofpbuf_delete(). On
+ * failure, returns a positive errno value and stores a null pointer into
+ * '*msgp'.
+ *
+ * If the connection has been closed in the normal fashion, returns EOF.
+ *
+ * The recv function must not block waiting for a packet to arrive. If no
+ * packets have been received, it should return EAGAIN. */
+ int (*recv)(struct vconn *vconn, struct ofpbuf **msgp);
+
+ /* Tries to queue 'msg' for transmission on 'vconn'. If successful,
+ * returns 0, in which case ownership of 'msg' is transferred to the vconn.
+ * Success does not guarantee that 'msg' has been or ever will be delivered
+ * to the peer, only that it has been queued for transmission.
+ *
+ * Returns a positive errno value on failure, in which case the caller
+ * retains ownership of 'msg'.
+ *
+ * The send function must not block. If 'msg' cannot be immediately
+ * accepted for transmission, it should return EAGAIN. */
+ int (*send)(struct vconn *vconn, struct ofpbuf *msg);
+
+ /* Arranges for the poll loop to wake up when 'vconn' is ready to take an
+ * action of the given 'type'. */
+ void (*wait)(struct vconn *vconn, enum vconn_wait_type type);
+};
+
+/* Passive virtual connection to an OpenFlow device.
+ *
+ * This structure should be treated as opaque by vconn implementations. */
+struct pvconn {
+ struct pvconn_class *class;
+ char *name;
+};
+
+void pvconn_init(struct pvconn *, struct pvconn_class *, const char *name);
+static inline void pvconn_assert_class(const struct pvconn *pvconn,
+ const struct pvconn_class *class)
+{
+ assert(pvconn->class == class);
+}
+
+struct pvconn_class {
+ /* Prefix for connection names, e.g. "ptcp", "pssl". */
+ const char *name;
+
+ /* Attempts to start listening for OpenFlow connections. 'name' is the
+ * full connection name provided by the user, e.g. "ptcp:1234". This name
+ * is useful for error messages but must not be modified.
+ *
+ * 'suffix' is a copy of 'name' following the colon and may be modified.
+ *
+ * Returns 0 if successful, otherwise a positive errno value. If
+ * successful, stores a pointer to the new connection in '*pvconnp'.
+ *
+ * The listen function must not block. If the connection cannot be
+ * completed immediately, it should return EAGAIN (not EINPROGRESS, as
+ * returned by the connect system call) and continue the connection in the
+ * background. */
+ int (*listen)(const char *name, char *suffix, struct pvconn **pvconnp);
+
+ /* Closes 'pvconn' and frees associated memory. */
+ void (*close)(struct pvconn *pvconn);
+
+ /* Tries to accept a new connection on 'pvconn'. If successful, stores the
+ * new connection in '*new_vconnp' and returns 0. Otherwise, returns a
+ * positive errno value.
+ *
+ * The accept function must not block waiting for a connection. If no
+ * connection is ready to be accepted, it should return EAGAIN. */
+ int (*accept)(struct pvconn *pvconn, struct vconn **new_vconnp);
+
+ /* Arranges for the poll loop to wake up when a connection is ready to be
+ * accepted on 'pvconn'. */
+ void (*wait)(struct pvconn *pvconn);
+};
+
+/* Active and passive vconn classes. */
+extern struct vconn_class tcp_vconn_class;
+extern struct pvconn_class ptcp_pvconn_class;
+extern struct vconn_class unix_vconn_class;
+extern struct pvconn_class punix_pvconn_class;
+#ifdef HAVE_OPENSSL
+extern struct vconn_class ssl_vconn_class;
+extern struct pvconn_class pssl_pvconn_class;
+#endif
+
+#endif /* vconn-provider.h */
diff --git a/lib/vconn-ssl.c b/lib/vconn-ssl.c
new file mode 100644
index 000000000..20bfb9793
--- /dev/null
+++ b/lib/vconn-ssl.c
@@ -0,0 +1,1197 @@
+/*
+ * Copyright (c) 2008, 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <config.h>
+#include "vconn-ssl.h"
+#include "dhparams.h"
+#include <assert.h>
+#include <ctype.h>
+#include <errno.h>
+#include <inttypes.h>
+#include <string.h>
+#include <netinet/tcp.h>
+#include <openssl/err.h>
+#include <openssl/ssl.h>
+#include <openssl/x509v3.h>
+#include <poll.h>
+#include <sys/fcntl.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include "dynamic-string.h"
+#include "leak-checker.h"
+#include "ofpbuf.h"
+#include "openflow/openflow.h"
+#include "packets.h"
+#include "poll-loop.h"
+#include "socket-util.h"
+#include "socket-util.h"
+#include "util.h"
+#include "vconn-provider.h"
+#include "vconn.h"
+
+#include "vlog.h"
+#define THIS_MODULE VLM_vconn_ssl
+
+/* Active SSL. */
+
+enum ssl_state {
+ STATE_TCP_CONNECTING,
+ STATE_SSL_CONNECTING
+};
+
+enum session_type {
+ CLIENT,
+ SERVER
+};
+
+struct ssl_vconn
+{
+ struct vconn vconn;
+ enum ssl_state state;
+ int connect_error;
+ enum session_type type;
+ int fd;
+ SSL *ssl;
+ struct ofpbuf *rxbuf;
+ struct ofpbuf *txbuf;
+ struct poll_waiter *tx_waiter;
+
+ /* rx_want and tx_want record the result of the last call to SSL_read()
+ * and SSL_write(), respectively:
+ *
+ * - If the call reported that data needed to be read from the file
+ * descriptor, the corresponding member is set to SSL_READING.
+ *
+ * - If the call reported that data needed to be written to the file
+ * descriptor, the corresponding member is set to SSL_WRITING.
+ *
+ * - Otherwise, the member is set to SSL_NOTHING, indicating that the
+ * call completed successfully (or with an error) and that there is no
+ * need to block.
+ *
+ * These are needed because there is no way to ask OpenSSL what a data read
+ * or write would require without giving it a buffer to receive into or
+ * data to send, respectively. (Note that the SSL_want() status is
+ * overwritten by each SSL_read() or SSL_write() call, so we can't rely on
+ * its value.)
+ *
+ * A single call to SSL_read() or SSL_write() can perform both reading
+ * and writing and thus invalidate not one of these values but actually
+ * both. Consider this situation, for example:
+ *
+ * - SSL_write() blocks on a read, so tx_want gets SSL_READING.
+ *
+ * - SSL_read() laters succeeds reading from 'fd' and clears out the
+ * whole receive buffer, so rx_want gets SSL_READING.
+ *
+ * - Client calls vconn_wait(WAIT_RECV) and vconn_wait(WAIT_SEND) and
+ * blocks.
+ *
+ * - Now we're stuck blocking until the peer sends us data, even though
+ * SSL_write() could now succeed, which could easily be a deadlock
+ * condition.
+ *
+ * On the other hand, we can't reset both tx_want and rx_want on every call
+ * to SSL_read() or SSL_write(), because that would produce livelock,
+ * e.g. in this situation:
+ *
+ * - SSL_write() blocks, so tx_want gets SSL_READING or SSL_WRITING.
+ *
+ * - SSL_read() blocks, so rx_want gets SSL_READING or SSL_WRITING,
+ * but tx_want gets reset to SSL_NOTHING.
+ *
+ * - Client calls vconn_wait(WAIT_RECV) and vconn_wait(WAIT_SEND) and
+ * blocks.
+ *
+ * - Client wakes up immediately since SSL_NOTHING in tx_want indicates
+ * that no blocking is necessary.
+ *
+ * The solution we adopt here is to set tx_want to SSL_NOTHING after
+ * calling SSL_read() only if the SSL state of the connection changed,
+ * which indicates that an SSL-level renegotiation made some progress, and
+ * similarly for rx_want and SSL_write(). This prevents both the
+ * deadlock and livelock situations above.
+ */
+ int rx_want, tx_want;
+};
+
+/* SSL context created by ssl_init(). */
+static SSL_CTX *ctx;
+
+/* Required configuration. */
+static bool has_private_key, has_certificate, has_ca_cert;
+
+/* Ordinarily, we require a CA certificate for the peer to be locally
+ * available. 'has_ca_cert' is true when this is the case, and neither of the
+ * following variables matter.
+ *
+ * We can, however, bootstrap the CA certificate from the peer at the beginning
+ * of our first connection then use that certificate on all subsequent
+ * connections, saving it to a file for use in future runs also. In this case,
+ * 'has_ca_cert' is false, 'bootstrap_ca_cert' is true, and 'ca_cert_file'
+ * names the file to be saved. */
+static bool bootstrap_ca_cert;
+static char *ca_cert_file;
+
+/* Who knows what can trigger various SSL errors, so let's throttle them down
+ * quite a bit. */
+static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(10, 25);
+
+static int ssl_init(void);
+static int do_ssl_init(void);
+static bool ssl_wants_io(int ssl_error);
+static void ssl_close(struct vconn *);
+static void ssl_clear_txbuf(struct ssl_vconn *);
+static int interpret_ssl_error(const char *function, int ret, int error,
+ int *want);
+static void ssl_tx_poll_callback(int fd, short int revents, void *vconn_);
+static DH *tmp_dh_callback(SSL *ssl, int is_export UNUSED, int keylength);
+static void log_ca_cert(const char *file_name, X509 *cert);
+
+static short int
+want_to_poll_events(int want)
+{
+ switch (want) {
+ case SSL_NOTHING:
+ NOT_REACHED();
+
+ case SSL_READING:
+ return POLLIN;
+
+ case SSL_WRITING:
+ return POLLOUT;
+
+ default:
+ NOT_REACHED();
+ }
+}
+
+static int
+new_ssl_vconn(const char *name, int fd, enum session_type type,
+ enum ssl_state state, const struct sockaddr_in *sin,
+ struct vconn **vconnp)
+{
+ struct ssl_vconn *sslv;
+ SSL *ssl = NULL;
+ int on = 1;
+ int retval;
+
+ /* Check for all the needful configuration. */
+ retval = 0;
+ if (!has_private_key) {
+ VLOG_ERR("Private key must be configured to use SSL");
+ retval = ENOPROTOOPT;
+ }
+ if (!has_certificate) {
+ VLOG_ERR("Certificate must be configured to use SSL");
+ retval = ENOPROTOOPT;
+ }
+ if (!has_ca_cert && !bootstrap_ca_cert) {
+ VLOG_ERR("CA certificate must be configured to use SSL");
+ retval = ENOPROTOOPT;
+ }
+ if (!SSL_CTX_check_private_key(ctx)) {
+ VLOG_ERR("Private key does not match certificate public key: %s",
+ ERR_error_string(ERR_get_error(), NULL));
+ retval = ENOPROTOOPT;
+ }
+ if (retval) {
+ goto error;
+ }
+
+ /* Disable Nagle. */
+ retval = setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, &on, sizeof on);
+ if (retval) {
+ VLOG_ERR("%s: setsockopt(TCP_NODELAY): %s", name, strerror(errno));
+ retval = errno;
+ goto error;
+ }
+
+ /* Create and configure OpenSSL stream. */
+ ssl = SSL_new(ctx);
+ if (ssl == NULL) {
+ VLOG_ERR("SSL_new: %s", ERR_error_string(ERR_get_error(), NULL));
+ retval = ENOPROTOOPT;
+ goto error;
+ }
+ if (SSL_set_fd(ssl, fd) == 0) {
+ VLOG_ERR("SSL_set_fd: %s", ERR_error_string(ERR_get_error(), NULL));
+ retval = ENOPROTOOPT;
+ goto error;
+ }
+ if (bootstrap_ca_cert && type == CLIENT) {
+ SSL_set_verify(ssl, SSL_VERIFY_NONE, NULL);
+ }
+
+ /* Create and return the ssl_vconn. */
+ sslv = xmalloc(sizeof *sslv);
+ vconn_init(&sslv->vconn, &ssl_vconn_class, EAGAIN, sin->sin_addr.s_addr,
+ name, true);
+ sslv->state = state;
+ sslv->type = type;
+ sslv->fd = fd;
+ sslv->ssl = ssl;
+ sslv->rxbuf = NULL;
+ sslv->txbuf = NULL;
+ sslv->tx_waiter = NULL;
+ sslv->rx_want = sslv->tx_want = SSL_NOTHING;
+ *vconnp = &sslv->vconn;
+ return 0;
+
+error:
+ if (ssl) {
+ SSL_free(ssl);
+ }
+ close(fd);
+ return retval;
+}
+
+static struct ssl_vconn *
+ssl_vconn_cast(struct vconn *vconn)
+{
+ vconn_assert_class(vconn, &ssl_vconn_class);
+ return CONTAINER_OF(vconn, struct ssl_vconn, vconn);
+}
+
+static int
+ssl_open(const char *name, char *suffix, struct vconn **vconnp)
+{
+ char *save_ptr, *host_name, *port_string;
+ struct sockaddr_in sin;
+ int retval;
+ int fd;
+
+ retval = ssl_init();
+ if (retval) {
+ return retval;
+ }
+
+ /* Glibc 2.7 has a bug in strtok_r when compiling with optimization that
+ * can cause segfaults here:
+ * http://sources.redhat.com/bugzilla/show_bug.cgi?id=5614.
+ * Using "::" instead of the obvious ":" works around it. */
+ host_name = strtok_r(suffix, "::", &save_ptr);
+ port_string = strtok_r(NULL, "::", &save_ptr);
+ if (!host_name) {
+ ovs_error(0, "%s: bad peer name format", name);
+ return EAFNOSUPPORT;
+ }
+
+ memset(&sin, 0, sizeof sin);
+ sin.sin_family = AF_INET;
+ if (lookup_ip(host_name, &sin.sin_addr)) {
+ return ENOENT;
+ }
+ sin.sin_port = htons(port_string && *port_string ? atoi(port_string)
+ : OFP_SSL_PORT);
+
+ /* Create socket. */
+ fd = socket(AF_INET, SOCK_STREAM, 0);
+ if (fd < 0) {
+ VLOG_ERR("%s: socket: %s", name, strerror(errno));
+ return errno;
+ }
+ retval = set_nonblocking(fd);
+ if (retval) {
+ close(fd);
+ return retval;
+ }
+
+ /* Connect socket. */
+ retval = connect(fd, (struct sockaddr *) &sin, sizeof sin);
+ if (retval < 0) {
+ if (errno == EINPROGRESS) {
+ return new_ssl_vconn(name, fd, CLIENT, STATE_TCP_CONNECTING,
+ &sin, vconnp);
+ } else {
+ int error = errno;
+ VLOG_ERR("%s: connect: %s", name, strerror(error));
+ close(fd);
+ return error;
+ }
+ } else {
+ return new_ssl_vconn(name, fd, CLIENT, STATE_SSL_CONNECTING,
+ &sin, vconnp);
+ }
+}
+
+static int
+do_ca_cert_bootstrap(struct vconn *vconn)
+{
+ struct ssl_vconn *sslv = ssl_vconn_cast(vconn);
+ STACK_OF(X509) *chain;
+ X509 *ca_cert;
+ FILE *file;
+ int error;
+ int fd;
+
+ chain = SSL_get_peer_cert_chain(sslv->ssl);
+ if (!chain || !sk_X509_num(chain)) {
+ VLOG_ERR("could not bootstrap CA cert: no certificate presented by "
+ "peer");
+ return EPROTO;
+ }
+ ca_cert = sk_X509_value(chain, sk_X509_num(chain) - 1);
+
+ /* Check that 'ca_cert' is self-signed. Otherwise it is not a CA
+ * certificate and we should not attempt to use it as one. */
+ error = X509_check_issued(ca_cert, ca_cert);
+ if (error) {
+ VLOG_ERR("could not bootstrap CA cert: obtained certificate is "
+ "not self-signed (%s)",
+ X509_verify_cert_error_string(error));
+ if (sk_X509_num(chain) < 2) {
+ VLOG_ERR("only one certificate was received, so probably the peer "
+ "is not configured to send its CA certificate");
+ }
+ return EPROTO;
+ }
+
+ fd = open(ca_cert_file, O_CREAT | O_EXCL | O_WRONLY, 0444);
+ if (fd < 0) {
+ VLOG_ERR("could not bootstrap CA cert: creating %s failed: %s",
+ ca_cert_file, strerror(errno));
+ return errno;
+ }
+
+ file = fdopen(fd, "w");
+ if (!file) {
+ int error = errno;
+ VLOG_ERR("could not bootstrap CA cert: fdopen failed: %s",
+ strerror(error));
+ unlink(ca_cert_file);
+ return error;
+ }
+
+ if (!PEM_write_X509(file, ca_cert)) {
+ VLOG_ERR("could not bootstrap CA cert: PEM_write_X509 to %s failed: "
+ "%s", ca_cert_file, ERR_error_string(ERR_get_error(), NULL));
+ fclose(file);
+ unlink(ca_cert_file);
+ return EIO;
+ }
+
+ if (fclose(file)) {
+ int error = errno;
+ VLOG_ERR("could not bootstrap CA cert: writing %s failed: %s",
+ ca_cert_file, strerror(error));
+ unlink(ca_cert_file);
+ return error;
+ }
+
+ VLOG_INFO("successfully bootstrapped CA cert to %s", ca_cert_file);
+ log_ca_cert(ca_cert_file, ca_cert);
+ bootstrap_ca_cert = false;
+ has_ca_cert = true;
+
+ /* SSL_CTX_add_client_CA makes a copy of ca_cert's relevant data. */
+ SSL_CTX_add_client_CA(ctx, ca_cert);
+
+ /* SSL_CTX_use_certificate() takes ownership of the certificate passed in.
+ * 'ca_cert' is owned by sslv->ssl, so we need to duplicate it. */
+ ca_cert = X509_dup(ca_cert);
+ if (!ca_cert) {
+ out_of_memory();
+ }
+ if (SSL_CTX_load_verify_locations(ctx, ca_cert_file, NULL) != 1) {
+ VLOG_ERR("SSL_CTX_load_verify_locations: %s",
+ ERR_error_string(ERR_get_error(), NULL));
+ return EPROTO;
+ }
+ VLOG_INFO("killing successful connection to retry using CA cert");
+ return EPROTO;
+}
+
+static int
+ssl_connect(struct vconn *vconn)
+{
+ struct ssl_vconn *sslv = ssl_vconn_cast(vconn);
+ int retval;
+
+ switch (sslv->state) {
+ case STATE_TCP_CONNECTING:
+ retval = check_connection_completion(sslv->fd);
+ if (retval) {
+ return retval;
+ }
+ sslv->state = STATE_SSL_CONNECTING;
+ /* Fall through. */
+
+ case STATE_SSL_CONNECTING:
+ retval = (sslv->type == CLIENT
+ ? SSL_connect(sslv->ssl) : SSL_accept(sslv->ssl));
+ if (retval != 1) {
+ int error = SSL_get_error(sslv->ssl, retval);
+ if (retval < 0 && ssl_wants_io(error)) {
+ return EAGAIN;
+ } else {
+ int unused;
+ interpret_ssl_error((sslv->type == CLIENT ? "SSL_connect"
+ : "SSL_accept"), retval, error, &unused);
+ shutdown(sslv->fd, SHUT_RDWR);
+ return EPROTO;
+ }
+ } else if (bootstrap_ca_cert) {
+ return do_ca_cert_bootstrap(vconn);
+ } else if ((SSL_get_verify_mode(sslv->ssl)
+ & (SSL_VERIFY_NONE | SSL_VERIFY_PEER))
+ != SSL_VERIFY_PEER) {
+ /* Two or more SSL connections completed at the same time while we
+ * were in bootstrap mode. Only one of these can finish the
+ * bootstrap successfully. The other one(s) must be rejected
+ * because they were not verified against the bootstrapped CA
+ * certificate. (Alternatively we could verify them against the CA
+ * certificate, but that's more trouble than it's worth. These
+ * connections will succeed the next time they retry, assuming that
+ * they have a certificate against the correct CA.) */
+ VLOG_ERR("rejecting SSL connection during bootstrap race window");
+ return EPROTO;
+ } else {
+ return 0;
+ }
+ }
+
+ NOT_REACHED();
+}
+
+static void
+ssl_close(struct vconn *vconn)
+{
+ struct ssl_vconn *sslv = ssl_vconn_cast(vconn);
+ poll_cancel(sslv->tx_waiter);
+ ssl_clear_txbuf(sslv);
+ ofpbuf_delete(sslv->rxbuf);
+ SSL_free(sslv->ssl);
+ close(sslv->fd);
+ free(sslv);
+}
+
+static int
+interpret_ssl_error(const char *function, int ret, int error,
+ int *want)
+{
+ *want = SSL_NOTHING;
+
+ switch (error) {
+ case SSL_ERROR_NONE:
+ VLOG_ERR_RL(&rl, "%s: unexpected SSL_ERROR_NONE", function);
+ break;
+
+ case SSL_ERROR_ZERO_RETURN:
+ VLOG_ERR_RL(&rl, "%s: unexpected SSL_ERROR_ZERO_RETURN", function);
+ break;
+
+ case SSL_ERROR_WANT_READ:
+ *want = SSL_READING;
+ return EAGAIN;
+
+ case SSL_ERROR_WANT_WRITE:
+ *want = SSL_WRITING;
+ return EAGAIN;
+
+ case SSL_ERROR_WANT_CONNECT:
+ VLOG_ERR_RL(&rl, "%s: unexpected SSL_ERROR_WANT_CONNECT", function);
+ break;
+
+ case SSL_ERROR_WANT_ACCEPT:
+ VLOG_ERR_RL(&rl, "%s: unexpected SSL_ERROR_WANT_ACCEPT", function);
+ break;
+
+ case SSL_ERROR_WANT_X509_LOOKUP:
+ VLOG_ERR_RL(&rl, "%s: unexpected SSL_ERROR_WANT_X509_LOOKUP",
+ function);
+ break;
+
+ case SSL_ERROR_SYSCALL: {
+ int queued_error = ERR_get_error();
+ if (queued_error == 0) {
+ if (ret < 0) {
+ int status = errno;
+ VLOG_WARN_RL(&rl, "%s: system error (%s)",
+ function, strerror(status));
+ return status;
+ } else {
+ VLOG_WARN_RL(&rl, "%s: unexpected SSL connection close",
+ function);
+ return EPROTO;
+ }
+ } else {
+ VLOG_WARN_RL(&rl, "%s: %s",
+ function, ERR_error_string(queued_error, NULL));
+ break;
+ }
+ }
+
+ case SSL_ERROR_SSL: {
+ int queued_error = ERR_get_error();
+ if (queued_error != 0) {
+ VLOG_WARN_RL(&rl, "%s: %s",
+ function, ERR_error_string(queued_error, NULL));
+ } else {
+ VLOG_ERR_RL(&rl, "%s: SSL_ERROR_SSL without queued error",
+ function);
+ }
+ break;
+ }
+
+ default:
+ VLOG_ERR_RL(&rl, "%s: bad SSL error code %d", function, error);
+ break;
+ }
+ return EIO;
+}
+
+static int
+ssl_recv(struct vconn *vconn, struct ofpbuf **bufferp)
+{
+ struct ssl_vconn *sslv = ssl_vconn_cast(vconn);
+ struct ofpbuf *rx;
+ size_t want_bytes;
+ int old_state;
+ ssize_t ret;
+
+ if (sslv->rxbuf == NULL) {
+ sslv->rxbuf = ofpbuf_new(1564);
+ }
+ rx = sslv->rxbuf;
+
+again:
+ if (sizeof(struct ofp_header) > rx->size) {
+ want_bytes = sizeof(struct ofp_header) - rx->size;
+ } else {
+ struct ofp_header *oh = rx->data;
+ size_t length = ntohs(oh->length);
+ if (length < sizeof(struct ofp_header)) {
+ VLOG_ERR_RL(&rl, "received too-short ofp_header (%zu bytes)",
+ length);
+ return EPROTO;
+ }
+ want_bytes = length - rx->size;
+ if (!want_bytes) {
+ *bufferp = rx;
+ sslv->rxbuf = NULL;
+ return 0;
+ }
+ }
+ ofpbuf_prealloc_tailroom(rx, want_bytes);
+
+ /* Behavior of zero-byte SSL_read is poorly defined. */
+ assert(want_bytes > 0);
+
+ old_state = SSL_get_state(sslv->ssl);
+ ret = SSL_read(sslv->ssl, ofpbuf_tail(rx), want_bytes);
+ if (old_state != SSL_get_state(sslv->ssl)) {
+ sslv->tx_want = SSL_NOTHING;
+ if (sslv->tx_waiter) {
+ poll_cancel(sslv->tx_waiter);
+ ssl_tx_poll_callback(sslv->fd, POLLIN, vconn);
+ }
+ }
+ sslv->rx_want = SSL_NOTHING;
+
+ if (ret > 0) {
+ rx->size += ret;
+ if (ret == want_bytes) {
+ if (rx->size > sizeof(struct ofp_header)) {
+ *bufferp = rx;
+ sslv->rxbuf = NULL;
+ return 0;
+ } else {
+ goto again;
+ }
+ }
+ return EAGAIN;
+ } else {
+ int error = SSL_get_error(sslv->ssl, ret);
+ if (error == SSL_ERROR_ZERO_RETURN) {
+ /* Connection closed (EOF). */
+ if (rx->size) {
+ VLOG_WARN_RL(&rl, "SSL_read: unexpected connection close");
+ return EPROTO;
+ } else {
+ return EOF;
+ }
+ } else {
+ return interpret_ssl_error("SSL_read", ret, error, &sslv->rx_want);
+ }
+ }
+}
+
+static void
+ssl_clear_txbuf(struct ssl_vconn *sslv)
+{
+ ofpbuf_delete(sslv->txbuf);
+ sslv->txbuf = NULL;
+ sslv->tx_waiter = NULL;
+}
+
+static void
+ssl_register_tx_waiter(struct vconn *vconn)
+{
+ struct ssl_vconn *sslv = ssl_vconn_cast(vconn);
+ sslv->tx_waiter = poll_fd_callback(sslv->fd,
+ want_to_poll_events(sslv->tx_want),
+ ssl_tx_poll_callback, vconn);
+}
+
+static int
+ssl_do_tx(struct vconn *vconn)
+{
+ struct ssl_vconn *sslv = ssl_vconn_cast(vconn);
+
+ for (;;) {
+ int old_state = SSL_get_state(sslv->ssl);
+ int ret = SSL_write(sslv->ssl, sslv->txbuf->data, sslv->txbuf->size);
+ if (old_state != SSL_get_state(sslv->ssl)) {
+ sslv->rx_want = SSL_NOTHING;
+ }
+ sslv->tx_want = SSL_NOTHING;
+ if (ret > 0) {
+ ofpbuf_pull(sslv->txbuf, ret);
+ if (sslv->txbuf->size == 0) {
+ return 0;
+ }
+ } else {
+ int ssl_error = SSL_get_error(sslv->ssl, ret);
+ if (ssl_error == SSL_ERROR_ZERO_RETURN) {
+ VLOG_WARN_RL(&rl, "SSL_write: connection closed");
+ return EPIPE;
+ } else {
+ return interpret_ssl_error("SSL_write", ret, ssl_error,
+ &sslv->tx_want);
+ }
+ }
+ }
+}
+
+static void
+ssl_tx_poll_callback(int fd UNUSED, short int revents UNUSED, void *vconn_)
+{
+ struct vconn *vconn = vconn_;
+ struct ssl_vconn *sslv = ssl_vconn_cast(vconn);
+ int error = ssl_do_tx(vconn);
+ if (error != EAGAIN) {
+ ssl_clear_txbuf(sslv);
+ } else {
+ ssl_register_tx_waiter(vconn);
+ }
+}
+
+static int
+ssl_send(struct vconn *vconn, struct ofpbuf *buffer)
+{
+ struct ssl_vconn *sslv = ssl_vconn_cast(vconn);
+
+ if (sslv->txbuf) {
+ return EAGAIN;
+ } else {
+ int error;
+
+ sslv->txbuf = buffer;
+ error = ssl_do_tx(vconn);
+ switch (error) {
+ case 0:
+ ssl_clear_txbuf(sslv);
+ return 0;
+ case EAGAIN:
+ leak_checker_claim(buffer);
+ ssl_register_tx_waiter(vconn);
+ return 0;
+ default:
+ sslv->txbuf = NULL;
+ return error;
+ }
+ }
+}
+
+static void
+ssl_wait(struct vconn *vconn, enum vconn_wait_type wait)
+{
+ struct ssl_vconn *sslv = ssl_vconn_cast(vconn);
+
+ switch (wait) {
+ case WAIT_CONNECT:
+ if (vconn_connect(vconn) != EAGAIN) {
+ poll_immediate_wake();
+ } else {
+ switch (sslv->state) {
+ case STATE_TCP_CONNECTING:
+ poll_fd_wait(sslv->fd, POLLOUT);
+ break;
+
+ case STATE_SSL_CONNECTING:
+ /* ssl_connect() called SSL_accept() or SSL_connect(), which
+ * set up the status that we test here. */
+ poll_fd_wait(sslv->fd,
+ want_to_poll_events(SSL_want(sslv->ssl)));
+ break;
+
+ default:
+ NOT_REACHED();
+ }
+ }
+ break;
+
+ case WAIT_RECV:
+ if (sslv->rx_want != SSL_NOTHING) {
+ poll_fd_wait(sslv->fd, want_to_poll_events(sslv->rx_want));
+ } else {
+ poll_immediate_wake();
+ }
+ break;
+
+ case WAIT_SEND:
+ if (!sslv->txbuf) {
+ /* We have room in our tx queue. */
+ poll_immediate_wake();
+ } else {
+ /* The call to ssl_tx_poll_callback() will wake us up. */
+ }
+ break;
+
+ default:
+ NOT_REACHED();
+ }
+}
+
+struct vconn_class ssl_vconn_class = {
+ "ssl", /* name */
+ ssl_open, /* open */
+ ssl_close, /* close */
+ ssl_connect, /* connect */
+ ssl_recv, /* recv */
+ ssl_send, /* send */
+ ssl_wait, /* wait */
+};
+
+/* Passive SSL. */
+
+struct pssl_pvconn
+{
+ struct pvconn pvconn;
+ int fd;
+};
+
+struct pvconn_class pssl_pvconn_class;
+
+static struct pssl_pvconn *
+pssl_pvconn_cast(struct pvconn *pvconn)
+{
+ pvconn_assert_class(pvconn, &pssl_pvconn_class);
+ return CONTAINER_OF(pvconn, struct pssl_pvconn, pvconn);
+}
+
+static int
+pssl_open(const char *name, char *suffix, struct pvconn **pvconnp)
+{
+ struct sockaddr_in sin;
+ struct pssl_pvconn *pssl;
+ int retval;
+ int fd;
+ unsigned int yes = 1;
+
+ retval = ssl_init();
+ if (retval) {
+ return retval;
+ }
+
+ /* Create socket. */
+ fd = socket(AF_INET, SOCK_STREAM, 0);
+ if (fd < 0) {
+ int error = errno;
+ VLOG_ERR("%s: socket: %s", name, strerror(error));
+ return error;
+ }
+
+ if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof yes) < 0) {
+ int error = errno;
+ VLOG_ERR("%s: setsockopt(SO_REUSEADDR): %s", name, strerror(errno));
+ return error;
+ }
+
+ memset(&sin, 0, sizeof sin);
+ sin.sin_family = AF_INET;
+ sin.sin_addr.s_addr = htonl(INADDR_ANY);
+ sin.sin_port = htons(atoi(suffix) ? atoi(suffix) : OFP_SSL_PORT);
+ retval = bind(fd, (struct sockaddr *) &sin, sizeof sin);
+ if (retval < 0) {
+ int error = errno;
+ VLOG_ERR("%s: bind: %s", name, strerror(error));
+ close(fd);
+ return error;
+ }
+
+ retval = listen(fd, 10);
+ if (retval < 0) {
+ int error = errno;
+ VLOG_ERR("%s: listen: %s", name, strerror(error));
+ close(fd);
+ return error;
+ }
+
+ retval = set_nonblocking(fd);
+ if (retval) {
+ close(fd);
+ return retval;
+ }
+
+ pssl = xmalloc(sizeof *pssl);
+ pvconn_init(&pssl->pvconn, &pssl_pvconn_class, name);
+ pssl->fd = fd;
+ *pvconnp = &pssl->pvconn;
+ return 0;
+}
+
+static void
+pssl_close(struct pvconn *pvconn)
+{
+ struct pssl_pvconn *pssl = pssl_pvconn_cast(pvconn);
+ close(pssl->fd);
+ free(pssl);
+}
+
+static int
+pssl_accept(struct pvconn *pvconn, struct vconn **new_vconnp)
+{
+ struct pssl_pvconn *pssl = pssl_pvconn_cast(pvconn);
+ struct sockaddr_in sin;
+ socklen_t sin_len = sizeof sin;
+ char name[128];
+ int new_fd;
+ int error;
+
+ new_fd = accept(pssl->fd, &sin, &sin_len);
+ if (new_fd < 0) {
+ int error = errno;
+ if (error != EAGAIN) {
+ VLOG_DBG_RL(&rl, "accept: %s", strerror(error));
+ }
+ return error;
+ }
+
+ error = set_nonblocking(new_fd);
+ if (error) {
+ close(new_fd);
+ return error;
+ }
+
+ sprintf(name, "ssl:"IP_FMT, IP_ARGS(&sin.sin_addr));
+ if (sin.sin_port != htons(OFP_SSL_PORT)) {
+ sprintf(strchr(name, '\0'), ":%"PRIu16, ntohs(sin.sin_port));
+ }
+ return new_ssl_vconn(name, new_fd, SERVER, STATE_SSL_CONNECTING, &sin,
+ new_vconnp);
+}
+
+static void
+pssl_wait(struct pvconn *pvconn)
+{
+ struct pssl_pvconn *pssl = pssl_pvconn_cast(pvconn);
+ poll_fd_wait(pssl->fd, POLLIN);
+}
+
+struct pvconn_class pssl_pvconn_class = {
+ "pssl",
+ pssl_open,
+ pssl_close,
+ pssl_accept,
+ pssl_wait,
+};
+
+/*
+ * Returns true if OpenSSL error is WANT_READ or WANT_WRITE, indicating that
+ * OpenSSL is requesting that we call it back when the socket is ready for read
+ * or writing, respectively.
+ */
+static bool
+ssl_wants_io(int ssl_error)
+{
+ return (ssl_error == SSL_ERROR_WANT_WRITE
+ || ssl_error == SSL_ERROR_WANT_READ);
+}
+
+static int
+ssl_init(void)
+{
+ static int init_status = -1;
+ if (init_status < 0) {
+ init_status = do_ssl_init();
+ assert(init_status >= 0);
+ }
+ return init_status;
+}
+
+static int
+do_ssl_init(void)
+{
+ SSL_METHOD *method;
+
+ SSL_library_init();
+ SSL_load_error_strings();
+
+ method = TLSv1_method();
+ if (method == NULL) {
+ VLOG_ERR("TLSv1_method: %s", ERR_error_string(ERR_get_error(), NULL));
+ return ENOPROTOOPT;
+ }
+
+ ctx = SSL_CTX_new(method);
+ if (ctx == NULL) {
+ VLOG_ERR("SSL_CTX_new: %s", ERR_error_string(ERR_get_error(), NULL));
+ return ENOPROTOOPT;
+ }
+ SSL_CTX_set_options(ctx, SSL_OP_NO_SSLv2 | SSL_OP_NO_SSLv3);
+ SSL_CTX_set_tmp_dh_callback(ctx, tmp_dh_callback);
+ SSL_CTX_set_mode(ctx, SSL_MODE_ENABLE_PARTIAL_WRITE);
+ SSL_CTX_set_mode(ctx, SSL_MODE_ACCEPT_MOVING_WRITE_BUFFER);
+ SSL_CTX_set_verify(ctx, SSL_VERIFY_PEER | SSL_VERIFY_FAIL_IF_NO_PEER_CERT,
+ NULL);
+
+ return 0;
+}
+
+static DH *
+tmp_dh_callback(SSL *ssl UNUSED, int is_export UNUSED, int keylength)
+{
+ struct dh {
+ int keylength;
+ DH *dh;
+ DH *(*constructor)(void);
+ };
+
+ static struct dh dh_table[] = {
+ {1024, NULL, get_dh1024},
+ {2048, NULL, get_dh2048},
+ {4096, NULL, get_dh4096},
+ };
+
+ struct dh *dh;
+
+ for (dh = dh_table; dh < &dh_table[ARRAY_SIZE(dh_table)]; dh++) {
+ if (dh->keylength == keylength) {
+ if (!dh->dh) {
+ dh->dh = dh->constructor();
+ if (!dh->dh) {
+ ovs_fatal(ENOMEM, "out of memory constructing "
+ "Diffie-Hellman parameters");
+ }
+ }
+ return dh->dh;
+ }
+ }
+ VLOG_ERR_RL(&rl, "no Diffie-Hellman parameters for key length %d",
+ keylength);
+ return NULL;
+}
+
+/* Returns true if SSL is at least partially configured. */
+bool
+vconn_ssl_is_configured(void)
+{
+ return has_private_key || has_certificate || has_ca_cert;
+}
+
+void
+vconn_ssl_set_private_key_file(const char *file_name)
+{
+ if (ssl_init()) {
+ return;
+ }
+ if (SSL_CTX_use_PrivateKey_file(ctx, file_name, SSL_FILETYPE_PEM) != 1) {
+ VLOG_ERR("SSL_use_PrivateKey_file: %s",
+ ERR_error_string(ERR_get_error(), NULL));
+ return;
+ }
+ has_private_key = true;
+}
+
+void
+vconn_ssl_set_certificate_file(const char *file_name)
+{
+ if (ssl_init()) {
+ return;
+ }
+ if (SSL_CTX_use_certificate_chain_file(ctx, file_name) != 1) {
+ VLOG_ERR("SSL_use_certificate_file: %s",
+ ERR_error_string(ERR_get_error(), NULL));
+ return;
+ }
+ has_certificate = true;
+}
+
+/* Reads the X509 certificate or certificates in file 'file_name'. On success,
+ * stores the address of the first element in an array of pointers to
+ * certificates in '*certs' and the number of certificates in the array in
+ * '*n_certs', and returns 0. On failure, stores a null pointer in '*certs', 0
+ * in '*n_certs', and returns a positive errno value.
+ *
+ * The caller is responsible for freeing '*certs'. */
+static int
+read_cert_file(const char *file_name, X509 ***certs, size_t *n_certs)
+{
+ FILE *file;
+ size_t allocated_certs = 0;
+
+ *certs = NULL;
+ *n_certs = 0;
+
+ file = fopen(file_name, "r");
+ if (!file) {
+ VLOG_ERR("failed to open %s for reading: %s",
+ file_name, strerror(errno));
+ return errno;
+ }
+
+ for (;;) {
+ X509 *certificate;
+ int c;
+
+ /* Read certificate from file. */
+ certificate = PEM_read_X509(file, NULL, NULL, NULL);
+ if (!certificate) {
+ size_t i;
+
+ VLOG_ERR("PEM_read_X509 failed reading %s: %s",
+ file_name, ERR_error_string(ERR_get_error(), NULL));
+ for (i = 0; i < *n_certs; i++) {
+ X509_free((*certs)[i]);
+ }
+ free(*certs);
+ *certs = NULL;
+ *n_certs = 0;
+ return EIO;
+ }
+
+ /* Add certificate to array. */
+ if (*n_certs >= allocated_certs) {
+ *certs = x2nrealloc(*certs, &allocated_certs, sizeof **certs);
+ }
+ (*certs)[(*n_certs)++] = certificate;
+
+ /* Are there additional certificates in the file? */
+ do {
+ c = getc(file);
+ } while (isspace(c));
+ if (c == EOF) {
+ break;
+ }
+ ungetc(c, file);
+ }
+ fclose(file);
+ return 0;
+}
+
+
+/* Sets 'file_name' as the name of a file containing one or more X509
+ * certificates to send to the peer. Typical use in OpenFlow is to send the CA
+ * certificate to the peer, which enables a switch to pick up the controller's
+ * CA certificate on its first connection. */
+void
+vconn_ssl_set_peer_ca_cert_file(const char *file_name)
+{
+ X509 **certs;
+ size_t n_certs;
+ size_t i;
+
+ if (ssl_init()) {
+ return;
+ }
+
+ if (!read_cert_file(file_name, &certs, &n_certs)) {
+ for (i = 0; i < n_certs; i++) {
+ if (SSL_CTX_add_extra_chain_cert(ctx, certs[i]) != 1) {
+ VLOG_ERR("SSL_CTX_add_extra_chain_cert: %s",
+ ERR_error_string(ERR_get_error(), NULL));
+ }
+ }
+ free(certs);
+ }
+}
+
+/* Logs fingerprint of CA certificate 'cert' obtained from 'file_name'. */
+static void
+log_ca_cert(const char *file_name, X509 *cert)
+{
+ unsigned char digest[EVP_MAX_MD_SIZE];
+ unsigned int n_bytes;
+ struct ds fp;
+ char *subject;
+
+ ds_init(&fp);
+ if (!X509_digest(cert, EVP_sha1(), digest, &n_bytes)) {
+ ds_put_cstr(&fp, "<out of memory>");
+ } else {
+ unsigned int i;
+ for (i = 0; i < n_bytes; i++) {
+ if (i) {
+ ds_put_char(&fp, ':');
+ }
+ ds_put_format(&fp, "%02hhx", digest[i]);
+ }
+ }
+ subject = X509_NAME_oneline(X509_get_subject_name(cert), NULL, 0);
+ VLOG_INFO("Trusting CA cert from %s (%s) (fingerprint %s)", file_name,
+ subject ? subject : "<out of memory>", ds_cstr(&fp));
+ free(subject);
+ ds_destroy(&fp);
+}
+
+/* Sets 'file_name' as the name of the file from which to read the CA
+ * certificate used to verify the peer within SSL connections. If 'bootstrap'
+ * is false, the file must exist. If 'bootstrap' is false, then the file is
+ * read if it is exists; if it does not, then it will be created from the CA
+ * certificate received from the peer on the first SSL connection. */
+void
+vconn_ssl_set_ca_cert_file(const char *file_name, bool bootstrap)
+{
+ X509 **certs;
+ size_t n_certs;
+ struct stat s;
+
+ if (ssl_init()) {
+ return;
+ }
+
+ if (bootstrap && stat(file_name, &s) && errno == ENOENT) {
+ bootstrap_ca_cert = true;
+ ca_cert_file = xstrdup(file_name);
+ } else if (!read_cert_file(file_name, &certs, &n_certs)) {
+ size_t i;
+
+ /* Set up list of CAs that the server will accept from the client. */
+ for (i = 0; i < n_certs; i++) {
+ /* SSL_CTX_add_client_CA makes a copy of the relevant data. */
+ if (SSL_CTX_add_client_CA(ctx, certs[i]) != 1) {
+ VLOG_ERR("failed to add client certificate %d from %s: %s",
+ i, file_name,
+ ERR_error_string(ERR_get_error(), NULL));
+ } else {
+ log_ca_cert(file_name, certs[i]);
+ }
+ X509_free(certs[i]);
+ }
+
+ /* Set up CAs for OpenSSL to trust in verifying the peer's
+ * certificate. */
+ if (SSL_CTX_load_verify_locations(ctx, file_name, NULL) != 1) {
+ VLOG_ERR("SSL_CTX_load_verify_locations: %s",
+ ERR_error_string(ERR_get_error(), NULL));
+ return;
+ }
+
+ has_ca_cert = true;
+ }
+}
diff --git a/lib/vconn-ssl.h b/lib/vconn-ssl.h
new file mode 100644
index 000000000..f7ff130bd
--- /dev/null
+++ b/lib/vconn-ssl.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2008 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+#ifndef VCONN_SSL_H
+#define VCONN_SSL_H 1
+
+#include <stdbool.h>
+
+#ifdef HAVE_OPENSSL
+bool vconn_ssl_is_configured(void);
+void vconn_ssl_set_private_key_file(const char *file_name);
+void vconn_ssl_set_certificate_file(const char *file_name);
+void vconn_ssl_set_ca_cert_file(const char *file_name, bool bootstrap);
+void vconn_ssl_set_peer_ca_cert_file(const char *file_name);
+
+#define VCONN_SSL_LONG_OPTIONS \
+ {"private-key", required_argument, 0, 'p'}, \
+ {"certificate", required_argument, 0, 'c'}, \
+ {"ca-cert", required_argument, 0, 'C'},
+
+#define VCONN_SSL_OPTION_HANDLERS \
+ case 'p': \
+ vconn_ssl_set_private_key_file(optarg); \
+ break; \
+ \
+ case 'c': \
+ vconn_ssl_set_certificate_file(optarg); \
+ break; \
+ \
+ case 'C': \
+ vconn_ssl_set_ca_cert_file(optarg, false); \
+ break;
+#else /* !HAVE_OPENSSL */
+static inline bool vconn_ssl_is_configured(void)
+{
+ return false;
+}
+#define VCONN_SSL_LONG_OPTIONS
+#define VCONN_SSL_OPTION_HANDLERS
+#endif /* !HAVE_OPENSSL */
+
+#endif /* vconn-ssl.h */
diff --git a/lib/vconn-stream.c b/lib/vconn-stream.c
new file mode 100644
index 000000000..468c112cc
--- /dev/null
+++ b/lib/vconn-stream.c
@@ -0,0 +1,346 @@
+/*
+ * Copyright (c) 2008, 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <config.h>
+#include "vconn-stream.h"
+#include <assert.h>
+#include <errno.h>
+#include <poll.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include "leak-checker.h"
+#include "ofpbuf.h"
+#include "openflow/openflow.h"
+#include "poll-loop.h"
+#include "socket-util.h"
+#include "util.h"
+#include "vconn-provider.h"
+#include "vconn.h"
+
+#include "vlog.h"
+#define THIS_MODULE VLM_vconn_stream
+
+/* Active stream socket vconn. */
+
+struct stream_vconn
+{
+ struct vconn vconn;
+ int fd;
+ struct ofpbuf *rxbuf;
+ struct ofpbuf *txbuf;
+ struct poll_waiter *tx_waiter;
+};
+
+static struct vconn_class stream_vconn_class;
+
+static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(10, 25);
+
+static void stream_clear_txbuf(struct stream_vconn *);
+
+int
+new_stream_vconn(const char *name, int fd, int connect_status,
+ uint32_t ip, bool reconnectable, struct vconn **vconnp)
+{
+ struct stream_vconn *s;
+
+ s = xmalloc(sizeof *s);
+ vconn_init(&s->vconn, &stream_vconn_class, connect_status, ip, name,
+ reconnectable);
+ s->fd = fd;
+ s->txbuf = NULL;
+ s->tx_waiter = NULL;
+ s->rxbuf = NULL;
+ *vconnp = &s->vconn;
+ return 0;
+}
+
+static struct stream_vconn *
+stream_vconn_cast(struct vconn *vconn)
+{
+ vconn_assert_class(vconn, &stream_vconn_class);
+ return CONTAINER_OF(vconn, struct stream_vconn, vconn);
+}
+
+static void
+stream_close(struct vconn *vconn)
+{
+ struct stream_vconn *s = stream_vconn_cast(vconn);
+ poll_cancel(s->tx_waiter);
+ stream_clear_txbuf(s);
+ ofpbuf_delete(s->rxbuf);
+ close(s->fd);
+ free(s);
+}
+
+static int
+stream_connect(struct vconn *vconn)
+{
+ struct stream_vconn *s = stream_vconn_cast(vconn);
+ return check_connection_completion(s->fd);
+}
+
+static int
+stream_recv(struct vconn *vconn, struct ofpbuf **bufferp)
+{
+ struct stream_vconn *s = stream_vconn_cast(vconn);
+ struct ofpbuf *rx;
+ size_t want_bytes;
+ ssize_t retval;
+
+ if (s->rxbuf == NULL) {
+ s->rxbuf = ofpbuf_new(1564);
+ }
+ rx = s->rxbuf;
+
+again:
+ if (sizeof(struct ofp_header) > rx->size) {
+ want_bytes = sizeof(struct ofp_header) - rx->size;
+ } else {
+ struct ofp_header *oh = rx->data;
+ size_t length = ntohs(oh->length);
+ if (length < sizeof(struct ofp_header)) {
+ VLOG_ERR_RL(&rl, "received too-short ofp_header (%zu bytes)",
+ length);
+ return EPROTO;
+ }
+ want_bytes = length - rx->size;
+ if (!want_bytes) {
+ *bufferp = rx;
+ s->rxbuf = NULL;
+ return 0;
+ }
+ }
+ ofpbuf_prealloc_tailroom(rx, want_bytes);
+
+ retval = read(s->fd, ofpbuf_tail(rx), want_bytes);
+ if (retval > 0) {
+ rx->size += retval;
+ if (retval == want_bytes) {
+ if (rx->size > sizeof(struct ofp_header)) {
+ *bufferp = rx;
+ s->rxbuf = NULL;
+ return 0;
+ } else {
+ goto again;
+ }
+ }
+ return EAGAIN;
+ } else if (retval == 0) {
+ if (rx->size) {
+ VLOG_ERR_RL(&rl, "connection dropped mid-packet");
+ return EPROTO;
+ } else {
+ return EOF;
+ }
+ } else {
+ return errno;
+ }
+}
+
+static void
+stream_clear_txbuf(struct stream_vconn *s)
+{
+ ofpbuf_delete(s->txbuf);
+ s->txbuf = NULL;
+ s->tx_waiter = NULL;
+}
+
+static void
+stream_do_tx(int fd UNUSED, short int revents UNUSED, void *vconn_)
+{
+ struct vconn *vconn = vconn_;
+ struct stream_vconn *s = stream_vconn_cast(vconn);
+ ssize_t n = write(s->fd, s->txbuf->data, s->txbuf->size);
+ if (n < 0) {
+ if (errno != EAGAIN) {
+ VLOG_ERR_RL(&rl, "send: %s", strerror(errno));
+ stream_clear_txbuf(s);
+ return;
+ }
+ } else if (n > 0) {
+ ofpbuf_pull(s->txbuf, n);
+ if (!s->txbuf->size) {
+ stream_clear_txbuf(s);
+ return;
+ }
+ }
+ s->tx_waiter = poll_fd_callback(s->fd, POLLOUT, stream_do_tx, vconn);
+}
+
+static int
+stream_send(struct vconn *vconn, struct ofpbuf *buffer)
+{
+ struct stream_vconn *s = stream_vconn_cast(vconn);
+ ssize_t retval;
+
+ if (s->txbuf) {
+ return EAGAIN;
+ }
+
+ retval = write(s->fd, buffer->data, buffer->size);
+ if (retval == buffer->size) {
+ ofpbuf_delete(buffer);
+ return 0;
+ } else if (retval >= 0 || errno == EAGAIN) {
+ leak_checker_claim(buffer);
+ s->txbuf = buffer;
+ if (retval > 0) {
+ ofpbuf_pull(buffer, retval);
+ }
+ s->tx_waiter = poll_fd_callback(s->fd, POLLOUT, stream_do_tx, vconn);
+ return 0;
+ } else {
+ return errno;
+ }
+}
+
+static void
+stream_wait(struct vconn *vconn, enum vconn_wait_type wait)
+{
+ struct stream_vconn *s = stream_vconn_cast(vconn);
+ switch (wait) {
+ case WAIT_CONNECT:
+ poll_fd_wait(s->fd, POLLOUT);
+ break;
+
+ case WAIT_SEND:
+ if (!s->txbuf) {
+ poll_fd_wait(s->fd, POLLOUT);
+ } else {
+ /* Nothing to do: need to drain txbuf first. */
+ }
+ break;
+
+ case WAIT_RECV:
+ poll_fd_wait(s->fd, POLLIN);
+ break;
+
+ default:
+ NOT_REACHED();
+ }
+}
+
+static struct vconn_class stream_vconn_class = {
+ "stream", /* name */
+ NULL, /* open */
+ stream_close, /* close */
+ stream_connect, /* connect */
+ stream_recv, /* recv */
+ stream_send, /* send */
+ stream_wait, /* wait */
+};
+
+/* Passive stream socket vconn. */
+
+struct pstream_pvconn
+{
+ struct pvconn pvconn;
+ int fd;
+ int (*accept_cb)(int fd, const struct sockaddr *, size_t sa_len,
+ struct vconn **);
+};
+
+static struct pvconn_class pstream_pvconn_class;
+
+static struct pstream_pvconn *
+pstream_pvconn_cast(struct pvconn *pvconn)
+{
+ pvconn_assert_class(pvconn, &pstream_pvconn_class);
+ return CONTAINER_OF(pvconn, struct pstream_pvconn, pvconn);
+}
+
+int
+new_pstream_pvconn(const char *name, int fd,
+ int (*accept_cb)(int fd, const struct sockaddr *,
+ size_t sa_len, struct vconn **),
+ struct pvconn **pvconnp)
+{
+ struct pstream_pvconn *ps;
+ int retval;
+
+ retval = set_nonblocking(fd);
+ if (retval) {
+ close(fd);
+ return retval;
+ }
+
+ if (listen(fd, 10) < 0) {
+ int error = errno;
+ VLOG_ERR("%s: listen: %s", name, strerror(error));
+ close(fd);
+ return error;
+ }
+
+ ps = xmalloc(sizeof *ps);
+ pvconn_init(&ps->pvconn, &pstream_pvconn_class, name);
+ ps->fd = fd;
+ ps->accept_cb = accept_cb;
+ *pvconnp = &ps->pvconn;
+ return 0;
+}
+
+static void
+pstream_close(struct pvconn *pvconn)
+{
+ struct pstream_pvconn *ps = pstream_pvconn_cast(pvconn);
+ close(ps->fd);
+ free(ps);
+}
+
+static int
+pstream_accept(struct pvconn *pvconn, struct vconn **new_vconnp)
+{
+ struct pstream_pvconn *ps = pstream_pvconn_cast(pvconn);
+ struct sockaddr_storage ss;
+ socklen_t ss_len = sizeof ss;
+ int new_fd;
+ int retval;
+
+ new_fd = accept(ps->fd, (struct sockaddr *) &ss, &ss_len);
+ if (new_fd < 0) {
+ int retval = errno;
+ if (retval != EAGAIN) {
+ VLOG_DBG_RL(&rl, "accept: %s", strerror(retval));
+ }
+ return retval;
+ }
+
+ retval = set_nonblocking(new_fd);
+ if (retval) {
+ close(new_fd);
+ return retval;
+ }
+
+ return ps->accept_cb(new_fd, (const struct sockaddr *) &ss, ss_len,
+ new_vconnp);
+}
+
+static void
+pstream_wait(struct pvconn *pvconn)
+{
+ struct pstream_pvconn *ps = pstream_pvconn_cast(pvconn);
+ poll_fd_wait(ps->fd, POLLIN);
+}
+
+static struct pvconn_class pstream_pvconn_class = {
+ "pstream",
+ NULL,
+ pstream_close,
+ pstream_accept,
+ pstream_wait
+};
diff --git a/lib/vconn-stream.h b/lib/vconn-stream.h
new file mode 100644
index 000000000..a5aa2dd9a
--- /dev/null
+++ b/lib/vconn-stream.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2008 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef VCONN_STREAM_H
+#define VCONN_STREAM_H 1
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+struct vconn;
+struct pvconn;
+struct sockaddr;
+
+int new_stream_vconn(const char *name, int fd, int connect_status,
+ uint32_t ip, bool reconnectable, struct vconn **vconnp);
+int new_pstream_pvconn(const char *name, int fd,
+ int (*accept_cb)(int fd, const struct sockaddr *,
+ size_t sa_len, struct vconn **),
+ struct pvconn **pvconnp);
+
+#endif /* vconn-stream.h */
diff --git a/lib/vconn-tcp.c b/lib/vconn-tcp.c
new file mode 100644
index 000000000..081ac26de
--- /dev/null
+++ b/lib/vconn-tcp.c
@@ -0,0 +1,186 @@
+/*
+ * Copyright (c) 2008, 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <config.h>
+#include "vconn.h"
+#include <errno.h>
+#include <inttypes.h>
+#include <sys/types.h>
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include "packets.h"
+#include "socket-util.h"
+#include "util.h"
+#include "openflow/openflow.h"
+#include "vconn-provider.h"
+#include "vconn-stream.h"
+
+#include "vlog.h"
+#define THIS_MODULE VLM_vconn_tcp
+
+/* Active TCP. */
+
+static int
+new_tcp_vconn(const char *name, int fd, int connect_status,
+ const struct sockaddr_in *sin, struct vconn **vconnp)
+{
+ int on = 1;
+ int retval;
+
+ retval = setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, &on, sizeof on);
+ if (retval) {
+ VLOG_ERR("%s: setsockopt(TCP_NODELAY): %s", name, strerror(errno));
+ close(fd);
+ return errno;
+ }
+
+ return new_stream_vconn(name, fd, connect_status, sin->sin_addr.s_addr,
+ true, vconnp);
+}
+
+static int
+tcp_open(const char *name, char *suffix, struct vconn **vconnp)
+{
+ char *save_ptr;
+ const char *host_name;
+ const char *port_string;
+ struct sockaddr_in sin;
+ int retval;
+ int fd;
+
+ /* Glibc 2.7 has a bug in strtok_r when compiling with optimization that
+ * can cause segfaults here:
+ * http://sources.redhat.com/bugzilla/show_bug.cgi?id=5614.
+ * Using "::" instead of the obvious ":" works around it. */
+ host_name = strtok_r(suffix, "::", &save_ptr);
+ port_string = strtok_r(NULL, "::", &save_ptr);
+ if (!host_name) {
+ ovs_error(0, "%s: bad peer name format", name);
+ return EAFNOSUPPORT;
+ }
+
+ memset(&sin, 0, sizeof sin);
+ sin.sin_family = AF_INET;
+ if (lookup_ip(host_name, &sin.sin_addr)) {
+ return ENOENT;
+ }
+ sin.sin_port = htons(port_string ? atoi(port_string) : OFP_TCP_PORT);
+
+ fd = socket(AF_INET, SOCK_STREAM, 0);
+ if (fd < 0) {
+ VLOG_ERR("%s: socket: %s", name, strerror(errno));
+ return errno;
+ }
+
+ retval = set_nonblocking(fd);
+ if (retval) {
+ close(fd);
+ return retval;
+ }
+
+ retval = connect(fd, (struct sockaddr *) &sin, sizeof sin);
+ if (retval < 0) {
+ if (errno == EINPROGRESS) {
+ return new_tcp_vconn(name, fd, EAGAIN, &sin, vconnp);
+ } else {
+ int error = errno;
+ VLOG_ERR("%s: connect: %s", name, strerror(error));
+ close(fd);
+ return error;
+ }
+ } else {
+ return new_tcp_vconn(name, fd, 0, &sin, vconnp);
+ }
+}
+
+struct vconn_class tcp_vconn_class = {
+ "tcp", /* name */
+ tcp_open, /* open */
+ NULL, /* close */
+ NULL, /* connect */
+ NULL, /* recv */
+ NULL, /* send */
+ NULL, /* wait */
+};
+
+/* Passive TCP. */
+
+static int ptcp_accept(int fd, const struct sockaddr *sa, size_t sa_len,
+ struct vconn **vconnp);
+
+static int
+ptcp_open(const char *name, char *suffix, struct pvconn **pvconnp)
+{
+ struct sockaddr_in sin;
+ int retval;
+ int fd;
+ unsigned int yes = 1;
+
+ fd = socket(AF_INET, SOCK_STREAM, 0);
+ if (fd < 0) {
+ VLOG_ERR("%s: socket: %s", name, strerror(errno));
+ return errno;
+ }
+
+ if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof yes) < 0) {
+ VLOG_ERR("%s: setsockopt(SO_REUSEADDR): %s", name, strerror(errno));
+ return errno;
+ }
+
+ memset(&sin, 0, sizeof sin);
+ sin.sin_family = AF_INET;
+ sin.sin_addr.s_addr = htonl(INADDR_ANY);
+ sin.sin_port = htons(atoi(suffix) ? atoi(suffix) : OFP_TCP_PORT);
+ retval = bind(fd, (struct sockaddr *) &sin, sizeof sin);
+ if (retval < 0) {
+ int error = errno;
+ VLOG_ERR("%s: bind: %s", name, strerror(error));
+ close(fd);
+ return error;
+ }
+
+ return new_pstream_pvconn("ptcp", fd, ptcp_accept, pvconnp);
+}
+
+static int
+ptcp_accept(int fd, const struct sockaddr *sa, size_t sa_len,
+ struct vconn **vconnp)
+{
+ const struct sockaddr_in *sin = (const struct sockaddr_in *) sa;
+ char name[128];
+
+ if (sa_len == sizeof(struct sockaddr_in) && sin->sin_family == AF_INET) {
+ sprintf(name, "tcp:"IP_FMT, IP_ARGS(&sin->sin_addr));
+ if (sin->sin_port != htons(OFP_TCP_PORT)) {
+ sprintf(strchr(name, '\0'), ":%"PRIu16, ntohs(sin->sin_port));
+ }
+ } else {
+ strcpy(name, "tcp");
+ }
+ return new_tcp_vconn(name, fd, 0, sin, vconnp);
+}
+
+struct pvconn_class ptcp_pvconn_class = {
+ "ptcp",
+ ptcp_open,
+ NULL,
+ NULL,
+ NULL
+};
+
diff --git a/lib/vconn-unix.c b/lib/vconn-unix.c
new file mode 100644
index 000000000..54b5e23db
--- /dev/null
+++ b/lib/vconn-unix.c
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2008, 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <config.h>
+#include "vconn.h"
+#include <assert.h>
+#include <errno.h>
+#include <inttypes.h>
+#include <netdb.h>
+#include <poll.h>
+#include <sys/types.h>
+#include <sys/un.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include "ofpbuf.h"
+#include "openflow/openflow.h"
+#include "packets.h"
+#include "poll-loop.h"
+#include "socket-util.h"
+#include "util.h"
+#include "vconn-provider.h"
+#include "vconn-stream.h"
+
+#include "vlog.h"
+#define THIS_MODULE VLM_vconn_unix
+
+/* Active UNIX socket. */
+
+/* Number of unix sockets created so far, to ensure binding path uniqueness. */
+static int n_unix_sockets;
+
+static int
+unix_open(const char *name, char *suffix, struct vconn **vconnp)
+{
+ const char *connect_path = suffix;
+ char bind_path[128];
+ int fd;
+
+ sprintf(bind_path, "/tmp/vconn-unix.%ld.%d",
+ (long int) getpid(), n_unix_sockets++);
+ fd = make_unix_socket(SOCK_STREAM, true, false, bind_path, connect_path);
+ if (fd < 0) {
+ VLOG_ERR("%s: connection to %s failed: %s",
+ bind_path, connect_path, strerror(-fd));
+ return -fd;
+ }
+
+ return new_stream_vconn(name, fd, check_connection_completion(fd),
+ 0, true, vconnp);
+}
+
+struct vconn_class unix_vconn_class = {
+ "unix", /* name */
+ unix_open, /* open */
+ NULL, /* close */
+ NULL, /* connect */
+ NULL, /* recv */
+ NULL, /* send */
+ NULL, /* wait */
+};
+
+/* Passive UNIX socket. */
+
+static int punix_accept(int fd, const struct sockaddr *sa, size_t sa_len,
+ struct vconn **vconnp);
+
+static int
+punix_open(const char *name UNUSED, char *suffix, struct pvconn **pvconnp)
+{
+ int fd;
+
+ fd = make_unix_socket(SOCK_STREAM, true, true, suffix, NULL);
+ if (fd < 0) {
+ VLOG_ERR("%s: binding failed: %s", suffix, strerror(errno));
+ return errno;
+ }
+
+ return new_pstream_pvconn("punix", fd, punix_accept, pvconnp);
+}
+
+static int
+punix_accept(int fd, const struct sockaddr *sa, size_t sa_len,
+ struct vconn **vconnp)
+{
+ const struct sockaddr_un *sun = (const struct sockaddr_un *) sa;
+ int name_len = get_unix_name_len(sa_len);
+ char name[128];
+
+ if (name_len > 0) {
+ snprintf(name, sizeof name, "unix:%.*s", name_len, sun->sun_path);
+ } else {
+ strcpy(name, "unix");
+ }
+ return new_stream_vconn(name, fd, 0, 0, true, vconnp);
+}
+
+struct pvconn_class punix_pvconn_class = {
+ "punix",
+ punix_open,
+ NULL,
+ NULL,
+ NULL
+};
+
diff --git a/lib/vconn.c b/lib/vconn.c
new file mode 100644
index 000000000..e4afb22e6
--- /dev/null
+++ b/lib/vconn.c
@@ -0,0 +1,1405 @@
+/*
+ * Copyright (c) 2008, 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <config.h>
+#include "vconn-provider.h"
+#include <assert.h>
+#include <errno.h>
+#include <inttypes.h>
+#include <netinet/in.h>
+#include <poll.h>
+#include <stdlib.h>
+#include <string.h>
+#include "coverage.h"
+#include "dynamic-string.h"
+#include "flow.h"
+#include "ofp-print.h"
+#include "ofpbuf.h"
+#include "openflow/nicira-ext.h"
+#include "openflow/openflow.h"
+#include "packets.h"
+#include "poll-loop.h"
+#include "random.h"
+#include "util.h"
+
+#define THIS_MODULE VLM_vconn
+#include "vlog.h"
+
+/* State of an active vconn.*/
+enum vconn_state {
+ /* This is the ordinary progression of states. */
+ VCS_CONNECTING, /* Underlying vconn is not connected. */
+ VCS_SEND_HELLO, /* Waiting to send OFPT_HELLO message. */
+ VCS_RECV_HELLO, /* Waiting to receive OFPT_HELLO message. */
+ VCS_CONNECTED, /* Connection established. */
+
+ /* These states are entered only when something goes wrong. */
+ VCS_SEND_ERROR, /* Sending OFPT_ERROR message. */
+ VCS_DISCONNECTED /* Connection failed or connection closed. */
+};
+
+static struct vconn_class *vconn_classes[] = {
+ &tcp_vconn_class,
+ &unix_vconn_class,
+#ifdef HAVE_OPENSSL
+ &ssl_vconn_class,
+#endif
+};
+
+static struct pvconn_class *pvconn_classes[] = {
+ &ptcp_pvconn_class,
+ &punix_pvconn_class,
+#ifdef HAVE_OPENSSL
+ &pssl_pvconn_class,
+#endif
+};
+
+/* Rate limit for individual OpenFlow messages going over the vconn, output at
+ * DBG level. This is very high because, if these are enabled, it is because
+ * we really need to see them. */
+static struct vlog_rate_limit ofmsg_rl = VLOG_RATE_LIMIT_INIT(600, 600);
+
+/* Rate limit for OpenFlow message parse errors. These always indicate a bug
+ * in the peer and so there's not much point in showing a lot of them. */
+static struct vlog_rate_limit bad_ofmsg_rl = VLOG_RATE_LIMIT_INIT(1, 5);
+
+static int do_recv(struct vconn *, struct ofpbuf **);
+static int do_send(struct vconn *, struct ofpbuf *);
+
+/* Check the validity of the vconn class structures. */
+static void
+check_vconn_classes(void)
+{
+#ifndef NDEBUG
+ size_t i;
+
+ for (i = 0; i < ARRAY_SIZE(vconn_classes); i++) {
+ struct vconn_class *class = vconn_classes[i];
+ assert(class->name != NULL);
+ assert(class->open != NULL);
+ if (class->close || class->recv || class->send || class->wait) {
+ assert(class->close != NULL);
+ assert(class->recv != NULL);
+ assert(class->send != NULL);
+ assert(class->wait != NULL);
+ } else {
+ /* This class delegates to another one. */
+ }
+ }
+
+ for (i = 0; i < ARRAY_SIZE(pvconn_classes); i++) {
+ struct pvconn_class *class = pvconn_classes[i];
+ assert(class->name != NULL);
+ assert(class->listen != NULL);
+ if (class->close || class->accept || class->wait) {
+ assert(class->close != NULL);
+ assert(class->accept != NULL);
+ assert(class->wait != NULL);
+ } else {
+ /* This class delegates to another one. */
+ }
+ }
+#endif
+}
+
+/* Prints information on active (if 'active') and passive (if 'passive')
+ * connection methods supported by the vconn. If 'bootstrap' is true, also
+ * advertises options to bootstrap the CA certificate. */
+void
+vconn_usage(bool active, bool passive, bool bootstrap UNUSED)
+{
+ /* Really this should be implemented via callbacks into the vconn
+ * providers, but that seems too heavy-weight to bother with at the
+ * moment. */
+
+ printf("\n");
+ if (active) {
+ printf("Active OpenFlow connection methods:\n");
+ printf(" tcp:HOST[:PORT] "
+ "PORT (default: %d) on remote TCP HOST\n", OFP_TCP_PORT);
+#ifdef HAVE_OPENSSL
+ printf(" ssl:HOST[:PORT] "
+ "SSL PORT (default: %d) on remote HOST\n", OFP_SSL_PORT);
+#endif
+ printf(" unix:FILE Unix domain socket named FILE\n");
+ }
+
+ if (passive) {
+ printf("Passive OpenFlow connection methods:\n");
+ printf(" ptcp:[PORT] "
+ "listen to TCP PORT (default: %d)\n",
+ OFP_TCP_PORT);
+#ifdef HAVE_OPENSSL
+ printf(" pssl:[PORT] "
+ "listen for SSL on PORT (default: %d)\n",
+ OFP_SSL_PORT);
+#endif
+ printf(" punix:FILE "
+ "listen on Unix domain socket FILE\n");
+ }
+
+#ifdef HAVE_OPENSSL
+ printf("PKI configuration (required to use SSL):\n"
+ " -p, --private-key=FILE file with private key\n"
+ " -c, --certificate=FILE file with certificate for private key\n"
+ " -C, --ca-cert=FILE file with peer CA certificate\n");
+ if (bootstrap) {
+ printf(" --bootstrap-ca-cert=FILE file with peer CA certificate "
+ "to read or create\n");
+ }
+#endif
+}
+
+/* Attempts to connect to an OpenFlow device. 'name' is a connection name in
+ * the form "TYPE:ARGS", where TYPE is an active vconn class's name and ARGS
+ * are vconn class-specific.
+ *
+ * The vconn will automatically negotiate an OpenFlow protocol version
+ * acceptable to both peers on the connection. The version negotiated will be
+ * no lower than 'min_version' and no higher than OFP_VERSION.
+ *
+ * Returns 0 if successful, otherwise a positive errno value. If successful,
+ * stores a pointer to the new connection in '*vconnp', otherwise a null
+ * pointer. */
+int
+vconn_open(const char *name, int min_version, struct vconn **vconnp)
+{
+ size_t prefix_len;
+ size_t i;
+
+ COVERAGE_INC(vconn_open);
+ check_vconn_classes();
+
+ *vconnp = NULL;
+ prefix_len = strcspn(name, ":");
+ if (prefix_len == strlen(name)) {
+ return EAFNOSUPPORT;
+ }
+ for (i = 0; i < ARRAY_SIZE(vconn_classes); i++) {
+ struct vconn_class *class = vconn_classes[i];
+ if (strlen(class->name) == prefix_len
+ && !memcmp(class->name, name, prefix_len)) {
+ struct vconn *vconn;
+ char *suffix_copy = xstrdup(name + prefix_len + 1);
+ int retval = class->open(name, suffix_copy, &vconn);
+ free(suffix_copy);
+ if (!retval) {
+ assert(vconn->state != VCS_CONNECTING
+ || vconn->class->connect);
+ vconn->min_version = min_version;
+ *vconnp = vconn;
+ }
+ return retval;
+ }
+ }
+ return EAFNOSUPPORT;
+}
+
+int
+vconn_open_block(const char *name, int min_version, struct vconn **vconnp)
+{
+ struct vconn *vconn;
+ int error;
+
+ error = vconn_open(name, min_version, &vconn);
+ while (error == EAGAIN) {
+ vconn_connect_wait(vconn);
+ poll_block();
+ error = vconn_connect(vconn);
+ assert(error != EINPROGRESS);
+ }
+ if (error) {
+ vconn_close(vconn);
+ *vconnp = NULL;
+ } else {
+ *vconnp = vconn;
+ }
+ return error;
+}
+
+/* Closes 'vconn'. */
+void
+vconn_close(struct vconn *vconn)
+{
+ if (vconn != NULL) {
+ char *name = vconn->name;
+ (vconn->class->close)(vconn);
+ free(name);
+ }
+}
+
+/* Returns the name of 'vconn', that is, the string passed to vconn_open(). */
+const char *
+vconn_get_name(const struct vconn *vconn)
+{
+ return vconn->name;
+}
+
+/* Returns the IP address of the peer, or 0 if the peer is not connected over
+ * an IP-based protocol or if its IP address is not yet known. */
+uint32_t
+vconn_get_ip(const struct vconn *vconn)
+{
+ return vconn->ip;
+}
+
+static void
+vcs_connecting(struct vconn *vconn)
+{
+ int retval = (vconn->class->connect)(vconn);
+ assert(retval != EINPROGRESS);
+ if (!retval) {
+ vconn->state = VCS_SEND_HELLO;
+ } else if (retval != EAGAIN) {
+ vconn->state = VCS_DISCONNECTED;
+ vconn->error = retval;
+ }
+}
+
+static void
+vcs_send_hello(struct vconn *vconn)
+{
+ struct ofpbuf *b;
+ int retval;
+
+ make_openflow(sizeof(struct ofp_header), OFPT_HELLO, &b);
+ retval = do_send(vconn, b);
+ if (!retval) {
+ vconn->state = VCS_RECV_HELLO;
+ } else {
+ ofpbuf_delete(b);
+ if (retval != EAGAIN) {
+ vconn->state = VCS_DISCONNECTED;
+ vconn->error = retval;
+ }
+ }
+}
+
+static void
+vcs_recv_hello(struct vconn *vconn)
+{
+ struct ofpbuf *b;
+ int retval;
+
+ retval = do_recv(vconn, &b);
+ if (!retval) {
+ struct ofp_header *oh = b->data;
+
+ if (oh->type == OFPT_HELLO) {
+ if (b->size > sizeof *oh) {
+ struct ds msg = DS_EMPTY_INITIALIZER;
+ ds_put_format(&msg, "%s: extra-long hello:\n", vconn->name);
+ ds_put_hex_dump(&msg, b->data, b->size, 0, true);
+ VLOG_WARN_RL(&bad_ofmsg_rl, "%s", ds_cstr(&msg));
+ ds_destroy(&msg);
+ }
+
+ vconn->version = MIN(OFP_VERSION, oh->version);
+ if (vconn->version < vconn->min_version) {
+ VLOG_WARN_RL(&bad_ofmsg_rl,
+ "%s: version negotiation failed: we support "
+ "versions 0x%02x to 0x%02x inclusive but peer "
+ "supports no later than version 0x%02"PRIx8,
+ vconn->name, vconn->min_version, OFP_VERSION,
+ oh->version);
+ vconn->state = VCS_SEND_ERROR;
+ } else {
+ VLOG_DBG("%s: negotiated OpenFlow version 0x%02x "
+ "(we support versions 0x%02x to 0x%02x inclusive, "
+ "peer no later than version 0x%02"PRIx8")",
+ vconn->name, vconn->version, vconn->min_version,
+ OFP_VERSION, oh->version);
+ vconn->state = VCS_CONNECTED;
+ }
+ ofpbuf_delete(b);
+ return;
+ } else {
+ char *s = ofp_to_string(b->data, b->size, 1);
+ VLOG_WARN_RL(&bad_ofmsg_rl,
+ "%s: received message while expecting hello: %s",
+ vconn->name, s);
+ free(s);
+ retval = EPROTO;
+ ofpbuf_delete(b);
+ }
+ }
+
+ if (retval != EAGAIN) {
+ vconn->state = VCS_DISCONNECTED;
+ vconn->error = retval;
+ }
+}
+
+static void
+vcs_send_error(struct vconn *vconn)
+{
+ struct ofp_error_msg *error;
+ struct ofpbuf *b;
+ char s[128];
+ int retval;
+
+ snprintf(s, sizeof s, "We support versions 0x%02x to 0x%02x inclusive but "
+ "you support no later than version 0x%02"PRIx8".",
+ vconn->min_version, OFP_VERSION, vconn->version);
+ error = make_openflow(sizeof *error, OFPT_ERROR, &b);
+ error->type = htons(OFPET_HELLO_FAILED);
+ error->code = htons(OFPHFC_INCOMPATIBLE);
+ ofpbuf_put(b, s, strlen(s));
+ update_openflow_length(b);
+ retval = do_send(vconn, b);
+ if (retval) {
+ ofpbuf_delete(b);
+ }
+ if (retval != EAGAIN) {
+ vconn->state = VCS_DISCONNECTED;
+ vconn->error = retval ? retval : EPROTO;
+ }
+}
+
+/* Tries to complete the connection on 'vconn', which must be an active
+ * vconn. If 'vconn''s connection is complete, returns 0 if the connection
+ * was successful or a positive errno value if it failed. If the
+ * connection is still in progress, returns EAGAIN. */
+int
+vconn_connect(struct vconn *vconn)
+{
+ enum vconn_state last_state;
+
+ assert(vconn->min_version >= 0);
+ do {
+ last_state = vconn->state;
+ switch (vconn->state) {
+ case VCS_CONNECTING:
+ vcs_connecting(vconn);
+ break;
+
+ case VCS_SEND_HELLO:
+ vcs_send_hello(vconn);
+ break;
+
+ case VCS_RECV_HELLO:
+ vcs_recv_hello(vconn);
+ break;
+
+ case VCS_CONNECTED:
+ return 0;
+
+ case VCS_SEND_ERROR:
+ vcs_send_error(vconn);
+ break;
+
+ case VCS_DISCONNECTED:
+ return vconn->error;
+
+ default:
+ NOT_REACHED();
+ }
+ } while (vconn->state != last_state);
+
+ return EAGAIN;
+}
+
+/* Tries to receive an OpenFlow message from 'vconn', which must be an active
+ * vconn. If successful, stores the received message into '*msgp' and returns
+ * 0. The caller is responsible for destroying the message with
+ * ofpbuf_delete(). On failure, returns a positive errno value and stores a
+ * null pointer into '*msgp'. On normal connection close, returns EOF.
+ *
+ * vconn_recv will not block waiting for a packet to arrive. If no packets
+ * have been received, it returns EAGAIN immediately. */
+int
+vconn_recv(struct vconn *vconn, struct ofpbuf **msgp)
+{
+ int retval = vconn_connect(vconn);
+ if (!retval) {
+ retval = do_recv(vconn, msgp);
+ }
+ return retval;
+}
+
+static int
+do_recv(struct vconn *vconn, struct ofpbuf **msgp)
+{
+ int retval;
+
+again:
+ retval = (vconn->class->recv)(vconn, msgp);
+ if (!retval) {
+ struct ofp_header *oh;
+
+ COVERAGE_INC(vconn_received);
+ if (VLOG_IS_DBG_ENABLED()) {
+ char *s = ofp_to_string((*msgp)->data, (*msgp)->size, 1);
+ VLOG_DBG_RL(&ofmsg_rl, "%s: received: %s", vconn->name, s);
+ free(s);
+ }
+
+ oh = ofpbuf_at_assert(*msgp, 0, sizeof *oh);
+ if (oh->version != vconn->version
+ && oh->type != OFPT_HELLO
+ && oh->type != OFPT_ERROR
+ && oh->type != OFPT_ECHO_REQUEST
+ && oh->type != OFPT_ECHO_REPLY
+ && oh->type != OFPT_VENDOR)
+ {
+ if (vconn->version < 0) {
+ if (oh->type == OFPT_PACKET_IN
+ || oh->type == OFPT_FLOW_EXPIRED
+ || oh->type == OFPT_PORT_STATUS) {
+ /* The kernel datapath is stateless and doesn't really
+ * support version negotiation, so it can end up sending
+ * these asynchronous message before version negotiation
+ * is complete. Just ignore them.
+ *
+ * (After we move OFPT_PORT_STATUS messages from the kernel
+ * into secchan, we won't get those here, since secchan
+ * does proper version negotiation.) */
+ ofpbuf_delete(*msgp);
+ goto again;
+ }
+ VLOG_ERR_RL(&bad_ofmsg_rl,
+ "%s: received OpenFlow message type %"PRIu8" "
+ "before version negotiation complete",
+ vconn->name, oh->type);
+ } else {
+ VLOG_ERR_RL(&bad_ofmsg_rl,
+ "%s: received OpenFlow version 0x%02"PRIx8" "
+ "!= expected %02x",
+ vconn->name, oh->version, vconn->version);
+ }
+ ofpbuf_delete(*msgp);
+ retval = EPROTO;
+ }
+ }
+ if (retval) {
+ *msgp = NULL;
+ }
+ return retval;
+}
+
+/* Tries to queue 'msg' for transmission on 'vconn', which must be an active
+ * vconn. If successful, returns 0, in which case ownership of 'msg' is
+ * transferred to the vconn. Success does not guarantee that 'msg' has been or
+ * ever will be delivered to the peer, only that it has been queued for
+ * transmission.
+ *
+ * Returns a positive errno value on failure, in which case the caller
+ * retains ownership of 'msg'.
+ *
+ * vconn_send will not block. If 'msg' cannot be immediately accepted for
+ * transmission, it returns EAGAIN immediately. */
+int
+vconn_send(struct vconn *vconn, struct ofpbuf *msg)
+{
+ int retval = vconn_connect(vconn);
+ if (!retval) {
+ retval = do_send(vconn, msg);
+ }
+ return retval;
+}
+
+static int
+do_send(struct vconn *vconn, struct ofpbuf *msg)
+{
+ int retval;
+
+ assert(msg->size >= sizeof(struct ofp_header));
+ assert(((struct ofp_header *) msg->data)->length == htons(msg->size));
+ if (!VLOG_IS_DBG_ENABLED()) {
+ COVERAGE_INC(vconn_sent);
+ retval = (vconn->class->send)(vconn, msg);
+ } else {
+ char *s = ofp_to_string(msg->data, msg->size, 1);
+ retval = (vconn->class->send)(vconn, msg);
+ if (retval != EAGAIN) {
+ VLOG_DBG_RL(&ofmsg_rl, "%s: sent (%s): %s",
+ vconn->name, strerror(retval), s);
+ }
+ free(s);
+ }
+ return retval;
+}
+
+/* Same as vconn_send, except that it waits until 'msg' can be transmitted. */
+int
+vconn_send_block(struct vconn *vconn, struct ofpbuf *msg)
+{
+ int retval;
+ while ((retval = vconn_send(vconn, msg)) == EAGAIN) {
+ vconn_send_wait(vconn);
+ poll_block();
+ }
+ return retval;
+}
+
+/* Same as vconn_recv, except that it waits until a message is received. */
+int
+vconn_recv_block(struct vconn *vconn, struct ofpbuf **msgp)
+{
+ int retval;
+ while ((retval = vconn_recv(vconn, msgp)) == EAGAIN) {
+ vconn_recv_wait(vconn);
+ poll_block();
+ }
+ return retval;
+}
+
+/* Waits until a message with a transaction ID matching 'xid' is recived on
+ * 'vconn'. Returns 0 if successful, in which case the reply is stored in
+ * '*replyp' for the caller to examine and free. Otherwise returns a positive
+ * errno value, or EOF, and sets '*replyp' to null.
+ *
+ * 'request' is always destroyed, regardless of the return value. */
+int
+vconn_recv_xid(struct vconn *vconn, uint32_t xid, struct ofpbuf **replyp)
+{
+ for (;;) {
+ uint32_t recv_xid;
+ struct ofpbuf *reply;
+ int error;
+
+ error = vconn_recv_block(vconn, &reply);
+ if (error) {
+ *replyp = NULL;
+ return error;
+ }
+ recv_xid = ((struct ofp_header *) reply->data)->xid;
+ if (xid == recv_xid) {
+ *replyp = reply;
+ return 0;
+ }
+
+ VLOG_DBG_RL(&bad_ofmsg_rl, "%s: received reply with xid %08"PRIx32
+ " != expected %08"PRIx32, vconn->name, recv_xid, xid);
+ ofpbuf_delete(reply);
+ }
+}
+
+/* Sends 'request' to 'vconn' and blocks until it receives a reply with a
+ * matching transaction ID. Returns 0 if successful, in which case the reply
+ * is stored in '*replyp' for the caller to examine and free. Otherwise
+ * returns a positive errno value, or EOF, and sets '*replyp' to null.
+ *
+ * 'request' is always destroyed, regardless of the return value. */
+int
+vconn_transact(struct vconn *vconn, struct ofpbuf *request,
+ struct ofpbuf **replyp)
+{
+ uint32_t send_xid = ((struct ofp_header *) request->data)->xid;
+ int error;
+
+ *replyp = NULL;
+ error = vconn_send_block(vconn, request);
+ if (error) {
+ ofpbuf_delete(request);
+ }
+ return error ? error : vconn_recv_xid(vconn, send_xid, replyp);
+}
+
+void
+vconn_wait(struct vconn *vconn, enum vconn_wait_type wait)
+{
+ assert(wait == WAIT_CONNECT || wait == WAIT_RECV || wait == WAIT_SEND);
+
+ switch (vconn->state) {
+ case VCS_CONNECTING:
+ wait = WAIT_CONNECT;
+ break;
+
+ case VCS_SEND_HELLO:
+ case VCS_SEND_ERROR:
+ wait = WAIT_SEND;
+ break;
+
+ case VCS_RECV_HELLO:
+ wait = WAIT_RECV;
+ break;
+
+ case VCS_CONNECTED:
+ break;
+
+ case VCS_DISCONNECTED:
+ poll_immediate_wake();
+ return;
+ }
+ (vconn->class->wait)(vconn, wait);
+}
+
+void
+vconn_connect_wait(struct vconn *vconn)
+{
+ vconn_wait(vconn, WAIT_CONNECT);
+}
+
+void
+vconn_recv_wait(struct vconn *vconn)
+{
+ vconn_wait(vconn, WAIT_RECV);
+}
+
+void
+vconn_send_wait(struct vconn *vconn)
+{
+ vconn_wait(vconn, WAIT_SEND);
+}
+
+/* Attempts to start listening for OpenFlow connections. 'name' is a
+ * connection name in the form "TYPE:ARGS", where TYPE is an passive vconn
+ * class's name and ARGS are vconn class-specific.
+ *
+ * Returns 0 if successful, otherwise a positive errno value. If successful,
+ * stores a pointer to the new connection in '*pvconnp', otherwise a null
+ * pointer. */
+int
+pvconn_open(const char *name, struct pvconn **pvconnp)
+{
+ size_t prefix_len;
+ size_t i;
+
+ check_vconn_classes();
+
+ *pvconnp = NULL;
+ prefix_len = strcspn(name, ":");
+ if (prefix_len == strlen(name)) {
+ return EAFNOSUPPORT;
+ }
+ for (i = 0; i < ARRAY_SIZE(pvconn_classes); i++) {
+ struct pvconn_class *class = pvconn_classes[i];
+ if (strlen(class->name) == prefix_len
+ && !memcmp(class->name, name, prefix_len)) {
+ char *suffix_copy = xstrdup(name + prefix_len + 1);
+ int retval = class->listen(name, suffix_copy, pvconnp);
+ free(suffix_copy);
+ if (retval) {
+ *pvconnp = NULL;
+ }
+ return retval;
+ }
+ }
+ return EAFNOSUPPORT;
+}
+
+/* Returns the name that was used to open 'pvconn'. The caller must not
+ * modify or free the name. */
+const char *
+pvconn_get_name(const struct pvconn *pvconn)
+{
+ return pvconn->name;
+}
+
+/* Closes 'pvconn'. */
+void
+pvconn_close(struct pvconn *pvconn)
+{
+ if (pvconn != NULL) {
+ char *name = pvconn->name;
+ (pvconn->class->close)(pvconn);
+ free(name);
+ }
+}
+
+/* Tries to accept a new connection on 'pvconn'. If successful, stores the new
+ * connection in '*new_vconn' and returns 0. Otherwise, returns a positive
+ * errno value.
+ *
+ * The new vconn will automatically negotiate an OpenFlow protocol version
+ * acceptable to both peers on the connection. The version negotiated will be
+ * no lower than 'min_version' and no higher than OFP_VERSION.
+ *
+ * pvconn_accept() will not block waiting for a connection. If no connection
+ * is ready to be accepted, it returns EAGAIN immediately. */
+int
+pvconn_accept(struct pvconn *pvconn, int min_version, struct vconn **new_vconn)
+{
+ int retval = (pvconn->class->accept)(pvconn, new_vconn);
+ if (retval) {
+ *new_vconn = NULL;
+ } else {
+ assert((*new_vconn)->state != VCS_CONNECTING
+ || (*new_vconn)->class->connect);
+ (*new_vconn)->min_version = min_version;
+ }
+ return retval;
+}
+
+void
+pvconn_wait(struct pvconn *pvconn)
+{
+ (pvconn->class->wait)(pvconn);
+}
+
+/* XXX we should really use consecutive xids to avoid probabilistic
+ * failures. */
+static inline uint32_t
+alloc_xid(void)
+{
+ return random_uint32();
+}
+
+/* Allocates and stores in '*bufferp' a new ofpbuf with a size of
+ * 'openflow_len', starting with an OpenFlow header with the given 'type' and
+ * an arbitrary transaction id. Allocated bytes beyond the header, if any, are
+ * zeroed.
+ *
+ * The caller is responsible for freeing '*bufferp' when it is no longer
+ * needed.
+ *
+ * The OpenFlow header length is initially set to 'openflow_len'; if the
+ * message is later extended, the length should be updated with
+ * update_openflow_length() before sending.
+ *
+ * Returns the header. */
+void *
+make_openflow(size_t openflow_len, uint8_t type, struct ofpbuf **bufferp)
+{
+ *bufferp = ofpbuf_new(openflow_len);
+ return put_openflow_xid(openflow_len, type, alloc_xid(), *bufferp);
+}
+
+/* Allocates and stores in '*bufferp' a new ofpbuf with a size of
+ * 'openflow_len', starting with an OpenFlow header with the given 'type' and
+ * transaction id 'xid'. Allocated bytes beyond the header, if any, are
+ * zeroed.
+ *
+ * The caller is responsible for freeing '*bufferp' when it is no longer
+ * needed.
+ *
+ * The OpenFlow header length is initially set to 'openflow_len'; if the
+ * message is later extended, the length should be updated with
+ * update_openflow_length() before sending.
+ *
+ * Returns the header. */
+void *
+make_openflow_xid(size_t openflow_len, uint8_t type, uint32_t xid,
+ struct ofpbuf **bufferp)
+{
+ *bufferp = ofpbuf_new(openflow_len);
+ return put_openflow_xid(openflow_len, type, xid, *bufferp);
+}
+
+/* Appends 'openflow_len' bytes to 'buffer', starting with an OpenFlow header
+ * with the given 'type' and an arbitrary transaction id. Allocated bytes
+ * beyond the header, if any, are zeroed.
+ *
+ * The OpenFlow header length is initially set to 'openflow_len'; if the
+ * message is later extended, the length should be updated with
+ * update_openflow_length() before sending.
+ *
+ * Returns the header. */
+void *
+put_openflow(size_t openflow_len, uint8_t type, struct ofpbuf *buffer)
+{
+ return put_openflow_xid(openflow_len, type, alloc_xid(), buffer);
+}
+
+/* Appends 'openflow_len' bytes to 'buffer', starting with an OpenFlow header
+ * with the given 'type' and an transaction id 'xid'. Allocated bytes beyond
+ * the header, if any, are zeroed.
+ *
+ * The OpenFlow header length is initially set to 'openflow_len'; if the
+ * message is later extended, the length should be updated with
+ * update_openflow_length() before sending.
+ *
+ * Returns the header. */
+void *
+put_openflow_xid(size_t openflow_len, uint8_t type, uint32_t xid,
+ struct ofpbuf *buffer)
+{
+ struct ofp_header *oh;
+
+ assert(openflow_len >= sizeof *oh);
+ assert(openflow_len <= UINT16_MAX);
+
+ oh = ofpbuf_put_uninit(buffer, openflow_len);
+ oh->version = OFP_VERSION;
+ oh->type = type;
+ oh->length = htons(openflow_len);
+ oh->xid = xid;
+ memset(oh + 1, 0, openflow_len - sizeof *oh);
+ return oh;
+}
+
+/* Updates the 'length' field of the OpenFlow message in 'buffer' to
+ * 'buffer->size'. */
+void
+update_openflow_length(struct ofpbuf *buffer)
+{
+ struct ofp_header *oh = ofpbuf_at_assert(buffer, 0, sizeof *oh);
+ oh->length = htons(buffer->size);
+}
+
+struct ofpbuf *
+make_flow_mod(uint16_t command, const flow_t *flow, size_t actions_len)
+{
+ struct ofp_flow_mod *ofm;
+ size_t size = sizeof *ofm + actions_len;
+ struct ofpbuf *out = ofpbuf_new(size);
+ ofm = ofpbuf_put_zeros(out, sizeof *ofm);
+ ofm->header.version = OFP_VERSION;
+ ofm->header.type = OFPT_FLOW_MOD;
+ ofm->header.length = htons(size);
+ ofm->match.wildcards = htonl(0);
+ ofm->match.in_port = htons(flow->in_port == ODPP_LOCAL ? OFPP_LOCAL
+ : flow->in_port);
+ memcpy(ofm->match.dl_src, flow->dl_src, sizeof ofm->match.dl_src);
+ memcpy(ofm->match.dl_dst, flow->dl_dst, sizeof ofm->match.dl_dst);
+ ofm->match.dl_vlan = flow->dl_vlan;
+ ofm->match.dl_type = flow->dl_type;
+ ofm->match.nw_src = flow->nw_src;
+ ofm->match.nw_dst = flow->nw_dst;
+ ofm->match.nw_proto = flow->nw_proto;
+ ofm->match.tp_src = flow->tp_src;
+ ofm->match.tp_dst = flow->tp_dst;
+ ofm->command = htons(command);
+ return out;
+}
+
+struct ofpbuf *
+make_add_flow(const flow_t *flow, uint32_t buffer_id,
+ uint16_t idle_timeout, size_t actions_len)
+{
+ struct ofpbuf *out = make_flow_mod(OFPFC_ADD, flow, actions_len);
+ struct ofp_flow_mod *ofm = out->data;
+ ofm->idle_timeout = htons(idle_timeout);
+ ofm->hard_timeout = htons(OFP_FLOW_PERMANENT);
+ ofm->buffer_id = htonl(buffer_id);
+ return out;
+}
+
+struct ofpbuf *
+make_del_flow(const flow_t *flow)
+{
+ struct ofpbuf *out = make_flow_mod(OFPFC_DELETE_STRICT, flow, 0);
+ struct ofp_flow_mod *ofm = out->data;
+ ofm->out_port = htons(OFPP_NONE);
+ return out;
+}
+
+struct ofpbuf *
+make_add_simple_flow(const flow_t *flow,
+ uint32_t buffer_id, uint16_t out_port,
+ uint16_t idle_timeout)
+{
+ struct ofp_action_output *oao;
+ struct ofpbuf *buffer = make_add_flow(flow, buffer_id, idle_timeout,
+ sizeof *oao);
+ oao = ofpbuf_put_zeros(buffer, sizeof *oao);
+ oao->type = htons(OFPAT_OUTPUT);
+ oao->len = htons(sizeof *oao);
+ oao->port = htons(out_port);
+ return buffer;
+}
+
+struct ofpbuf *
+make_packet_out(const struct ofpbuf *packet, uint32_t buffer_id,
+ uint16_t in_port,
+ const struct ofp_action_header *actions, size_t n_actions)
+{
+ size_t actions_len = n_actions * sizeof *actions;
+ struct ofp_packet_out *opo;
+ size_t size = sizeof *opo + actions_len + (packet ? packet->size : 0);
+ struct ofpbuf *out = ofpbuf_new(size);
+
+ opo = ofpbuf_put_uninit(out, sizeof *opo);
+ opo->header.version = OFP_VERSION;
+ opo->header.type = OFPT_PACKET_OUT;
+ opo->header.length = htons(size);
+ opo->header.xid = htonl(0);
+ opo->buffer_id = htonl(buffer_id);
+ opo->in_port = htons(in_port == ODPP_LOCAL ? OFPP_LOCAL : in_port);
+ opo->actions_len = htons(actions_len);
+ ofpbuf_put(out, actions, actions_len);
+ if (packet) {
+ ofpbuf_put(out, packet->data, packet->size);
+ }
+ return out;
+}
+
+struct ofpbuf *
+make_unbuffered_packet_out(const struct ofpbuf *packet,
+ uint16_t in_port, uint16_t out_port)
+{
+ struct ofp_action_output action;
+ action.type = htons(OFPAT_OUTPUT);
+ action.len = htons(sizeof action);
+ action.port = htons(out_port);
+ return make_packet_out(packet, UINT32_MAX, in_port,
+ (struct ofp_action_header *) &action, 1);
+}
+
+struct ofpbuf *
+make_buffered_packet_out(uint32_t buffer_id,
+ uint16_t in_port, uint16_t out_port)
+{
+ struct ofp_action_output action;
+ action.type = htons(OFPAT_OUTPUT);
+ action.len = htons(sizeof action);
+ action.port = htons(out_port);
+ return make_packet_out(NULL, buffer_id, in_port,
+ (struct ofp_action_header *) &action, 1);
+}
+
+/* Creates and returns an OFPT_ECHO_REQUEST message with an empty payload. */
+struct ofpbuf *
+make_echo_request(void)
+{
+ struct ofp_header *rq;
+ struct ofpbuf *out = ofpbuf_new(sizeof *rq);
+ rq = ofpbuf_put_uninit(out, sizeof *rq);
+ rq->version = OFP_VERSION;
+ rq->type = OFPT_ECHO_REQUEST;
+ rq->length = htons(sizeof *rq);
+ rq->xid = 0;
+ return out;
+}
+
+/* Creates and returns an OFPT_ECHO_REPLY message matching the
+ * OFPT_ECHO_REQUEST message in 'rq'. */
+struct ofpbuf *
+make_echo_reply(const struct ofp_header *rq)
+{
+ size_t size = ntohs(rq->length);
+ struct ofpbuf *out = ofpbuf_new(size);
+ struct ofp_header *reply = ofpbuf_put(out, rq, size);
+ reply->type = OFPT_ECHO_REPLY;
+ return out;
+}
+
+static int
+check_message_type(uint8_t got_type, uint8_t want_type)
+{
+ if (got_type != want_type) {
+ char *want_type_name = ofp_message_type_to_string(want_type);
+ char *got_type_name = ofp_message_type_to_string(got_type);
+ VLOG_WARN_RL(&bad_ofmsg_rl,
+ "received bad message type %s (expected %s)",
+ got_type_name, want_type_name);
+ free(want_type_name);
+ free(got_type_name);
+ return ofp_mkerr(OFPET_BAD_REQUEST, OFPBRC_BAD_TYPE);
+ }
+ return 0;
+}
+
+/* Checks that 'msg' has type 'type' and that it is exactly 'size' bytes long.
+ * Returns 0 if the checks pass, otherwise an OpenFlow error code (produced
+ * with ofp_mkerr()). */
+int
+check_ofp_message(const struct ofp_header *msg, uint8_t type, size_t size)
+{
+ size_t got_size;
+ int error;
+
+ error = check_message_type(msg->type, type);
+ if (error) {
+ return error;
+ }
+
+ got_size = ntohs(msg->length);
+ if (got_size != size) {
+ char *type_name = ofp_message_type_to_string(type);
+ VLOG_WARN_RL(&bad_ofmsg_rl,
+ "received %s message of length %"PRIu16" (expected %zu)",
+ type_name, got_size, size);
+ free(type_name);
+ return ofp_mkerr(OFPET_BAD_REQUEST, OFPBRC_BAD_LENGTH);
+ }
+
+ return 0;
+}
+
+/* Checks that 'msg' has type 'type' and that 'msg' is 'size' plus a
+ * nonnegative integer multiple of 'array_elt_size' bytes long. Returns 0 if
+ * the checks pass, otherwise an OpenFlow error code (produced with
+ * ofp_mkerr()).
+ *
+ * If 'n_array_elts' is nonnull, then '*n_array_elts' is set to the number of
+ * 'array_elt_size' blocks in 'msg' past the first 'min_size' bytes, when
+ * successful. */
+int
+check_ofp_message_array(const struct ofp_header *msg, uint8_t type,
+ size_t min_size, size_t array_elt_size,
+ size_t *n_array_elts)
+{
+ size_t got_size;
+ int error;
+
+ assert(array_elt_size);
+
+ error = check_message_type(msg->type, type);
+ if (error) {
+ return error;
+ }
+
+ got_size = ntohs(msg->length);
+ if (got_size < min_size) {
+ char *type_name = ofp_message_type_to_string(type);
+ VLOG_WARN_RL(&bad_ofmsg_rl, "received %s message of length %"PRIu16" "
+ "(expected at least %zu)",
+ type_name, got_size, min_size);
+ free(type_name);
+ return ofp_mkerr(OFPET_BAD_REQUEST, OFPBRC_BAD_LENGTH);
+ }
+ if ((got_size - min_size) % array_elt_size) {
+ char *type_name = ofp_message_type_to_string(type);
+ VLOG_WARN_RL(&bad_ofmsg_rl,
+ "received %s message of bad length %"PRIu16": the "
+ "excess over %zu (%zu) is not evenly divisible by %zu "
+ "(remainder is %zu)",
+ type_name, got_size, min_size, got_size - min_size,
+ array_elt_size, (got_size - min_size) % array_elt_size);
+ free(type_name);
+ return ofp_mkerr(OFPET_BAD_REQUEST, OFPBRC_BAD_LENGTH);
+ }
+ if (n_array_elts) {
+ *n_array_elts = (got_size - min_size) / array_elt_size;
+ }
+ return 0;
+}
+
+int
+check_ofp_packet_out(const struct ofp_header *oh, struct ofpbuf *data,
+ int *n_actionsp, int max_ports)
+{
+ const struct ofp_packet_out *opo;
+ unsigned int actions_len, n_actions;
+ size_t extra;
+ int error;
+
+ *n_actionsp = 0;
+ error = check_ofp_message_array(oh, OFPT_PACKET_OUT,
+ sizeof *opo, 1, &extra);
+ if (error) {
+ return error;
+ }
+ opo = (const struct ofp_packet_out *) oh;
+
+ actions_len = ntohs(opo->actions_len);
+ if (actions_len > extra) {
+ VLOG_WARN_RL(&bad_ofmsg_rl, "packet-out claims %zu bytes of actions "
+ "but message has room for only %zu bytes",
+ actions_len, extra);
+ return ofp_mkerr(OFPET_BAD_REQUEST, OFPBRC_BAD_LENGTH);
+ }
+ if (actions_len % sizeof(union ofp_action)) {
+ VLOG_WARN_RL(&bad_ofmsg_rl, "packet-out claims %zu bytes of actions, "
+ "which is not a multiple of %zu",
+ actions_len, sizeof(union ofp_action));
+ return ofp_mkerr(OFPET_BAD_REQUEST, OFPBRC_BAD_LENGTH);
+ }
+
+ n_actions = actions_len / sizeof(union ofp_action);
+ error = validate_actions((const union ofp_action *) opo->actions,
+ n_actions, max_ports);
+ if (error) {
+ return error;
+ }
+
+ data->data = (void *) &opo->actions[n_actions];
+ data->size = extra - actions_len;
+ *n_actionsp = n_actions;
+ return 0;
+}
+
+const struct ofp_flow_stats *
+flow_stats_first(struct flow_stats_iterator *iter,
+ const struct ofp_stats_reply *osr)
+{
+ iter->pos = osr->body;
+ iter->end = osr->body + (ntohs(osr->header.length)
+ - offsetof(struct ofp_stats_reply, body));
+ return flow_stats_next(iter);
+}
+
+const struct ofp_flow_stats *
+flow_stats_next(struct flow_stats_iterator *iter)
+{
+ ptrdiff_t bytes_left = iter->end - iter->pos;
+ const struct ofp_flow_stats *fs;
+ size_t length;
+
+ if (bytes_left < sizeof *fs) {
+ if (bytes_left != 0) {
+ VLOG_WARN_RL(&bad_ofmsg_rl,
+ "%td leftover bytes in flow stats reply", bytes_left);
+ }
+ return NULL;
+ }
+
+ fs = (const void *) iter->pos;
+ length = ntohs(fs->length);
+ if (length < sizeof *fs) {
+ VLOG_WARN_RL(&bad_ofmsg_rl, "flow stats length %zu is shorter than "
+ "min %zu", length, sizeof *fs);
+ return NULL;
+ } else if (length > bytes_left) {
+ VLOG_WARN_RL(&bad_ofmsg_rl, "flow stats length %zu but only %td "
+ "bytes left", length, bytes_left);
+ return NULL;
+ } else if ((length - sizeof *fs) % sizeof fs->actions[0]) {
+ VLOG_WARN_RL(&bad_ofmsg_rl, "flow stats length %zu has %zu bytes "
+ "left over in final action", length,
+ (length - sizeof *fs) % sizeof fs->actions[0]);
+ return NULL;
+ }
+ iter->pos += length;
+ return fs;
+}
+
+/* Alignment of ofp_actions. */
+#define ACTION_ALIGNMENT 8
+
+static int
+check_action_exact_len(const union ofp_action *a, unsigned int len,
+ unsigned int required_len)
+{
+ if (len != required_len) {
+ VLOG_DBG_RL(&bad_ofmsg_rl,
+ "action %u has invalid length %"PRIu16" (must be %u)\n",
+ a->type, ntohs(a->header.len), required_len);
+ return ofp_mkerr(OFPET_BAD_ACTION, OFPBAC_BAD_LEN);
+ }
+ return 0;
+}
+
+static int
+check_action_port(int port, int max_ports)
+{
+ switch (port) {
+ case OFPP_IN_PORT:
+ case OFPP_TABLE:
+ case OFPP_NORMAL:
+ case OFPP_FLOOD:
+ case OFPP_ALL:
+ case OFPP_CONTROLLER:
+ case OFPP_LOCAL:
+ return 0;
+
+ default:
+ if (port >= 0 && port < max_ports) {
+ return 0;
+ }
+ VLOG_WARN_RL(&bad_ofmsg_rl, "unknown output port %x", port);
+ return ofp_mkerr(OFPET_BAD_ACTION, OFPBAC_BAD_OUT_PORT);
+ }
+}
+
+static int
+check_nicira_action(const union ofp_action *a, unsigned int len)
+{
+ const struct nx_action_header *nah;
+
+ if (len < 16) {
+ VLOG_DBG_RL(&bad_ofmsg_rl,
+ "Nicira vendor action only %u bytes", len);
+ return ofp_mkerr(OFPET_BAD_ACTION, OFPBAC_BAD_LEN);
+ }
+ nah = (const struct nx_action_header *) a;
+
+ switch (ntohs(nah->subtype)) {
+ case NXAST_RESUBMIT:
+ return check_action_exact_len(a, len, 16);
+ default:
+ return ofp_mkerr(OFPET_BAD_ACTION, OFPBAC_BAD_VENDOR_TYPE);
+ }
+}
+
+static int
+check_action(const union ofp_action *a, unsigned int len, int max_ports)
+{
+ int error;
+
+ switch (a->type) {
+ case OFPAT_OUTPUT:
+ error = check_action_port(ntohs(a->output.port), max_ports);
+ if (error) {
+ return error;
+ }
+ return check_action_exact_len(a, len, 8);
+
+ case OFPAT_SET_VLAN_VID:
+ case OFPAT_SET_VLAN_PCP:
+ case OFPAT_STRIP_VLAN:
+ case OFPAT_SET_NW_SRC:
+ case OFPAT_SET_NW_DST:
+ case OFPAT_SET_TP_SRC:
+ case OFPAT_SET_TP_DST:
+ return check_action_exact_len(a, len, 8);
+
+ case OFPAT_SET_DL_SRC:
+ case OFPAT_SET_DL_DST:
+ return check_action_exact_len(a, len, 16);
+
+ case OFPAT_VENDOR:
+ if (a->vendor.vendor == htonl(NX_VENDOR_ID)) {
+ return check_nicira_action(a, len);
+ } else {
+ return ofp_mkerr(OFPET_BAD_ACTION, OFPBAC_BAD_VENDOR);
+ }
+ break;
+
+ default:
+ VLOG_WARN_RL(&bad_ofmsg_rl, "unknown action type %"PRIu16, a->type);
+ return ofp_mkerr(OFPET_BAD_ACTION, OFPBAC_BAD_TYPE);
+ }
+
+ if (!len) {
+ VLOG_DBG_RL(&bad_ofmsg_rl, "action has invalid length 0");
+ return ofp_mkerr(OFPET_BAD_ACTION, OFPBAC_BAD_LEN);
+ }
+ if (len % ACTION_ALIGNMENT) {
+ VLOG_DBG_RL(&bad_ofmsg_rl, "action length %u is not a multiple of %d",
+ len, ACTION_ALIGNMENT);
+ return ofp_mkerr(OFPET_BAD_ACTION, OFPBAC_BAD_LEN);
+ }
+ return 0;
+}
+
+int
+validate_actions(const union ofp_action *actions, size_t n_actions,
+ int max_ports)
+{
+ const union ofp_action *a;
+
+ for (a = actions; a < &actions[n_actions]; ) {
+ unsigned int len = ntohs(a->header.len);
+ unsigned int n_slots = len / ACTION_ALIGNMENT;
+ unsigned int slots_left = &actions[n_actions] - a;
+ int error;
+
+ if (n_slots > slots_left) {
+ VLOG_DBG_RL(&bad_ofmsg_rl,
+ "action requires %u slots but only %td remain",
+ n_slots, slots_left);
+ return ofp_mkerr(OFPET_BAD_ACTION, OFPBAC_BAD_LEN);
+ }
+ error = check_action(a, len, max_ports);
+ if (error) {
+ return error;
+ }
+ a += n_slots;
+ }
+ return 0;
+}
+
+/* The set of actions must either come from a trusted source or have been
+ * previously validated with validate_actions(). */
+const union ofp_action *
+actions_first(struct actions_iterator *iter,
+ const union ofp_action *oa, size_t n_actions)
+{
+ iter->pos = oa;
+ iter->end = oa + n_actions;
+ return actions_next(iter);
+}
+
+const union ofp_action *
+actions_next(struct actions_iterator *iter)
+{
+ if (iter->pos < iter->end) {
+ const union ofp_action *a = iter->pos;
+ unsigned int len = ntohs(a->header.len);
+ iter->pos += len / ACTION_ALIGNMENT;
+ return a;
+ } else {
+ return NULL;
+ }
+}
+
+void
+normalize_match(struct ofp_match *m)
+{
+ enum { OFPFW_NW = OFPFW_NW_SRC_MASK | OFPFW_NW_DST_MASK | OFPFW_NW_PROTO };
+ enum { OFPFW_TP = OFPFW_TP_SRC | OFPFW_TP_DST };
+ uint32_t wc;
+
+ wc = ntohl(m->wildcards) & OFPFW_ALL;
+ if (wc & OFPFW_DL_TYPE) {
+ m->dl_type = 0;
+
+ /* Can't sensibly m on network or transport headers if the
+ * data link type is unknown. */
+ wc |= OFPFW_NW | OFPFW_TP;
+ m->nw_src = m->nw_dst = m->nw_proto = 0;
+ m->tp_src = m->tp_dst = 0;
+ } else if (m->dl_type == htons(ETH_TYPE_IP)) {
+ if (wc & OFPFW_NW_PROTO) {
+ m->nw_proto = 0;
+
+ /* Can't sensibly m on transport headers if the network
+ * protocol is unknown. */
+ wc |= OFPFW_TP;
+ m->tp_src = m->tp_dst = 0;
+ } else if (m->nw_proto == IPPROTO_TCP ||
+ m->nw_proto == IPPROTO_UDP ||
+ m->nw_proto == IPPROTO_ICMP) {
+ if (wc & OFPFW_TP_SRC) {
+ m->tp_src = 0;
+ }
+ if (wc & OFPFW_TP_DST) {
+ m->tp_dst = 0;
+ }
+ } else {
+ /* Transport layer fields will always be extracted as zeros, so we
+ * can do an exact-m on those values. */
+ wc &= ~OFPFW_TP;
+ m->tp_src = m->tp_dst = 0;
+ }
+ if (wc & OFPFW_NW_SRC_MASK) {
+ m->nw_src &= flow_nw_bits_to_mask(wc, OFPFW_NW_SRC_SHIFT);
+ }
+ if (wc & OFPFW_NW_DST_MASK) {
+ m->nw_dst &= flow_nw_bits_to_mask(wc, OFPFW_NW_DST_SHIFT);
+ }
+ } else {
+ /* Network and transport layer fields will always be extracted as
+ * zeros, so we can do an exact-m on those values. */
+ wc &= ~(OFPFW_NW | OFPFW_TP);
+ m->nw_proto = m->nw_src = m->nw_dst = 0;
+ m->tp_src = m->tp_dst = 0;
+ }
+ if (wc & OFPFW_DL_SRC) {
+ memset(m->dl_src, 0, sizeof m->dl_src);
+ }
+ if (wc & OFPFW_DL_DST) {
+ memset(m->dl_dst, 0, sizeof m->dl_dst);
+ }
+ m->wildcards = htonl(wc);
+}
+
+void
+vconn_init(struct vconn *vconn, struct vconn_class *class, int connect_status,
+ uint32_t ip, const char *name, bool reconnectable)
+{
+ vconn->class = class;
+ vconn->state = (connect_status == EAGAIN ? VCS_CONNECTING
+ : !connect_status ? VCS_SEND_HELLO
+ : VCS_DISCONNECTED);
+ vconn->error = connect_status;
+ vconn->version = -1;
+ vconn->min_version = -1;
+ vconn->ip = ip;
+ vconn->name = xstrdup(name);
+ vconn->reconnectable = reconnectable;
+}
+
+void
+pvconn_init(struct pvconn *pvconn, struct pvconn_class *class,
+ const char *name)
+{
+ pvconn->class = class;
+ pvconn->name = xstrdup(name);
+}
diff --git a/lib/vconn.h b/lib/vconn.h
new file mode 100644
index 000000000..e17f9e247
--- /dev/null
+++ b/lib/vconn.h
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2008, 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef VCONN_H
+#define VCONN_H 1
+
+#include <assert.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "flow.h"
+
+struct ofpbuf;
+struct ofp_action_header;
+struct ofp_header;
+struct ofp_match;
+struct ofp_stats_reply;
+struct pvconn;
+struct vconn;
+
+void vconn_usage(bool active, bool passive, bool bootstrap);
+
+/* Active vconns: virtual connections to OpenFlow devices. */
+int vconn_open(const char *name, int min_version, struct vconn **);
+void vconn_close(struct vconn *);
+const char *vconn_get_name(const struct vconn *);
+uint32_t vconn_get_ip(const struct vconn *);
+int vconn_connect(struct vconn *);
+int vconn_recv(struct vconn *, struct ofpbuf **);
+int vconn_send(struct vconn *, struct ofpbuf *);
+int vconn_recv_xid(struct vconn *, uint32_t xid, struct ofpbuf **);
+int vconn_transact(struct vconn *, struct ofpbuf *, struct ofpbuf **);
+
+int vconn_open_block(const char *name, int min_version, struct vconn **);
+int vconn_send_block(struct vconn *, struct ofpbuf *);
+int vconn_recv_block(struct vconn *, struct ofpbuf **);
+
+enum vconn_wait_type {
+ WAIT_CONNECT,
+ WAIT_RECV,
+ WAIT_SEND
+};
+void vconn_wait(struct vconn *, enum vconn_wait_type);
+void vconn_connect_wait(struct vconn *);
+void vconn_recv_wait(struct vconn *);
+void vconn_send_wait(struct vconn *);
+
+/* Passive vconns: virtual listeners for incoming OpenFlow connections. */
+int pvconn_open(const char *name, struct pvconn **);
+const char *pvconn_get_name(const struct pvconn *);
+void pvconn_close(struct pvconn *);
+int pvconn_accept(struct pvconn *, int min_version, struct vconn **);
+void pvconn_wait(struct pvconn *);
+
+/* OpenFlow protocol utility functions. */
+void *make_openflow(size_t openflow_len, uint8_t type, struct ofpbuf **);
+void *make_openflow_xid(size_t openflow_len, uint8_t type,
+ uint32_t xid, struct ofpbuf **);
+void *put_openflow(size_t openflow_len, uint8_t type, struct ofpbuf *);
+void *put_openflow_xid(size_t openflow_len, uint8_t type, uint32_t xid,
+ struct ofpbuf *);
+void update_openflow_length(struct ofpbuf *);
+struct ofpbuf *make_flow_mod(uint16_t command, const flow_t *,
+ size_t actions_len);
+struct ofpbuf *make_add_flow(const flow_t *, uint32_t buffer_id,
+ uint16_t max_idle, size_t actions_len);
+struct ofpbuf *make_del_flow(const flow_t *);
+struct ofpbuf *make_add_simple_flow(const flow_t *,
+ uint32_t buffer_id, uint16_t out_port,
+ uint16_t max_idle);
+struct ofpbuf *make_packet_out(const struct ofpbuf *packet, uint32_t buffer_id,
+ uint16_t in_port,
+ const struct ofp_action_header *,
+ size_t n_actions);
+struct ofpbuf *make_buffered_packet_out(uint32_t buffer_id,
+ uint16_t in_port, uint16_t out_port);
+struct ofpbuf *make_unbuffered_packet_out(const struct ofpbuf *packet,
+ uint16_t in_port, uint16_t out_port);
+struct ofpbuf *make_echo_request(void);
+struct ofpbuf *make_echo_reply(const struct ofp_header *rq);
+int check_ofp_message(const struct ofp_header *, uint8_t type, size_t size);
+int check_ofp_message_array(const struct ofp_header *, uint8_t type,
+ size_t size, size_t array_elt_size,
+ size_t *n_array_elts);
+int check_ofp_packet_out(const struct ofp_header *, struct ofpbuf *data,
+ int *n_actions, int max_ports);
+
+struct flow_stats_iterator {
+ const uint8_t *pos, *end;
+};
+const struct ofp_flow_stats *flow_stats_first(struct flow_stats_iterator *,
+ const struct ofp_stats_reply *);
+const struct ofp_flow_stats *flow_stats_next(struct flow_stats_iterator *);
+
+struct actions_iterator {
+ const union ofp_action *pos, *end;
+};
+const union ofp_action *actions_first(struct actions_iterator *,
+ const union ofp_action *,
+ size_t n_actions);
+const union ofp_action *actions_next(struct actions_iterator *);
+int validate_actions(const union ofp_action *, size_t n_actions,
+ int max_ports);
+
+void normalize_match(struct ofp_match *);
+
+static inline int
+ofp_mkerr(uint16_t type, uint16_t code)
+{
+ assert(type > 0 && type <= 0x7fff);
+ return (type << 16) | code;
+}
+
+#endif /* vconn.h */
diff --git a/lib/vlog-modules.def b/lib/vlog-modules.def
new file mode 100644
index 000000000..833c20a4c
--- /dev/null
+++ b/lib/vlog-modules.def
@@ -0,0 +1,65 @@
+/* Modules that can emit log messages. */
+VLOG_MODULE(backtrace)
+VLOG_MODULE(brcompatd)
+VLOG_MODULE(bridge)
+VLOG_MODULE(chain)
+VLOG_MODULE(cfg)
+VLOG_MODULE(cfg_mod)
+VLOG_MODULE(controller)
+VLOG_MODULE(coverage)
+VLOG_MODULE(ctlpath)
+VLOG_MODULE(daemon)
+VLOG_MODULE(datapath)
+VLOG_MODULE(dhcp)
+VLOG_MODULE(dhcp_client)
+VLOG_MODULE(discovery)
+VLOG_MODULE(dpif)
+VLOG_MODULE(dpctl)
+VLOG_MODULE(executer)
+VLOG_MODULE(ezio_term)
+VLOG_MODULE(fail_open)
+VLOG_MODULE(fault)
+VLOG_MODULE(flow)
+VLOG_MODULE(in_band)
+VLOG_MODULE(leak_checker)
+VLOG_MODULE(learning_switch)
+VLOG_MODULE(mac_learning)
+VLOG_MODULE(mgmt)
+VLOG_MODULE(netdev)
+VLOG_MODULE(netflow)
+VLOG_MODULE(netlink)
+VLOG_MODULE(ofctl)
+VLOG_MODULE(ovs_discover)
+VLOG_MODULE(ofproto)
+VLOG_MODULE(pktbuf)
+VLOG_MODULE(pcap)
+VLOG_MODULE(poll_loop)
+VLOG_MODULE(port_watcher)
+VLOG_MODULE(proc_net_compat)
+VLOG_MODULE(process)
+VLOG_MODULE(secchan)
+VLOG_MODULE(rconn)
+VLOG_MODULE(stp)
+VLOG_MODULE(stp_secchan)
+VLOG_MODULE(stats)
+VLOG_MODULE(status)
+VLOG_MODULE(svec)
+VLOG_MODULE(switch)
+VLOG_MODULE(terminal)
+VLOG_MODULE(timeval)
+VLOG_MODULE(tty)
+VLOG_MODULE(socket_util)
+VLOG_MODULE(switchui)
+VLOG_MODULE(unixctl)
+VLOG_MODULE(vconn_tcp)
+VLOG_MODULE(vconn_ssl)
+VLOG_MODULE(vconn_stream)
+VLOG_MODULE(vconn_unix)
+VLOG_MODULE(vconn)
+VLOG_MODULE(vlog)
+VLOG_MODULE(wcelim)
+VLOG_MODULE(vswitchd)
+VLOG_MODULE(vt)
+VLOG_MODULE(xenserver)
+
+#undef VLOG_MODULE
diff --git a/lib/vlog.c b/lib/vlog.c
new file mode 100644
index 000000000..408cc74a0
--- /dev/null
+++ b/lib/vlog.c
@@ -0,0 +1,711 @@
+/*
+ * Copyright (c) 2008, 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <config.h>
+#include "vlog.h"
+#include <assert.h>
+#include <ctype.h>
+#include <errno.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <syslog.h>
+#include <time.h>
+#include <unistd.h>
+#include "dirs.h"
+#include "dynamic-string.h"
+#include "sat-math.h"
+#include "timeval.h"
+#include "unixctl.h"
+#include "util.h"
+
+#define THIS_MODULE VLM_vlog
+
+/* Name for each logging level. */
+static const char *level_names[VLL_N_LEVELS] = {
+#define VLOG_LEVEL(NAME, SYSLOG_LEVEL) #NAME,
+ VLOG_LEVELS
+#undef VLOG_LEVEL
+};
+
+/* Syslog value for each logging level. */
+static int syslog_levels[VLL_N_LEVELS] = {
+#define VLOG_LEVEL(NAME, SYSLOG_LEVEL) SYSLOG_LEVEL,
+ VLOG_LEVELS
+#undef VLOG_LEVEL
+};
+
+/* Name for each logging module */
+static const char *module_names[VLM_N_MODULES] = {
+#define VLOG_MODULE(NAME) #NAME,
+#include "vlog-modules.def"
+#undef VLOG_MODULE
+};
+
+/* Information about each facility. */
+struct facility {
+ const char *name; /* Name. */
+ char *pattern; /* Current pattern. */
+ bool default_pattern; /* Whether current pattern is the default. */
+};
+static struct facility facilities[VLF_N_FACILITIES] = {
+#define VLOG_FACILITY(NAME, PATTERN) {#NAME, PATTERN, true},
+ VLOG_FACILITIES
+#undef VLOG_FACILITY
+};
+
+/* Current log levels. */
+static int levels[VLM_N_MODULES][VLF_N_FACILITIES];
+
+/* For fast checking whether we're logging anything for a given module and
+ * level.*/
+enum vlog_level min_vlog_levels[VLM_N_MODULES];
+
+/* Time at which vlog was initialized, in milliseconds. */
+static long long int boot_time;
+
+/* VLF_FILE configuration. */
+static char *log_file_name;
+static FILE *log_file;
+
+static void format_log_message(enum vlog_module, enum vlog_level,
+ enum vlog_facility, unsigned int msg_num,
+ const char *message, va_list, struct ds *)
+ PRINTF_FORMAT(5, 0);
+
+/* Searches the 'n_names' in 'names'. Returns the index of a match for
+ * 'target', or 'n_names' if no name matches. */
+static size_t
+search_name_array(const char *target, const char **names, size_t n_names)
+{
+ size_t i;
+
+ for (i = 0; i < n_names; i++) {
+ assert(names[i]);
+ if (!strcasecmp(names[i], target)) {
+ break;
+ }
+ }
+ return i;
+}
+
+/* Returns the name for logging level 'level'. */
+const char *
+vlog_get_level_name(enum vlog_level level)
+{
+ assert(level < VLL_N_LEVELS);
+ return level_names[level];
+}
+
+/* Returns the logging level with the given 'name', or VLL_N_LEVELS if 'name'
+ * is not the name of a logging level. */
+enum vlog_level
+vlog_get_level_val(const char *name)
+{
+ return search_name_array(name, level_names, ARRAY_SIZE(level_names));
+}
+
+/* Returns the name for logging facility 'facility'. */
+const char *
+vlog_get_facility_name(enum vlog_facility facility)
+{
+ assert(facility < VLF_N_FACILITIES);
+ return facilities[facility].name;
+}
+
+/* Returns the logging facility named 'name', or VLF_N_FACILITIES if 'name' is
+ * not the name of a logging facility. */
+enum vlog_facility
+vlog_get_facility_val(const char *name)
+{
+ size_t i;
+
+ for (i = 0; i < VLF_N_FACILITIES; i++) {
+ if (!strcasecmp(facilities[i].name, name)) {
+ break;
+ }
+ }
+ return i;
+}
+
+/* Returns the name for logging module 'module'. */
+const char *vlog_get_module_name(enum vlog_module module)
+{
+ assert(module < VLM_N_MODULES);
+ return module_names[module];
+}
+
+/* Returns the logging module named 'name', or VLM_N_MODULES if 'name' is not
+ * the name of a logging module. */
+enum vlog_module
+vlog_get_module_val(const char *name)
+{
+ return search_name_array(name, module_names, ARRAY_SIZE(module_names));
+}
+
+/* Returns the current logging level for the given 'module' and 'facility'. */
+enum vlog_level
+vlog_get_level(enum vlog_module module, enum vlog_facility facility)
+{
+ assert(module < VLM_N_MODULES);
+ assert(facility < VLF_N_FACILITIES);
+ return levels[module][facility];
+}
+
+static void
+update_min_level(enum vlog_module module)
+{
+ enum vlog_level min_level = VLL_EMER;
+ enum vlog_facility facility;
+
+ for (facility = 0; facility < VLF_N_FACILITIES; facility++) {
+ if (log_file || facility != VLF_FILE) {
+ min_level = MAX(min_level, levels[module][facility]);
+ }
+ }
+ min_vlog_levels[module] = min_level;
+}
+
+static void
+set_facility_level(enum vlog_facility facility, enum vlog_module module,
+ enum vlog_level level)
+{
+ assert(facility >= 0 && facility < VLF_N_FACILITIES);
+ assert(level < VLL_N_LEVELS);
+
+ if (module == VLM_ANY_MODULE) {
+ for (module = 0; module < VLM_N_MODULES; module++) {
+ levels[module][facility] = level;
+ update_min_level(module);
+ }
+ } else {
+ levels[module][facility] = level;
+ update_min_level(module);
+ }
+}
+
+/* Sets the logging level for the given 'module' and 'facility' to 'level'. */
+void
+vlog_set_levels(enum vlog_module module, enum vlog_facility facility,
+ enum vlog_level level)
+{
+ assert(facility < VLF_N_FACILITIES || facility == VLF_ANY_FACILITY);
+ if (facility == VLF_ANY_FACILITY) {
+ for (facility = 0; facility < VLF_N_FACILITIES; facility++) {
+ set_facility_level(facility, module, level);
+ }
+ } else {
+ set_facility_level(facility, module, level);
+ }
+}
+
+static void
+do_set_pattern(enum vlog_facility facility, const char *pattern)
+{
+ struct facility *f = &facilities[facility];
+ if (!f->default_pattern) {
+ free(f->pattern);
+ } else {
+ f->default_pattern = false;
+ }
+ f->pattern = xstrdup(pattern);
+}
+
+/* Sets the pattern for the given 'facility' to 'pattern'. */
+void
+vlog_set_pattern(enum vlog_facility facility, const char *pattern)
+{
+ assert(facility < VLF_N_FACILITIES || facility == VLF_ANY_FACILITY);
+ if (facility == VLF_ANY_FACILITY) {
+ for (facility = 0; facility < VLF_N_FACILITIES; facility++) {
+ do_set_pattern(facility, pattern);
+ }
+ } else {
+ do_set_pattern(facility, pattern);
+ }
+}
+
+/* Returns the name of the log file used by VLF_FILE, or a null pointer if no
+ * log file has been set. (A non-null return value does not assert that the
+ * named log file is in use: if vlog_set_log_file() or vlog_reopen_log_file()
+ * fails, it still sets the log file name.) */
+const char *
+vlog_get_log_file(void)
+{
+ return log_file_name;
+}
+
+/* Sets the name of the log file used by VLF_FILE to 'file_name', or to the
+ * default file name if 'file_name' is null. Returns 0 if successful,
+ * otherwise a positive errno value. */
+int
+vlog_set_log_file(const char *file_name)
+{
+ char *old_log_file_name;
+ enum vlog_module module;
+ int error;
+
+ /* Close old log file. */
+ if (log_file) {
+ VLOG_INFO("closing log file");
+ fclose(log_file);
+ log_file = NULL;
+ }
+
+ /* Update log file name and free old name. The ordering is important
+ * because 'file_name' might be 'log_file_name' or some suffix of it. */
+ old_log_file_name = log_file_name;
+ log_file_name = (file_name
+ ? xstrdup(file_name)
+ : xasprintf("%s/%s.log", ovs_logdir, program_name));
+ free(old_log_file_name);
+ file_name = NULL; /* Might have been freed. */
+
+ /* Open new log file and update min_levels[] to reflect whether we actually
+ * have a log_file. */
+ log_file = fopen(log_file_name, "a");
+ for (module = 0; module < VLM_N_MODULES; module++) {
+ update_min_level(module);
+ }
+
+ /* Log success or failure. */
+ if (!log_file) {
+ VLOG_WARN("failed to open %s for logging: %s",
+ log_file_name, strerror(errno));
+ error = errno;
+ } else {
+ VLOG_INFO("opened log file %s", log_file_name);
+ error = 0;
+ }
+
+ return error;
+}
+
+/* Closes and then attempts to re-open the current log file. (This is useful
+ * just after log rotation, to ensure that the new log file starts being used.)
+ * Returns 0 if successful, otherwise a positive errno value. */
+int
+vlog_reopen_log_file(void)
+{
+ return log_file_name ? vlog_set_log_file(log_file_name) : 0;
+}
+
+/* Set debugging levels:
+ *
+ * mod[:facility[:level]] mod2[:facility[:level]] ...
+ *
+ * Return null if successful, otherwise an error message that the caller must
+ * free().
+ */
+char *
+vlog_set_levels_from_string(const char *s_)
+{
+ char *save_ptr;
+ char *s = xstrdup(s_);
+ char *module, *facility;
+
+ for (module = strtok_r(s, ": \t", &save_ptr); module != NULL;
+ module = strtok_r(NULL, ": \t", &save_ptr)) {
+ enum vlog_module e_module;
+ enum vlog_facility e_facility;
+
+ facility = strtok_r(NULL, ":", &save_ptr);
+
+ if (!facility || !strcmp(facility, "ANY")) {
+ e_facility = VLF_ANY_FACILITY;
+ } else {
+ e_facility = vlog_get_facility_val(facility);
+ if (e_facility >= VLF_N_FACILITIES) {
+ char *msg = xasprintf("unknown facility \"%s\"", facility);
+ free(s);
+ return msg;
+ }
+ }
+
+ if (!strcmp(module, "PATTERN")) {
+ vlog_set_pattern(e_facility, save_ptr);
+ break;
+ } else {
+ char *level;
+ enum vlog_level e_level;
+
+ if (!strcmp(module, "ANY")) {
+ e_module = VLM_ANY_MODULE;
+ } else {
+ e_module = vlog_get_module_val(module);
+ if (e_module >= VLM_N_MODULES) {
+ char *msg = xasprintf("unknown module \"%s\"", module);
+ free(s);
+ return msg;
+ }
+ }
+
+ level = strtok_r(NULL, ":", &save_ptr);
+ e_level = level ? vlog_get_level_val(level) : VLL_DBG;
+ if (e_level >= VLL_N_LEVELS) {
+ char *msg = xasprintf("unknown level \"%s\"", level);
+ free(s);
+ return msg;
+ }
+
+ vlog_set_levels(e_module, e_facility, e_level);
+ }
+ }
+ free(s);
+ return NULL;
+}
+
+/* If 'arg' is null, configure maximum verbosity. Otherwise, sets
+ * configuration according to 'arg' (see vlog_set_levels_from_string()). */
+void
+vlog_set_verbosity(const char *arg)
+{
+ if (arg) {
+ char *msg = vlog_set_levels_from_string(arg);
+ if (msg) {
+ ovs_fatal(0, "processing \"%s\": %s", arg, msg);
+ }
+ } else {
+ vlog_set_levels(VLM_ANY_MODULE, VLF_ANY_FACILITY, VLL_DBG);
+ }
+}
+
+static void
+vlog_unixctl_set(struct unixctl_conn *conn, const char *args)
+{
+ char *msg = vlog_set_levels_from_string(args);
+ unixctl_command_reply(conn, msg ? 501 : 202, msg);
+ free(msg);
+}
+
+static void
+vlog_unixctl_list(struct unixctl_conn *conn, const char *args UNUSED)
+{
+ char *msg = vlog_get_levels();
+ unixctl_command_reply(conn, 200, msg);
+ free(msg);
+}
+
+static void
+vlog_unixctl_reopen(struct unixctl_conn *conn, const char *args UNUSED)
+{
+ if (log_file_name) {
+ int error = vlog_reopen_log_file();
+ if (error) {
+ unixctl_command_reply(conn, 503, strerror(errno));
+ } else {
+ unixctl_command_reply(conn, 202, NULL);
+ }
+ } else {
+ unixctl_command_reply(conn, 403, "Logging to file not configured");
+ }
+}
+
+/* Initializes the logging subsystem. */
+void
+vlog_init(void)
+{
+ time_t now;
+
+ openlog(program_name, LOG_NDELAY, LOG_DAEMON);
+ vlog_set_levels(VLM_ANY_MODULE, VLF_ANY_FACILITY, VLL_INFO);
+
+ boot_time = time_msec();
+ now = time_now();
+ if (now < 0) {
+ struct tm tm;
+ char s[128];
+
+ localtime_r(&now, &tm);
+ strftime(s, sizeof s, "%a, %d %b %Y %H:%M:%S %z", &tm);
+ VLOG_ERR("current time is negative: %s (%ld)", s, (long int) now);
+ }
+
+ unixctl_command_register("vlog/set", vlog_unixctl_set);
+ unixctl_command_register("vlog/list", vlog_unixctl_list);
+ unixctl_command_register("vlog/reopen", vlog_unixctl_reopen);
+}
+
+/* Closes the logging subsystem. */
+void
+vlog_exit(void)
+{
+ closelog();
+}
+
+/* Print the current logging level for each module. */
+char *
+vlog_get_levels(void)
+{
+ struct ds s = DS_EMPTY_INITIALIZER;
+ enum vlog_module module;
+
+ ds_put_format(&s, " console syslog file\n");
+ ds_put_format(&s, " ------- ------ ------\n");
+
+ for (module = 0; module < VLM_N_MODULES; module++) {
+ ds_put_format(&s, "%-16s %4s %4s %4s\n",
+ vlog_get_module_name(module),
+ vlog_get_level_name(vlog_get_level(module, VLF_CONSOLE)),
+ vlog_get_level_name(vlog_get_level(module, VLF_SYSLOG)),
+ vlog_get_level_name(vlog_get_level(module, VLF_FILE)));
+ }
+
+ return ds_cstr(&s);
+}
+
+/* Returns true if a log message emitted for the given 'module' and 'level'
+ * would cause some log output, false if that module and level are completely
+ * disabled. */
+bool
+vlog_is_enabled(enum vlog_module module, enum vlog_level level)
+{
+ return min_vlog_levels[module] >= level;
+}
+
+static const char *
+fetch_braces(const char *p, const char *def, char *out, size_t out_size)
+{
+ if (*p == '{') {
+ size_t n = strcspn(p + 1, "}");
+ size_t n_copy = MIN(n, out_size - 1);
+ memcpy(out, p + 1, n_copy);
+ out[n_copy] = '\0';
+ p += n + 2;
+ } else {
+ ovs_strlcpy(out, def, out_size);
+ }
+ return p;
+}
+
+static void
+format_log_message(enum vlog_module module, enum vlog_level level,
+ enum vlog_facility facility, unsigned int msg_num,
+ const char *message, va_list args_, struct ds *s)
+{
+ char tmp[128];
+ va_list args;
+ const char *p;
+
+ ds_clear(s);
+ for (p = facilities[facility].pattern; *p != '\0'; ) {
+ enum { LEFT, RIGHT } justify = RIGHT;
+ int pad = '0';
+ size_t length, field, used;
+
+ if (*p != '%') {
+ ds_put_char(s, *p++);
+ continue;
+ }
+
+ p++;
+ if (*p == '-') {
+ justify = LEFT;
+ p++;
+ }
+ if (*p == '0') {
+ pad = '0';
+ p++;
+ }
+ field = 0;
+ while (isdigit(*p)) {
+ field = (field * 10) + (*p - '0');
+ p++;
+ }
+
+ length = s->length;
+ switch (*p++) {
+ case 'A':
+ ds_put_cstr(s, program_name);
+ break;
+ case 'c':
+ p = fetch_braces(p, "", tmp, sizeof tmp);
+ ds_put_cstr(s, vlog_get_module_name(module));
+ break;
+ case 'd':
+ p = fetch_braces(p, "%Y-%m-%d %H:%M:%S", tmp, sizeof tmp);
+ ds_put_strftime(s, tmp, NULL);
+ break;
+ case 'm':
+ /* Format user-supplied log message and trim trailing new-lines. */
+ length = s->length;
+ va_copy(args, args_);
+ ds_put_format_valist(s, message, args);
+ va_end(args);
+ while (s->length > length && s->string[s->length - 1] == '\n') {
+ s->length--;
+ }
+ break;
+ case 'N':
+ ds_put_format(s, "%u", msg_num);
+ break;
+ case 'n':
+ ds_put_char(s, '\n');
+ break;
+ case 'p':
+ ds_put_cstr(s, vlog_get_level_name(level));
+ break;
+ case 'P':
+ ds_put_format(s, "%ld", (long int) getpid());
+ break;
+ case 'r':
+ ds_put_format(s, "%lld", time_msec() - boot_time);
+ break;
+ default:
+ ds_put_char(s, p[-1]);
+ break;
+ }
+ used = s->length - length;
+ if (used < field) {
+ size_t n_pad = field - used;
+ if (justify == RIGHT) {
+ ds_put_uninit(s, n_pad);
+ memmove(&s->string[length + n_pad], &s->string[length], used);
+ memset(&s->string[length], pad, n_pad);
+ } else {
+ ds_put_char_multiple(s, pad, n_pad);
+ }
+ }
+ }
+}
+
+/* Writes 'message' to the log at the given 'level' and as coming from the
+ * given 'module'.
+ *
+ * Guaranteed to preserve errno. */
+void
+vlog_valist(enum vlog_module module, enum vlog_level level,
+ const char *message, va_list args)
+{
+ bool log_to_console = levels[module][VLF_CONSOLE] >= level;
+ bool log_to_syslog = levels[module][VLF_SYSLOG] >= level;
+ bool log_to_file = levels[module][VLF_FILE] >= level && log_file;
+ if (log_to_console || log_to_syslog || log_to_file) {
+ int save_errno = errno;
+ static unsigned int msg_num;
+ struct ds s;
+
+ ds_init(&s);
+ ds_reserve(&s, 1024);
+ msg_num++;
+
+ if (log_to_console) {
+ format_log_message(module, level, VLF_CONSOLE, msg_num,
+ message, args, &s);
+ ds_put_char(&s, '\n');
+ fputs(ds_cstr(&s), stderr);
+ }
+
+ if (log_to_syslog) {
+ int syslog_level = syslog_levels[level];
+ char *save_ptr = NULL;
+ char *line;
+
+ format_log_message(module, level, VLF_SYSLOG, msg_num,
+ message, args, &s);
+ for (line = strtok_r(s.string, "\n", &save_ptr); line;
+ line = strtok_r(NULL, "\n", &save_ptr)) {
+ syslog(syslog_level, "%s", line);
+ }
+ }
+
+ if (log_to_file) {
+ format_log_message(module, level, VLF_FILE, msg_num,
+ message, args, &s);
+ ds_put_char(&s, '\n');
+ fputs(ds_cstr(&s), log_file);
+ fflush(log_file);
+ }
+
+ ds_destroy(&s);
+ errno = save_errno;
+ }
+}
+
+void
+vlog(enum vlog_module module, enum vlog_level level, const char *message, ...)
+{
+ va_list args;
+
+ va_start(args, message);
+ vlog_valist(module, level, message, args);
+ va_end(args);
+}
+
+bool
+vlog_should_drop(enum vlog_module module, enum vlog_level level,
+ struct vlog_rate_limit *rl)
+{
+ if (!vlog_is_enabled(module, level)) {
+ return true;
+ }
+
+ if (rl->tokens < VLOG_MSG_TOKENS) {
+ time_t now = time_now();
+ if (rl->last_fill > now) {
+ /* Last filled in the future? Time must have gone backward, or
+ * 'rl' has not been used before. */
+ rl->tokens = rl->burst;
+ } else if (rl->last_fill < now) {
+ unsigned int add = sat_mul(rl->rate, now - rl->last_fill);
+ unsigned int tokens = sat_add(rl->tokens, add);
+ rl->tokens = MIN(tokens, rl->burst);
+ rl->last_fill = now;
+ }
+ if (rl->tokens < VLOG_MSG_TOKENS) {
+ if (!rl->n_dropped) {
+ rl->first_dropped = now;
+ }
+ rl->n_dropped++;
+ return true;
+ }
+ }
+ rl->tokens -= VLOG_MSG_TOKENS;
+
+ if (rl->n_dropped) {
+ vlog(module, level,
+ "Dropped %u log messages in last %u seconds "
+ "due to excessive rate",
+ rl->n_dropped, (unsigned int) (time_now() - rl->first_dropped));
+ rl->n_dropped = 0;
+ }
+ return false;
+}
+
+void
+vlog_rate_limit(enum vlog_module module, enum vlog_level level,
+ struct vlog_rate_limit *rl, const char *message, ...)
+{
+ if (!vlog_should_drop(module, level, rl)) {
+ va_list args;
+
+ va_start(args, message);
+ vlog_valist(module, level, message, args);
+ va_end(args);
+ }
+}
+
+void
+vlog_usage(void)
+{
+ printf("\nLogging options:\n"
+ " -v, --verbose=MODULE[:FACILITY[:LEVEL]] set logging levels\n"
+ " -v, --verbose set maximum verbosity level\n"
+ " --log-file[=FILE] enable logging to specified FILE\n"
+ " (default: %s/%s.log)\n",
+ ovs_logdir, program_name);
+}
diff --git a/lib/vlog.h b/lib/vlog.h
new file mode 100644
index 000000000..87036151b
--- /dev/null
+++ b/lib/vlog.h
@@ -0,0 +1,192 @@
+/*
+ * Copyright (c) 2008, 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef VLOG_H
+#define VLOG_H 1
+
+#include <limits.h>
+#include <stdarg.h>
+#include <stdbool.h>
+#include <time.h>
+#include "util.h"
+
+/* Logging importance levels. */
+#define VLOG_LEVELS \
+ VLOG_LEVEL(EMER, LOG_ALERT) \
+ VLOG_LEVEL(ERR, LOG_ERR) \
+ VLOG_LEVEL(WARN, LOG_WARNING) \
+ VLOG_LEVEL(INFO, LOG_NOTICE) \
+ VLOG_LEVEL(DBG, LOG_DEBUG)
+enum vlog_level {
+#define VLOG_LEVEL(NAME, SYSLOG_LEVEL) VLL_##NAME,
+ VLOG_LEVELS
+#undef VLOG_LEVEL
+ VLL_N_LEVELS
+};
+
+const char *vlog_get_level_name(enum vlog_level);
+enum vlog_level vlog_get_level_val(const char *name);
+
+/* Facilities that we can log to. */
+#define VLOG_FACILITIES \
+ VLOG_FACILITY(SYSLOG, "%05N|%c|%p|%m") \
+ VLOG_FACILITY(CONSOLE, "%d{%b %d %H:%M:%S}|%05N|%c|%p|%m") \
+ VLOG_FACILITY(FILE, "%d{%b %d %H:%M:%S}|%05N|%c|%p|%m")
+enum vlog_facility {
+#define VLOG_FACILITY(NAME, PATTERN) VLF_##NAME,
+ VLOG_FACILITIES
+#undef VLOG_FACILITY
+ VLF_N_FACILITIES,
+ VLF_ANY_FACILITY = -1
+};
+
+const char *vlog_get_facility_name(enum vlog_facility);
+enum vlog_facility vlog_get_facility_val(const char *name);
+
+/* VLM_ constant for each vlog module. */
+enum vlog_module {
+#define VLOG_MODULE(NAME) VLM_##NAME,
+#include "vlog-modules.def"
+ VLM_N_MODULES,
+ VLM_ANY_MODULE = -1
+};
+
+const char *vlog_get_module_name(enum vlog_module);
+enum vlog_module vlog_get_module_val(const char *name);
+
+/* Rate-limiter for log messages. */
+struct vlog_rate_limit {
+ /* Configuration settings. */
+ unsigned int rate; /* Tokens per second. */
+ unsigned int burst; /* Max cumulative tokens credit. */
+
+ /* Current status. */
+ unsigned int tokens; /* Current number of tokens. */
+ time_t last_fill; /* Last time tokens added. */
+ time_t first_dropped; /* Time first message was dropped. */
+ unsigned int n_dropped; /* Number of messages dropped. */
+};
+
+/* Number of tokens to emit a message. We add 'rate' tokens per second, which
+ * is 60 times the unit used for 'rate', thus 60 tokens are required to emit
+ * one message. */
+#define VLOG_MSG_TOKENS 60
+
+/* Initializer for a struct vlog_rate_limit, to set up a maximum rate of RATE
+ * messages per minute and a maximum burst size of BURST messages. */
+#define VLOG_RATE_LIMIT_INIT(RATE, BURST) \
+ { \
+ RATE, /* rate */ \
+ (MIN(BURST, UINT_MAX / VLOG_MSG_TOKENS) \
+ * VLOG_MSG_TOKENS), /* burst */ \
+ 0, /* tokens */ \
+ 0, /* last_fill */ \
+ 0, /* first_dropped */ \
+ 0, /* n_dropped */ \
+ }
+
+/* Configuring how each module logs messages. */
+enum vlog_level vlog_get_level(enum vlog_module, enum vlog_facility);
+void vlog_set_levels(enum vlog_module, enum vlog_facility, enum vlog_level);
+char *vlog_set_levels_from_string(const char *);
+char *vlog_get_levels(void);
+bool vlog_is_enabled(enum vlog_module, enum vlog_level);
+bool vlog_should_drop(enum vlog_module, enum vlog_level,
+ struct vlog_rate_limit *);
+void vlog_set_verbosity(const char *arg);
+
+/* Configuring log facilities. */
+void vlog_set_pattern(enum vlog_facility, const char *pattern);
+const char *vlog_get_log_file(void);
+int vlog_set_log_file(const char *file_name);
+int vlog_reopen_log_file(void);
+
+/* Function for actual logging. */
+void vlog_init(void);
+void vlog_exit(void);
+void vlog(enum vlog_module, enum vlog_level, const char *format, ...)
+ __attribute__((format(printf, 3, 4)));
+void vlog_valist(enum vlog_module, enum vlog_level, const char *, va_list)
+ __attribute__((format(printf, 3, 0)));
+void vlog_rate_limit(enum vlog_module, enum vlog_level,
+ struct vlog_rate_limit *, const char *, ...)
+ __attribute__((format(printf, 4, 5)));
+
+/* Convenience macros. To use these, define THIS_MODULE as a macro that
+ * expands to the module used by the current source file, e.g.
+ * #include "vlog.h"
+ * #define THIS_MODULE VLM_netlink
+ * Guaranteed to preserve errno.
+ */
+#define VLOG_EMER(...) VLOG(VLL_EMER, __VA_ARGS__)
+#define VLOG_ERR(...) VLOG(VLL_ERR, __VA_ARGS__)
+#define VLOG_WARN(...) VLOG(VLL_WARN, __VA_ARGS__)
+#define VLOG_INFO(...) VLOG(VLL_INFO, __VA_ARGS__)
+#define VLOG_DBG(...) VLOG(VLL_DBG, __VA_ARGS__)
+
+/* More convenience macros, for testing whether a given level is enabled in
+ * THIS_MODULE. When constructing a log message is expensive, this enables it
+ * to be skipped. */
+#define VLOG_IS_EMER_ENABLED() true
+#define VLOG_IS_ERR_ENABLED() vlog_is_enabled(THIS_MODULE, VLL_EMER)
+#define VLOG_IS_WARN_ENABLED() vlog_is_enabled(THIS_MODULE, VLL_WARN)
+#define VLOG_IS_INFO_ENABLED() vlog_is_enabled(THIS_MODULE, VLL_INFO)
+#define VLOG_IS_DBG_ENABLED() vlog_is_enabled(THIS_MODULE, VLL_DBG)
+
+/* Convenience macros for rate-limiting.
+ * Guaranteed to preserve errno.
+ */
+#define VLOG_ERR_RL(RL, ...) VLOG_RL(RL, VLL_ERR, __VA_ARGS__)
+#define VLOG_WARN_RL(RL, ...) VLOG_RL(RL, VLL_WARN, __VA_ARGS__)
+#define VLOG_INFO_RL(RL, ...) VLOG_RL(RL, VLL_INFO, __VA_ARGS__)
+#define VLOG_DBG_RL(RL, ...) VLOG_RL(RL, VLL_DBG, __VA_ARGS__)
+
+#define VLOG_DROP_ERR(RL) vlog_should_drop(THIS_MODULE, VLL_ERR, RL)
+#define VLOG_DROP_WARN(RL) vlog_should_drop(THIS_MODULE, VLL_WARN, RL)
+#define VLOG_DROP_INFO(RL) vlog_should_drop(THIS_MODULE, VLL_INFO, RL)
+#define VLOG_DROP_DBG(RL) vlog_should_drop(THIS_MODULE, VLL_DBG, RL)
+
+/* Command line processing. */
+#define VLOG_OPTION_ENUMS OPT_LOG_FILE
+#define VLOG_LONG_OPTIONS \
+ {"verbose", optional_argument, 0, 'v'}, \
+ {"log-file", optional_argument, 0, OPT_LOG_FILE}
+#define VLOG_OPTION_HANDLERS \
+ case 'v': \
+ vlog_set_verbosity(optarg); \
+ break; \
+ case OPT_LOG_FILE: \
+ vlog_set_log_file(optarg); \
+ break;
+void vlog_usage(void);
+
+/* Implementation details. */
+#define VLOG(LEVEL, ...) \
+ do { \
+ if (min_vlog_levels[THIS_MODULE] >= LEVEL) { \
+ vlog(THIS_MODULE, LEVEL, __VA_ARGS__); \
+ } \
+ } while (0)
+#define VLOG_RL(RL, LEVEL, ...) \
+ do { \
+ if (min_vlog_levels[THIS_MODULE] >= LEVEL) { \
+ vlog_rate_limit(THIS_MODULE, LEVEL, RL, __VA_ARGS__); \
+ } \
+ } while (0)
+extern enum vlog_level min_vlog_levels[VLM_N_MODULES];
+
+
+#endif /* vlog.h */
diff --git a/lib/vlog.man b/lib/vlog.man
new file mode 100644
index 000000000..0bd8a26e8
--- /dev/null
+++ b/lib/vlog.man
@@ -0,0 +1,44 @@
+.TP
+\fB-v\fImodule\fR[\fB:\fIfacility\fR[\fB:\fIlevel\fR]], \fB--verbose=\fImodule\fR[\fB:\fIfacility\fR[\fB:\fIlevel\fR]]
+
+Sets the logging level for \fImodule\fR in \fIfacility\fR to
+\fIlevel\fR:
+
+.RS
+.IP \(bu
+\fImodule\fR may be any valid module name (as displayed by the
+\fB--list\fR action on \fBovs-appctl\fR(8)), or the special name
+\fBANY\fR to set the logging levels for all modules.
+
+.IP \(bu
+\fIfacility\fR may be \fBsyslog\fR, \fBconsole\fR, or \fBfile\fR to
+set the levels for logging to the system log, the console, or a file
+respectively, or \fBANY\fR to set the logging levels for both
+facilities. If it is omitted, \fIfacility\fR defaults to \fBANY\fR.
+
+Regardless of the log levels set for \fBfile\fR, logging to a file
+will not take place unless \fB--log-file\fR is also specified (see
+below).
+
+.IP \(bu
+\fIlevel\fR must be one of \fBemer\fR, \fBerr\fR, \fBwarn\fR,
+\fBinfo\fR, or
+\fBdbg\fR, designating the minimum severity of a message for it to be
+logged. If it is omitted, \fIlevel\fR defaults to \fBdbg\fR.
+.RE
+
+.TP
+\fB-v\fR, \fB--verbose\fR
+Sets the maximum logging verbosity level, equivalent to
+\fB--verbose=ANY:ANY:dbg\fR.
+
+.TP
+\fB-vPATTERN:\fIfacility\fB:\fIpattern\fR, \fB--verbose=PATTERN:\fIfacility\fB:\fIpattern\fR
+Sets the log pattern for \fIfacility\fR to \fIpattern\fR. Refer to
+\fBovs-appctl\fR(8) for a description of the valid syntax for \fIpattern\fR.
+
+.TP
+\fB--log-file\fR[\fB=\fIfile\fR]
+Enables logging to a file. If \fIfile\fR is specified, then it is
+used as the exact name for the log file. The default log file name
+used if \fIfile\fR is omitted is \fB@LOGDIR@/\*(PN.log\fR.
diff --git a/lib/xtoxll.h b/lib/xtoxll.h
new file mode 100644
index 000000000..853a80637
--- /dev/null
+++ b/lib/xtoxll.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2008 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+#ifndef XTOXLL_H
+#define XTOXLL_H 1
+
+#include <arpa/inet.h>
+#include <sys/types.h>
+
+static inline uint64_t
+htonll(uint64_t n)
+{
+ return htonl(1) == 1 ? n : ((uint64_t) htonl(n) << 32) | htonl(n >> 32);
+}
+
+static inline uint64_t
+ntohll(uint64_t n)
+{
+ return htonl(1) == 1 ? n : ((uint64_t) ntohl(n) << 32) | ntohl(n >> 32);
+}
+
+#endif /* xtoxll.h */
diff --git a/m4/nx-build.m4 b/m4/nx-build.m4
new file mode 100644
index 000000000..ed2a5897c
--- /dev/null
+++ b/m4/nx-build.m4
@@ -0,0 +1,53 @@
+# -*- autoconf -*-
+
+# Copyright (c) 2008 Nicira Networks.
+#
+# Permission to use, copy, modify, and/or distribute this software for any
+# purpose with or without fee is hereby granted, provided that the above
+# copyright notice and this permission notice appear in all copies.
+#
+# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+dnl NX_BUILDNR
+dnl
+dnl If --with-build-number=NUMBER is used, substitutes a Makefile
+dnl variable BUILDNR with NUMBER, and sets a C preprocessor variable
+dnl BUILDNR to "+buildNUMBER".
+dnl
+dnl Otherwise, if --with-build-number is not used, substitutes BUILDNR
+dnl with 0 and sets C preprocessor variable BUILDNR to "".
+AC_DEFUN([NX_BUILDNR],
+ [AC_ARG_WITH(
+ [build-number],
+ [AS_HELP_STRING([--with-build-number=NUMBER],
+ [Official build number (default is none)])])
+ AC_MSG_CHECKING([build number])
+ case $with_build_number in # (
+ [[0-9]] | \
+ [[0-9]][[0-9]] | \
+ [[0-9]][[0-9]][[0-9]] | \
+ [[0-9]][[0-9]][[0-9]][[0-9]] | \
+ [[0-9]][[0-9]][[0-9]][[0-9]][[0-9]])
+ BUILDNR=$with_build_number
+ buildnr='"+build'$BUILDNR'"'
+ AC_MSG_RESULT([$with_build_number])
+ ;; # (
+ ''|no)
+ BUILDNR=0
+ buildnr='""'
+ AC_MSG_RESULT([none])
+ ;; # (
+ *)
+ AC_MSG_ERROR([invalid build number $with_build_number])
+ ;;
+ esac
+ AC_SUBST([BUILDNR])
+ AC_DEFINE_UNQUOTED([BUILDNR], [$buildnr],
+ [Official build number as a VERSION suffix string, e.g. "+build123",
+ or "" if this is not an official build.])])
diff --git a/m4/openvswitch.m4 b/m4/openvswitch.m4
new file mode 100644
index 000000000..7b23516a4
--- /dev/null
+++ b/m4/openvswitch.m4
@@ -0,0 +1,210 @@
+# -*- autoconf -*-
+
+# Copyright (c) 2008, 2009 Nicira Networks.
+#
+# Permission to use, copy, modify, and/or distribute this software for any
+# purpose with or without fee is hereby granted, provided that the above
+# copyright notice and this permission notice appear in all copies.
+#
+# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+dnl Checks for --enable-ndebug and defines NDEBUG if it is specified.
+AC_DEFUN([OVS_CHECK_NDEBUG],
+ [AC_ARG_ENABLE(
+ [ndebug],
+ [AC_HELP_STRING([--enable-ndebug],
+ [Disable debugging features for max performance])],
+ [case "${enableval}" in
+ (yes) ndebug=true ;;
+ (no) ndebug=false ;;
+ (*) AC_MSG_ERROR([bad value ${enableval} for --enable-ndebug]) ;;
+ esac],
+ [ndebug=false])
+ AM_CONDITIONAL([NDEBUG], [test x$ndebug = xtrue])])
+
+dnl Checks for Netlink support.
+AC_DEFUN([OVS_CHECK_NETLINK],
+ [AC_CHECK_HEADER([linux/netlink.h],
+ [HAVE_NETLINK=yes],
+ [HAVE_NETLINK=no],
+ [#include <sys/socket.h>
+ #include <linux/types.h>
+ ])
+ AM_CONDITIONAL([HAVE_NETLINK], [test "$HAVE_NETLINK" = yes])
+ if test "$HAVE_NETLINK" = yes; then
+ AC_DEFINE([HAVE_NETLINK], [1],
+ [Define to 1 if Netlink protocol is available.])
+ fi])
+
+dnl Checks for OpenSSL, if --enable-ssl is passed in.
+AC_DEFUN([OVS_CHECK_OPENSSL],
+ [AC_ARG_ENABLE(
+ [ssl],
+ [AC_HELP_STRING([--enable-ssl],
+ [Enable ssl support (requires libssl)])],
+ [case "${enableval}" in
+ (yes) ssl=true ;;
+ (no) ssl=false ;;
+ (*) AC_MSG_ERROR([bad value ${enableval} for --enable-ssl]) ;;
+ esac],
+ [ssl=false])
+
+ if test "$ssl" = true; then
+ dnl Make sure that pkg-config is installed.
+ m4_pattern_forbid([PKG_CHECK_MODULES])
+ PKG_CHECK_MODULES([SSL], [libssl],
+ [HAVE_OPENSSL=yes],
+ [HAVE_OPENSSL=no
+ AC_MSG_WARN([Cannot find libssl:
+
+ $SSL_PKG_ERRORS
+
+ OpenFlow connections over SSL will not be supported.])])
+
+ fi
+ AM_CONDITIONAL([HAVE_OPENSSL], [test "$HAVE_OPENSSL" = yes])
+ if test "$HAVE_OPENSSL" = yes; then
+ AC_DEFINE([HAVE_OPENSSL], [1], [Define to 1 if OpenSSL is installed.])
+ fi])
+
+dnl Checks for libraries needed by lib/fault.c.
+AC_DEFUN([OVS_CHECK_FAULT_LIBS],
+ [AC_CHECK_LIB([dl], [dladdr], [FAULT_LIBS=-ldl])
+ AC_SUBST([FAULT_LIBS])])
+
+dnl Checks for libraries needed by lib/socket-util.c.
+AC_DEFUN([OVS_CHECK_SOCKET_LIBS],
+ [AC_CHECK_LIB([socket], [connect])
+ AC_SEARCH_LIBS([gethostbyname], [resolv], [RESOLVER_LIBS=-lresolv])])
+
+dnl Checks for the directory in which to store the PKI.
+AC_DEFUN([OVS_CHECK_PKIDIR],
+ [AC_ARG_WITH(
+ [pkidir],
+ AC_HELP_STRING([--with-pkidir=DIR],
+ [PKI hierarchy directory [[DATADIR/openvswitch/pki]]]),
+ [PKIDIR=$withval],
+ [PKIDIR='${pkgdatadir}/pki'])
+ AC_SUBST([PKIDIR])])
+
+dnl Checks for the directory in which to store pidfiles.
+AC_DEFUN([OVS_CHECK_RUNDIR],
+ [AC_ARG_WITH(
+ [rundir],
+ AC_HELP_STRING([--with-rundir=DIR],
+ [directory used for pidfiles [[LOCALSTATEDIR/run]]]),
+ [RUNDIR=$withval],
+ [RUNDIR='${localstatedir}/run'])
+ AC_SUBST([RUNDIR])])
+
+dnl Checks for the directory in which to store logs.
+AC_DEFUN([OVS_CHECK_LOGDIR],
+ [AC_ARG_WITH(
+ [logdir],
+ AC_HELP_STRING([--with-logdir=DIR],
+ [directory used for logs [[LOCALSTATEDIR/log/PACKAGE]]]),
+ [LOGDIR=$withval],
+ [LOGDIR='${localstatedir}/log/${PACKAGE}'])
+ AC_SUBST([LOGDIR])])
+
+dnl Checks for __malloc_hook, etc., supported by glibc.
+AC_DEFUN([OVS_CHECK_MALLOC_HOOKS],
+ [AC_CACHE_CHECK(
+ [whether libc supports hooks for malloc and related functions],
+ [ovs_cv_malloc_hooks],
+ [AC_COMPILE_IFELSE(
+ [AC_LANG_PROGRAM(
+ [#include <malloc.h>
+ ],
+ [(void) __malloc_hook;
+ (void) __realloc_hook;
+ (void) __free_hook;])],
+ [ovs_cv_malloc_hooks=yes],
+ [ovs_cv_malloc_hooks=no])])
+ if test $ovs_cv_malloc_hooks = yes; then
+ AC_DEFINE([HAVE_MALLOC_HOOKS], [1],
+ [Define to 1 if you have __malloc_hook, __realloc_hook, and
+ __free_hook in <malloc.h>.])
+ fi])
+
+dnl Checks for valgrind/valgrind.h.
+AC_DEFUN([OVS_CHECK_VALGRIND],
+ [AC_CHECK_HEADERS([valgrind/valgrind.h])])
+
+dnl Searches for a directory to put lockfiles for tty devices.
+dnl Defines C preprocessor variable TTY_LOCK_DIR to a quoted string
+dnl for that directory.
+AC_DEFUN([OVS_CHECK_TTY_LOCK_DIR],
+ [AC_CACHE_CHECK([directory used for serial device lockfiles],
+ [ovs_cv_path_tty_locks],
+ [# This list of candidate directories is from minicom.
+ ovs_cv_path_tty_locks=none
+ for dir in /etc/locks /var/lock /usr/spool/locks \
+ /var/spool/locks /var/spool/lock \
+ /usr/spool/uucp /var/spool/uucp /var/run; do
+ if test -d $dir; then
+ ovs_cv_path_tty_locks=$dir
+ break
+ fi
+ done])
+ if test "$ovs_cv_path_tty_locks" = none; then
+ AC_MSG_ERROR([cannot find a directory for tty locks])
+ fi
+ AC_DEFINE_UNQUOTED([TTY_LOCK_DIR], "$ovs_cv_path_tty_locks",
+ [Directory used for serial device lockfiles])])
+
+dnl The following check is adapted from GNU PSPP.
+dnl It searches for the ncurses library. If it finds it, it sets
+dnl HAVE_CURSES to yes and sets NCURSES_LIBS and NCURSES_CFLAGS
+dnl appropriate. Otherwise, it sets HAVE_CURSES to no.
+AC_DEFUN([OVS_CHECK_CURSES],
+ [if test "$cross_compiling" != yes; then
+ AC_CHECK_PROGS([NCURSES_CONFIG], [ncurses5-config ncurses8-config])
+ fi
+ if test "$NCURSES_CONFIG" = ""; then
+ AC_SEARCH_LIBS([tgetent], [ncurses],
+ [AC_CHECK_HEADERS([term.h curses.h],
+ [HAVE_CURSES=yes],
+ [HAVE_CURSES=no])])
+ else
+ save_cflags=$CFLAGS
+ CFLAGS="$CFLAGS $($NCURSES_CONFIG --cflags)"
+ AC_CHECK_HEADERS([term.h curses.h],
+ [HAVE_CURSES=yes],
+ [HAVE_CURSES=no])
+ CFLAGS=$save_cflags
+ if test "$HAVE_CURSES" = yes; then
+ NCURSES_LIBS=$($NCURSES_CONFIG --libs)
+ NCURSES_CFLAGS=$($NCURSES_CONFIG --cflags)
+ AC_SUBST(NCURSES_CFLAGS)
+ AC_SUBST(NCURSES_LIBS)
+ fi
+ fi
+ AM_CONDITIONAL([HAVE_CURSES], [test "$HAVE_CURSES" = yes])])
+
+dnl Checks for linux/vt.h.
+AC_DEFUN([OVS_CHECK_LINUX_VT_H],
+ [AC_CHECK_HEADER([linux/vt.h],
+ [HAVE_LINUX_VT_H=yes],
+ [HAVE_LINUX_VT_H=no])
+ AM_CONDITIONAL([HAVE_LINUX_VT_H], [test "$HAVE_LINUX_VT_H" = yes])
+ if test "$HAVE_LINUX_VT_H" = yes; then
+ AC_DEFINE([HAVE_LINUX_VT_H], [1],
+ [Define to 1 if linux/vt.h is available.])
+ fi])
+
+dnl Checks for libpcre.
+AC_DEFUN([OVS_CHECK_PCRE],
+ [dnl Make sure that pkg-config is installed.
+ m4_pattern_forbid([PKG_CHECK_MODULES])
+ PKG_CHECK_MODULES([PCRE], [libpcre], [HAVE_PCRE=yes], [HAVE_PCRE=no])
+ AM_CONDITIONAL([HAVE_PCRE], [test "$HAVE_PCRE" = yes])
+ if test "$HAVE_PCRE" = yes; then
+ AC_DEFINE([HAVE_PCRE], [1], [Define to 1 if libpcre is installed.])
+ fi])
diff --git a/secchan/.gitignore b/secchan/.gitignore
new file mode 100644
index 000000000..ada656655
--- /dev/null
+++ b/secchan/.gitignore
@@ -0,0 +1,4 @@
+/Makefile
+/Makefile.in
+/secchan
+/secchan.8
diff --git a/secchan/automake.mk b/secchan/automake.mk
new file mode 100644
index 000000000..d6bf1b0c5
--- /dev/null
+++ b/secchan/automake.mk
@@ -0,0 +1,42 @@
+# Copyright (C) 2009 Nicira Networks, Inc.
+#
+# Copying and distribution of this file, with or without modification,
+# are permitted in any medium without royalty provided the copyright
+# notice and this notice are preserved. This file is offered as-is,
+# without warranty of any kind.
+
+bin_PROGRAMS += secchan/secchan
+man_MANS += secchan/secchan.8
+
+secchan_secchan_SOURCES = secchan/main.c
+secchan_secchan_LDADD = \
+ secchan/libsecchan.a \
+ lib/libopenvswitch.a \
+ $(FAULT_LIBS) \
+ $(SSL_LIBS)
+
+noinst_LIBRARIES += secchan/libsecchan.a
+secchan_libsecchan_a_SOURCES = \
+ secchan/discovery.c \
+ secchan/discovery.h \
+ secchan/executer.c \
+ secchan/executer.h \
+ secchan/fail-open.c \
+ secchan/fail-open.h \
+ secchan/in-band.c \
+ secchan/in-band.h \
+ secchan/netflow.c \
+ secchan/netflow.h \
+ secchan/ofproto.c \
+ secchan/ofproto.h \
+ secchan/pktbuf.c \
+ secchan/pktbuf.h \
+ secchan/pinsched.c \
+ secchan/pinsched.h \
+ secchan/status.c \
+ secchan/status.h
+
+EXTRA_DIST += secchan/secchan.8.in
+DISTCLEANFILES += secchan/secchan.8
+
+include secchan/commands/automake.mk
diff --git a/secchan/commands/automake.mk b/secchan/commands/automake.mk
new file mode 100644
index 000000000..cbe44d8c6
--- /dev/null
+++ b/secchan/commands/automake.mk
@@ -0,0 +1,3 @@
+commandsdir = ${pkgdatadir}/commands
+dist_commands_SCRIPTS = \
+ secchan/commands/reboot
diff --git a/secchan/commands/reboot b/secchan/commands/reboot
new file mode 100755
index 000000000..42fd10c11
--- /dev/null
+++ b/secchan/commands/reboot
@@ -0,0 +1,3 @@
+#! /bin/sh
+ovs-kill --force --signal=USR1 ovs-switchui.pid
+reboot
diff --git a/secchan/discovery.c b/secchan/discovery.c
new file mode 100644
index 000000000..06de6f07c
--- /dev/null
+++ b/secchan/discovery.c
@@ -0,0 +1,270 @@
+/*
+ * Copyright (c) 2008, 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <config.h>
+#include "discovery.h"
+#include <errno.h>
+#include <inttypes.h>
+#include <net/if.h>
+#include <regex.h>
+#include <stdlib.h>
+#include <string.h>
+#include "dhcp-client.h"
+#include "dhcp.h"
+#include "dpif.h"
+#include "netdev.h"
+#include "openflow/openflow.h"
+#include "packets.h"
+#include "status.h"
+#include "vconn-ssl.h"
+
+#define THIS_MODULE VLM_discovery
+#include "vlog.h"
+
+struct discovery {
+ char *re;
+ bool update_resolv_conf;
+ regex_t *regex;
+ struct dhclient *dhcp;
+ int n_changes;
+ struct status_category *ss_cat;
+};
+
+static void modify_dhcp_request(struct dhcp_msg *, void *aux);
+static bool validate_dhcp_offer(const struct dhcp_msg *, void *aux);
+
+static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(60, 60);
+
+static void
+discovery_status_cb(struct status_reply *sr, void *d_)
+{
+ struct discovery *d = d_;
+
+ status_reply_put(sr, "accept-remote=%s", d->re);
+ status_reply_put(sr, "n-changes=%d", d->n_changes);
+ if (d->dhcp) {
+ status_reply_put(sr, "state=%s", dhclient_get_state(d->dhcp));
+ status_reply_put(sr, "state-elapsed=%u",
+ dhclient_get_state_elapsed(d->dhcp));
+ if (dhclient_is_bound(d->dhcp)) {
+ uint32_t ip = dhclient_get_ip(d->dhcp);
+ uint32_t netmask = dhclient_get_netmask(d->dhcp);
+ uint32_t router = dhclient_get_router(d->dhcp);
+
+ const struct dhcp_msg *cfg = dhclient_get_config(d->dhcp);
+ uint32_t dns_server;
+ char *domain_name;
+ int i;
+
+ status_reply_put(sr, "ip="IP_FMT, IP_ARGS(&ip));
+ status_reply_put(sr, "netmask="IP_FMT, IP_ARGS(&netmask));
+ if (router) {
+ status_reply_put(sr, "router="IP_FMT, IP_ARGS(&router));
+ }
+
+ for (i = 0; dhcp_msg_get_ip(cfg, DHCP_CODE_DNS_SERVER, i,
+ &dns_server);
+ i++) {
+ status_reply_put(sr, "dns%d="IP_FMT, i, IP_ARGS(&dns_server));
+ }
+
+ domain_name = dhcp_msg_get_string(cfg, DHCP_CODE_DOMAIN_NAME);
+ if (domain_name) {
+ status_reply_put(sr, "domain=%s", domain_name);
+ free(domain_name);
+ }
+
+ status_reply_put(sr, "lease-remaining=%u",
+ dhclient_get_lease_remaining(d->dhcp));
+ }
+ }
+}
+
+int
+discovery_create(const char *re, bool update_resolv_conf,
+ struct dpif *dpif, struct switch_status *ss,
+ struct discovery **discoveryp)
+{
+ struct discovery *d;
+ char local_name[IF_NAMESIZE];
+ int error;
+
+ d = xcalloc(1, sizeof *d);
+
+ /* Controller regular expression. */
+ error = discovery_set_accept_controller_re(d, re);
+ if (error) {
+ goto error_free;
+ }
+ d->update_resolv_conf = update_resolv_conf;
+
+ /* Initialize DHCP client. */
+ error = dpif_get_name(dpif, local_name, sizeof local_name);
+ if (error) {
+ VLOG_ERR("failed to query datapath local port: %s", strerror(error));
+ goto error_regfree;
+ }
+ error = dhclient_create(local_name, modify_dhcp_request,
+ validate_dhcp_offer, d, &d->dhcp);
+ if (error) {
+ VLOG_ERR("failed to initialize DHCP client: %s", strerror(error));
+ goto error_regfree;
+ }
+ dhclient_set_max_timeout(d->dhcp, 3);
+ dhclient_init(d->dhcp, 0);
+
+ d->ss_cat = switch_status_register(ss, "discovery",
+ discovery_status_cb, d);
+
+ *discoveryp = d;
+ return 0;
+
+error_regfree:
+ regfree(d->regex);
+ free(d->regex);
+error_free:
+ free(d);
+ *discoveryp = 0;
+ return error;
+}
+
+void
+discovery_destroy(struct discovery *d)
+{
+ if (d) {
+ free(d->re);
+ regfree(d->regex);
+ free(d->regex);
+ dhclient_destroy(d->dhcp);
+ switch_status_unregister(d->ss_cat);
+ free(d);
+ }
+}
+
+void
+discovery_set_update_resolv_conf(struct discovery *d,
+ bool update_resolv_conf)
+{
+ d->update_resolv_conf = update_resolv_conf;
+}
+
+int
+discovery_set_accept_controller_re(struct discovery *d, const char *re_)
+{
+ regex_t *regex;
+ int error;
+ char *re;
+
+ re = (!re_ ? xstrdup(vconn_ssl_is_configured() ? "^ssl:.*" : ".*")
+ : re_[0] == '^' ? xstrdup(re_) : xasprintf("^%s", re_));
+ regex = xmalloc(sizeof *regex);
+ error = regcomp(regex, re, REG_NOSUB | REG_EXTENDED);
+ if (error) {
+ size_t length = regerror(error, regex, NULL, 0);
+ char *buffer = xmalloc(length);
+ regerror(error, regex, buffer, length);
+ VLOG_WARN("%s: %s", re, buffer);
+ free(regex);
+ free(re);
+ return EINVAL;
+ } else {
+ if (d->regex) {
+ regfree(d->regex);
+ free(d->regex);
+ }
+ free(d->re);
+
+ d->regex = regex;
+ d->re = re;
+ return 0;
+ }
+}
+
+void
+discovery_question_connectivity(struct discovery *d)
+{
+ if (d->dhcp) {
+ dhclient_force_renew(d->dhcp, 15);
+ }
+}
+
+bool
+discovery_run(struct discovery *d, char **controller_name)
+{
+ if (!d->dhcp) {
+ *controller_name = NULL;
+ return true;
+ }
+
+ dhclient_run(d->dhcp);
+ if (!dhclient_changed(d->dhcp)) {
+ return false;
+ }
+
+ dhclient_configure_netdev(d->dhcp);
+ if (d->update_resolv_conf) {
+ dhclient_update_resolv_conf(d->dhcp);
+ }
+
+ if (dhclient_is_bound(d->dhcp)) {
+ *controller_name = dhcp_msg_get_string(dhclient_get_config(d->dhcp),
+ DHCP_CODE_OFP_CONTROLLER_VCONN);
+ VLOG_INFO("%s: discovered controller", *controller_name);
+ d->n_changes++;
+ } else {
+ *controller_name = NULL;
+ if (d->n_changes) {
+ VLOG_INFO("discovered controller no longer available");
+ d->n_changes++;
+ }
+ }
+ return true;
+}
+
+void
+discovery_wait(struct discovery *d)
+{
+ if (d->dhcp) {
+ dhclient_wait(d->dhcp);
+ }
+}
+
+static void
+modify_dhcp_request(struct dhcp_msg *msg, void *aux UNUSED)
+{
+ dhcp_msg_put_string(msg, DHCP_CODE_VENDOR_CLASS, "OpenFlow");
+}
+
+static bool
+validate_dhcp_offer(const struct dhcp_msg *msg, void *d_)
+{
+ const struct discovery *d = d_;
+ char *vconn_name;
+ bool accept;
+
+ vconn_name = dhcp_msg_get_string(msg, DHCP_CODE_OFP_CONTROLLER_VCONN);
+ if (!vconn_name) {
+ VLOG_WARN_RL(&rl, "rejecting DHCP offer missing controller vconn");
+ return false;
+ }
+ accept = !regexec(d->regex, vconn_name, 0, NULL, 0);
+ if (!accept) {
+ VLOG_WARN_RL(&rl, "rejecting controller vconn that fails to match %s",
+ d->re);
+ }
+ free(vconn_name);
+ return accept;
+}
diff --git a/secchan/discovery.h b/secchan/discovery.h
new file mode 100644
index 000000000..cf8a925f0
--- /dev/null
+++ b/secchan/discovery.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2008, 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef DISCOVERY_H
+#define DISCOVERY_H 1
+
+#include <stdbool.h>
+
+struct dpif;
+struct discovery;
+struct settings;
+struct switch_status;
+
+int discovery_create(const char *accept_controller_re, bool update_resolv_conf,
+ struct dpif *, struct switch_status *,
+ struct discovery **);
+void discovery_destroy(struct discovery *);
+void discovery_set_update_resolv_conf(struct discovery *,
+ bool update_resolv_conf);
+int discovery_set_accept_controller_re(struct discovery *, const char *re);
+void discovery_question_connectivity(struct discovery *);
+bool discovery_run(struct discovery *, char **controller_name);
+void discovery_wait(struct discovery *);
+
+#endif /* discovery.h */
diff --git a/secchan/executer.c b/secchan/executer.c
new file mode 100644
index 000000000..304df56c1
--- /dev/null
+++ b/secchan/executer.c
@@ -0,0 +1,519 @@
+/*
+ * Copyright (c) 2008, 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <config.h>
+#include "executer.h"
+#include <errno.h>
+#include <fcntl.h>
+#include <fnmatch.h>
+#include <poll.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <sys/stat.h>
+#include <sys/wait.h>
+#include <string.h>
+#include <unistd.h>
+#include "dirs.h"
+#include "dynamic-string.h"
+#include "fatal-signal.h"
+#include "openflow/nicira-ext.h"
+#include "ofpbuf.h"
+#include "openflow/openflow.h"
+#include "poll-loop.h"
+#include "rconn.h"
+#include "socket-util.h"
+#include "util.h"
+#include "vconn.h"
+
+#define THIS_MODULE VLM_executer
+#include "vlog.h"
+
+#define MAX_CHILDREN 8
+
+struct child {
+ /* Information about child process. */
+ char *name; /* argv[0] passed to child. */
+ pid_t pid; /* Child's process ID. */
+
+ /* For sending a reply to the controller when the child dies. */
+ struct rconn *rconn;
+ uint32_t xid; /* Transaction ID used by controller. */
+
+ /* We read up to MAX_OUTPUT bytes of output and send them back to the
+ * controller when the child dies. */
+#define MAX_OUTPUT 4096
+ int output_fd; /* FD from which to read child's output. */
+ uint8_t *output; /* Output data. */
+ size_t output_size; /* Number of bytes of output data so far. */
+};
+
+struct executer {
+ /* Settings. */
+ char *command_acl; /* Command white/blacklist, as shell globs. */
+ char *command_dir; /* Directory that contains commands. */
+
+ /* Children. */
+ struct child children[MAX_CHILDREN];
+ size_t n_children;
+};
+
+/* File descriptors for waking up when a child dies. */
+static int signal_fds[2];
+
+/* File descriptor for /dev/null. */
+static int null_fd = -1;
+
+static void send_child_status(struct rconn *, uint32_t xid, uint32_t status,
+ const void *data, size_t size);
+static void send_child_message(struct rconn *, uint32_t xid, uint32_t status,
+ const char *message);
+
+/* Returns true if 'cmd' is allowed by 'acl', which is a command-separated
+ * access control list in the format described for --command-acl in
+ * secchan(8). */
+static bool
+executer_is_permitted(const char *acl_, const char *cmd)
+{
+ char *acl, *save_ptr, *pattern;
+ bool allowed, denied;
+
+ /* Verify that 'cmd' consists only of alphanumerics plus _ or -. */
+ if (cmd[strspn(cmd, "abcdefghijklmnopqrstuvwxyz"
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_-")] != '\0') {
+ VLOG_WARN("rejecting command name \"%s\" that contain forbidden "
+ "characters", cmd);
+ return false;
+ }
+
+ /* Check 'cmd' against 'acl'. */
+ acl = xstrdup(acl_);
+ save_ptr = acl;
+ allowed = denied = false;
+ while ((pattern = strsep(&save_ptr, ",")) != NULL && !denied) {
+ if (pattern[0] != '!' && !fnmatch(pattern, cmd, 0)) {
+ allowed = true;
+ } else if (pattern[0] == '!' && !fnmatch(pattern + 1, cmd, 0)) {
+ denied = true;
+ }
+ }
+ free(acl);
+
+ /* Check the command white/blacklisted state. */
+ if (allowed && !denied) {
+ VLOG_INFO("permitting command execution: \"%s\" is whitelisted", cmd);
+ } else if (allowed && denied) {
+ VLOG_WARN("denying command execution: \"%s\" is both blacklisted "
+ "and whitelisted", cmd);
+ } else if (!allowed) {
+ VLOG_WARN("denying command execution: \"%s\" is not whitelisted", cmd);
+ } else if (denied) {
+ VLOG_WARN("denying command execution: \"%s\" is blacklisted", cmd);
+ }
+ return allowed && !denied;
+}
+
+int
+executer_handle_request(struct executer *e, struct rconn *rconn,
+ struct nicira_header *request)
+{
+ char **argv;
+ char *args;
+ char *exec_file = NULL;
+ int max_fds;
+ struct stat s;
+ size_t args_size;
+ size_t argc;
+ size_t i;
+ pid_t pid;
+ int output_fds[2];
+
+ /* Verify limit on children not exceeded.
+ * XXX should probably kill children when the connection drops? */
+ if (e->n_children >= MAX_CHILDREN) {
+ send_child_message(rconn, request->header.xid, NXT_STATUS_ERROR,
+ "too many child processes");
+ return 0;
+ }
+
+ /* Copy argument buffer, adding a null terminator at the end. Now every
+ * argument is null-terminated, instead of being merely null-delimited. */
+ args_size = ntohs(request->header.length) - sizeof *request;
+ args = xmemdup0((const void *) (request + 1), args_size);
+
+ /* Count arguments. */
+ argc = 0;
+ for (i = 0; i <= args_size; i++) {
+ argc += args[i] == '\0';
+ }
+
+ /* Set argv[*] to point to each argument. */
+ argv = xmalloc((argc + 1) * sizeof *argv);
+ argv[0] = args;
+ for (i = 1; i < argc; i++) {
+ argv[i] = strchr(argv[i - 1], '\0') + 1;
+ }
+ argv[argc] = NULL;
+
+ /* Check permissions. */
+ if (!executer_is_permitted(e->command_acl, argv[0])) {
+ send_child_message(rconn, request->header.xid, NXT_STATUS_ERROR,
+ "command not allowed");
+ goto done;
+ }
+
+ /* Find the executable. */
+ exec_file = xasprintf("%s/%s", e->command_dir, argv[0]);
+ if (stat(exec_file, &s)) {
+ VLOG_WARN("failed to stat \"%s\": %s", exec_file, strerror(errno));
+ send_child_message(rconn, request->header.xid, NXT_STATUS_ERROR,
+ "command not allowed");
+ goto done;
+ }
+ if (!S_ISREG(s.st_mode)) {
+ VLOG_WARN("\"%s\" is not a regular file", exec_file);
+ send_child_message(rconn, request->header.xid, NXT_STATUS_ERROR,
+ "command not allowed");
+ goto done;
+ }
+ argv[0] = exec_file;
+
+ /* Arrange to capture output. */
+ if (pipe(output_fds)) {
+ VLOG_WARN("pipe failed: %s", strerror(errno));
+ send_child_message(rconn, request->header.xid, NXT_STATUS_ERROR,
+ "internal error (pipe)");
+ goto done;
+ }
+
+ pid = fork();
+ if (!pid) {
+ /* Running in child.
+ * XXX should run in new process group so that we can signal all
+ * subprocesses at once? Would also want to catch fatal signals and
+ * kill them at the same time though. */
+ fatal_signal_fork();
+ dup2(null_fd, 0);
+ dup2(output_fds[1], 1);
+ dup2(null_fd, 2);
+ max_fds = get_max_fds();
+ for (i = 3; i < max_fds; i++) {
+ close(i);
+ }
+ if (chdir(e->command_dir)) {
+ printf("could not change directory to \"%s\": %s",
+ e->command_dir, strerror(errno));
+ exit(EXIT_FAILURE);
+ }
+ execv(argv[0], argv);
+ printf("failed to start \"%s\": %s\n", argv[0], strerror(errno));
+ exit(EXIT_FAILURE);
+ } else if (pid > 0) {
+ /* Running in parent. */
+ struct child *child;
+
+ VLOG_INFO("started \"%s\" subprocess", argv[0]);
+ send_child_status(rconn, request->header.xid, NXT_STATUS_STARTED,
+ NULL, 0);
+ child = &e->children[e->n_children++];
+ child->name = xstrdup(argv[0]);
+ child->pid = pid;
+ child->rconn = rconn;
+ child->xid = request->header.xid;
+ child->output_fd = output_fds[0];
+ child->output = xmalloc(MAX_OUTPUT);
+ child->output_size = 0;
+ set_nonblocking(output_fds[0]);
+ close(output_fds[1]);
+ } else {
+ VLOG_WARN("fork failed: %s", strerror(errno));
+ send_child_message(rconn, request->header.xid, NXT_STATUS_ERROR,
+ "internal error (fork)");
+ close(output_fds[0]);
+ close(output_fds[1]);
+ }
+
+done:
+ free(exec_file);
+ free(args);
+ free(argv);
+ return 0;
+}
+
+static void
+send_child_status(struct rconn *rconn, uint32_t xid, uint32_t status,
+ const void *data, size_t size)
+{
+ if (rconn) {
+ struct nx_command_reply *r;
+ struct ofpbuf *buffer;
+
+ r = make_openflow_xid(sizeof *r, OFPT_VENDOR, xid, &buffer);
+ r->nxh.vendor = htonl(NX_VENDOR_ID);
+ r->nxh.subtype = htonl(NXT_COMMAND_REPLY);
+ r->status = htonl(status);
+ ofpbuf_put(buffer, data, size);
+ update_openflow_length(buffer);
+ if (rconn_send(rconn, buffer, NULL)) {
+ ofpbuf_delete(buffer);
+ }
+ }
+}
+
+static void
+send_child_message(struct rconn *rconn, uint32_t xid, uint32_t status,
+ const char *message)
+{
+ send_child_status(rconn, xid, status, message, strlen(message));
+}
+
+/* 'child' died with 'status' as its return code. Deal with it. */
+static void
+child_terminated(struct child *child, int status)
+{
+ struct ds ds;
+ uint32_t ofp_status;
+
+ /* Log how it terminated. */
+ ds_init(&ds);
+ if (WIFEXITED(status)) {
+ ds_put_format(&ds, "normally with status %d", WEXITSTATUS(status));
+ } else if (WIFSIGNALED(status)) {
+ const char *name = NULL;
+#ifdef HAVE_STRSIGNAL
+ name = strsignal(WTERMSIG(status));
+#endif
+ ds_put_format(&ds, "by signal %d", WTERMSIG(status));
+ if (name) {
+ ds_put_format(&ds, " (%s)", name);
+ }
+ }
+ if (WCOREDUMP(status)) {
+ ds_put_cstr(&ds, " (core dumped)");
+ }
+ VLOG_INFO("child process \"%s\" with pid %ld terminated %s",
+ child->name, (long int) child->pid, ds_cstr(&ds));
+ ds_destroy(&ds);
+
+ /* Send a status message back to the controller that requested the
+ * command. */
+ if (WIFEXITED(status)) {
+ ofp_status = WEXITSTATUS(status) | NXT_STATUS_EXITED;
+ } else if (WIFSIGNALED(status)) {
+ ofp_status = WTERMSIG(status) | NXT_STATUS_SIGNALED;
+ } else {
+ ofp_status = NXT_STATUS_UNKNOWN;
+ }
+ if (WCOREDUMP(status)) {
+ ofp_status |= NXT_STATUS_COREDUMP;
+ }
+ send_child_status(child->rconn, child->xid, ofp_status,
+ child->output, child->output_size);
+}
+
+/* Read output from 'child' and append it to its output buffer. */
+static void
+poll_child(struct child *child)
+{
+ ssize_t n;
+
+ if (child->output_fd < 0) {
+ return;
+ }
+
+ do {
+ n = read(child->output_fd, child->output + child->output_size,
+ MAX_OUTPUT - child->output_size);
+ } while (n < 0 && errno == EINTR);
+ if (n > 0) {
+ child->output_size += n;
+ if (child->output_size < MAX_OUTPUT) {
+ return;
+ }
+ } else if (n < 0 && errno == EAGAIN) {
+ return;
+ }
+ close(child->output_fd);
+ child->output_fd = -1;
+}
+
+void
+executer_run(struct executer *e)
+{
+ char buffer[MAX_CHILDREN];
+ size_t i;
+
+ if (!e->n_children) {
+ return;
+ }
+
+ /* Read output from children. */
+ for (i = 0; i < e->n_children; i++) {
+ struct child *child = &e->children[i];
+ poll_child(child);
+ }
+
+ /* If SIGCHLD was received, reap dead children. */
+ if (read(signal_fds[0], buffer, sizeof buffer) <= 0) {
+ return;
+ }
+ for (;;) {
+ int status;
+ pid_t pid;
+
+ /* Get dead child in 'pid' and its return code in 'status'. */
+ pid = waitpid(WAIT_ANY, &status, WNOHANG);
+ if (pid < 0 && errno == EINTR) {
+ continue;
+ } else if (pid <= 0) {
+ return;
+ }
+
+ /* Find child with given 'pid' and drop it from the list. */
+ for (i = 0; i < e->n_children; i++) {
+ struct child *child = &e->children[i];
+ if (child->pid == pid) {
+ poll_child(child);
+ child_terminated(child, status);
+ free(child->name);
+ free(child->output);
+ *child = e->children[--e->n_children];
+ goto found;
+ }
+ }
+ VLOG_WARN("child with unknown pid %ld terminated", (long int) pid);
+ found:;
+ }
+
+}
+
+void
+executer_wait(struct executer *e)
+{
+ if (e->n_children) {
+ size_t i;
+
+ /* Wake up on SIGCHLD. */
+ poll_fd_wait(signal_fds[0], POLLIN);
+
+ /* Wake up when we get output from a child. */
+ for (i = 0; i < e->n_children; i++) {
+ struct child *child = &e->children[i];
+ if (child->output_fd >= 0) {
+ poll_fd_wait(child->output_fd, POLLIN);
+ }
+ }
+ }
+}
+
+void
+executer_rconn_closing(struct executer *e, struct rconn *rconn)
+{
+ size_t i;
+
+ /* If any of our children was connected to 'r', then disconnect it so we
+ * don't try to reference a dead connection when the process terminates
+ * later.
+ * XXX kill the children started by 'r'? */
+ for (i = 0; i < e->n_children; i++) {
+ if (e->children[i].rconn == rconn) {
+ e->children[i].rconn = NULL;
+ }
+ }
+}
+
+static void
+sigchld_handler(int signr UNUSED)
+{
+ write(signal_fds[1], "", 1);
+}
+
+int
+executer_create(const char *command_acl, const char *command_dir,
+ struct executer **executerp)
+{
+ struct executer *e;
+ struct sigaction sa;
+
+ *executerp = NULL;
+ if (null_fd == -1) {
+ /* Create pipe for notifying us that SIGCHLD was invoked. */
+ if (pipe(signal_fds)) {
+ VLOG_ERR("pipe failed: %s", strerror(errno));
+ return errno;
+ }
+ set_nonblocking(signal_fds[0]);
+ set_nonblocking(signal_fds[1]);
+
+ /* Open /dev/null. */
+ null_fd = open("/dev/null", O_RDWR);
+ if (null_fd < 0) {
+ int error = errno;
+ VLOG_ERR("could not open /dev/null: %s", strerror(error));
+ close(signal_fds[0]);
+ close(signal_fds[1]);
+ return error;
+ }
+ }
+
+ /* Set up signal handler. */
+ memset(&sa, 0, sizeof sa);
+ sa.sa_handler = sigchld_handler;
+ sigemptyset(&sa.sa_mask);
+ sa.sa_flags = SA_NOCLDSTOP | SA_RESTART;
+ if (sigaction(SIGCHLD, &sa, NULL)) {
+ VLOG_ERR("sigaction(SIGCHLD) failed: %s", strerror(errno));
+ return errno;
+ }
+
+ e = xcalloc(1, sizeof *e);
+ e->command_acl = xstrdup(command_acl);
+ e->command_dir = (command_dir
+ ? xstrdup(command_dir)
+ : xasprintf("%s/commands", ovs_pkgdatadir));
+ e->n_children = 0;
+ *executerp = e;
+ return 0;
+}
+
+void
+executer_destroy(struct executer *e)
+{
+ if (e) {
+ size_t i;
+
+ free(e->command_acl);
+ free(e->command_dir);
+ for (i = 0; i < e->n_children; i++) {
+ struct child *child = &e->children[i];
+
+ free(child->name);
+ kill(child->pid, SIGHUP);
+ /* We don't own child->rconn. */
+ free(child->output);
+ free(child);
+ }
+ free(e);
+ }
+}
+
+void
+executer_set_acl(struct executer *e, const char *acl, const char *dir)
+{
+ free(e->command_acl);
+ e->command_acl = xstrdup(acl);
+ free(e->command_dir);
+ e->command_dir = xstrdup(dir);
+}
diff --git a/secchan/executer.h b/secchan/executer.h
new file mode 100644
index 000000000..a663367b5
--- /dev/null
+++ b/secchan/executer.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2008, 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef EXECUTER_H
+#define EXECUTER_H 1
+
+struct executer;
+struct nicira_header;
+struct rconn;
+
+int executer_create(const char *acl, const char *dir, struct executer **);
+void executer_set_acl(struct executer *, const char *acl, const char *dir);
+void executer_destroy(struct executer *);
+void executer_run(struct executer *);
+void executer_wait(struct executer *);
+void executer_rconn_closing(struct executer *, struct rconn *);
+int executer_handle_request(struct executer *, struct rconn *,
+ struct nicira_header *);
+
+#endif /* executer.h */
diff --git a/secchan/fail-open.c b/secchan/fail-open.c
new file mode 100644
index 000000000..1ef989a02
--- /dev/null
+++ b/secchan/fail-open.c
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2008, 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <config.h>
+#include "fail-open.h"
+#include <inttypes.h>
+#include <stdlib.h>
+#include "flow.h"
+#include "mac-learning.h"
+#include "odp-util.h"
+#include "ofproto.h"
+#include "rconn.h"
+#include "status.h"
+#include "timeval.h"
+
+#define THIS_MODULE VLM_fail_open
+#include "vlog.h"
+
+struct fail_open {
+ struct ofproto *ofproto;
+ struct rconn *controller;
+ int trigger_duration;
+ int last_disconn_secs;
+ struct status_category *ss_cat;
+};
+
+/* Causes the switch to enter or leave fail-open mode, if appropriate. */
+void
+fail_open_run(struct fail_open *fo)
+{
+ int disconn_secs = rconn_failure_duration(fo->controller);
+ bool open = disconn_secs >= fo->trigger_duration;
+ if (open != (fo->last_disconn_secs != 0)) {
+ if (!open) {
+ flow_t flow;
+
+ VLOG_WARN("No longer in fail-open mode");
+ fo->last_disconn_secs = 0;
+
+ memset(&flow, 0, sizeof flow);
+ ofproto_delete_flow(fo->ofproto, &flow, OFPFW_ALL, 70000);
+ } else {
+ VLOG_WARN("Could not connect to controller for %d seconds, "
+ "failing open", disconn_secs);
+ fo->last_disconn_secs = disconn_secs;
+
+ /* Flush all OpenFlow and datapath flows. We will set up our
+ * fail-open rule from fail_open_flushed() when
+ * ofproto_flush_flows() calls back to us. */
+ ofproto_flush_flows(fo->ofproto);
+ }
+ } else if (open && disconn_secs > fo->last_disconn_secs + 60) {
+ VLOG_INFO("Still in fail-open mode after %d seconds disconnected "
+ "from controller", disconn_secs);
+ fo->last_disconn_secs = disconn_secs;
+ }
+}
+
+void
+fail_open_wait(struct fail_open *fo UNUSED)
+{
+ /* Nothing to do. */
+}
+
+void
+fail_open_flushed(struct fail_open *fo)
+{
+ int disconn_secs = rconn_failure_duration(fo->controller);
+ bool open = disconn_secs >= fo->trigger_duration;
+ if (open) {
+ union ofp_action action;
+ flow_t flow;
+
+ /* Set up a flow that matches every packet and directs them to
+ * OFPP_NORMAL. */
+ memset(&action, 0, sizeof action);
+ action.type = htons(OFPAT_OUTPUT);
+ action.output.len = htons(sizeof action);
+ action.output.port = htons(OFPP_NORMAL);
+ memset(&flow, 0, sizeof flow);
+ ofproto_add_flow(fo->ofproto, &flow, OFPFW_ALL, 70000,
+ &action, 1, 0);
+ }
+}
+
+static void
+fail_open_status_cb(struct status_reply *sr, void *fo_)
+{
+ struct fail_open *fo = fo_;
+ int cur_duration = rconn_failure_duration(fo->controller);
+
+ status_reply_put(sr, "trigger-duration=%d", fo->trigger_duration);
+ status_reply_put(sr, "current-duration=%d", cur_duration);
+ status_reply_put(sr, "triggered=%s",
+ cur_duration >= fo->trigger_duration ? "true" : "false");
+}
+
+struct fail_open *
+fail_open_create(struct ofproto *ofproto,
+ int trigger_duration, struct switch_status *switch_status,
+ struct rconn *controller)
+{
+ struct fail_open *fo = xmalloc(sizeof *fo);
+ fo->ofproto = ofproto;
+ fo->controller = controller;
+ fo->trigger_duration = trigger_duration;
+ fo->last_disconn_secs = 0;
+ fo->ss_cat = switch_status_register(switch_status, "fail-open",
+ fail_open_status_cb, fo);
+ return fo;
+}
+
+void
+fail_open_set_trigger_duration(struct fail_open *fo, int trigger_duration)
+{
+ fo->trigger_duration = trigger_duration;
+}
+
+void
+fail_open_destroy(struct fail_open *fo)
+{
+ if (fo) {
+ /* We don't own fo->controller. */
+ switch_status_unregister(fo->ss_cat);
+ free(fo);
+ }
+}
diff --git a/secchan/fail-open.h b/secchan/fail-open.h
new file mode 100644
index 000000000..8e01da2d3
--- /dev/null
+++ b/secchan/fail-open.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2008, 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef FAIL_OPEN_H
+#define FAIL_OPEN_H 1
+
+#include <stdbool.h>
+#include <stdint.h>
+#include "flow.h"
+
+struct fail_open;
+struct ofproto;
+struct rconn;
+struct switch_status;
+
+struct fail_open *fail_open_create(struct ofproto *, int trigger_duration,
+ struct switch_status *,
+ struct rconn *controller);
+void fail_open_set_trigger_duration(struct fail_open *, int trigger_duration);
+void fail_open_destroy(struct fail_open *);
+void fail_open_wait(struct fail_open *);
+void fail_open_run(struct fail_open *);
+void fail_open_flushed(struct fail_open *);
+
+#endif /* fail-open.h */
diff --git a/secchan/in-band.c b/secchan/in-band.c
new file mode 100644
index 000000000..51bf9ab4b
--- /dev/null
+++ b/secchan/in-band.c
@@ -0,0 +1,358 @@
+/*
+ * Copyright (c) 2008, 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <config.h>
+#include "in-band.h"
+#include <arpa/inet.h>
+#include <errno.h>
+#include <inttypes.h>
+#include <net/if.h>
+#include <string.h>
+#include "dpif.h"
+#include "flow.h"
+#include "mac-learning.h"
+#include "netdev.h"
+#include "odp-util.h"
+#include "ofp-print.h"
+#include "ofproto.h"
+#include "ofpbuf.h"
+#include "openflow/openflow.h"
+#include "packets.h"
+#include "poll-loop.h"
+#include "rconn.h"
+#include "status.h"
+#include "timeval.h"
+#include "vconn.h"
+
+#define THIS_MODULE VLM_in_band
+#include "vlog.h"
+
+#define IB_BASE_PRIORITY 18181800
+
+enum {
+ IBR_FROM_LOCAL_PORT, /* Sent by secure channel. */
+ IBR_TO_LOCAL_PORT, /* Sent to secure channel. */
+ IBR_ARP_FROM_CTL, /* ARP from the controller. */
+ IBR_TO_CTL_OFP_SRC, /* To controller, OpenFlow source port. */
+ IBR_TO_CTL_OFP_DST, /* To controller, OpenFlow dest port. */
+ IBR_FROM_CTL_OFP_SRC, /* From controller, OpenFlow source port. */
+ IBR_FROM_CTL_OFP_DST, /* From controller, OpenFlow dest port. */
+#if OFP_TCP_PORT != OFP_SSL_PORT
+#error Need to support separate TCP and SSL flows.
+#endif
+ N_IB_RULES
+};
+
+struct ib_rule {
+ bool installed;
+ flow_t flow;
+ uint32_t wildcards;
+ unsigned int priority;
+};
+
+struct in_band {
+ struct ofproto *ofproto;
+ struct netdev *netdev;
+ struct rconn *controller;
+ struct status_category *ss_cat;
+
+ /* Keeping track of controller's MAC address. */
+ uint32_t ip; /* Current IP, 0 if unknown. */
+ uint32_t last_ip; /* Last known IP, 0 if never known. */
+ uint8_t mac[ETH_ADDR_LEN]; /* Current MAC, 0 if unknown. */
+ uint8_t last_mac[ETH_ADDR_LEN]; /* Last known MAC, 0 if never known */
+ time_t next_refresh; /* Next time to refresh MAC address. */
+
+ /* Keeping track of the local port's MAC address. */
+ uint8_t local_mac[ETH_ADDR_LEN]; /* Current MAC. */
+ time_t next_local_refresh; /* Next time to refresh MAC address. */
+
+ /* Rules that we set up. */
+ struct ib_rule rules[N_IB_RULES];
+};
+
+static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(60, 60);
+
+static const uint8_t *
+get_controller_mac(struct in_band *ib)
+{
+ time_t now = time_now();
+ uint32_t ip;
+
+ ip = rconn_get_ip(ib->controller);
+ if (ip != ib->ip || now >= ib->next_refresh) {
+ bool have_mac;
+
+ ib->ip = ip;
+
+ /* Look up MAC address. */
+ memset(ib->mac, 0, sizeof ib->mac);
+ if (ib->ip) {
+ int retval = netdev_arp_lookup(ib->netdev, ib->ip, ib->mac);
+ if (retval) {
+ VLOG_DBG_RL(&rl, "cannot look up controller hw address "
+ "("IP_FMT"): %s",
+ IP_ARGS(&ib->ip), strerror(retval));
+ }
+ }
+ have_mac = !eth_addr_is_zero(ib->mac);
+
+ /* Log changes in IP, MAC addresses. */
+ if (ib->ip && ib->ip != ib->last_ip) {
+ VLOG_DBG("controller IP address changed from "IP_FMT
+ " to "IP_FMT, IP_ARGS(&ib->last_ip), IP_ARGS(&ib->ip));
+ ib->last_ip = ib->ip;
+ }
+ if (have_mac && memcmp(ib->last_mac, ib->mac, ETH_ADDR_LEN)) {
+ VLOG_DBG("controller MAC address changed from "ETH_ADDR_FMT" to "
+ ETH_ADDR_FMT,
+ ETH_ADDR_ARGS(ib->last_mac), ETH_ADDR_ARGS(ib->mac));
+ memcpy(ib->last_mac, ib->mac, ETH_ADDR_LEN);
+ }
+
+ /* Schedule next refresh.
+ *
+ * If we have an IP address but not a MAC address, then refresh
+ * quickly, since we probably will get a MAC address soon (via ARP).
+ * Otherwise, we can afford to wait a little while. */
+ ib->next_refresh = now + (!ib->ip || have_mac ? 10 : 1);
+ }
+ return !eth_addr_is_zero(ib->mac) ? ib->mac : NULL;
+}
+
+static const uint8_t *
+get_local_mac(struct in_band *ib)
+{
+ time_t now = time_now();
+ if (now >= ib->next_local_refresh) {
+ uint8_t ea[ETH_ADDR_LEN];
+ if (!netdev_nodev_get_etheraddr(netdev_get_name(ib->netdev), ea)) {
+ memcpy(ib->local_mac, ea, ETH_ADDR_LEN);
+ }
+ ib->next_local_refresh = now + 1;
+ }
+ return !eth_addr_is_zero(ib->local_mac) ? ib->local_mac : NULL;
+}
+
+static void
+in_band_status_cb(struct status_reply *sr, void *in_band_)
+{
+ struct in_band *in_band = in_band_;
+ struct in_addr local_ip;
+ const uint8_t *local_mac;
+ uint32_t controller_ip;
+ const uint8_t *controller_mac;
+
+ if (netdev_get_in4(in_band->netdev, &local_ip)) {
+ status_reply_put(sr, "local-ip="IP_FMT, IP_ARGS(&local_ip.s_addr));
+ }
+ local_mac = get_local_mac(in_band);
+ if (local_mac) {
+ status_reply_put(sr, "local-mac="ETH_ADDR_FMT,
+ ETH_ADDR_ARGS(local_mac));
+ }
+
+ controller_ip = rconn_get_ip(in_band->controller);
+ if (controller_ip) {
+ status_reply_put(sr, "controller-ip="IP_FMT,
+ IP_ARGS(&controller_ip));
+ }
+ controller_mac = get_controller_mac(in_band);
+ if (controller_mac) {
+ status_reply_put(sr, "controller-mac="ETH_ADDR_FMT,
+ ETH_ADDR_ARGS(controller_mac));
+ }
+}
+
+static void
+drop_flow(struct in_band *in_band, int rule_idx)
+{
+ struct ib_rule *rule = &in_band->rules[rule_idx];
+
+ if (rule->installed) {
+ rule->installed = false;
+ ofproto_delete_flow(in_band->ofproto, &rule->flow, rule->wildcards,
+ rule->priority);
+ }
+}
+
+/* out_port and fixed_fields are assumed never to change. */
+static void
+setup_flow(struct in_band *in_band, int rule_idx, const flow_t *flow,
+ uint32_t fixed_fields, uint16_t out_port)
+{
+ struct ib_rule *rule = &in_band->rules[rule_idx];
+
+ if (!rule->installed || memcmp(flow, &rule->flow, sizeof *flow)) {
+ union ofp_action action;
+
+ drop_flow(in_band, rule_idx);
+
+ rule->installed = true;
+ rule->flow = *flow;
+ rule->wildcards = OFPFW_ALL & ~fixed_fields;
+ rule->priority = IB_BASE_PRIORITY + (N_IB_RULES - rule_idx);
+
+ action.type = htons(OFPAT_OUTPUT);
+ action.output.len = htons(sizeof action);
+ action.output.port = htons(out_port);
+ action.output.max_len = htons(0);
+ ofproto_add_flow(in_band->ofproto, &rule->flow, rule->wildcards,
+ rule->priority, &action, 1, 0);
+ }
+}
+
+void
+in_band_run(struct in_band *in_band)
+{
+ const uint8_t *controller_mac;
+ const uint8_t *local_mac;
+ flow_t flow;
+
+ if (time_now() < MIN(in_band->next_refresh, in_band->next_local_refresh)) {
+ return;
+ }
+ controller_mac = get_controller_mac(in_band);
+ local_mac = get_local_mac(in_band);
+
+ /* Switch traffic sent by the secure channel. */
+ memset(&flow, 0, sizeof flow);
+ flow.in_port = ODPP_LOCAL;
+ setup_flow(in_band, IBR_FROM_LOCAL_PORT, &flow, OFPFW_IN_PORT,
+ OFPP_NORMAL);
+
+ /* Deliver traffic sent to the secure channel to the local port. */
+ if (local_mac) {
+ memset(&flow, 0, sizeof flow);
+ memcpy(flow.dl_dst, local_mac, ETH_ADDR_LEN);
+ setup_flow(in_band, IBR_TO_LOCAL_PORT, &flow, OFPFW_DL_DST,
+ OFPP_NORMAL);
+ } else {
+ drop_flow(in_band, IBR_TO_LOCAL_PORT);
+ }
+
+ if (controller_mac) {
+ /* Switch ARP requests sent by the controller. (OFPP_NORMAL will "do
+ * the right thing" regarding VLANs here.) */
+ memset(&flow, 0, sizeof flow);
+ flow.dl_type = htons(ETH_TYPE_ARP);
+ memcpy(flow.dl_dst, eth_addr_broadcast, ETH_ADDR_LEN);
+ memcpy(flow.dl_src, controller_mac, ETH_ADDR_LEN);
+ setup_flow(in_band, IBR_ARP_FROM_CTL, &flow,
+ OFPFW_DL_TYPE | OFPFW_DL_DST | OFPFW_DL_SRC,
+ OFPP_NORMAL);
+
+ /* OpenFlow traffic to or from the controller.
+ *
+ * (A given field's value is completely ignored if it is wildcarded,
+ * which is why we can get away with using a single 'flow' in each
+ * case here.) */
+ memset(&flow, 0, sizeof flow);
+ flow.dl_type = htons(ETH_TYPE_IP);
+ memcpy(flow.dl_src, controller_mac, ETH_ADDR_LEN);
+ memcpy(flow.dl_dst, controller_mac, ETH_ADDR_LEN);
+ flow.nw_proto = IP_TYPE_TCP;
+ flow.tp_src = htons(OFP_TCP_PORT);
+ flow.tp_dst = htons(OFP_TCP_PORT);
+ setup_flow(in_band, IBR_TO_CTL_OFP_SRC, &flow,
+ (OFPFW_DL_TYPE | OFPFW_DL_DST | OFPFW_NW_PROTO
+ | OFPFW_TP_SRC), OFPP_NORMAL);
+ setup_flow(in_band, IBR_TO_CTL_OFP_DST, &flow,
+ (OFPFW_DL_TYPE | OFPFW_DL_DST | OFPFW_NW_PROTO
+ | OFPFW_TP_DST), OFPP_NORMAL);
+ setup_flow(in_band, IBR_FROM_CTL_OFP_SRC, &flow,
+ (OFPFW_DL_TYPE | OFPFW_DL_SRC | OFPFW_NW_PROTO
+ | OFPFW_TP_SRC), OFPP_NORMAL);
+ setup_flow(in_band, IBR_FROM_CTL_OFP_DST, &flow,
+ (OFPFW_DL_TYPE | OFPFW_DL_SRC | OFPFW_NW_PROTO
+ | OFPFW_TP_DST), OFPP_NORMAL);
+ } else {
+ drop_flow(in_band, IBR_ARP_FROM_CTL);
+ drop_flow(in_band, IBR_TO_CTL_OFP_DST);
+ drop_flow(in_band, IBR_TO_CTL_OFP_SRC);
+ drop_flow(in_band, IBR_FROM_CTL_OFP_DST);
+ drop_flow(in_band, IBR_FROM_CTL_OFP_SRC);
+ }
+}
+
+void
+in_band_wait(struct in_band *in_band)
+{
+ time_t now = time_now();
+ time_t wakeup = MIN(in_band->next_refresh, in_band->next_local_refresh);
+ if (wakeup > now) {
+ poll_timer_wait((wakeup - now) * 1000);
+ } else {
+ poll_immediate_wake();
+ }
+}
+
+void
+in_band_flushed(struct in_band *in_band)
+{
+ int i;
+
+ for (i = 0; i < N_IB_RULES; i++) {
+ in_band->rules[i].installed = false;
+ }
+}
+
+int
+in_band_create(struct ofproto *ofproto,
+ struct dpif *dpif, struct switch_status *ss,
+ struct rconn *controller, struct in_band **in_bandp)
+{
+ struct in_band *in_band;
+ struct netdev *netdev;
+ char local_name[IF_NAMESIZE];
+ int error;
+
+ *in_bandp = NULL;
+ error = dpif_get_name(dpif, local_name, sizeof local_name);
+ if (error) {
+ return error;
+ }
+
+ error = netdev_open(local_name, NETDEV_ETH_TYPE_NONE, &netdev);
+ if (error) {
+ VLOG_ERR("failed to open %s network device: %s",
+ local_name, strerror(error));
+ return error;
+ }
+
+ in_band = xcalloc(1, sizeof *in_band);
+ in_band->ofproto = ofproto;
+ in_band->netdev = netdev;
+ in_band->controller = controller;
+ in_band->ss_cat = switch_status_register(ss, "in-band",
+ in_band_status_cb, in_band);
+ in_band->next_refresh = TIME_MIN;
+ in_band->next_local_refresh = TIME_MIN;
+
+ *in_bandp = in_band;
+ return 0;
+}
+
+void
+in_band_destroy(struct in_band *in_band)
+{
+ if (in_band) {
+ netdev_close(in_band->netdev);
+ switch_status_unregister(in_band->ss_cat);
+ /* We don't own the rconn. */
+ }
+}
+
diff --git a/secchan/in-band.h b/secchan/in-band.h
new file mode 100644
index 000000000..8673d2ade
--- /dev/null
+++ b/secchan/in-band.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2008, 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef IN_BAND_H
+#define IN_BAND_H 1
+
+#include "flow.h"
+
+struct dpif;
+struct in_band;
+struct ofproto;
+struct rconn;
+struct secchan;
+struct settings;
+struct switch_status;
+
+int in_band_create(struct ofproto *, struct dpif *, struct switch_status *,
+ struct rconn *controller, struct in_band **);
+void in_band_destroy(struct in_band *);
+void in_band_run(struct in_band *);
+void in_band_wait(struct in_band *);
+void in_band_flushed(struct in_band *);
+
+#endif /* in-band.h */
diff --git a/secchan/main.c b/secchan/main.c
new file mode 100644
index 000000000..ca385766c
--- /dev/null
+++ b/secchan/main.c
@@ -0,0 +1,565 @@
+/*
+ * Copyright (c) 2008, 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <config.h>
+#include <assert.h>
+#include <errno.h>
+#include <getopt.h>
+#include <inttypes.h>
+#include <netinet/in.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <string.h>
+
+#include "command-line.h"
+#include "compiler.h"
+#include "daemon.h"
+#include "dirs.h"
+#include "discovery.h"
+#include "dpif.h"
+#include "fail-open.h"
+#include "fault.h"
+#include "in-band.h"
+#include "leak-checker.h"
+#include "list.h"
+#include "netdev.h"
+#include "ofpbuf.h"
+#include "ofproto.h"
+#include "openflow/openflow.h"
+#include "packets.h"
+#include "poll-loop.h"
+#include "rconn.h"
+#include "status.h"
+#include "svec.h"
+#include "timeval.h"
+#include "unixctl.h"
+#include "util.h"
+#include "vconn-ssl.h"
+#include "vconn.h"
+
+#include "vlog.h"
+#define THIS_MODULE VLM_secchan
+
+/* Behavior when the connection to the controller fails. */
+enum fail_mode {
+ FAIL_OPEN, /* Act as learning switch. */
+ FAIL_CLOSED /* Drop all packets. */
+};
+
+/* Settings that may be configured by the user. */
+struct ofsettings {
+ /* Overall mode of operation. */
+ bool discovery; /* Discover the controller automatically? */
+ bool in_band; /* Connect to controller in-band? */
+
+ /* Datapath. */
+ uint64_t datapath_id; /* Datapath ID. */
+ const char *dp_name; /* Name of local datapath. */
+
+ /* Description strings. */
+ const char *mfr_desc; /* Manufacturer. */
+ const char *hw_desc; /* Hardware. */
+ const char *sw_desc; /* Software version. */
+ const char *serial_desc; /* Serial number. */
+
+ /* Related vconns and network devices. */
+ const char *controller_name; /* Controller (if not discovery mode). */
+ struct svec listeners; /* Listen for management connections. */
+ struct svec snoops; /* Listen for controller snooping conns. */
+
+ /* Failure behavior. */
+ enum fail_mode fail_mode; /* Act as learning switch if no controller? */
+ int max_idle; /* Idle time for flows in fail-open mode. */
+ int probe_interval; /* # seconds idle before sending echo request. */
+ int max_backoff; /* Max # seconds between connection attempts. */
+
+ /* Packet-in rate-limiting. */
+ int rate_limit; /* Tokens added to bucket per second. */
+ int burst_limit; /* Maximum number token bucket size. */
+
+ /* Discovery behavior. */
+ const char *accept_controller_re; /* Controller vconns to accept. */
+ bool update_resolv_conf; /* Update /etc/resolv.conf? */
+
+ /* Spanning tree protocol. */
+ bool enable_stp;
+
+ /* Remote command execution. */
+ char *command_acl; /* Command white/blacklist, as shell globs. */
+ char *command_dir; /* Directory that contains commands. */
+
+ /* Management. */
+ uint64_t mgmt_id; /* Management ID. */
+
+ /* NetFlow. */
+ struct svec netflow; /* NetFlow targets. */
+};
+
+static void parse_options(int argc, char *argv[], struct ofsettings *);
+static void usage(void) NO_RETURN;
+
+int
+main(int argc, char *argv[])
+{
+ struct unixctl_server *unixctl;
+ struct ofproto *ofproto;
+ struct ofsettings s;
+ int error;
+
+ set_program_name(argv[0]);
+ register_fault_handlers();
+ time_init();
+ vlog_init();
+ parse_options(argc, argv, &s);
+ signal(SIGPIPE, SIG_IGN);
+
+ die_if_already_running();
+ daemonize();
+
+ /* Start listening for ovs-appctl requests. */
+ error = unixctl_server_create(NULL, &unixctl);
+ if (error) {
+ ovs_fatal(error, "Could not listen for unixctl connections");
+ }
+
+ VLOG_INFO("Open vSwitch version %s", VERSION BUILDNR);
+ VLOG_INFO("OpenFlow protocol version 0x%02x", OFP_VERSION);
+
+ /* Start OpenFlow processing. */
+ error = ofproto_create(s.dp_name, NULL, NULL, &ofproto);
+ if (error) {
+ ovs_fatal(error, "could not initialize openflow switch");
+ }
+ error = ofproto_set_in_band(ofproto, s.in_band);
+ if (error) {
+ ovs_fatal(error, "failed to configure in-band control");
+ }
+ error = ofproto_set_discovery(ofproto, s.discovery, s.accept_controller_re,
+ s.update_resolv_conf);
+ if (error) {
+ ovs_fatal(error, "failed to configure controller discovery");
+ }
+ if (s.datapath_id) {
+ ofproto_set_datapath_id(ofproto, s.datapath_id);
+ }
+ if (s.mgmt_id) {
+ ofproto_set_mgmt_id(ofproto, s.mgmt_id);
+ }
+ ofproto_set_desc(ofproto, s.mfr_desc, s.hw_desc, s.sw_desc, s.serial_desc);
+ error = ofproto_set_listeners(ofproto, &s.listeners);
+ if (error) {
+ ovs_fatal(error, "failed to configure management connections");
+ }
+ error = ofproto_set_snoops(ofproto, &s.snoops);
+ if (error) {
+ ovs_fatal(error,
+ "failed to configure controller snooping connections");
+ }
+ error = ofproto_set_netflow(ofproto, &s.netflow, 0, 0, false);
+ if (error) {
+ ovs_fatal(error, "failed to configure NetFlow collectors");
+ }
+ ofproto_set_failure(ofproto, s.fail_mode == FAIL_OPEN);
+ ofproto_set_probe_interval(ofproto, s.probe_interval);
+ ofproto_set_max_backoff(ofproto, s.max_backoff);
+ ofproto_set_rate_limit(ofproto, s.rate_limit, s.burst_limit);
+ error = ofproto_set_stp(ofproto, s.enable_stp);
+ if (error) {
+ ovs_fatal(error, "failed to configure STP");
+ }
+ error = ofproto_set_remote_execution(ofproto, s.command_acl,
+ s.command_dir);
+ if (error) {
+ ovs_fatal(error, "failed to configure remote command execution");
+ }
+ if (!s.discovery) {
+ error = ofproto_set_controller(ofproto, s.controller_name);
+ if (error) {
+ ovs_fatal(error, "failed to configure controller");
+ }
+ }
+
+ while (ofproto_is_alive(ofproto)) {
+ error = ofproto_run(ofproto);
+ if (error) {
+ ovs_fatal(error, "unrecoverable datapath error");
+ }
+ unixctl_server_run(unixctl);
+
+ ofproto_wait(ofproto);
+ unixctl_server_wait(unixctl);
+ poll_block();
+ }
+
+ return 0;
+}
+
+/* User interface. */
+
+static void
+parse_options(int argc, char *argv[], struct ofsettings *s)
+{
+ enum {
+ OPT_DATAPATH_ID = UCHAR_MAX + 1,
+ OPT_MANUFACTURER,
+ OPT_HARDWARE,
+ OPT_SOFTWARE,
+ OPT_SERIAL,
+ OPT_ACCEPT_VCONN,
+ OPT_NO_RESOLV_CONF,
+ OPT_BR_NAME,
+ OPT_FAIL_MODE,
+ OPT_INACTIVITY_PROBE,
+ OPT_MAX_IDLE,
+ OPT_MAX_BACKOFF,
+ OPT_SNOOP,
+ OPT_RATE_LIMIT,
+ OPT_BURST_LIMIT,
+ OPT_BOOTSTRAP_CA_CERT,
+ OPT_STP,
+ OPT_NO_STP,
+ OPT_OUT_OF_BAND,
+ OPT_IN_BAND,
+ OPT_COMMAND_ACL,
+ OPT_COMMAND_DIR,
+ OPT_NETFLOW,
+ OPT_MGMT_ID,
+ VLOG_OPTION_ENUMS,
+ LEAK_CHECKER_OPTION_ENUMS
+ };
+ static struct option long_options[] = {
+ {"datapath-id", required_argument, 0, OPT_DATAPATH_ID},
+ {"manufacturer", required_argument, 0, OPT_MANUFACTURER},
+ {"hardware", required_argument, 0, OPT_HARDWARE},
+ {"software", required_argument, 0, OPT_SOFTWARE},
+ {"serial", required_argument, 0, OPT_SERIAL},
+ {"accept-vconn", required_argument, 0, OPT_ACCEPT_VCONN},
+ {"no-resolv-conf", no_argument, 0, OPT_NO_RESOLV_CONF},
+ {"config", required_argument, 0, 'F'},
+ {"br-name", required_argument, 0, OPT_BR_NAME},
+ {"fail", required_argument, 0, OPT_FAIL_MODE},
+ {"inactivity-probe", required_argument, 0, OPT_INACTIVITY_PROBE},
+ {"max-idle", required_argument, 0, OPT_MAX_IDLE},
+ {"max-backoff", required_argument, 0, OPT_MAX_BACKOFF},
+ {"listen", required_argument, 0, 'l'},
+ {"snoop", required_argument, 0, OPT_SNOOP},
+ {"rate-limit", optional_argument, 0, OPT_RATE_LIMIT},
+ {"burst-limit", required_argument, 0, OPT_BURST_LIMIT},
+ {"stp", no_argument, 0, OPT_STP},
+ {"no-stp", no_argument, 0, OPT_NO_STP},
+ {"out-of-band", no_argument, 0, OPT_OUT_OF_BAND},
+ {"in-band", no_argument, 0, OPT_IN_BAND},
+ {"command-acl", required_argument, 0, OPT_COMMAND_ACL},
+ {"command-dir", required_argument, 0, OPT_COMMAND_DIR},
+ {"netflow", required_argument, 0, OPT_NETFLOW},
+ {"mgmt-id", required_argument, 0, OPT_MGMT_ID},
+ {"verbose", optional_argument, 0, 'v'},
+ {"help", no_argument, 0, 'h'},
+ {"version", no_argument, 0, 'V'},
+ DAEMON_LONG_OPTIONS,
+ VLOG_LONG_OPTIONS,
+ LEAK_CHECKER_LONG_OPTIONS,
+#ifdef HAVE_OPENSSL
+ VCONN_SSL_LONG_OPTIONS
+ {"bootstrap-ca-cert", required_argument, 0, OPT_BOOTSTRAP_CA_CERT},
+#endif
+ {0, 0, 0, 0},
+ };
+ char *short_options = long_options_to_short_options(long_options);
+
+ /* Set defaults that we can figure out before parsing options. */
+ s->datapath_id = 0;
+ s->mfr_desc = NULL;
+ s->hw_desc = NULL;
+ s->sw_desc = NULL;
+ s->serial_desc = NULL;
+ svec_init(&s->listeners);
+ svec_init(&s->snoops);
+ s->fail_mode = FAIL_OPEN;
+ s->max_idle = 0;
+ s->probe_interval = 0;
+ s->max_backoff = 15;
+ s->update_resolv_conf = true;
+ s->rate_limit = 0;
+ s->burst_limit = 0;
+ s->accept_controller_re = NULL;
+ s->enable_stp = false;
+ s->in_band = true;
+ s->command_acl = "";
+ s->command_dir = NULL;
+ svec_init(&s->netflow);
+ s->mgmt_id = 0;
+ for (;;) {
+ int c;
+
+ c = getopt_long(argc, argv, short_options, long_options, NULL);
+ if (c == -1) {
+ break;
+ }
+
+ switch (c) {
+ case OPT_DATAPATH_ID:
+ if (strlen(optarg) != 12
+ || strspn(optarg, "0123456789abcdefABCDEF") != 12) {
+ ovs_fatal(0, "argument to --datapath-id must be "
+ "exactly 12 hex digits");
+ }
+ s->datapath_id = strtoll(optarg, NULL, 16);
+ if (!s->datapath_id) {
+ ovs_fatal(0, "argument to --datapath-id must be nonzero");
+ }
+ break;
+
+ case OPT_MANUFACTURER:
+ s->mfr_desc = optarg;
+ break;
+
+ case OPT_HARDWARE:
+ s->hw_desc = optarg;
+ break;
+
+ case OPT_SOFTWARE:
+ s->sw_desc = optarg;
+ break;
+
+ case OPT_SERIAL:
+ s->serial_desc = optarg;
+ break;
+
+ case OPT_ACCEPT_VCONN:
+ s->accept_controller_re = optarg;
+ break;
+
+ case OPT_NO_RESOLV_CONF:
+ s->update_resolv_conf = false;
+ break;
+
+ case OPT_FAIL_MODE:
+ if (!strcmp(optarg, "open")) {
+ s->fail_mode = FAIL_OPEN;
+ } else if (!strcmp(optarg, "closed")) {
+ s->fail_mode = FAIL_CLOSED;
+ } else {
+ ovs_fatal(0, "-f or --fail argument must be \"open\" "
+ "or \"closed\"");
+ }
+ break;
+
+ case OPT_INACTIVITY_PROBE:
+ s->probe_interval = atoi(optarg);
+ if (s->probe_interval < 5) {
+ ovs_fatal(0, "--inactivity-probe argument must be at least 5");
+ }
+ break;
+
+ case OPT_MAX_IDLE:
+ if (!strcmp(optarg, "permanent")) {
+ s->max_idle = OFP_FLOW_PERMANENT;
+ } else {
+ s->max_idle = atoi(optarg);
+ if (s->max_idle < 1 || s->max_idle > 65535) {
+ ovs_fatal(0, "--max-idle argument must be between 1 and "
+ "65535 or the word 'permanent'");
+ }
+ }
+ break;
+
+ case OPT_MAX_BACKOFF:
+ s->max_backoff = atoi(optarg);
+ if (s->max_backoff < 1) {
+ ovs_fatal(0, "--max-backoff argument must be at least 1");
+ } else if (s->max_backoff > 3600) {
+ s->max_backoff = 3600;
+ }
+ break;
+
+ case OPT_RATE_LIMIT:
+ if (optarg) {
+ s->rate_limit = atoi(optarg);
+ if (s->rate_limit < 1) {
+ ovs_fatal(0, "--rate-limit argument must be at least 1");
+ }
+ } else {
+ s->rate_limit = 1000;
+ }
+ break;
+
+ case OPT_BURST_LIMIT:
+ s->burst_limit = atoi(optarg);
+ if (s->burst_limit < 1) {
+ ovs_fatal(0, "--burst-limit argument must be at least 1");
+ }
+ break;
+
+ case OPT_STP:
+ s->enable_stp = true;
+ break;
+
+ case OPT_NO_STP:
+ s->enable_stp = false;
+ break;
+
+ case OPT_OUT_OF_BAND:
+ s->in_band = false;
+ break;
+
+ case OPT_IN_BAND:
+ s->in_band = true;
+ break;
+
+ case OPT_COMMAND_ACL:
+ s->command_acl = (s->command_acl[0]
+ ? xasprintf("%s,%s", s->command_acl, optarg)
+ : optarg);
+ break;
+
+ case OPT_COMMAND_DIR:
+ s->command_dir = optarg;
+ break;
+
+ case OPT_NETFLOW:
+ svec_add(&s->netflow, optarg);
+ break;
+
+ case OPT_MGMT_ID:
+ if (strlen(optarg) != 12
+ || strspn(optarg, "0123456789abcdefABCDEF") != 12) {
+ ovs_fatal(0, "argument to --mgmt-id must be "
+ "exactly 12 hex digits");
+ }
+ s->mgmt_id = strtoll(optarg, NULL, 16);
+ if (!s->mgmt_id) {
+ ovs_fatal(0, "argument to --mgmt-id must be nonzero");
+ }
+ break;
+
+ case 'l':
+ svec_add(&s->listeners, optarg);
+ break;
+
+ case OPT_SNOOP:
+ svec_add(&s->snoops, optarg);
+ break;
+
+ case 'h':
+ usage();
+
+ case 'V':
+ OVS_PRINT_VERSION(OFP_VERSION, OFP_VERSION);
+ exit(EXIT_SUCCESS);
+
+ DAEMON_OPTION_HANDLERS
+
+ VLOG_OPTION_HANDLERS
+
+ LEAK_CHECKER_OPTION_HANDLERS
+
+#ifdef HAVE_OPENSSL
+ VCONN_SSL_OPTION_HANDLERS
+
+ case OPT_BOOTSTRAP_CA_CERT:
+ vconn_ssl_set_ca_cert_file(optarg, true);
+ break;
+#endif
+
+ case '?':
+ exit(EXIT_FAILURE);
+
+ default:
+ abort();
+ }
+ }
+ free(short_options);
+
+ argc -= optind;
+ argv += optind;
+ if (argc < 1 || argc > 2) {
+ ovs_fatal(0, "need one or two non-option arguments; "
+ "use --help for usage");
+ }
+
+ /* Local and remote vconns. */
+ s->dp_name = argv[0];
+ s->controller_name = argc > 1 ? xstrdup(argv[1]) : NULL;
+
+ /* Set accept_controller_regex. */
+ if (!s->accept_controller_re) {
+ s->accept_controller_re = vconn_ssl_is_configured() ? "^ssl:.*" : ".*";
+ }
+
+ /* Mode of operation. */
+ s->discovery = s->controller_name == NULL;
+ if (s->discovery && !s->in_band) {
+ ovs_fatal(0, "Cannot perform discovery with out-of-band control");
+ }
+
+ /* Rate limiting. */
+ if (s->rate_limit && s->rate_limit < 100) {
+ VLOG_WARN("Rate limit set to unusually low value %d", s->rate_limit);
+ }
+}
+
+static void
+usage(void)
+{
+ printf("%s: an OpenFlow switch implementation.\n"
+ "usage: %s [OPTIONS] DATAPATH [CONTROLLER]\n"
+ "DATAPATH is a local datapath (e.g. \"dp0\").\n"
+ "CONTROLLER is an active OpenFlow connection method; if it is\n"
+ "omitted, then secchan performs controller discovery.\n",
+ program_name, program_name);
+ vconn_usage(true, true, true);
+ printf("\nOpenFlow options:\n"
+ " -d, --datapath-id=ID Use ID as the OpenFlow switch ID\n"
+ " (ID must consist of 12 hex digits)\n"
+ " --mgmt-id=ID Use ID as the management ID\n"
+ " (ID must consist of 12 hex digits)\n"
+ " --manufacturer=MFR Identify manufacturer as MFR\n"
+ " --hardware=HW Identify hardware as HW\n"
+ " --software=SW Identify software as SW\n"
+ " --serial=SERIAL Identify serial number as SERIAL\n"
+ "\nController discovery options:\n"
+ " --accept-vconn=REGEX accept matching discovered controllers\n"
+ " --no-resolv-conf do not update /etc/resolv.conf\n"
+ "\nNetworking options:\n"
+ " --fail=open|closed when controller connection fails:\n"
+ " closed: drop all packets\n"
+ " open (default): act as learning switch\n"
+ " --inactivity-probe=SECS time between inactivity probes\n"
+ " --max-idle=SECS max idle for flows set up by secchan\n"
+ " --max-backoff=SECS max time between controller connection\n"
+ " attempts (default: 15 seconds)\n"
+ " -l, --listen=METHOD allow management connections on METHOD\n"
+ " (a passive OpenFlow connection method)\n"
+ " --snoop=METHOD allow controller snooping on METHOD\n"
+ " (a passive OpenFlow connection method)\n"
+ " --out-of-band controller connection is out-of-band\n"
+ " --netflow=HOST:PORT configure NetFlow output target\n"
+ "\nRate-limiting of \"packet-in\" messages to the controller:\n"
+ " --rate-limit[=PACKETS] max rate, in packets/s (default: 1000)\n"
+ " --burst-limit=BURST limit on packet credit for idle time\n"
+ "\nRemote command execution options:\n"
+ " --command-acl=[!]GLOB[,[!]GLOB...] set allowed/denied commands\n"
+ " --command-dir=DIR set command dir (default: %s/commands)\n",
+ ovs_pkgdatadir);
+ daemon_usage();
+ vlog_usage();
+ printf("\nOther options:\n"
+ " -h, --help display this help message\n"
+ " -V, --version display version information\n");
+ leak_checker_usage();
+ exit(EXIT_SUCCESS);
+}
diff --git a/secchan/netflow.c b/secchan/netflow.c
new file mode 100644
index 000000000..99f3eea4a
--- /dev/null
+++ b/secchan/netflow.c
@@ -0,0 +1,328 @@
+/*
+ * Copyright (c) 2008, 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <config.h>
+#include "netflow.h"
+#include <arpa/inet.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include "cfg.h"
+#include "flow.h"
+#include "netflow.h"
+#include "ofpbuf.h"
+#include "ofproto.h"
+#include "packets.h"
+#include "socket-util.h"
+#include "svec.h"
+#include "timeval.h"
+#include "util.h"
+#include "xtoxll.h"
+
+#define THIS_MODULE VLM_netflow
+#include "vlog.h"
+
+#define NETFLOW_V5_VERSION 5
+
+/* Every NetFlow v5 message contains the header that follows. This is
+ * followed by up to thirty records that describe a terminating flow.
+ * We only send a single record per NetFlow message.
+ */
+struct netflow_v5_header {
+ uint16_t version; /* NetFlow version is 5. */
+ uint16_t count; /* Number of records in this message. */
+ uint32_t sysuptime; /* System uptime in milliseconds. */
+ uint32_t unix_secs; /* Number of seconds since Unix epoch. */
+ uint32_t unix_nsecs; /* Number of residual nanoseconds
+ after epoch seconds. */
+ uint32_t flow_seq; /* Number of flows since sending
+ messages began. */
+ uint8_t engine_type; /* Engine type. */
+ uint8_t engine_id; /* Engine id. */
+ uint16_t sampling_interval; /* Set to zero. */
+};
+BUILD_ASSERT_DECL(sizeof(struct netflow_v5_header) == 24);
+
+/* A NetFlow v5 description of a terminating flow. It is preceded by a
+ * NetFlow v5 header.
+ */
+struct netflow_v5_record {
+ uint32_t src_addr; /* Source IP address. */
+ uint32_t dst_addr; /* Destination IP address. */
+ uint32_t nexthop; /* IP address of next hop. Set to 0. */
+ uint16_t input; /* Input interface index. */
+ uint16_t output; /* Output interface index. */
+ uint32_t packet_count; /* Number of packets. */
+ uint32_t byte_count; /* Number of bytes. */
+ uint32_t init_time; /* Value of sysuptime on first packet. */
+ uint32_t used_time; /* Value of sysuptime on last packet. */
+
+ /* The 'src_port' and 'dst_port' identify the source and destination
+ * port, respectively, for TCP and UDP. For ICMP, the high-order
+ * byte identifies the type and low-order byte identifies the code
+ * in the 'dst_port' field. */
+ uint16_t src_port;
+ uint16_t dst_port;
+
+ uint8_t pad1;
+ uint8_t tcp_flags; /* Union of seen TCP flags. */
+ uint8_t ip_proto; /* IP protocol. */
+ uint8_t ip_tos; /* IP TOS value. */
+ uint16_t src_as; /* Source AS ID. Set to 0. */
+ uint16_t dst_as; /* Destination AS ID. Set to 0. */
+ uint8_t src_mask; /* Source mask bits. Set to 0. */
+ uint8_t dst_mask; /* Destination mask bits. Set to 0. */
+ uint8_t pad[2];
+};
+BUILD_ASSERT_DECL(sizeof(struct netflow_v5_record) == 48);
+
+struct netflow {
+ uint8_t engine_type; /* Value of engine_type to use. */
+ uint8_t engine_id; /* Value of engine_id to use. */
+ long long int boot_time; /* Time when netflow_create() was called. */
+ int *fds; /* Sockets for NetFlow collectors. */
+ size_t n_fds; /* Number of Netflow collectors. */
+ bool add_id_to_iface; /* Put the 7 least signficiant bits of
+ * 'engine_id' into the most signficant
+ * bits of the interface fields. */
+ uint32_t netflow_cnt; /* Flow sequence number for NetFlow. */
+ struct ofpbuf packet; /* NetFlow packet being accumulated. */
+};
+
+static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
+
+static int
+open_collector(char *dst)
+{
+ char *save_ptr;
+ const char *host_name;
+ const char *port_string;
+ struct sockaddr_in sin;
+ int retval;
+ int fd;
+
+ /* Glibc 2.7 has a bug in strtok_r when compiling with optimization that
+ * can cause segfaults here:
+ * http://sources.redhat.com/bugzilla/show_bug.cgi?id=5614.
+ * Using "::" instead of the obvious ":" works around it. */
+ host_name = strtok_r(dst, "::", &save_ptr);
+ port_string = strtok_r(NULL, "::", &save_ptr);
+ if (!host_name) {
+ ovs_error(0, "%s: bad peer name format", dst);
+ return -EAFNOSUPPORT;
+ }
+ if (!port_string) {
+ ovs_error(0, "%s: bad port format", dst);
+ return -EAFNOSUPPORT;
+ }
+
+ memset(&sin, 0, sizeof sin);
+ sin.sin_family = AF_INET;
+ if (lookup_ip(host_name, &sin.sin_addr)) {
+ return -ENOENT;
+ }
+ sin.sin_port = htons(atoi(port_string));
+
+ fd = socket(AF_INET, SOCK_DGRAM, 0);
+ if (fd < 0) {
+ VLOG_ERR("%s: socket: %s", dst, strerror(errno));
+ return -errno;
+ }
+
+ retval = set_nonblocking(fd);
+ if (retval) {
+ close(fd);
+ return -retval;
+ }
+
+ retval = connect(fd, (struct sockaddr *) &sin, sizeof sin);
+ if (retval < 0) {
+ int error = errno;
+ VLOG_ERR("%s: connect: %s", dst, strerror(error));
+ close(fd);
+ return -error;
+ }
+
+ return fd;
+}
+
+void
+netflow_expire(struct netflow *nf, const struct ofexpired *expired)
+{
+ struct netflow_v5_header *nf_hdr;
+ struct netflow_v5_record *nf_rec;
+ struct timeval now;
+
+ /* NetFlow only reports on IP packets. */
+ if (expired->flow.dl_type != htons(ETH_TYPE_IP)) {
+ return;
+ }
+
+ time_timeval(&now);
+
+ if (!nf->packet.size) {
+ nf_hdr = ofpbuf_put_zeros(&nf->packet, sizeof *nf_hdr);
+ nf_hdr->version = htons(NETFLOW_V5_VERSION);
+ nf_hdr->count = htons(0);
+ nf_hdr->sysuptime = htonl(time_msec() - nf->boot_time);
+ nf_hdr->unix_secs = htonl(now.tv_sec);
+ nf_hdr->unix_nsecs = htonl(now.tv_usec * 1000);
+ nf_hdr->flow_seq = htonl(nf->netflow_cnt++);
+ nf_hdr->engine_type = nf->engine_type;
+ nf_hdr->engine_id = nf->engine_id;
+ nf_hdr->sampling_interval = htons(0);
+ }
+
+ nf_hdr = nf->packet.data;
+ nf_hdr->count = htons(ntohs(nf_hdr->count) + 1);
+
+ nf_rec = ofpbuf_put_zeros(&nf->packet, sizeof *nf_rec);
+ nf_rec->src_addr = expired->flow.nw_src;
+ nf_rec->dst_addr = expired->flow.nw_dst;
+ nf_rec->nexthop = htons(0);
+ if (nf->add_id_to_iface) {
+ uint16_t iface = (nf->engine_id & 0x7f) << 9;
+ nf_rec->input = htons(iface | (expired->flow.in_port & 0x1ff));
+ nf_rec->output = htons(iface);
+ printf("input: %x\n", ntohs(nf_rec->input));
+ } else {
+ nf_rec->input = htons(expired->flow.in_port);
+ nf_rec->output = htons(0);
+ }
+ nf_rec->packet_count = htonl(MIN(expired->packet_count, UINT32_MAX));
+ nf_rec->byte_count = htonl(MIN(expired->byte_count, UINT32_MAX));
+ nf_rec->init_time = htonl(expired->created - nf->boot_time);
+ nf_rec->used_time = htonl(MAX(expired->created, expired->used)
+ - nf->boot_time);
+ if (expired->flow.nw_proto == IP_TYPE_ICMP) {
+ /* In NetFlow, the ICMP type and code are concatenated and
+ * placed in the 'dst_port' field. */
+ uint8_t type = ntohs(expired->flow.tp_src);
+ uint8_t code = ntohs(expired->flow.tp_dst);
+ nf_rec->src_port = htons(0);
+ nf_rec->dst_port = htons((type << 8) | code);
+ } else {
+ nf_rec->src_port = expired->flow.tp_src;
+ nf_rec->dst_port = expired->flow.tp_dst;
+ }
+ nf_rec->tcp_flags = expired->tcp_flags;
+ nf_rec->ip_proto = expired->flow.nw_proto;
+ nf_rec->ip_tos = expired->ip_tos;
+
+ /* NetFlow messages are limited to 30 records. A length of 1400
+ * bytes guarantees that the limit is not exceeded. */
+ if (nf->packet.size >= 1400) {
+ netflow_run(nf);
+ }
+}
+
+void
+netflow_run(struct netflow *nf)
+{
+ size_t i;
+
+ if (!nf->packet.size) {
+ return;
+ }
+
+ for (i = 0; i < nf->n_fds; i++) {
+ if (send(nf->fds[i], nf->packet.data, nf->packet.size, 0) == -1) {
+ VLOG_WARN_RL(&rl, "netflow message send failed: %s",
+ strerror(errno));
+ }
+ }
+ nf->packet.size = 0;
+}
+
+static void
+clear_collectors(struct netflow *nf)
+{
+ size_t i;
+
+ for (i = 0; i < nf->n_fds; i++) {
+ close(nf->fds[i]);
+ }
+ free(nf->fds);
+ nf->fds = NULL;
+ nf->n_fds = 0;
+}
+
+int
+netflow_set_collectors(struct netflow *nf, const struct svec *collectors_)
+{
+ struct svec collectors;
+ int error = 0;
+ size_t i;
+
+ clear_collectors(nf);
+
+ svec_clone(&collectors, collectors_);
+ svec_sort_unique(&collectors);
+
+ nf->fds = xmalloc(sizeof *nf->fds * collectors.n);
+ for (i = 0; i < collectors.n; i++) {
+ const char *name = collectors.names[i];
+ char *tmpname = xstrdup(name);
+ int fd = open_collector(tmpname);
+ free(tmpname);
+ if (fd >= 0) {
+ nf->fds[nf->n_fds++] = fd;
+ } else {
+ VLOG_WARN("couldn't open connection to collector (%s), "
+ "ignoring %s\n", strerror(-fd), name);
+ if (!error) {
+ error = -fd;
+ }
+ }
+ }
+
+ svec_destroy(&collectors);
+ return error;
+}
+
+void
+netflow_set_engine(struct netflow *nf, uint8_t engine_type,
+ uint8_t engine_id, bool add_id_to_iface)
+{
+ nf->engine_type = engine_type;
+ nf->engine_id = engine_id;
+ nf->add_id_to_iface = add_id_to_iface;
+}
+
+struct netflow *
+netflow_create(void)
+{
+ struct netflow *nf = xmalloc(sizeof *nf);
+ nf->engine_type = 0;
+ nf->engine_id = 0;
+ nf->boot_time = time_msec();
+ nf->fds = NULL;
+ nf->n_fds = 0;
+ nf->add_id_to_iface = false;
+ nf->netflow_cnt = 0;
+ ofpbuf_init(&nf->packet, 1500);
+ return nf;
+}
+
+void
+netflow_destroy(struct netflow *nf)
+{
+ if (nf) {
+ ofpbuf_uninit(&nf->packet);
+ clear_collectors(nf);
+ free(nf);
+ }
+}
diff --git a/secchan/netflow.h b/secchan/netflow.h
new file mode 100644
index 000000000..f37d6eff0
--- /dev/null
+++ b/secchan/netflow.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2008, 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef NETFLOW_H
+#define NETFLOW_H 1
+
+#include "flow.h"
+
+struct ofexpired;
+struct svec;
+
+struct netflow *netflow_create(void);
+void netflow_destroy(struct netflow *);
+int netflow_set_collectors(struct netflow *, const struct svec *collectors);
+void netflow_set_engine(struct netflow *nf, uint8_t engine_type,
+ uint8_t engine_id, bool add_id_to_iface);
+void netflow_expire(struct netflow *, const struct ofexpired *);
+void netflow_run(struct netflow *);
+
+#endif /* netflow.h */
diff --git a/secchan/ofproto.c b/secchan/ofproto.c
new file mode 100644
index 000000000..8220fb8df
--- /dev/null
+++ b/secchan/ofproto.c
@@ -0,0 +1,3305 @@
+/*
+ * Copyright (c) 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <config.h>
+#include "ofproto.h"
+#include <errno.h>
+#include <inttypes.h>
+#include <net/if.h>
+#include <netinet/in.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include "classifier.h"
+#include "coverage.h"
+#include "discovery.h"
+#include "dpif.h"
+#include "executer.h"
+#include "fail-open.h"
+#include "in-band.h"
+#include "mac-learning.h"
+#include "netdev.h"
+#include "netflow.h"
+#include "odp-util.h"
+#include "ofp-print.h"
+#include "ofpbuf.h"
+#include "openflow/nicira-ext.h"
+#include "openflow/openflow.h"
+#include "openflow/openflow-mgmt.h"
+#include "openvswitch/datapath-protocol.h"
+#include "packets.h"
+#include "pinsched.h"
+#include "pktbuf.h"
+#include "poll-loop.h"
+#include "port-array.h"
+#include "rconn.h"
+#include "shash.h"
+#include "status.h"
+#include "stp.h"
+#include "svec.h"
+#include "tag.h"
+#include "timeval.h"
+#include "vconn.h"
+#include "vconn-ssl.h"
+#include "xtoxll.h"
+
+#define THIS_MODULE VLM_ofproto
+#include "vlog.h"
+
+enum {
+ DP_GROUP_FLOOD = 0,
+ DP_GROUP_ALL = 1
+};
+
+enum {
+ TABLEID_HASH = 0,
+ TABLEID_CLASSIFIER = 1
+};
+
+struct ofport {
+ struct netdev *netdev;
+ struct ofp_phy_port opp; /* In host byte order. */
+};
+
+static void ofport_free(struct ofport *);
+static void hton_ofp_phy_port(struct ofp_phy_port *);
+
+static int xlate_actions(const union ofp_action *in, size_t n_in,
+ const flow_t *flow, struct ofproto *ofproto,
+ const struct ofpbuf *packet,
+ struct odp_actions *out, tag_type *tags,
+ bool *may_setup_flow);
+
+struct rule {
+ struct cls_rule cr;
+
+ uint16_t idle_timeout; /* In seconds from time of last use. */
+ uint16_t hard_timeout; /* In seconds from time of creation. */
+ long long int used; /* Last-used time (0 if never used). */
+ long long int created; /* Creation time. */
+ uint64_t packet_count; /* Number of packets received. */
+ uint64_t byte_count; /* Number of bytes received. */
+ uint64_t accounted_bytes; /* Number of bytes passed to account_cb. */
+ uint8_t tcp_flags; /* Bitwise-OR of all TCP flags seen. */
+ uint8_t ip_tos; /* Last-seen IP type-of-service. */
+ tag_type tags; /* Tags (set only by hooks). */
+
+ /* If 'super' is non-NULL, this rule is a subrule, that is, it is an
+ * exact-match rule (having cr.wc.wildcards of 0) generated from the
+ * wildcard rule 'super'. In this case, 'list' is an element of the
+ * super-rule's list.
+ *
+ * If 'super' is NULL, this rule is a super-rule, and 'list' is the head of
+ * a list of subrules. A super-rule with no wildcards (where
+ * cr.wc.wildcards is 0) will never have any subrules. */
+ struct rule *super;
+ struct list list;
+
+ /* OpenFlow actions.
+ *
+ * A subrule has no actions (it uses the super-rule's actions). */
+ int n_actions;
+ union ofp_action *actions;
+
+ /* Datapath actions.
+ *
+ * A super-rule with wildcard fields never has ODP actions (since the
+ * datapath only supports exact-match flows). */
+ bool installed; /* Installed in datapath? */
+ bool may_install; /* True ordinarily; false if actions must
+ * be reassessed for every packet. */
+ int n_odp_actions;
+ union odp_action *odp_actions;
+};
+
+static inline bool
+rule_is_hidden(const struct rule *rule)
+{
+ /* Subrules are merely an implementation detail, so hide them from the
+ * controller. */
+ if (rule->super != NULL) {
+ return true;
+ }
+
+ /* Rules with priority higher than UINT16_MAX are set up by secchan itself
+ * (e.g. by in-band control) and are intentionally hidden from the
+ * controller. */
+ if (rule->cr.priority > UINT16_MAX) {
+ return true;
+ }
+
+ return false;
+}
+
+static struct rule *rule_create(struct rule *super, const union ofp_action *,
+ size_t n_actions, uint16_t idle_timeout,
+ uint16_t hard_timeout);
+static void rule_free(struct rule *);
+static void rule_destroy(struct ofproto *, struct rule *);
+static struct rule *rule_from_cls_rule(const struct cls_rule *);
+static void rule_insert(struct ofproto *, struct rule *,
+ struct ofpbuf *packet, uint16_t in_port);
+static void rule_remove(struct ofproto *, struct rule *);
+static bool rule_make_actions(struct ofproto *, struct rule *,
+ const struct ofpbuf *packet);
+static void rule_install(struct ofproto *, struct rule *,
+ struct rule *displaced_rule);
+static void rule_uninstall(struct ofproto *, struct rule *);
+static void rule_post_uninstall(struct ofproto *, struct rule *);
+
+struct ofconn {
+ struct list node;
+ struct rconn *rconn;
+ struct pktbuf *pktbuf;
+ bool send_flow_exp;
+ int miss_send_len;
+
+ struct rconn_packet_counter *packet_in_counter;
+
+ /* Number of OpenFlow messages queued as replies to OpenFlow requests, and
+ * the maximum number before we stop reading OpenFlow requests. */
+#define OFCONN_REPLY_MAX 100
+ struct rconn_packet_counter *reply_counter;
+};
+
+static struct ofconn *ofconn_create(struct ofproto *, struct rconn *);
+static void ofconn_destroy(struct ofconn *, struct ofproto *);
+static void ofconn_run(struct ofconn *, struct ofproto *);
+static void ofconn_wait(struct ofconn *);
+static void queue_tx(struct ofpbuf *msg, const struct ofconn *ofconn,
+ struct rconn_packet_counter *counter);
+
+struct ofproto {
+ /* Settings. */
+ uint64_t datapath_id; /* Datapath ID. */
+ uint64_t fallback_dpid; /* Datapath ID if no better choice found. */
+ uint64_t mgmt_id; /* Management channel identifier. */
+ char *manufacturer; /* Manufacturer. */
+ char *hardware; /* Hardware. */
+ char *software; /* Software version. */
+ char *serial; /* Serial number. */
+
+ /* Datapath. */
+ struct dpif dpif;
+ struct dpifmon *dpifmon;
+ struct port_array ports; /* Index is ODP port nr; ofport->opp.port_no is
+ * OFP port nr. */
+ struct shash port_by_name;
+ uint32_t max_ports;
+
+ /* Configuration. */
+ struct switch_status *switch_status;
+ struct status_category *ss_cat;
+ struct in_band *in_band;
+ struct discovery *discovery;
+ struct fail_open *fail_open;
+ struct pinsched *miss_sched, *action_sched;
+ struct executer *executer;
+ struct netflow *netflow;
+
+ /* Flow table. */
+ struct classifier cls;
+ bool need_revalidate;
+ long long int next_expiration;
+ struct tag_set revalidate_set;
+
+ /* OpenFlow connections. */
+ struct list all_conns;
+ struct ofconn *controller;
+ struct pvconn **listeners;
+ size_t n_listeners;
+ struct pvconn **snoops;
+ size_t n_snoops;
+
+ /* Hooks for ovs-vswitchd. */
+ const struct ofhooks *ofhooks;
+ void *aux;
+
+ /* Used by default ofhooks. */
+ struct mac_learning *ml;
+};
+
+static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
+
+static const struct ofhooks default_ofhooks;
+
+static uint64_t pick_datapath_id(struct dpif *, uint64_t fallback_dpid);
+static uint64_t pick_fallback_dpid(void);
+static void send_packet_in_miss(struct ofpbuf *, void *ofproto);
+static void send_packet_in_action(struct ofpbuf *, void *ofproto);
+static void update_used(struct ofproto *);
+static void update_stats(struct rule *, const struct odp_flow_stats *);
+static void expire_rule(struct cls_rule *, void *ofproto);
+static bool revalidate_rule(struct ofproto *p, struct rule *rule);
+static void revalidate_cb(struct cls_rule *rule_, void *p_);
+
+static void handle_odp_msg(struct ofproto *, struct ofpbuf *);
+
+static void handle_openflow(struct ofconn *, struct ofproto *,
+ struct ofpbuf *);
+
+static void refresh_port_group(struct ofproto *, unsigned int group);
+static void update_port(struct ofproto *, const char *devname);
+static int init_ports(struct ofproto *);
+static void reinit_ports(struct ofproto *);
+
+int
+ofproto_create(const char *datapath, const struct ofhooks *ofhooks, void *aux,
+ struct ofproto **ofprotop)
+{
+ struct dpifmon *dpifmon;
+ struct odp_stats stats;
+ struct ofproto *p;
+ struct dpif dpif;
+ int error;
+
+ *ofprotop = NULL;
+
+ /* Connect to datapath and start listening for messages. */
+ error = dpif_open(datapath, &dpif);
+ if (error) {
+ VLOG_ERR("failed to open datapath %s: %s", datapath, strerror(error));
+ return error;
+ }
+ error = dpif_get_dp_stats(&dpif, &stats);
+ if (error) {
+ VLOG_ERR("failed to obtain stats for datapath %s: %s",
+ datapath, strerror(error));
+ dpif_close(&dpif);
+ return error;
+ }
+ error = dpif_set_listen_mask(&dpif, ODPL_MISS | ODPL_ACTION);
+ if (error) {
+ VLOG_ERR("failed to listen on datapath %s: %s",
+ datapath, strerror(error));
+ dpif_close(&dpif);
+ return error;
+ }
+ dpif_flow_flush(&dpif);
+ dpif_purge(&dpif);
+
+ /* Start monitoring datapath ports for status changes. */
+ error = dpifmon_create(datapath, &dpifmon);
+ if (error) {
+ VLOG_ERR("failed to starting monitoring datapath %s: %s",
+ datapath, strerror(error));
+ dpif_close(&dpif);
+ return error;
+ }
+
+ /* Initialize settings. */
+ p = xcalloc(1, sizeof *p);
+ p->fallback_dpid = pick_fallback_dpid();
+ p->datapath_id = pick_datapath_id(&dpif, p->fallback_dpid);
+ VLOG_INFO("using datapath ID %012"PRIx64, p->datapath_id);
+ p->manufacturer = xstrdup("Nicira Networks, Inc.");
+ p->hardware = xstrdup("Reference Implementation");
+ p->software = xstrdup(VERSION BUILDNR);
+ p->serial = xstrdup("None");
+
+ /* Initialize datapath. */
+ p->dpif = dpif;
+ p->dpifmon = dpifmon;
+ port_array_init(&p->ports);
+ shash_init(&p->port_by_name);
+ p->max_ports = stats.max_ports;
+
+ /* Initialize submodules. */
+ p->switch_status = switch_status_create(p);
+ p->in_band = NULL;
+ p->discovery = NULL;
+ p->fail_open = NULL;
+ p->miss_sched = p->action_sched = NULL;
+ p->executer = NULL;
+ p->netflow = NULL;
+
+ /* Initialize flow table. */
+ classifier_init(&p->cls);
+ p->need_revalidate = false;
+ p->next_expiration = time_msec() + 1000;
+ tag_set_init(&p->revalidate_set);
+
+ /* Initialize OpenFlow connections. */
+ list_init(&p->all_conns);
+ p->controller = ofconn_create(p, rconn_create(15, 15));
+ p->controller->pktbuf = pktbuf_create();
+ p->controller->miss_send_len = OFP_DEFAULT_MISS_SEND_LEN;
+ p->listeners = NULL;
+ p->n_listeners = 0;
+ p->snoops = NULL;
+ p->n_snoops = 0;
+
+ /* Initialize hooks. */
+ if (ofhooks) {
+ p->ofhooks = ofhooks;
+ p->aux = aux;
+ p->ml = NULL;
+ } else {
+ p->ofhooks = &default_ofhooks;
+ p->aux = p;
+ p->ml = mac_learning_create();
+ }
+
+ /* Register switch status category. */
+ p->ss_cat = switch_status_register(p->switch_status, "remote",
+ rconn_status_cb, p->controller->rconn);
+
+ /* Almost done... */
+ error = init_ports(p);
+ if (error) {
+ ofproto_destroy(p);
+ return error;
+ }
+
+ *ofprotop = p;
+ return 0;
+}
+
+void
+ofproto_set_datapath_id(struct ofproto *p, uint64_t datapath_id)
+{
+ uint64_t old_dpid = p->datapath_id;
+ p->datapath_id = (datapath_id
+ ? datapath_id
+ : pick_datapath_id(&p->dpif, p->fallback_dpid));
+ if (p->datapath_id != old_dpid) {
+ VLOG_INFO("datapath ID changed to %012"PRIx64, p->datapath_id);
+ rconn_reconnect(p->controller->rconn);
+ }
+}
+
+void
+ofproto_set_mgmt_id(struct ofproto *p, uint64_t mgmt_id)
+{
+ p->mgmt_id = mgmt_id;
+}
+
+void
+ofproto_set_probe_interval(struct ofproto *p, int probe_interval)
+{
+ probe_interval = probe_interval ? MAX(probe_interval, 5) : 0;
+ rconn_set_probe_interval(p->controller->rconn, probe_interval);
+ if (p->fail_open) {
+ int trigger_duration = probe_interval ? probe_interval * 3 : 15;
+ fail_open_set_trigger_duration(p->fail_open, trigger_duration);
+ }
+}
+
+void
+ofproto_set_max_backoff(struct ofproto *p, int max_backoff)
+{
+ rconn_set_max_backoff(p->controller->rconn, max_backoff);
+}
+
+void
+ofproto_set_desc(struct ofproto *p,
+ const char *manufacturer, const char *hardware,
+ const char *software, const char *serial)
+{
+ if (manufacturer) {
+ free(p->manufacturer);
+ p->manufacturer = xstrdup(manufacturer);
+ }
+ if (hardware) {
+ free(p->hardware);
+ p->hardware = xstrdup(hardware);
+ }
+ if (software) {
+ free(p->software);
+ p->software = xstrdup(software);
+ }
+ if (serial) {
+ free(p->serial);
+ p->serial = xstrdup(serial);
+ }
+}
+
+int
+ofproto_set_in_band(struct ofproto *p, bool in_band)
+{
+ if (in_band != (p->in_band != NULL)) {
+ if (in_band) {
+ return in_band_create(p, &p->dpif, p->switch_status,
+ p->controller->rconn, &p->in_band);
+ } else {
+ ofproto_set_discovery(p, false, NULL, true);
+ in_band_destroy(p->in_band);
+ p->in_band = NULL;
+ }
+ rconn_reconnect(p->controller->rconn);
+ }
+ return 0;
+}
+
+int
+ofproto_set_discovery(struct ofproto *p, bool discovery,
+ const char *re, bool update_resolv_conf)
+{
+ if (discovery != (p->discovery != NULL)) {
+ if (discovery) {
+ int error = ofproto_set_in_band(p, true);
+ if (error) {
+ return error;
+ }
+ error = discovery_create(re, update_resolv_conf,
+ &p->dpif, p->switch_status,
+ &p->discovery);
+ if (error) {
+ return error;
+ }
+ } else {
+ discovery_destroy(p->discovery);
+ p->discovery = NULL;
+ }
+ rconn_disconnect(p->controller->rconn);
+ } else if (discovery) {
+ discovery_set_update_resolv_conf(p->discovery, update_resolv_conf);
+ return discovery_set_accept_controller_re(p->discovery, re);
+ }
+ return 0;
+}
+
+int
+ofproto_set_controller(struct ofproto *ofproto, const char *controller)
+{
+ if (ofproto->discovery) {
+ return EINVAL;
+ } else if (controller) {
+ if (strcmp(rconn_get_name(ofproto->controller->rconn), controller)) {
+ return rconn_connect(ofproto->controller->rconn, controller);
+ } else {
+ return 0;
+ }
+ } else {
+ rconn_disconnect(ofproto->controller->rconn);
+ return 0;
+ }
+}
+
+static int
+set_pvconns(struct pvconn ***pvconnsp, size_t *n_pvconnsp,
+ const struct svec *svec)
+{
+ struct pvconn **pvconns = *pvconnsp;
+ size_t n_pvconns = *n_pvconnsp;
+ int retval = 0;
+ size_t i;
+
+ for (i = 0; i < n_pvconns; i++) {
+ pvconn_close(pvconns[i]);
+ }
+ free(pvconns);
+
+ pvconns = xmalloc(svec->n * sizeof *pvconns);
+ n_pvconns = 0;
+ for (i = 0; i < svec->n; i++) {
+ const char *name = svec->names[i];
+ struct pvconn *pvconn;
+ int error;
+
+ error = pvconn_open(name, &pvconn);
+ if (!error) {
+ pvconns[n_pvconns++] = pvconn;
+ } else {
+ VLOG_ERR("failed to listen on %s: %s", name, strerror(error));
+ if (!retval) {
+ retval = error;
+ }
+ }
+ }
+
+ *pvconnsp = pvconns;
+ *n_pvconnsp = n_pvconns;
+
+ return retval;
+}
+
+int
+ofproto_set_listeners(struct ofproto *ofproto, const struct svec *listeners)
+{
+ return set_pvconns(&ofproto->listeners, &ofproto->n_listeners, listeners);
+}
+
+int
+ofproto_set_snoops(struct ofproto *ofproto, const struct svec *snoops)
+{
+ return set_pvconns(&ofproto->snoops, &ofproto->n_snoops, snoops);
+}
+
+int
+ofproto_set_netflow(struct ofproto *ofproto, const struct svec *collectors,
+ uint8_t engine_type, uint8_t engine_id, bool add_id_to_iface)
+{
+ if (collectors && collectors->n) {
+ if (!ofproto->netflow) {
+ ofproto->netflow = netflow_create();
+ }
+ netflow_set_engine(ofproto->netflow, engine_type, engine_id,
+ add_id_to_iface);
+ return netflow_set_collectors(ofproto->netflow, collectors);
+ } else {
+ netflow_destroy(ofproto->netflow);
+ ofproto->netflow = NULL;
+ return 0;
+ }
+}
+
+void
+ofproto_set_failure(struct ofproto *ofproto, bool fail_open)
+{
+ if (fail_open) {
+ struct rconn *rconn = ofproto->controller->rconn;
+ int trigger_duration = rconn_get_probe_interval(rconn) * 3;
+ if (!ofproto->fail_open) {
+ ofproto->fail_open = fail_open_create(ofproto, trigger_duration,
+ ofproto->switch_status,
+ rconn);
+ } else {
+ fail_open_set_trigger_duration(ofproto->fail_open,
+ trigger_duration);
+ }
+ } else {
+ fail_open_destroy(ofproto->fail_open);
+ ofproto->fail_open = NULL;
+ }
+}
+
+void
+ofproto_set_rate_limit(struct ofproto *ofproto,
+ int rate_limit, int burst_limit)
+{
+ if (rate_limit > 0) {
+ if (!ofproto->miss_sched) {
+ ofproto->miss_sched = pinsched_create(rate_limit, burst_limit,
+ ofproto->switch_status);
+ ofproto->action_sched = pinsched_create(rate_limit, burst_limit,
+ NULL);
+ } else {
+ pinsched_set_limits(ofproto->miss_sched, rate_limit, burst_limit);
+ pinsched_set_limits(ofproto->action_sched,
+ rate_limit, burst_limit);
+ }
+ } else {
+ pinsched_destroy(ofproto->miss_sched);
+ ofproto->miss_sched = NULL;
+ pinsched_destroy(ofproto->action_sched);
+ ofproto->action_sched = NULL;
+ }
+}
+
+int
+ofproto_set_stp(struct ofproto *ofproto UNUSED, bool enable_stp)
+{
+ /* XXX */
+ if (enable_stp) {
+ VLOG_WARN("STP is not yet implemented");
+ return EINVAL;
+ } else {
+ return 0;
+ }
+}
+
+int
+ofproto_set_remote_execution(struct ofproto *ofproto, const char *command_acl,
+ const char *command_dir)
+{
+ if (command_acl) {
+ if (!ofproto->executer) {
+ return executer_create(command_acl, command_dir,
+ &ofproto->executer);
+ } else {
+ executer_set_acl(ofproto->executer, command_acl, command_dir);
+ }
+ } else {
+ executer_destroy(ofproto->executer);
+ ofproto->executer = NULL;
+ }
+ return 0;
+}
+
+uint64_t
+ofproto_get_datapath_id(const struct ofproto *ofproto)
+{
+ return ofproto->datapath_id;
+}
+
+int
+ofproto_get_probe_interval(const struct ofproto *ofproto)
+{
+ return rconn_get_probe_interval(ofproto->controller->rconn);
+}
+
+int
+ofproto_get_max_backoff(const struct ofproto *ofproto)
+{
+ return rconn_get_max_backoff(ofproto->controller->rconn);
+}
+
+bool
+ofproto_get_in_band(const struct ofproto *ofproto)
+{
+ return ofproto->in_band != NULL;
+}
+
+bool
+ofproto_get_discovery(const struct ofproto *ofproto)
+{
+ return ofproto->discovery != NULL;
+}
+
+const char *
+ofproto_get_controller(const struct ofproto *ofproto)
+{
+ return rconn_get_name(ofproto->controller->rconn);
+}
+
+void
+ofproto_get_listeners(const struct ofproto *ofproto, struct svec *listeners)
+{
+ size_t i;
+
+ for (i = 0; i < ofproto->n_listeners; i++) {
+ svec_add(listeners, pvconn_get_name(ofproto->listeners[i]));
+ }
+}
+
+void
+ofproto_get_snoops(const struct ofproto *ofproto, struct svec *snoops)
+{
+ size_t i;
+
+ for (i = 0; i < ofproto->n_snoops; i++) {
+ svec_add(snoops, pvconn_get_name(ofproto->snoops[i]));
+ }
+}
+
+void
+ofproto_destroy(struct ofproto *p)
+{
+ struct ofconn *ofconn, *next_ofconn;
+ struct ofport *ofport;
+ unsigned int port_no;
+ size_t i;
+
+ if (!p) {
+ return;
+ }
+
+ ofproto_flush_flows(p);
+ classifier_destroy(&p->cls);
+
+ LIST_FOR_EACH_SAFE (ofconn, next_ofconn, struct ofconn, node,
+ &p->all_conns) {
+ ofconn_destroy(ofconn, p);
+ }
+
+ dpif_close(&p->dpif);
+ dpifmon_destroy(p->dpifmon);
+ PORT_ARRAY_FOR_EACH (ofport, &p->ports, port_no) {
+ ofport_free(ofport);
+ }
+ shash_destroy(&p->port_by_name);
+
+ switch_status_destroy(p->switch_status);
+ in_band_destroy(p->in_band);
+ discovery_destroy(p->discovery);
+ fail_open_destroy(p->fail_open);
+ pinsched_destroy(p->miss_sched);
+ pinsched_destroy(p->action_sched);
+ executer_destroy(p->executer);
+ netflow_destroy(p->netflow);
+
+ switch_status_unregister(p->ss_cat);
+
+ for (i = 0; i < p->n_listeners; i++) {
+ pvconn_close(p->listeners[i]);
+ }
+ free(p->listeners);
+
+ for (i = 0; i < p->n_snoops; i++) {
+ pvconn_close(p->snoops[i]);
+ }
+ free(p->snoops);
+
+ mac_learning_destroy(p->ml);
+
+ free(p);
+}
+
+int
+ofproto_run(struct ofproto *p)
+{
+ int error = ofproto_run1(p);
+ if (!error) {
+ error = ofproto_run2(p, false);
+ }
+ return error;
+}
+
+int
+ofproto_run1(struct ofproto *p)
+{
+ struct ofconn *ofconn, *next_ofconn;
+ char *devname;
+ int error;
+ int i;
+
+ for (i = 0; i < 50; i++) {
+ struct ofpbuf *buf;
+ int error;
+
+ error = dpif_recv(&p->dpif, &buf);
+ if (error) {
+ if (error == ENODEV) {
+ /* Someone destroyed the datapath behind our back. The caller
+ * better destroy us and give up, because we're just going to
+ * spin from here on out. */
+ static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
+ VLOG_ERR_RL(&rl, "dp%u: datapath was destroyed externally",
+ dpif_id(&p->dpif));
+ return ENODEV;
+ }
+ break;
+ }
+
+ handle_odp_msg(p, buf);
+ }
+
+ while ((error = dpifmon_poll(p->dpifmon, &devname)) != EAGAIN) {
+ if (error == ENOBUFS) {
+ reinit_ports(p);
+ } else if (!error) {
+ update_port(p, devname);
+ free(devname);
+ }
+ }
+
+ if (p->in_band) {
+ in_band_run(p->in_band);
+ }
+ if (p->discovery) {
+ char *controller_name;
+ if (rconn_is_connectivity_questionable(p->controller->rconn)) {
+ discovery_question_connectivity(p->discovery);
+ }
+ if (discovery_run(p->discovery, &controller_name)) {
+ if (controller_name) {
+ rconn_connect(p->controller->rconn, controller_name);
+ } else {
+ rconn_disconnect(p->controller->rconn);
+ }
+ }
+ }
+ if (p->fail_open) {
+ fail_open_run(p->fail_open);
+ }
+ pinsched_run(p->miss_sched, send_packet_in_miss, p);
+ pinsched_run(p->action_sched, send_packet_in_action, p);
+ if (p->executer) {
+ executer_run(p->executer);
+ }
+
+ LIST_FOR_EACH_SAFE (ofconn, next_ofconn, struct ofconn, node,
+ &p->all_conns) {
+ ofconn_run(ofconn, p);
+ }
+
+ for (i = 0; i < p->n_listeners; i++) {
+ struct vconn *vconn;
+ int retval;
+
+ retval = pvconn_accept(p->listeners[i], OFP_VERSION, &vconn);
+ if (!retval) {
+ ofconn_create(p, rconn_new_from_vconn("passive", vconn));
+ } else if (retval != EAGAIN) {
+ VLOG_WARN_RL(&rl, "accept failed (%s)", strerror(retval));
+ }
+ }
+
+ for (i = 0; i < p->n_snoops; i++) {
+ struct vconn *vconn;
+ int retval;
+
+ retval = pvconn_accept(p->snoops[i], OFP_VERSION, &vconn);
+ if (!retval) {
+ rconn_add_monitor(p->controller->rconn, vconn);
+ } else if (retval != EAGAIN) {
+ VLOG_WARN_RL(&rl, "accept failed (%s)", strerror(retval));
+ }
+ }
+
+ if (time_msec() >= p->next_expiration) {
+ COVERAGE_INC(ofproto_expiration);
+ p->next_expiration = time_msec() + 1000;
+ update_used(p);
+
+ classifier_for_each(&p->cls, CLS_INC_ALL, expire_rule, p);
+
+ /* Let the hook know that we're at a stable point: all outstanding data
+ * in existing flows has been accounted to the account_cb. Thus, the
+ * hook can now reasonably do operations that depend on having accurate
+ * flow volume accounting (currently, that's just bond rebalancing). */
+ if (p->ofhooks->account_checkpoint_cb) {
+ p->ofhooks->account_checkpoint_cb(p->aux);
+ }
+ }
+
+ if (p->netflow) {
+ netflow_run(p->netflow);
+ }
+
+ return 0;
+}
+
+struct revalidate_cbdata {
+ struct ofproto *ofproto;
+ bool revalidate_all; /* Revalidate all exact-match rules? */
+ bool revalidate_subrules; /* Revalidate all exact-match subrules? */
+ struct tag_set revalidate_set; /* Set of tags to revalidate. */
+};
+
+int
+ofproto_run2(struct ofproto *p, bool revalidate_all)
+{
+ if (p->need_revalidate || revalidate_all
+ || !tag_set_is_empty(&p->revalidate_set)) {
+ struct revalidate_cbdata cbdata;
+ cbdata.ofproto = p;
+ cbdata.revalidate_all = revalidate_all;
+ cbdata.revalidate_subrules = p->need_revalidate;
+ cbdata.revalidate_set = p->revalidate_set;
+ tag_set_init(&p->revalidate_set);
+ COVERAGE_INC(ofproto_revalidate);
+ classifier_for_each(&p->cls, CLS_INC_EXACT, revalidate_cb, &cbdata);
+ p->need_revalidate = false;
+ }
+
+ return 0;
+}
+
+void
+ofproto_wait(struct ofproto *p)
+{
+ struct ofconn *ofconn;
+ size_t i;
+
+ dpif_recv_wait(&p->dpif);
+ dpifmon_wait(p->dpifmon);
+ LIST_FOR_EACH (ofconn, struct ofconn, node, &p->all_conns) {
+ ofconn_wait(ofconn);
+ }
+ if (p->in_band) {
+ in_band_wait(p->in_band);
+ }
+ if (p->discovery) {
+ discovery_wait(p->discovery);
+ }
+ if (p->fail_open) {
+ fail_open_wait(p->fail_open);
+ }
+ pinsched_wait(p->miss_sched);
+ pinsched_wait(p->action_sched);
+ if (p->executer) {
+ executer_wait(p->executer);
+ }
+ if (!tag_set_is_empty(&p->revalidate_set)) {
+ poll_immediate_wake();
+ }
+ if (p->need_revalidate) {
+ /* Shouldn't happen, but if it does just go around again. */
+ VLOG_DBG_RL(&rl, "need revalidate in ofproto_wait_cb()");
+ poll_immediate_wake();
+ } else if (p->next_expiration != LLONG_MAX) {
+ poll_timer_wait(p->next_expiration - time_msec());
+ }
+ for (i = 0; i < p->n_listeners; i++) {
+ pvconn_wait(p->listeners[i]);
+ }
+ for (i = 0; i < p->n_snoops; i++) {
+ pvconn_wait(p->snoops[i]);
+ }
+}
+
+void
+ofproto_revalidate(struct ofproto *ofproto, tag_type tag)
+{
+ tag_set_add(&ofproto->revalidate_set, tag);
+}
+
+struct tag_set *
+ofproto_get_revalidate_set(struct ofproto *ofproto)
+{
+ return &ofproto->revalidate_set;
+}
+
+bool
+ofproto_is_alive(const struct ofproto *p)
+{
+ return p->discovery || rconn_is_alive(p->controller->rconn);
+}
+
+int
+ofproto_send_packet(struct ofproto *p, const flow_t *flow,
+ const union ofp_action *actions, size_t n_actions,
+ const struct ofpbuf *packet)
+{
+ struct odp_actions odp_actions;
+ int error;
+
+ error = xlate_actions(actions, n_actions, flow, p, packet, &odp_actions,
+ NULL, NULL);
+ if (error) {
+ return error;
+ }
+
+ /* XXX Should we translate the dpif_execute() errno value into an OpenFlow
+ * error code? */
+ dpif_execute(&p->dpif, flow->in_port, odp_actions.actions,
+ odp_actions.n_actions, packet);
+ return 0;
+}
+
+void
+ofproto_add_flow(struct ofproto *p,
+ const flow_t *flow, uint32_t wildcards, unsigned int priority,
+ const union ofp_action *actions, size_t n_actions,
+ int idle_timeout)
+{
+ struct rule *rule;
+ rule = rule_create(NULL, actions, n_actions,
+ idle_timeout >= 0 ? idle_timeout : 5 /* XXX */, 0);
+ cls_rule_from_flow(&rule->cr, flow, wildcards, priority);
+ rule_insert(p, rule, NULL, 0);
+}
+
+void
+ofproto_delete_flow(struct ofproto *ofproto, const flow_t *flow,
+ uint32_t wildcards, unsigned int priority)
+{
+ struct rule *rule;
+
+ rule = rule_from_cls_rule(classifier_find_rule_exactly(&ofproto->cls,
+ flow, wildcards,
+ priority));
+ if (rule) {
+ rule_remove(ofproto, rule);
+ }
+}
+
+static void
+destroy_rule(struct cls_rule *rule_, void *ofproto_)
+{
+ struct rule *rule = rule_from_cls_rule(rule_);
+ struct ofproto *ofproto = ofproto_;
+
+ /* Mark the flow as not installed, even though it might really be
+ * installed, so that rule_remove() doesn't bother trying to uninstall it.
+ * There is no point in uninstalling it individually since we are about to
+ * blow away all the flows with dpif_flow_flush(). */
+ rule->installed = false;
+
+ rule_remove(ofproto, rule);
+}
+
+void
+ofproto_flush_flows(struct ofproto *ofproto)
+{
+ COVERAGE_INC(ofproto_flush);
+ classifier_for_each(&ofproto->cls, CLS_INC_ALL, destroy_rule, ofproto);
+ dpif_flow_flush(&ofproto->dpif);
+ if (ofproto->in_band) {
+ in_band_flushed(ofproto->in_band);
+ }
+ if (ofproto->fail_open) {
+ fail_open_flushed(ofproto->fail_open);
+ }
+}
+
+static void
+reinit_ports(struct ofproto *p)
+{
+ struct svec devnames;
+ struct ofport *ofport;
+ unsigned int port_no;
+ struct odp_port *odp_ports;
+ size_t n_odp_ports;
+ size_t i;
+
+ svec_init(&devnames);
+ PORT_ARRAY_FOR_EACH (ofport, &p->ports, port_no) {
+ svec_add (&devnames, (char *) ofport->opp.name);
+ }
+ dpif_port_list(&p->dpif, &odp_ports, &n_odp_ports);
+ for (i = 0; i < n_odp_ports; i++) {
+ svec_add (&devnames, odp_ports[i].devname);
+ }
+ free(odp_ports);
+
+ svec_sort_unique(&devnames);
+ for (i = 0; i < devnames.n; i++) {
+ update_port(p, devnames.names[i]);
+ }
+ svec_destroy(&devnames);
+}
+
+static void
+refresh_port_group(struct ofproto *p, unsigned int group)
+{
+ uint16_t *ports;
+ size_t n_ports;
+ struct ofport *port;
+ unsigned int port_no;
+
+ assert(group == DP_GROUP_ALL || group == DP_GROUP_FLOOD);
+
+ ports = xmalloc(port_array_count(&p->ports) * sizeof *ports);
+ n_ports = 0;
+ PORT_ARRAY_FOR_EACH (port, &p->ports, port_no) {
+ if (group == DP_GROUP_ALL || !(port->opp.config & OFPPC_NO_FLOOD)) {
+ ports[n_ports++] = port_no;
+ }
+ }
+ dpif_port_group_set(&p->dpif, group, ports, n_ports);
+ free(ports);
+}
+
+static void
+refresh_port_groups(struct ofproto *p)
+{
+ refresh_port_group(p, DP_GROUP_FLOOD);
+ refresh_port_group(p, DP_GROUP_ALL);
+}
+
+static struct ofport *
+make_ofport(const struct odp_port *odp_port)
+{
+ enum netdev_flags flags;
+ struct ofport *ofport;
+ struct netdev *netdev;
+ bool carrier;
+ int error;
+
+ error = netdev_open(odp_port->devname, NETDEV_ETH_TYPE_NONE, &netdev);
+ if (error) {
+ VLOG_WARN_RL(&rl, "ignoring port %s (%"PRIu16") because netdev %s "
+ "cannot be opened (%s)",
+ odp_port->devname, odp_port->port,
+ odp_port->devname, strerror(error));
+ return NULL;
+ }
+
+ ofport = xmalloc(sizeof *ofport);
+ ofport->netdev = netdev;
+ ofport->opp.port_no = odp_port_to_ofp_port(odp_port->port);
+ memcpy(ofport->opp.hw_addr, netdev_get_etheraddr(netdev), ETH_ALEN);
+ memcpy(ofport->opp.name, odp_port->devname,
+ MIN(sizeof ofport->opp.name, sizeof odp_port->devname));
+ ofport->opp.name[sizeof ofport->opp.name - 1] = '\0';
+
+ netdev_get_flags(netdev, &flags);
+ ofport->opp.config = flags & NETDEV_UP ? 0 : OFPPC_PORT_DOWN;
+
+ netdev_get_carrier(netdev, &carrier);
+ ofport->opp.state = carrier ? 0 : OFPPS_LINK_DOWN;
+
+ netdev_get_features(netdev,
+ &ofport->opp.curr, &ofport->opp.advertised,
+ &ofport->opp.supported, &ofport->opp.peer);
+ return ofport;
+}
+
+static bool
+ofport_conflicts(const struct ofproto *p, const struct odp_port *odp_port)
+{
+ if (port_array_get(&p->ports, odp_port->port)) {
+ VLOG_WARN_RL(&rl, "ignoring duplicate port %"PRIu16" in datapath",
+ odp_port->port);
+ return true;
+ } else if (shash_find(&p->port_by_name, odp_port->devname)) {
+ VLOG_WARN_RL(&rl, "ignoring duplicate device %s in datapath",
+ odp_port->devname);
+ return true;
+ } else {
+ return false;
+ }
+}
+
+static int
+ofport_equal(const struct ofport *a_, const struct ofport *b_)
+{
+ const struct ofp_phy_port *a = &a_->opp;
+ const struct ofp_phy_port *b = &b_->opp;
+
+ BUILD_ASSERT_DECL(sizeof *a == 48); /* Detect ofp_phy_port changes. */
+ return (a->port_no == b->port_no
+ && !memcmp(a->hw_addr, b->hw_addr, sizeof a->hw_addr)
+ && !strcmp((char *) a->name, (char *) b->name)
+ && a->state == b->state
+ && a->config == b->config
+ && a->curr == b->curr
+ && a->advertised == b->advertised
+ && a->supported == b->supported
+ && a->peer == b->peer);
+}
+
+static void
+send_port_status(struct ofproto *p, const struct ofport *ofport,
+ uint8_t reason)
+{
+ /* XXX Should limit the number of queued port status change messages. */
+ struct ofconn *ofconn;
+ LIST_FOR_EACH (ofconn, struct ofconn, node, &p->all_conns) {
+ struct ofp_port_status *ops;
+ struct ofpbuf *b;
+
+ ops = make_openflow_xid(sizeof *ops, OFPT_PORT_STATUS, 0, &b);
+ ops->reason = reason;
+ ops->desc = ofport->opp;
+ hton_ofp_phy_port(&ops->desc);
+ queue_tx(b, ofconn, NULL);
+ }
+ if (p->ofhooks->port_changed_cb) {
+ p->ofhooks->port_changed_cb(reason, &ofport->opp, p->aux);
+ }
+}
+
+static void
+ofport_install(struct ofproto *p, struct ofport *ofport)
+{
+ port_array_set(&p->ports, ofp_port_to_odp_port(ofport->opp.port_no),
+ ofport);
+ shash_add(&p->port_by_name, (char *) ofport->opp.name, ofport);
+}
+
+static void
+ofport_remove(struct ofproto *p, struct ofport *ofport)
+{
+ port_array_set(&p->ports, ofp_port_to_odp_port(ofport->opp.port_no), NULL);
+ shash_delete(&p->port_by_name,
+ shash_find(&p->port_by_name, (char *) ofport->opp.name));
+}
+
+static void
+ofport_free(struct ofport *ofport)
+{
+ if (ofport) {
+ netdev_close(ofport->netdev);
+ free(ofport);
+ }
+}
+
+static void
+update_port(struct ofproto *p, const char *devname)
+{
+ struct odp_port odp_port;
+ struct ofport *ofport;
+ int error;
+
+ COVERAGE_INC(ofproto_update_port);
+ ofport = shash_find_data(&p->port_by_name, devname);
+ error = dpif_port_query_by_name(&p->dpif, devname, &odp_port);
+ if (!error) {
+ if (!ofport) {
+ /* New port. */
+ if (!ofport_conflicts(p, &odp_port)) {
+ ofport = make_ofport(&odp_port);
+ if (ofport) {
+ ofport_install(p, ofport);
+ send_port_status(p, ofport, OFPPR_ADD);
+ }
+ }
+ } else {
+ /* Modified port. */
+ struct ofport *new_ofport = make_ofport(&odp_port);
+ if (!new_ofport) {
+ return;
+ }
+
+ new_ofport->opp.config &= OFPPC_PORT_DOWN;
+ new_ofport->opp.config |= ofport->opp.config & ~OFPPC_PORT_DOWN;
+ if (ofport_equal(ofport, new_ofport)) {
+ /* False alarm--no change. */
+ ofport_free(new_ofport);
+ } else {
+ ofport_remove(p, ofport);
+ ofport_install(p, new_ofport);
+ ofport_free(ofport);
+ send_port_status(p, new_ofport, OFPPR_MODIFY);
+ }
+ }
+ } else if (error == ENOENT || error == ENODEV) {
+ /* Deleted port. */
+ if (ofport) {
+ send_port_status(p, ofport, OFPPR_DELETE);
+ ofport_remove(p, ofport);
+ ofport_free(ofport);
+ }
+ } else {
+ VLOG_WARN_RL(&rl, "dpif_port_query_by_name returned unexpected error "
+ "%s", strerror(error));
+ return;
+ }
+ refresh_port_groups(p);
+}
+
+static int
+init_ports(struct ofproto *p)
+{
+ struct odp_port *ports;
+ size_t n_ports;
+ size_t i;
+ int error;
+
+ error = dpif_port_list(&p->dpif, &ports, &n_ports);
+ if (error) {
+ return error;
+ }
+
+ for (i = 0; i < n_ports; i++) {
+ const struct odp_port *odp_port = &ports[i];
+ if (!ofport_conflicts(p, odp_port)) {
+ struct ofport *ofport = make_ofport(odp_port);
+ if (ofport) {
+ ofport_install(p, ofport);
+ }
+ }
+ }
+ free(ports);
+ refresh_port_groups(p);
+ return 0;
+}
+
+static struct ofconn *
+ofconn_create(struct ofproto *p, struct rconn *rconn)
+{
+ struct ofconn *ofconn = xmalloc(sizeof *ofconn);
+ list_push_back(&p->all_conns, &ofconn->node);
+ ofconn->rconn = rconn;
+ ofconn->pktbuf = NULL;
+ ofconn->send_flow_exp = false;
+ ofconn->miss_send_len = 0;
+ ofconn->packet_in_counter = rconn_packet_counter_create ();
+ ofconn->reply_counter = rconn_packet_counter_create ();
+ return ofconn;
+}
+
+static void
+ofconn_destroy(struct ofconn *ofconn, struct ofproto *p)
+{
+ if (p->executer) {
+ executer_rconn_closing(p->executer, ofconn->rconn);
+ }
+
+ list_remove(&ofconn->node);
+ rconn_destroy(ofconn->rconn);
+ rconn_packet_counter_destroy(ofconn->packet_in_counter);
+ rconn_packet_counter_destroy(ofconn->reply_counter);
+ pktbuf_destroy(ofconn->pktbuf);
+ free(ofconn);
+}
+
+static void
+ofconn_run(struct ofconn *ofconn, struct ofproto *p)
+{
+ int iteration;
+
+ rconn_run(ofconn->rconn);
+
+ if (rconn_packet_counter_read (ofconn->reply_counter) < OFCONN_REPLY_MAX) {
+ /* Limit the number of iterations to prevent other tasks from
+ * starving. */
+ for (iteration = 0; iteration < 50; iteration++) {
+ struct ofpbuf *of_msg = rconn_recv(ofconn->rconn);
+ if (!of_msg) {
+ break;
+ }
+ handle_openflow(ofconn, p, of_msg);
+ ofpbuf_delete(of_msg);
+ }
+ }
+
+ if (ofconn != p->controller && !rconn_is_alive(ofconn->rconn)) {
+ ofconn_destroy(ofconn, p);
+ }
+}
+
+static void
+ofconn_wait(struct ofconn *ofconn)
+{
+ rconn_run_wait(ofconn->rconn);
+ if (rconn_packet_counter_read (ofconn->reply_counter) < OFCONN_REPLY_MAX) {
+ rconn_recv_wait(ofconn->rconn);
+ } else {
+ COVERAGE_INC(ofproto_ofconn_stuck);
+ }
+}
+
+/* Caller is responsible for initializing the 'cr' member of the returned
+ * rule. */
+static struct rule *
+rule_create(struct rule *super,
+ const union ofp_action *actions, size_t n_actions,
+ uint16_t idle_timeout, uint16_t hard_timeout)
+{
+ struct rule *rule = xcalloc(1, sizeof *rule);
+ rule->idle_timeout = idle_timeout;
+ rule->hard_timeout = hard_timeout;
+ rule->used = rule->created = time_msec();
+ rule->super = super;
+ if (super) {
+ list_push_back(&super->list, &rule->list);
+ } else {
+ list_init(&rule->list);
+ }
+ rule->n_actions = n_actions;
+ rule->actions = xmemdup(actions, n_actions * sizeof *actions);
+ return rule;
+}
+
+static struct rule *
+rule_from_cls_rule(const struct cls_rule *cls_rule)
+{
+ return cls_rule ? CONTAINER_OF(cls_rule, struct rule, cr) : NULL;
+}
+
+static void
+rule_free(struct rule *rule)
+{
+ free(rule->actions);
+ free(rule->odp_actions);
+ free(rule);
+}
+
+/* Destroys 'rule'. If 'rule' is a subrule, also removes it from its
+ * super-rule's list of subrules. If 'rule' is a super-rule, also iterates
+ * through all of its subrules and revalidates them, destroying any that no
+ * longer has a super-rule (which is probably all of them).
+ *
+ * Before calling this function, the caller must make have removed 'rule' from
+ * the classifier. If 'rule' is an exact-match rule, the caller is also
+ * responsible for ensuring that it has been uninstalled from the datapath. */
+static void
+rule_destroy(struct ofproto *ofproto, struct rule *rule)
+{
+ if (!rule->super) {
+ struct rule *subrule, *next;
+ LIST_FOR_EACH_SAFE (subrule, next, struct rule, list, &rule->list) {
+ revalidate_rule(ofproto, subrule);
+ }
+ } else {
+ list_remove(&rule->list);
+ }
+ rule_free(rule);
+}
+
+static bool
+rule_has_out_port(const struct rule *rule, uint16_t out_port)
+{
+ const union ofp_action *oa;
+ struct actions_iterator i;
+
+ if (out_port == htons(OFPP_NONE)) {
+ return true;
+ }
+ for (oa = actions_first(&i, rule->actions, rule->n_actions); oa;
+ oa = actions_next(&i)) {
+ if (oa->type == htons(OFPAT_OUTPUT) && oa->output.port == out_port) {
+ return true;
+ }
+ }
+ return false;
+}
+
+/* Executes the actions indicated by 'rule' on 'packet', which is in flow
+ * 'flow' and is considered to have arrived on ODP port 'in_port'.
+ *
+ * The flow that 'packet' actually contains does not need to actually match
+ * 'rule'; the actions in 'rule' will be applied to it either way. Likewise,
+ * the packet and byte counters for 'rule' will be credited for the packet sent
+ * out whether or not the packet actually matches 'rule'.
+ *
+ * If 'rule' is an exact-match rule and 'flow' actually equals the rule's flow,
+ * the caller must already have accurately composed ODP actions for it given
+ * 'packet' using rule_make_actions(). If 'rule' is a wildcard rule, or if
+ * 'rule' is an exact-match rule but 'flow' is not the rule's flow, then this
+ * function will compose a set of ODP actions based on 'rule''s OpenFlow
+ * actions and apply them to 'packet'. */
+static void
+rule_execute(struct ofproto *ofproto, struct rule *rule,
+ struct ofpbuf *packet, const flow_t *flow)
+{
+ const union odp_action *actions;
+ size_t n_actions;
+ struct odp_actions a;
+
+ /* Grab or compose the ODP actions.
+ *
+ * The special case for an exact-match 'rule' where 'flow' is not the
+ * rule's flow is important to avoid, e.g., sending a packet out its input
+ * port simply because the ODP actions were composed for the wrong
+ * scenario. */
+ if (rule->cr.wc.wildcards || !flow_equal(flow, &rule->cr.flow)) {
+ struct rule *super = rule->super ? rule->super : rule;
+ if (xlate_actions(super->actions, super->n_actions, flow, ofproto,
+ packet, &a, NULL, 0)) {
+ return;
+ }
+ actions = a.actions;
+ n_actions = a.n_actions;
+ } else {
+ actions = rule->odp_actions;
+ n_actions = rule->n_odp_actions;
+ }
+
+ /* Execute the ODP actions. */
+ if (!dpif_execute(&ofproto->dpif, flow->in_port,
+ actions, n_actions, packet)) {
+ struct odp_flow_stats stats;
+ flow_extract_stats(flow, packet, &stats);
+ update_stats(rule, &stats);
+ rule->used = time_msec();
+ }
+}
+
+static void
+rule_insert(struct ofproto *p, struct rule *rule, struct ofpbuf *packet,
+ uint16_t in_port)
+{
+ struct rule *displaced_rule;
+
+ /* Insert the rule in the classifier. */
+ displaced_rule = rule_from_cls_rule(classifier_insert(&p->cls, &rule->cr));
+ if (!rule->cr.wc.wildcards) {
+ rule_make_actions(p, rule, packet);
+ }
+
+ /* Send the packet and credit it to the rule. */
+ if (packet) {
+ flow_t flow;
+ flow_extract(packet, in_port, &flow);
+ rule_execute(p, rule, packet, &flow);
+ }
+
+ /* Install the rule in the datapath only after sending the packet, to
+ * avoid packet reordering. */
+ if (rule->cr.wc.wildcards) {
+ COVERAGE_INC(ofproto_add_wc_flow);
+ p->need_revalidate = true;
+ } else {
+ rule_install(p, rule, displaced_rule);
+ }
+
+ /* Free the rule that was displaced, if any. */
+ if (displaced_rule) {
+ rule_destroy(p, displaced_rule);
+ }
+}
+
+static struct rule *
+rule_create_subrule(struct ofproto *ofproto, struct rule *rule,
+ const flow_t *flow)
+{
+ struct rule *subrule = rule_create(rule, NULL, 0,
+ rule->idle_timeout, rule->hard_timeout);
+ COVERAGE_INC(ofproto_subrule_create);
+ cls_rule_from_flow(&subrule->cr, flow, 0,
+ (rule->cr.priority <= UINT16_MAX ? UINT16_MAX
+ : rule->cr.priority));
+ classifier_insert_exact(&ofproto->cls, &subrule->cr);
+
+ return subrule;
+}
+
+static void
+rule_remove(struct ofproto *ofproto, struct rule *rule)
+{
+ if (rule->cr.wc.wildcards) {
+ COVERAGE_INC(ofproto_del_wc_flow);
+ ofproto->need_revalidate = true;
+ } else {
+ rule_uninstall(ofproto, rule);
+ }
+ classifier_remove(&ofproto->cls, &rule->cr);
+ rule_destroy(ofproto, rule);
+}
+
+/* Returns true if the actions changed, false otherwise. */
+static bool
+rule_make_actions(struct ofproto *p, struct rule *rule,
+ const struct ofpbuf *packet)
+{
+ const struct rule *super;
+ struct odp_actions a;
+ size_t actions_len;
+
+ assert(!rule->cr.wc.wildcards);
+
+ super = rule->super ? rule->super : rule;
+ rule->tags = 0;
+ xlate_actions(super->actions, super->n_actions, &rule->cr.flow, p,
+ packet, &a, &rule->tags, &rule->may_install);
+
+ actions_len = a.n_actions * sizeof *a.actions;
+ if (rule->n_odp_actions != a.n_actions
+ || memcmp(rule->odp_actions, a.actions, actions_len)) {
+ COVERAGE_INC(ofproto_odp_unchanged);
+ free(rule->odp_actions);
+ rule->n_odp_actions = a.n_actions;
+ rule->odp_actions = xmemdup(a.actions, actions_len);
+ return true;
+ } else {
+ return false;
+ }
+}
+
+static int
+do_put_flow(struct ofproto *ofproto, struct rule *rule, int flags,
+ struct odp_flow_put *put)
+{
+ memset(&put->flow.stats, 0, sizeof put->flow.stats);
+ put->flow.key = rule->cr.flow;
+ put->flow.actions = rule->odp_actions;
+ put->flow.n_actions = rule->n_odp_actions;
+ put->flags = flags;
+ return dpif_flow_put(&ofproto->dpif, put);
+}
+
+static void
+rule_install(struct ofproto *p, struct rule *rule, struct rule *displaced_rule)
+{
+ assert(!rule->cr.wc.wildcards);
+
+ if (rule->may_install) {
+ struct odp_flow_put put;
+ if (!do_put_flow(p, rule,
+ ODPPF_CREATE | ODPPF_MODIFY | ODPPF_ZERO_STATS,
+ &put)) {
+ rule->installed = true;
+ if (displaced_rule) {
+ update_stats(rule, &put.flow.stats);
+ rule_post_uninstall(p, displaced_rule);
+ }
+ }
+ } else if (displaced_rule) {
+ rule_uninstall(p, displaced_rule);
+ }
+}
+
+static void
+rule_reinstall(struct ofproto *ofproto, struct rule *rule)
+{
+ if (rule->installed) {
+ struct odp_flow_put put;
+ COVERAGE_INC(ofproto_dp_missed);
+ do_put_flow(ofproto, rule, ODPPF_CREATE | ODPPF_MODIFY, &put);
+ } else {
+ rule_install(ofproto, rule, NULL);
+ }
+}
+
+static void
+rule_update_actions(struct ofproto *ofproto, struct rule *rule)
+{
+ bool actions_changed = rule_make_actions(ofproto, rule, NULL);
+ if (rule->may_install) {
+ if (rule->installed) {
+ if (actions_changed) {
+ /* XXX should really do rule_post_uninstall() for the *old* set
+ * of actions, and distinguish the old stats from the new. */
+ struct odp_flow_put put;
+ do_put_flow(ofproto, rule, ODPPF_CREATE | ODPPF_MODIFY, &put);
+ }
+ } else {
+ rule_install(ofproto, rule, NULL);
+ }
+ } else {
+ rule_uninstall(ofproto, rule);
+ }
+}
+
+static void
+rule_account(struct ofproto *ofproto, struct rule *rule, uint64_t extra_bytes)
+{
+ uint64_t total_bytes = rule->byte_count + extra_bytes;
+
+ if (ofproto->ofhooks->account_flow_cb
+ && total_bytes > rule->accounted_bytes)
+ {
+ ofproto->ofhooks->account_flow_cb(
+ &rule->cr.flow, rule->odp_actions, rule->n_odp_actions,
+ total_bytes - rule->accounted_bytes, ofproto->aux);
+ rule->accounted_bytes = total_bytes;
+ }
+}
+
+static void
+rule_uninstall(struct ofproto *p, struct rule *rule)
+{
+ assert(!rule->cr.wc.wildcards);
+ if (rule->installed) {
+ struct odp_flow odp_flow;
+
+ odp_flow.key = rule->cr.flow;
+ odp_flow.actions = NULL;
+ odp_flow.n_actions = 0;
+ if (!dpif_flow_del(&p->dpif, &odp_flow)) {
+ update_stats(rule, &odp_flow.stats);
+ }
+ rule->installed = false;
+
+ rule_post_uninstall(p, rule);
+ }
+}
+
+static void
+rule_post_uninstall(struct ofproto *ofproto, struct rule *rule)
+{
+ struct rule *super = rule->super;
+
+ rule_account(ofproto, rule, 0);
+ if (ofproto->netflow) {
+ struct ofexpired expired;
+ expired.flow = rule->cr.flow;
+ expired.packet_count = rule->packet_count;
+ expired.byte_count = rule->byte_count;
+ expired.used = rule->used;
+ expired.created = rule->created;
+ expired.tcp_flags = rule->tcp_flags;
+ expired.ip_tos = rule->ip_tos;
+ netflow_expire(ofproto->netflow, &expired);
+ }
+ if (super) {
+ super->packet_count += rule->packet_count;
+ super->byte_count += rule->byte_count;
+ super->tcp_flags |= rule->tcp_flags;
+ if (rule->packet_count) {
+ super->ip_tos = rule->ip_tos;
+ }
+ }
+
+ /* Reset counters to prevent double counting if the rule ever gets
+ * reinstalled. */
+ rule->packet_count = 0;
+ rule->byte_count = 0;
+ rule->accounted_bytes = 0;
+ rule->tcp_flags = 0;
+ rule->ip_tos = 0;
+}
+
+static void
+queue_tx(struct ofpbuf *msg, const struct ofconn *ofconn,
+ struct rconn_packet_counter *counter)
+{
+ update_openflow_length(msg);
+ if (rconn_send(ofconn->rconn, msg, counter)) {
+ ofpbuf_delete(msg);
+ }
+}
+
+static void
+send_error(const struct ofconn *ofconn, const struct ofp_header *oh,
+ int error, const void *data, size_t len)
+{
+ struct ofpbuf *buf;
+ struct ofp_error_msg *oem;
+
+ if (!(error >> 16)) {
+ VLOG_WARN_RL(&rl, "not sending bad error code %d to controller",
+ error);
+ return;
+ }
+
+ COVERAGE_INC(ofproto_error);
+ oem = make_openflow_xid(len + sizeof *oem, OFPT_ERROR,
+ oh ? oh->xid : 0, &buf);
+ oem->type = htons((unsigned int) error >> 16);
+ oem->code = htons(error & 0xffff);
+ memcpy(oem->data, data, len);
+ queue_tx(buf, ofconn, ofconn->reply_counter);
+}
+
+static void
+send_error_oh(const struct ofconn *ofconn, const struct ofp_header *oh,
+ int error)
+{
+ size_t oh_length = ntohs(oh->length);
+ send_error(ofconn, oh, error, oh, MIN(oh_length, 64));
+}
+
+static void
+hton_ofp_phy_port(struct ofp_phy_port *opp)
+{
+ opp->port_no = htons(opp->port_no);
+ opp->config = htonl(opp->config);
+ opp->state = htonl(opp->state);
+ opp->curr = htonl(opp->curr);
+ opp->advertised = htonl(opp->advertised);
+ opp->supported = htonl(opp->supported);
+ opp->peer = htonl(opp->peer);
+}
+
+static int
+handle_echo_request(struct ofconn *ofconn, struct ofp_header *oh)
+{
+ struct ofp_header *rq = oh;
+ queue_tx(make_echo_reply(rq), ofconn, ofconn->reply_counter);
+ return 0;
+}
+
+static int
+handle_features_request(struct ofproto *p, struct ofconn *ofconn,
+ struct ofp_header *oh)
+{
+ struct ofp_switch_features *osf;
+ struct ofpbuf *buf;
+ unsigned int port_no;
+ struct ofport *port;
+
+ osf = make_openflow_xid(sizeof *osf, OFPT_FEATURES_REPLY, oh->xid, &buf);
+ osf->datapath_id = htonll(p->datapath_id);
+ osf->n_buffers = htonl(pktbuf_capacity());
+ osf->n_tables = 2;
+ osf->capabilities = htonl(OFPC_FLOW_STATS | OFPC_TABLE_STATS |
+ OFPC_PORT_STATS | OFPC_MULTI_PHY_TX);
+ osf->actions = htonl((1u << OFPAT_OUTPUT) |
+ (1u << OFPAT_SET_VLAN_VID) |
+ (1u << OFPAT_SET_VLAN_PCP) |
+ (1u << OFPAT_STRIP_VLAN) |
+ (1u << OFPAT_SET_DL_SRC) |
+ (1u << OFPAT_SET_DL_DST) |
+ (1u << OFPAT_SET_NW_SRC) |
+ (1u << OFPAT_SET_NW_DST) |
+ (1u << OFPAT_SET_TP_SRC) |
+ (1u << OFPAT_SET_TP_DST));
+
+ PORT_ARRAY_FOR_EACH (port, &p->ports, port_no) {
+ hton_ofp_phy_port(ofpbuf_put(buf, &port->opp, sizeof port->opp));
+ }
+
+ queue_tx(buf, ofconn, ofconn->reply_counter);
+ return 0;
+}
+
+static int
+handle_get_config_request(struct ofproto *p, struct ofconn *ofconn,
+ struct ofp_header *oh)
+{
+ struct ofpbuf *buf;
+ struct ofp_switch_config *osc;
+ uint16_t flags;
+ bool drop_frags;
+
+ /* Figure out flags. */
+ dpif_get_drop_frags(&p->dpif, &drop_frags);
+ flags = drop_frags ? OFPC_FRAG_DROP : OFPC_FRAG_NORMAL;
+ if (ofconn->send_flow_exp) {
+ flags |= OFPC_SEND_FLOW_EXP;
+ }
+
+ /* Send reply. */
+ osc = make_openflow_xid(sizeof *osc, OFPT_GET_CONFIG_REPLY, oh->xid, &buf);
+ osc->flags = htons(flags);
+ osc->miss_send_len = htons(ofconn->miss_send_len);
+ queue_tx(buf, ofconn, ofconn->reply_counter);
+
+ return 0;
+}
+
+static int
+handle_set_config(struct ofproto *p, struct ofconn *ofconn,
+ struct ofp_switch_config *osc)
+{
+ uint16_t flags;
+ int error;
+
+ error = check_ofp_message(&osc->header, OFPT_SET_CONFIG, sizeof *osc);
+ if (error) {
+ return error;
+ }
+ flags = ntohs(osc->flags);
+
+ ofconn->send_flow_exp = (flags & OFPC_SEND_FLOW_EXP) != 0;
+
+ if (ofconn == p->controller) {
+ switch (flags & OFPC_FRAG_MASK) {
+ case OFPC_FRAG_NORMAL:
+ dpif_set_drop_frags(&p->dpif, false);
+ break;
+ case OFPC_FRAG_DROP:
+ dpif_set_drop_frags(&p->dpif, true);
+ break;
+ default:
+ VLOG_WARN_RL(&rl, "requested bad fragment mode (flags=%"PRIx16")",
+ osc->flags);
+ break;
+ }
+ }
+
+ if ((ntohs(osc->miss_send_len) != 0) != (ofconn->miss_send_len != 0)) {
+ if (ntohs(osc->miss_send_len) != 0) {
+ ofconn->pktbuf = pktbuf_create();
+ } else {
+ pktbuf_destroy(ofconn->pktbuf);
+ }
+ }
+
+ ofconn->miss_send_len = ntohs(osc->miss_send_len);
+
+ return 0;
+}
+
+static void
+add_output_group_action(struct odp_actions *actions, uint16_t group)
+{
+ odp_actions_add(actions, ODPAT_OUTPUT_GROUP)->output_group.group = group;
+}
+
+static void
+add_controller_action(struct odp_actions *actions,
+ const struct ofp_action_output *oao)
+{
+ union odp_action *a = odp_actions_add(actions, ODPAT_CONTROLLER);
+ a->controller.arg = oao->max_len ? ntohs(oao->max_len) : UINT32_MAX;
+}
+
+struct action_xlate_ctx {
+ /* Input. */
+ const flow_t *flow; /* Flow to which these actions correspond. */
+ int recurse; /* Recursion level, via xlate_table_action. */
+ struct ofproto *ofproto;
+ const struct ofpbuf *packet; /* The packet corresponding to 'flow', or a
+ * null pointer if we are revalidating
+ * without a packet to refer to. */
+
+ /* Output. */
+ struct odp_actions *out; /* Datapath actions. */
+ tag_type *tags; /* Tags associated with OFPP_NORMAL actions. */
+ bool may_setup_flow; /* True ordinarily; false if the actions must
+ * be reassessed for every packet. */
+};
+
+static void do_xlate_actions(const union ofp_action *in, size_t n_in,
+ struct action_xlate_ctx *ctx);
+
+static void
+add_output_action(struct action_xlate_ctx *ctx, uint16_t port)
+{
+ const struct ofport *ofport = port_array_get(&ctx->ofproto->ports, port);
+ if (!ofport || !(ofport->opp.config & OFPPC_NO_FWD)) {
+ odp_actions_add(ctx->out, ODPAT_OUTPUT)->output.port = port;
+ }
+}
+
+static struct rule *
+lookup_valid_rule(struct ofproto *ofproto, const flow_t *flow)
+{
+ struct rule *rule;
+ rule = rule_from_cls_rule(classifier_lookup(&ofproto->cls, flow));
+
+ /* The rule we found might not be valid, since we could be in need of
+ * revalidation. If it is not valid, don't return it. */
+ if (rule
+ && rule->super
+ && ofproto->need_revalidate
+ && !revalidate_rule(ofproto, rule)) {
+ COVERAGE_INC(ofproto_invalidated);
+ return NULL;
+ }
+
+ return rule;
+}
+
+static void
+xlate_table_action(struct action_xlate_ctx *ctx, uint16_t in_port)
+{
+ if (!ctx->recurse) {
+ struct rule *rule;
+ flow_t flow;
+
+ flow = *ctx->flow;
+ flow.in_port = in_port;
+
+ rule = lookup_valid_rule(ctx->ofproto, &flow);
+ if (rule) {
+ if (rule->super) {
+ rule = rule->super;
+ }
+
+ ctx->recurse++;
+ do_xlate_actions(rule->actions, rule->n_actions, ctx);
+ ctx->recurse--;
+ }
+ }
+}
+
+static void
+xlate_output_action(struct action_xlate_ctx *ctx,
+ const struct ofp_action_output *oao)
+{
+ uint16_t odp_port;
+
+ switch (ntohs(oao->port)) {
+ case OFPP_IN_PORT:
+ add_output_action(ctx, ctx->flow->in_port);
+ break;
+ case OFPP_TABLE:
+ xlate_table_action(ctx, ctx->flow->in_port);
+ break;
+ case OFPP_NORMAL:
+ if (!ctx->ofproto->ofhooks->normal_cb(ctx->flow, ctx->packet,
+ ctx->out, ctx->tags,
+ ctx->ofproto->aux)) {
+ COVERAGE_INC(ofproto_uninstallable);
+ ctx->may_setup_flow = false;
+ }
+ break;
+ case OFPP_FLOOD:
+ add_output_group_action(ctx->out, DP_GROUP_FLOOD);
+ break;
+ case OFPP_ALL:
+ add_output_group_action(ctx->out, DP_GROUP_ALL);
+ break;
+ case OFPP_CONTROLLER:
+ add_controller_action(ctx->out, oao);
+ break;
+ case OFPP_LOCAL:
+ add_output_action(ctx, ODPP_LOCAL);
+ break;
+ default:
+ odp_port = ofp_port_to_odp_port(ntohs(oao->port));
+ if (odp_port != ctx->flow->in_port) {
+ add_output_action(ctx, odp_port);
+ }
+ break;
+ }
+}
+
+static void
+xlate_nicira_action(struct action_xlate_ctx *ctx,
+ const struct nx_action_header *nah)
+{
+ const struct nx_action_resubmit *nar;
+ int subtype = ntohs(nah->subtype);
+
+ assert(nah->vendor == htonl(NX_VENDOR_ID));
+ switch (subtype) {
+ case NXAST_RESUBMIT:
+ nar = (const struct nx_action_resubmit *) nah;
+ xlate_table_action(ctx, ofp_port_to_odp_port(ntohs(nar->in_port)));
+ break;
+
+ default:
+ VLOG_DBG_RL(&rl, "unknown Nicira action type %"PRIu16, subtype);
+ break;
+ }
+}
+
+static void
+do_xlate_actions(const union ofp_action *in, size_t n_in,
+ struct action_xlate_ctx *ctx)
+{
+ struct actions_iterator iter;
+ const union ofp_action *ia;
+ const struct ofport *port;
+
+ port = port_array_get(&ctx->ofproto->ports, ctx->flow->in_port);
+ if (port && port->opp.config & (OFPPC_NO_RECV | OFPPC_NO_RECV_STP) &&
+ port->opp.config & (eth_addr_equals(ctx->flow->dl_dst, stp_eth_addr)
+ ? OFPPC_NO_RECV_STP : OFPPC_NO_RECV)) {
+ /* Drop this flow. */
+ return;
+ }
+
+ for (ia = actions_first(&iter, in, n_in); ia; ia = actions_next(&iter)) {
+ uint16_t type = ntohs(ia->type);
+ union odp_action *oa;
+
+ switch (type) {
+ case OFPAT_OUTPUT:
+ xlate_output_action(ctx, &ia->output);
+ break;
+
+ case OFPAT_SET_VLAN_VID:
+ oa = odp_actions_add(ctx->out, ODPAT_SET_VLAN_VID);
+ oa->vlan_vid.vlan_vid = ia->vlan_vid.vlan_vid;
+ break;
+
+ case OFPAT_SET_VLAN_PCP:
+ oa = odp_actions_add(ctx->out, ODPAT_SET_VLAN_PCP);
+ oa->vlan_pcp.vlan_pcp = ia->vlan_pcp.vlan_pcp;
+ break;
+
+ case OFPAT_STRIP_VLAN:
+ odp_actions_add(ctx->out, ODPAT_STRIP_VLAN);
+ break;
+
+ case OFPAT_SET_DL_SRC:
+ oa = odp_actions_add(ctx->out, ODPAT_SET_DL_SRC);
+ memcpy(oa->dl_addr.dl_addr,
+ ((struct ofp_action_dl_addr *) ia)->dl_addr, ETH_ADDR_LEN);
+ break;
+
+ case OFPAT_SET_DL_DST:
+ oa = odp_actions_add(ctx->out, ODPAT_SET_DL_DST);
+ memcpy(oa->dl_addr.dl_addr,
+ ((struct ofp_action_dl_addr *) ia)->dl_addr, ETH_ADDR_LEN);
+ break;
+
+ case OFPAT_SET_NW_SRC:
+ oa = odp_actions_add(ctx->out, ODPAT_SET_NW_SRC);
+ oa->nw_addr.nw_addr = ia->nw_addr.nw_addr;
+ break;
+
+ case OFPAT_SET_TP_SRC:
+ oa = odp_actions_add(ctx->out, ODPAT_SET_TP_SRC);
+ oa->tp_port.tp_port = ia->tp_port.tp_port;
+ break;
+
+ case OFPAT_VENDOR:
+ xlate_nicira_action(ctx, (const struct nx_action_header *) ia);
+ break;
+
+ default:
+ VLOG_DBG_RL(&rl, "unknown action type %"PRIu16, type);
+ break;
+ }
+ }
+}
+
+static int
+xlate_actions(const union ofp_action *in, size_t n_in,
+ const flow_t *flow, struct ofproto *ofproto,
+ const struct ofpbuf *packet,
+ struct odp_actions *out, tag_type *tags, bool *may_setup_flow)
+{
+ tag_type no_tags = 0;
+ struct action_xlate_ctx ctx;
+ COVERAGE_INC(ofproto_ofp2odp);
+ odp_actions_init(out);
+ ctx.flow = flow;
+ ctx.recurse = 0;
+ ctx.ofproto = ofproto;
+ ctx.packet = packet;
+ ctx.out = out;
+ ctx.tags = tags ? tags : &no_tags;
+ ctx.may_setup_flow = true;
+ do_xlate_actions(in, n_in, &ctx);
+ if (may_setup_flow) {
+ *may_setup_flow = ctx.may_setup_flow;
+ }
+ if (odp_actions_overflow(out)) {
+ odp_actions_init(out);
+ return ofp_mkerr(OFPET_BAD_ACTION, OFPBAC_TOO_MANY);
+ }
+ return 0;
+}
+
+static int
+handle_packet_out(struct ofproto *p, struct ofconn *ofconn,
+ struct ofp_header *oh)
+{
+ struct ofp_packet_out *opo;
+ struct ofpbuf payload, *buffer;
+ struct odp_actions actions;
+ int n_actions;
+ uint16_t in_port;
+ flow_t flow;
+ int error;
+
+ error = check_ofp_packet_out(oh, &payload, &n_actions, p->max_ports);
+ if (error) {
+ return error;
+ }
+ opo = (struct ofp_packet_out *) oh;
+
+ COVERAGE_INC(ofproto_packet_out);
+ if (opo->buffer_id != htonl(UINT32_MAX)) {
+ error = pktbuf_retrieve(ofconn->pktbuf, ntohl(opo->buffer_id),
+ &buffer, &in_port);
+ if (error) {
+ return error;
+ }
+ payload = *buffer;
+ } else {
+ buffer = NULL;
+ }
+
+ flow_extract(&payload, ofp_port_to_odp_port(ntohs(opo->in_port)), &flow);
+ error = xlate_actions((const union ofp_action *) opo->actions, n_actions,
+ &flow, p, &payload, &actions, NULL, NULL);
+ if (error) {
+ return error;
+ }
+
+ dpif_execute(&p->dpif, flow.in_port, actions.actions, actions.n_actions,
+ &payload);
+ ofpbuf_delete(buffer);
+
+ return 0;
+}
+
+static void
+update_port_config(struct ofproto *p, struct ofport *port,
+ uint32_t config, uint32_t mask)
+{
+ mask &= config ^ port->opp.config;
+ if (mask & OFPPC_PORT_DOWN) {
+ if (config & OFPPC_PORT_DOWN) {
+ netdev_turn_flags_off(port->netdev, NETDEV_UP, true);
+ } else {
+ netdev_turn_flags_on(port->netdev, NETDEV_UP, true);
+ }
+ }
+#define REVALIDATE_BITS (OFPPC_NO_RECV | OFPPC_NO_RECV_STP | OFPPC_NO_FWD)
+ if (mask & REVALIDATE_BITS) {
+ COVERAGE_INC(ofproto_costly_flags);
+ port->opp.config ^= mask & REVALIDATE_BITS;
+ p->need_revalidate = true;
+ }
+#undef REVALIDATE_BITS
+ if (mask & OFPPC_NO_FLOOD) {
+ port->opp.config ^= OFPPC_NO_FLOOD;
+ refresh_port_group(p, DP_GROUP_FLOOD);
+ }
+ if (mask & OFPPC_NO_PACKET_IN) {
+ port->opp.config ^= OFPPC_NO_PACKET_IN;
+ }
+}
+
+static int
+handle_port_mod(struct ofproto *p, struct ofp_header *oh)
+{
+ const struct ofp_port_mod *opm;
+ struct ofport *port;
+ int error;
+
+ error = check_ofp_message(oh, OFPT_PORT_MOD, sizeof *opm);
+ if (error) {
+ return error;
+ }
+ opm = (struct ofp_port_mod *) oh;
+
+ port = port_array_get(&p->ports,
+ ofp_port_to_odp_port(ntohs(opm->port_no)));
+ if (!port) {
+ return ofp_mkerr(OFPET_PORT_MOD_FAILED, OFPPMFC_BAD_PORT);
+ } else if (memcmp(port->opp.hw_addr, opm->hw_addr, OFP_ETH_ALEN)) {
+ return ofp_mkerr(OFPET_PORT_MOD_FAILED, OFPPMFC_BAD_HW_ADDR);
+ } else {
+ update_port_config(p, port, ntohl(opm->config), ntohl(opm->mask));
+ if (opm->advertise) {
+ netdev_set_advertisements(port->netdev, ntohl(opm->advertise));
+ }
+ }
+ return 0;
+}
+
+static struct ofpbuf *
+make_stats_reply(uint32_t xid, uint16_t type, size_t body_len)
+{
+ struct ofp_stats_reply *osr;
+ struct ofpbuf *msg;
+
+ msg = ofpbuf_new(MIN(sizeof *osr + body_len, UINT16_MAX));
+ osr = put_openflow_xid(sizeof *osr, OFPT_STATS_REPLY, xid, msg);
+ osr->type = type;
+ osr->flags = htons(0);
+ return msg;
+}
+
+static struct ofpbuf *
+start_stats_reply(const struct ofp_stats_request *request, size_t body_len)
+{
+ return make_stats_reply(request->header.xid, request->type, body_len);
+}
+
+static void *
+append_stats_reply(size_t nbytes, struct ofconn *ofconn, struct ofpbuf **msgp)
+{
+ struct ofpbuf *msg = *msgp;
+ assert(nbytes <= UINT16_MAX - sizeof(struct ofp_stats_reply));
+ if (nbytes + msg->size > UINT16_MAX) {
+ struct ofp_stats_reply *reply = msg->data;
+ reply->flags = htons(OFPSF_REPLY_MORE);
+ *msgp = make_stats_reply(reply->header.xid, reply->type, nbytes);
+ queue_tx(msg, ofconn, ofconn->reply_counter);
+ }
+ return ofpbuf_put_uninit(*msgp, nbytes);
+}
+
+static int
+handle_desc_stats_request(struct ofproto *p, struct ofconn *ofconn,
+ struct ofp_stats_request *request)
+{
+ struct ofp_desc_stats *ods;
+ struct ofpbuf *msg;
+
+ msg = start_stats_reply(request, sizeof *ods);
+ ods = append_stats_reply(sizeof *ods, ofconn, &msg);
+ strncpy(ods->mfr_desc, p->manufacturer, sizeof ods->mfr_desc);
+ strncpy(ods->hw_desc, p->hardware, sizeof ods->hw_desc);
+ strncpy(ods->sw_desc, p->software, sizeof ods->sw_desc);
+ strncpy(ods->serial_num, p->serial, sizeof ods->serial_num);
+ queue_tx(msg, ofconn, ofconn->reply_counter);
+
+ return 0;
+}
+
+static void
+count_subrules(struct cls_rule *cls_rule, void *n_subrules_)
+{
+ struct rule *rule = rule_from_cls_rule(cls_rule);
+ int *n_subrules = n_subrules_;
+
+ if (rule->super) {
+ (*n_subrules)++;
+ }
+}
+
+static int
+handle_table_stats_request(struct ofproto *p, struct ofconn *ofconn,
+ struct ofp_stats_request *request)
+{
+ struct ofp_table_stats *ots;
+ struct ofpbuf *msg;
+ struct odp_stats dpstats;
+ int n_exact, n_subrules, n_wild;
+
+ msg = start_stats_reply(request, sizeof *ots * 2);
+
+ /* Count rules of various kinds. */
+ n_subrules = 0;
+ classifier_for_each(&p->cls, CLS_INC_EXACT, count_subrules, &n_subrules);
+ n_exact = classifier_count_exact(&p->cls) - n_subrules;
+ n_wild = classifier_count(&p->cls) - classifier_count_exact(&p->cls);
+
+ /* Hash table. */
+ dpif_get_dp_stats(&p->dpif, &dpstats);
+ ots = append_stats_reply(sizeof *ots, ofconn, &msg);
+ memset(ots, 0, sizeof *ots);
+ ots->table_id = TABLEID_HASH;
+ strcpy(ots->name, "hash");
+ ots->wildcards = htonl(0);
+ ots->max_entries = htonl(dpstats.max_capacity);
+ ots->active_count = htonl(n_exact);
+ ots->lookup_count = htonll(dpstats.n_frags + dpstats.n_hit +
+ dpstats.n_missed);
+ ots->matched_count = htonll(dpstats.n_hit); /* XXX */
+
+ /* Classifier table. */
+ ots = append_stats_reply(sizeof *ots, ofconn, &msg);
+ memset(ots, 0, sizeof *ots);
+ ots->table_id = TABLEID_CLASSIFIER;
+ strcpy(ots->name, "classifier");
+ ots->wildcards = htonl(OFPFW_ALL);
+ ots->max_entries = htonl(65536);
+ ots->active_count = htonl(n_wild);
+ ots->lookup_count = htonll(0); /* XXX */
+ ots->matched_count = htonll(0); /* XXX */
+
+ queue_tx(msg, ofconn, ofconn->reply_counter);
+ return 0;
+}
+
+static int
+handle_port_stats_request(struct ofproto *p, struct ofconn *ofconn,
+ struct ofp_stats_request *request)
+{
+ struct ofp_port_stats *ops;
+ struct ofpbuf *msg;
+ struct ofport *port;
+ unsigned int port_no;
+
+ msg = start_stats_reply(request, sizeof *ops * 16);
+ PORT_ARRAY_FOR_EACH (port, &p->ports, port_no) {
+ struct netdev_stats stats;
+
+ /* Intentionally ignore return value, since errors will set 'stats' to
+ * all-1s, which is correct for OpenFlow, and netdev_get_stats() will
+ * log errors. */
+ netdev_get_stats(port->netdev, &stats);
+
+ ops = append_stats_reply(sizeof *ops, ofconn, &msg);
+ ops->port_no = htons(odp_port_to_ofp_port(port_no));
+ memset(ops->pad, 0, sizeof ops->pad);
+ ops->rx_packets = htonll(stats.rx_packets);
+ ops->tx_packets = htonll(stats.tx_packets);
+ ops->rx_bytes = htonll(stats.rx_bytes);
+ ops->tx_bytes = htonll(stats.tx_bytes);
+ ops->rx_dropped = htonll(stats.rx_dropped);
+ ops->tx_dropped = htonll(stats.tx_dropped);
+ ops->rx_errors = htonll(stats.rx_errors);
+ ops->tx_errors = htonll(stats.tx_errors);
+ ops->rx_frame_err = htonll(stats.rx_frame_errors);
+ ops->rx_over_err = htonll(stats.rx_over_errors);
+ ops->rx_crc_err = htonll(stats.rx_crc_errors);
+ ops->collisions = htonll(stats.collisions);
+ }
+
+ queue_tx(msg, ofconn, ofconn->reply_counter);
+ return 0;
+}
+
+struct flow_stats_cbdata {
+ struct ofproto *ofproto;
+ struct ofconn *ofconn;
+ uint16_t out_port;
+ struct ofpbuf *msg;
+};
+
+static void
+query_stats(struct ofproto *p, struct rule *rule,
+ uint64_t *packet_countp, uint64_t *byte_countp)
+{
+ uint64_t packet_count, byte_count;
+ struct rule *subrule;
+ struct odp_flow *odp_flows;
+ size_t n_odp_flows;
+
+ n_odp_flows = rule->cr.wc.wildcards ? list_size(&rule->list) : 1;
+ odp_flows = xcalloc(1, n_odp_flows * sizeof *odp_flows);
+ if (rule->cr.wc.wildcards) {
+ size_t i = 0;
+ LIST_FOR_EACH (subrule, struct rule, list, &rule->list) {
+ odp_flows[i++].key = subrule->cr.flow;
+ }
+ } else {
+ odp_flows[0].key = rule->cr.flow;
+ }
+
+ packet_count = rule->packet_count;
+ byte_count = rule->byte_count;
+ if (!dpif_flow_get_multiple(&p->dpif, odp_flows, n_odp_flows)) {
+ size_t i;
+ for (i = 0; i < n_odp_flows; i++) {
+ struct odp_flow *odp_flow = &odp_flows[i];
+ packet_count += odp_flow->stats.n_packets;
+ byte_count += odp_flow->stats.n_bytes;
+ }
+ }
+ free(odp_flows);
+
+ *packet_countp = packet_count;
+ *byte_countp = byte_count;
+}
+
+static void
+flow_stats_cb(struct cls_rule *rule_, void *cbdata_)
+{
+ struct rule *rule = rule_from_cls_rule(rule_);
+ struct flow_stats_cbdata *cbdata = cbdata_;
+ struct ofp_flow_stats *ofs;
+ uint64_t packet_count, byte_count;
+ size_t act_len, len;
+
+ if (rule_is_hidden(rule) || !rule_has_out_port(rule, cbdata->out_port)) {
+ return;
+ }
+
+ act_len = sizeof *rule->actions * rule->n_actions;
+ len = offsetof(struct ofp_flow_stats, actions) + act_len;
+
+ query_stats(cbdata->ofproto, rule, &packet_count, &byte_count);
+
+ ofs = append_stats_reply(len, cbdata->ofconn, &cbdata->msg);
+ ofs->length = htons(len);
+ ofs->table_id = rule->cr.wc.wildcards ? TABLEID_CLASSIFIER : TABLEID_HASH;
+ ofs->pad = 0;
+ flow_to_match(&rule->cr.flow, rule->cr.wc.wildcards, &ofs->match);
+ ofs->duration = htonl((time_msec() - rule->created) / 1000);
+ ofs->priority = htons(rule->cr.priority);
+ ofs->idle_timeout = htons(rule->idle_timeout);
+ ofs->hard_timeout = htons(rule->hard_timeout);
+ memset(ofs->pad2, 0, sizeof ofs->pad2);
+ ofs->packet_count = htonll(packet_count);
+ ofs->byte_count = htonll(byte_count);
+ memcpy(ofs->actions, rule->actions, act_len);
+}
+
+static int
+table_id_to_include(uint8_t table_id)
+{
+ return (table_id == TABLEID_HASH ? CLS_INC_EXACT
+ : table_id == TABLEID_CLASSIFIER ? CLS_INC_WILD
+ : table_id == 0xff ? CLS_INC_ALL
+ : 0);
+}
+
+static int
+handle_flow_stats_request(struct ofproto *p, struct ofconn *ofconn,
+ const struct ofp_stats_request *osr,
+ size_t arg_size)
+{
+ struct ofp_flow_stats_request *fsr;
+ struct flow_stats_cbdata cbdata;
+ struct cls_rule target;
+
+ if (arg_size != sizeof *fsr) {
+ return ofp_mkerr(OFPET_BAD_REQUEST, OFPBRC_BAD_LENGTH);
+ }
+ fsr = (struct ofp_flow_stats_request *) osr->body;
+
+ COVERAGE_INC(ofproto_flows_req);
+ cbdata.ofproto = p;
+ cbdata.ofconn = ofconn;
+ cbdata.out_port = fsr->out_port;
+ cbdata.msg = start_stats_reply(osr, 1024);
+ cls_rule_from_match(&target, &fsr->match, 0);
+ classifier_for_each_match(&p->cls, &target,
+ table_id_to_include(fsr->table_id),
+ flow_stats_cb, &cbdata);
+ queue_tx(cbdata.msg, ofconn, ofconn->reply_counter);
+ return 0;
+}
+
+struct aggregate_stats_cbdata {
+ struct ofproto *ofproto;
+ uint16_t out_port;
+ uint64_t packet_count;
+ uint64_t byte_count;
+ uint32_t n_flows;
+};
+
+static void
+aggregate_stats_cb(struct cls_rule *rule_, void *cbdata_)
+{
+ struct rule *rule = rule_from_cls_rule(rule_);
+ struct aggregate_stats_cbdata *cbdata = cbdata_;
+ uint64_t packet_count, byte_count;
+
+ if (rule_is_hidden(rule) || !rule_has_out_port(rule, cbdata->out_port)) {
+ return;
+ }
+
+ query_stats(cbdata->ofproto, rule, &packet_count, &byte_count);
+
+ cbdata->packet_count += packet_count;
+ cbdata->byte_count += byte_count;
+ cbdata->n_flows++;
+}
+
+static int
+handle_aggregate_stats_request(struct ofproto *p, struct ofconn *ofconn,
+ const struct ofp_stats_request *osr,
+ size_t arg_size)
+{
+ struct ofp_aggregate_stats_request *asr;
+ struct ofp_aggregate_stats_reply *reply;
+ struct aggregate_stats_cbdata cbdata;
+ struct cls_rule target;
+ struct ofpbuf *msg;
+
+ if (arg_size != sizeof *asr) {
+ return ofp_mkerr(OFPET_BAD_REQUEST, OFPBRC_BAD_LENGTH);
+ }
+ asr = (struct ofp_aggregate_stats_request *) osr->body;
+
+ COVERAGE_INC(ofproto_agg_request);
+ cbdata.ofproto = p;
+ cbdata.out_port = asr->out_port;
+ cbdata.packet_count = 0;
+ cbdata.byte_count = 0;
+ cbdata.n_flows = 0;
+ cls_rule_from_match(&target, &asr->match, 0);
+ classifier_for_each_match(&p->cls, &target,
+ table_id_to_include(asr->table_id),
+ aggregate_stats_cb, &cbdata);
+
+ msg = start_stats_reply(osr, sizeof *reply);
+ reply = append_stats_reply(sizeof *reply, ofconn, &msg);
+ reply->flow_count = htonl(cbdata.n_flows);
+ reply->packet_count = htonll(cbdata.packet_count);
+ reply->byte_count = htonll(cbdata.byte_count);
+ queue_tx(msg, ofconn, ofconn->reply_counter);
+ return 0;
+}
+
+static int
+handle_stats_request(struct ofproto *p, struct ofconn *ofconn,
+ struct ofp_header *oh)
+{
+ struct ofp_stats_request *osr;
+ size_t arg_size;
+ int error;
+
+ error = check_ofp_message_array(oh, OFPT_STATS_REQUEST, sizeof *osr,
+ 1, &arg_size);
+ if (error) {
+ return error;
+ }
+ osr = (struct ofp_stats_request *) oh;
+
+ switch (ntohs(osr->type)) {
+ case OFPST_DESC:
+ return handle_desc_stats_request(p, ofconn, osr);
+
+ case OFPST_FLOW:
+ return handle_flow_stats_request(p, ofconn, osr, arg_size);
+
+ case OFPST_AGGREGATE:
+ return handle_aggregate_stats_request(p, ofconn, osr, arg_size);
+
+ case OFPST_TABLE:
+ return handle_table_stats_request(p, ofconn, osr);
+
+ case OFPST_PORT:
+ return handle_port_stats_request(p, ofconn, osr);
+
+ case OFPST_VENDOR:
+ return ofp_mkerr(OFPET_BAD_REQUEST, OFPBRC_BAD_VENDOR);
+
+ default:
+ return ofp_mkerr(OFPET_BAD_REQUEST, OFPBRC_BAD_STAT);
+ }
+}
+
+static long long int
+msec_from_nsec(uint64_t sec, uint32_t nsec)
+{
+ return !sec ? 0 : sec * 1000 + nsec / 1000000;
+}
+
+static void
+update_time(struct rule *rule, const struct odp_flow_stats *stats)
+{
+ long long int used = msec_from_nsec(stats->used_sec, stats->used_nsec);
+ if (used > rule->used) {
+ rule->used = used;
+ }
+}
+
+static void
+update_stats(struct rule *rule, const struct odp_flow_stats *stats)
+{
+ update_time(rule, stats);
+ rule->packet_count += stats->n_packets;
+ rule->byte_count += stats->n_bytes;
+ rule->tcp_flags |= stats->tcp_flags;
+ if (stats->n_packets) {
+ rule->ip_tos = stats->ip_tos;
+ }
+}
+
+static int
+add_flow(struct ofproto *p, struct ofconn *ofconn,
+ struct ofp_flow_mod *ofm, size_t n_actions)
+{
+ struct ofpbuf *packet;
+ struct rule *rule;
+ uint16_t in_port;
+ int error;
+
+ rule = rule_create(NULL, (const union ofp_action *) ofm->actions,
+ n_actions, ntohs(ofm->idle_timeout),
+ ntohs(ofm->hard_timeout));
+ cls_rule_from_match(&rule->cr, &ofm->match, ntohs(ofm->priority));
+
+ packet = NULL;
+ error = 0;
+ if (ofm->buffer_id != htonl(UINT32_MAX)) {
+ error = pktbuf_retrieve(ofconn->pktbuf, ntohl(ofm->buffer_id),
+ &packet, &in_port);
+ }
+
+ rule_insert(p, rule, packet, in_port);
+ ofpbuf_delete(packet);
+ return error;
+}
+
+static int
+modify_flow(struct ofproto *p, const struct ofp_flow_mod *ofm,
+ size_t n_actions, uint16_t command, struct rule *rule)
+{
+ if (rule_is_hidden(rule)) {
+ return 0;
+ }
+
+ if (command == OFPFC_DELETE) {
+ rule_remove(p, rule);
+ } else {
+ size_t actions_len = n_actions * sizeof *rule->actions;
+
+ if (n_actions == rule->n_actions
+ && !memcmp(ofm->actions, rule->actions, actions_len))
+ {
+ return 0;
+ }
+
+ free(rule->actions);
+ rule->actions = xmemdup(ofm->actions, actions_len);
+ rule->n_actions = n_actions;
+
+ if (rule->cr.wc.wildcards) {
+ COVERAGE_INC(ofproto_mod_wc_flow);
+ p->need_revalidate = true;
+ } else {
+ rule_update_actions(p, rule);
+ }
+ }
+
+ return 0;
+}
+
+static int
+modify_flows_strict(struct ofproto *p, const struct ofp_flow_mod *ofm,
+ size_t n_actions, uint16_t command)
+{
+ struct rule *rule;
+ uint32_t wildcards;
+ flow_t flow;
+
+ flow_from_match(&flow, &wildcards, &ofm->match);
+ rule = rule_from_cls_rule(classifier_find_rule_exactly(
+ &p->cls, &flow, wildcards,
+ ntohs(ofm->priority)));
+
+ if (rule) {
+ if (command == OFPFC_DELETE
+ && ofm->out_port != htons(OFPP_NONE)
+ && !rule_has_out_port(rule, ofm->out_port)) {
+ return 0;
+ }
+
+ modify_flow(p, ofm, n_actions, command, rule);
+ }
+ return 0;
+}
+
+struct modify_flows_cbdata {
+ struct ofproto *ofproto;
+ const struct ofp_flow_mod *ofm;
+ uint16_t out_port;
+ size_t n_actions;
+ uint16_t command;
+};
+
+static void
+modify_flows_cb(struct cls_rule *rule_, void *cbdata_)
+{
+ struct rule *rule = rule_from_cls_rule(rule_);
+ struct modify_flows_cbdata *cbdata = cbdata_;
+
+ if (cbdata->out_port != htons(OFPP_NONE)
+ && !rule_has_out_port(rule, cbdata->out_port)) {
+ return;
+ }
+
+ modify_flow(cbdata->ofproto, cbdata->ofm, cbdata->n_actions,
+ cbdata->command, rule);
+}
+
+static int
+modify_flows_loose(struct ofproto *p, const struct ofp_flow_mod *ofm,
+ size_t n_actions, uint16_t command)
+{
+ struct modify_flows_cbdata cbdata;
+ struct cls_rule target;
+
+ cbdata.ofproto = p;
+ cbdata.ofm = ofm;
+ cbdata.out_port = (command == OFPFC_DELETE ? ofm->out_port
+ : htons(OFPP_NONE));
+ cbdata.n_actions = n_actions;
+ cbdata.command = command;
+
+ cls_rule_from_match(&target, &ofm->match, 0);
+
+ classifier_for_each_match(&p->cls, &target, CLS_INC_ALL,
+ modify_flows_cb, &cbdata);
+ return 0;
+}
+
+static int
+handle_flow_mod(struct ofproto *p, struct ofconn *ofconn,
+ struct ofp_flow_mod *ofm)
+{
+ size_t n_actions;
+ int error;
+
+ error = check_ofp_message_array(&ofm->header, OFPT_FLOW_MOD, sizeof *ofm,
+ sizeof *ofm->actions, &n_actions);
+ if (error) {
+ return error;
+ }
+
+ normalize_match(&ofm->match);
+ if (!ofm->match.wildcards) {
+ ofm->priority = htons(UINT16_MAX);
+ }
+
+ error = validate_actions((const union ofp_action *) ofm->actions,
+ n_actions, p->max_ports);
+ if (error) {
+ return error;
+ }
+
+ switch (ntohs(ofm->command)) {
+ case OFPFC_ADD:
+ return add_flow(p, ofconn, ofm, n_actions);
+
+ case OFPFC_MODIFY:
+ return modify_flows_loose(p, ofm, n_actions, OFPFC_MODIFY);
+
+ case OFPFC_MODIFY_STRICT:
+ return modify_flows_strict(p, ofm, n_actions, OFPFC_MODIFY);
+
+ case OFPFC_DELETE:
+ return modify_flows_loose(p, ofm, n_actions, OFPFC_DELETE);
+
+ case OFPFC_DELETE_STRICT:
+ return modify_flows_strict(p, ofm, n_actions, OFPFC_DELETE);
+
+ default:
+ return ofp_mkerr(OFPET_FLOW_MOD_FAILED, OFPFMFC_BAD_COMMAND);
+ }
+}
+
+static void
+send_capability_reply(struct ofproto *p, struct ofconn *ofconn, uint32_t xid)
+{
+ struct ofmp_capability_reply *ocr;
+ struct ofpbuf *b;
+ char capabilities[] = "com.nicira.mgmt.manager=false\n";
+
+ ocr = make_openflow_xid(sizeof(*ocr), OFPT_VENDOR, xid, &b);
+ ocr->header.header.vendor = htonl(NX_VENDOR_ID);
+ ocr->header.header.subtype = htonl(NXT_MGMT);
+ ocr->header.type = htons(OFMPT_CAPABILITY_REPLY);
+
+ ocr->format = htonl(OFMPCOF_SIMPLE);
+ ocr->mgmt_id = htonll(p->mgmt_id);
+
+ ofpbuf_put(b, capabilities, strlen(capabilities));
+
+ queue_tx(b, ofconn, ofconn->reply_counter);
+}
+
+static int
+handle_ofmp(struct ofproto *p, struct ofconn *ofconn,
+ struct ofmp_header *ofmph)
+{
+ size_t msg_len = ntohs(ofmph->header.header.length);
+ if (msg_len < sizeof(*ofmph)) {
+ VLOG_WARN_RL(&rl, "dropping short managment message: %d\n", msg_len);
+ return ofp_mkerr(OFPET_BAD_REQUEST, OFPBRC_BAD_LENGTH);
+ }
+
+ if (ofmph->type == htons(OFMPT_CAPABILITY_REQUEST)) {
+ struct ofmp_capability_request *ofmpcr;
+
+ if (msg_len < sizeof(struct ofmp_capability_request)) {
+ VLOG_WARN_RL(&rl, "dropping short capability request: %d\n",
+ msg_len);
+ return ofp_mkerr(OFPET_BAD_REQUEST, OFPBRC_BAD_LENGTH);
+ }
+
+ ofmpcr = (struct ofmp_capability_request *)ofmph;
+ if (ofmpcr->format != htonl(OFMPCAF_SIMPLE)) {
+ /* xxx Find a better type than bad subtype */
+ return ofp_mkerr(OFPET_BAD_REQUEST, OFPBRC_BAD_SUBTYPE);
+ }
+
+ send_capability_reply(p, ofconn, ofmph->header.header.xid);
+ return 0;
+ } else {
+ return ofp_mkerr(OFPET_BAD_REQUEST, OFPBRC_BAD_SUBTYPE);
+ }
+}
+
+static int
+handle_vendor(struct ofproto *p, struct ofconn *ofconn, void *msg)
+{
+ struct ofp_vendor_header *ovh = msg;
+ struct nicira_header *nh;
+
+ if (ntohs(ovh->header.length) < sizeof(struct ofp_vendor_header)) {
+ return ofp_mkerr(OFPET_BAD_REQUEST, OFPBRC_BAD_LENGTH);
+ }
+ if (ovh->vendor != htonl(NX_VENDOR_ID)) {
+ return ofp_mkerr(OFPET_BAD_REQUEST, OFPBRC_BAD_VENDOR);
+ }
+ if (ntohs(ovh->header.length) < sizeof(struct nicira_header)) {
+ return ofp_mkerr(OFPET_BAD_REQUEST, OFPBRC_BAD_LENGTH);
+ }
+
+ nh = msg;
+ switch (ntohl(nh->subtype)) {
+ case NXT_STATUS_REQUEST:
+ return switch_status_handle_request(p->switch_status, ofconn->rconn,
+ msg);
+
+ case NXT_ACT_SET_CONFIG:
+ return ofp_mkerr(OFPET_BAD_REQUEST, OFPBRC_BAD_SUBTYPE); /* XXX */
+
+ case NXT_ACT_GET_CONFIG:
+ return ofp_mkerr(OFPET_BAD_REQUEST, OFPBRC_BAD_SUBTYPE); /* XXX */
+
+ case NXT_COMMAND_REQUEST:
+ if (p->executer) {
+ return executer_handle_request(p->executer, ofconn->rconn, msg);
+ }
+ break;
+
+ case NXT_MGMT:
+ return handle_ofmp(p, ofconn, msg);
+ }
+
+ return ofp_mkerr(OFPET_BAD_REQUEST, OFPBRC_BAD_SUBTYPE);
+}
+
+static void
+handle_openflow(struct ofconn *ofconn, struct ofproto *p,
+ struct ofpbuf *ofp_msg)
+{
+ struct ofp_header *oh = ofp_msg->data;
+ int error;
+
+ COVERAGE_INC(ofproto_recv_openflow);
+ switch (oh->type) {
+ case OFPT_ECHO_REQUEST:
+ error = handle_echo_request(ofconn, oh);
+ break;
+
+ case OFPT_ECHO_REPLY:
+ error = 0;
+ break;
+
+ case OFPT_FEATURES_REQUEST:
+ error = handle_features_request(p, ofconn, oh);
+ break;
+
+ case OFPT_GET_CONFIG_REQUEST:
+ error = handle_get_config_request(p, ofconn, oh);
+ break;
+
+ case OFPT_SET_CONFIG:
+ error = handle_set_config(p, ofconn, ofp_msg->data);
+ break;
+
+ case OFPT_PACKET_OUT:
+ error = handle_packet_out(p, ofconn, ofp_msg->data);
+ break;
+
+ case OFPT_PORT_MOD:
+ error = handle_port_mod(p, oh);
+ break;
+
+ case OFPT_FLOW_MOD:
+ error = handle_flow_mod(p, ofconn, ofp_msg->data);
+ break;
+
+ case OFPT_STATS_REQUEST:
+ error = handle_stats_request(p, ofconn, oh);
+ break;
+
+ case OFPT_VENDOR:
+ error = handle_vendor(p, ofconn, ofp_msg->data);
+ break;
+
+ default:
+ if (VLOG_IS_WARN_ENABLED()) {
+ char *s = ofp_to_string(oh, ntohs(oh->length), 2);
+ VLOG_DBG_RL(&rl, "OpenFlow message ignored: %s", s);
+ free(s);
+ }
+ error = ofp_mkerr(OFPET_BAD_REQUEST, OFPBRC_BAD_TYPE);
+ break;
+ }
+
+ if (error) {
+ send_error_oh(ofconn, ofp_msg->data, error);
+ }
+}
+
+static void
+handle_odp_msg(struct ofproto *p, struct ofpbuf *packet)
+{
+ struct odp_msg *msg = packet->data;
+ uint16_t in_port = odp_port_to_ofp_port(msg->port);
+ struct rule *rule;
+ struct ofpbuf payload;
+ flow_t flow;
+
+ /* Handle controller actions. */
+ if (msg->type == _ODPL_ACTION_NR) {
+ COVERAGE_INC(ofproto_ctlr_action);
+ pinsched_send(p->action_sched, in_port, packet,
+ send_packet_in_action, p);
+ return;
+ }
+
+ payload.data = msg + 1;
+ payload.size = msg->length - sizeof *msg;
+ flow_extract(&payload, msg->port, &flow);
+
+ rule = lookup_valid_rule(p, &flow);
+ if (!rule) {
+ /* Don't send a packet-in if OFPPC_NO_PACKET_IN asserted. */
+ struct ofport *port = port_array_get(&p->ports, msg->port);
+ if (port) {
+ if (port->opp.config & OFPPC_NO_PACKET_IN) {
+ COVERAGE_INC(ofproto_no_packet_in);
+ /* XXX install 'drop' flow entry */
+ ofpbuf_delete(packet);
+ return;
+ }
+ } else {
+ VLOG_WARN_RL(&rl, "packet-in on unknown port %"PRIu16, msg->port);
+ }
+
+ COVERAGE_INC(ofproto_packet_in);
+ pinsched_send(p->miss_sched, in_port, packet, send_packet_in_miss, p);
+ return;
+ }
+
+ if (rule->cr.wc.wildcards) {
+ rule = rule_create_subrule(p, rule, &flow);
+ rule_make_actions(p, rule, packet);
+ } else {
+ if (!rule->may_install) {
+ /* The rule is not installable, that is, we need to process every
+ * packet, so process the current packet and set its actions into
+ * 'subrule'. */
+ rule_make_actions(p, rule, packet);
+ } else {
+ /* XXX revalidate rule if it needs it */
+ }
+ }
+
+ rule_execute(p, rule, &payload, &flow);
+ rule_reinstall(p, rule);
+ ofpbuf_delete(packet);
+}
+
+static void
+revalidate_cb(struct cls_rule *sub_, void *cbdata_)
+{
+ struct rule *sub = rule_from_cls_rule(sub_);
+ struct revalidate_cbdata *cbdata = cbdata_;
+
+ if (cbdata->revalidate_all
+ || (cbdata->revalidate_subrules && sub->super)
+ || (tag_set_intersects(&cbdata->revalidate_set, sub->tags))) {
+ revalidate_rule(cbdata->ofproto, sub);
+ }
+}
+
+static bool
+revalidate_rule(struct ofproto *p, struct rule *rule)
+{
+ const flow_t *flow = &rule->cr.flow;
+
+ COVERAGE_INC(ofproto_revalidate_rule);
+ if (rule->super) {
+ struct rule *super;
+ super = rule_from_cls_rule(classifier_lookup_wild(&p->cls, flow));
+ if (!super) {
+ rule_remove(p, rule);
+ return false;
+ } else if (super != rule->super) {
+ COVERAGE_INC(ofproto_revalidate_moved);
+ list_remove(&rule->list);
+ list_push_back(&super->list, &rule->list);
+ rule->super = super;
+ rule->hard_timeout = super->hard_timeout;
+ rule->idle_timeout = super->idle_timeout;
+ rule->created = super->created;
+ rule->used = 0;
+ }
+ }
+
+ rule_update_actions(p, rule);
+ return true;
+}
+
+static struct ofpbuf *
+compose_flow_exp(const struct rule *rule, long long int now, uint8_t reason)
+{
+ struct ofp_flow_expired *ofe;
+ struct ofpbuf *buf;
+
+ ofe = make_openflow(sizeof *ofe, OFPT_FLOW_EXPIRED, &buf);
+ flow_to_match(&rule->cr.flow, rule->cr.wc.wildcards, &ofe->match);
+ ofe->priority = htons(rule->cr.priority);
+ ofe->reason = reason;
+ ofe->duration = (now - rule->created) / 1000;
+ ofe->packet_count = rule->packet_count;
+ ofe->byte_count = rule->byte_count;
+
+ return buf;
+}
+
+static void
+send_flow_exp(struct ofproto *p, struct rule *rule,
+ long long int now, uint8_t reason)
+{
+ struct ofconn *ofconn;
+ struct ofconn *prev;
+ struct ofpbuf *buf;
+
+ /* We limit the maximum number of queued flow expirations it by accounting
+ * them under the counter for replies. That works because preventing
+ * OpenFlow requests from being processed also prevents new flows from
+ * being added (and expiring). (It also prevents processing OpenFlow
+ * requests that would not add new flows, so it is imperfect.) */
+
+ prev = NULL;
+ LIST_FOR_EACH (ofconn, struct ofconn, node, &p->all_conns) {
+ if (ofconn->send_flow_exp && rconn_is_connected(ofconn->rconn)) {
+ if (prev) {
+ queue_tx(ofpbuf_clone(buf), prev, ofconn->reply_counter);
+ } else {
+ buf = compose_flow_exp(rule, now, reason);
+ }
+ prev = ofconn;
+ }
+ }
+ if (prev) {
+ queue_tx(buf, prev, ofconn->reply_counter);
+ }
+}
+
+static void
+uninstall_idle_flow(struct ofproto *ofproto, struct rule *rule)
+{
+ assert(rule->installed);
+ assert(!rule->cr.wc.wildcards);
+
+ if (rule->super) {
+ rule_remove(ofproto, rule);
+ } else {
+ rule_uninstall(ofproto, rule);
+ }
+}
+
+static void
+expire_rule(struct cls_rule *cls_rule, void *p_)
+{
+ struct ofproto *p = p_;
+ struct rule *rule = rule_from_cls_rule(cls_rule);
+ long long int hard_expire, idle_expire, expire, now;
+
+ hard_expire = (rule->hard_timeout
+ ? rule->created + rule->hard_timeout * 1000
+ : LLONG_MAX);
+ idle_expire = (rule->idle_timeout
+ && (rule->super || list_is_empty(&rule->list))
+ ? rule->used + rule->idle_timeout * 1000
+ : LLONG_MAX);
+ expire = MIN(hard_expire, idle_expire);
+ if (expire == LLONG_MAX) {
+ if (rule->installed && time_msec() >= rule->used + 5000) {
+ uninstall_idle_flow(p, rule);
+ }
+ return;
+ }
+
+ now = time_msec();
+ if (now < expire) {
+ if (rule->installed && now >= rule->used + 5000) {
+ uninstall_idle_flow(p, rule);
+ }
+ return;
+ }
+
+ COVERAGE_INC(ofproto_expired);
+ if (rule->cr.wc.wildcards) {
+ /* Update stats. (This code will be a no-op if the rule expired
+ * due to an idle timeout, because in that case the rule has no
+ * subrules left.) */
+ struct rule *subrule, *next;
+ LIST_FOR_EACH_SAFE (subrule, next, struct rule, list, &rule->list) {
+ rule_remove(p, subrule);
+ }
+ }
+
+ send_flow_exp(p, rule, now,
+ (now >= hard_expire
+ ? OFPER_HARD_TIMEOUT : OFPER_IDLE_TIMEOUT));
+ rule_remove(p, rule);
+}
+
+static void
+update_used(struct ofproto *p)
+{
+ struct odp_flow *flows;
+ size_t n_flows;
+ size_t i;
+ int error;
+
+ error = dpif_flow_list_all(&p->dpif, &flows, &n_flows);
+ if (error) {
+ return;
+ }
+
+ for (i = 0; i < n_flows; i++) {
+ struct odp_flow *f = &flows[i];
+ struct rule *rule;
+
+ rule = rule_from_cls_rule(
+ classifier_find_rule_exactly(&p->cls, &f->key, 0, UINT16_MAX));
+ if (!rule || !rule->installed) {
+ COVERAGE_INC(ofproto_unexpected_rule);
+ dpif_flow_del(&p->dpif, f);
+ continue;
+ }
+
+ update_time(rule, &f->stats);
+ rule_account(p, rule, f->stats.n_bytes);
+ }
+ free(flows);
+}
+
+static void
+do_send_packet_in(struct ofconn *ofconn, uint32_t buffer_id,
+ const struct ofpbuf *packet, int send_len)
+{
+ struct ofp_packet_in *opi;
+ struct ofpbuf payload, *buf;
+ struct odp_msg *msg;
+
+ msg = packet->data;
+ payload.data = msg + 1;
+ payload.size = msg->length - sizeof *msg;
+
+ send_len = MIN(send_len, payload.size);
+ buf = ofpbuf_new(sizeof *opi + send_len);
+ opi = put_openflow_xid(offsetof(struct ofp_packet_in, data),
+ OFPT_PACKET_IN, 0, buf);
+ opi->buffer_id = htonl(buffer_id);
+ opi->total_len = htons(payload.size);
+ opi->in_port = htons(odp_port_to_ofp_port(msg->port));
+ opi->reason = msg->type == _ODPL_ACTION_NR ? OFPR_ACTION : OFPR_NO_MATCH;
+ ofpbuf_put(buf, payload.data, MIN(send_len, payload.size));
+ update_openflow_length(buf);
+ rconn_send_with_limit(ofconn->rconn, buf, ofconn->packet_in_counter, 100);
+}
+
+static void
+send_packet_in_action(struct ofpbuf *packet, void *p_)
+{
+ struct ofproto *p = p_;
+ struct ofconn *ofconn;
+ struct odp_msg *msg;
+
+ msg = packet->data;
+ LIST_FOR_EACH (ofconn, struct ofconn, node, &p->all_conns) {
+ if (ofconn == p->controller || ofconn->miss_send_len) {
+ do_send_packet_in(ofconn, UINT32_MAX, packet, msg->arg);
+ }
+ }
+ ofpbuf_delete(packet);
+}
+
+static void
+send_packet_in_miss(struct ofpbuf *packet, void *p_)
+{
+ struct ofproto *p = p_;
+ struct ofconn *ofconn;
+ struct ofpbuf payload;
+ struct odp_msg *msg;
+
+ msg = packet->data;
+ payload.data = msg + 1;
+ payload.size = msg->length - sizeof *msg;
+ LIST_FOR_EACH (ofconn, struct ofconn, node, &p->all_conns) {
+ if (ofconn->miss_send_len) {
+ uint32_t buffer_id = pktbuf_save(ofconn->pktbuf, &payload,
+ msg->port);
+ int send_len = (buffer_id != UINT32_MAX ? ofconn->miss_send_len
+ : UINT32_MAX);
+ do_send_packet_in(ofconn, buffer_id, packet, send_len);
+ }
+ }
+ ofpbuf_delete(packet);
+}
+
+static uint64_t
+pick_datapath_id(struct dpif *dpif, uint64_t fallback_dpid)
+{
+ char local_name[IF_NAMESIZE];
+ uint8_t ea[ETH_ADDR_LEN];
+ int error;
+
+ error = dpif_get_name(dpif, local_name, sizeof local_name);
+ if (!error) {
+ error = netdev_nodev_get_etheraddr(local_name, ea);
+ if (!error) {
+ return eth_addr_to_uint64(ea);
+ }
+ VLOG_WARN("could not get MAC address for %s (%s)",
+ local_name, strerror(error));
+ }
+
+ return fallback_dpid;
+}
+
+static uint64_t
+pick_fallback_dpid(void)
+{
+ uint8_t ea[ETH_ADDR_LEN];
+ eth_addr_random(ea);
+ ea[0] = 0x00; /* Set Nicira OUI. */
+ ea[1] = 0x23;
+ ea[2] = 0x20;
+ return eth_addr_to_uint64(ea);
+}
+
+static bool
+default_normal_ofhook_cb(const flow_t *flow, const struct ofpbuf *packet,
+ struct odp_actions *actions, tag_type *tags,
+ void *ofproto_)
+{
+ struct ofproto *ofproto = ofproto_;
+ int out_port;
+
+ /* Drop frames for reserved multicast addresses. */
+ if (eth_addr_is_reserved(flow->dl_dst)) {
+ return true;
+ }
+
+ /* Learn source MAC (but don't try to learn from revalidation). */
+ if (packet != NULL) {
+ tag_type rev_tag = mac_learning_learn(ofproto->ml, flow->dl_src,
+ 0, flow->in_port);
+ if (rev_tag) {
+ /* The log messages here could actually be useful in debugging,
+ * so keep the rate limit relatively high. */
+ static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(30, 300);
+ VLOG_DBG_RL(&rl, "learned that "ETH_ADDR_FMT" is on port %"PRIu16,
+ ETH_ADDR_ARGS(flow->dl_src), flow->in_port);
+ ofproto_revalidate(ofproto, rev_tag);
+ }
+ }
+
+ /* Determine output port. */
+ out_port = mac_learning_lookup_tag(ofproto->ml, flow->dl_dst, 0, tags);
+ if (out_port < 0) {
+ add_output_group_action(actions, DP_GROUP_FLOOD);
+ } else if (out_port != flow->in_port) {
+ odp_actions_add(actions, ODPAT_OUTPUT)->output.port = out_port;
+ } else {
+ /* Drop. */
+ }
+
+ return true;
+}
+
+static const struct ofhooks default_ofhooks = {
+ NULL,
+ default_normal_ofhook_cb,
+ NULL,
+ NULL
+};
diff --git a/secchan/ofproto.h b/secchan/ofproto.h
new file mode 100644
index 000000000..6272d2796
--- /dev/null
+++ b/secchan/ofproto.h
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef OFPROTO_H
+#define OFPROTO_H 1
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include "flow.h"
+#include "tag.h"
+
+struct odp_actions;
+struct ofhooks;
+struct ofproto;
+struct svec;
+
+struct ofexpired {
+ flow_t flow;
+ uint64_t packet_count; /* Packets from *expired* subrules. */
+ uint64_t byte_count; /* Bytes from *expired* subrules. */
+ long long int used; /* Last-used time (0 if never used). */
+ long long int created; /* Creation time. */
+ uint8_t tcp_flags; /* Bitwise-OR of all TCP flags seen. */
+ uint8_t ip_tos; /* Last-seen IP type-of-service. */
+};
+
+int ofproto_create(const char *datapath, const struct ofhooks *, void *aux,
+ struct ofproto **ofprotop);
+void ofproto_destroy(struct ofproto *);
+int ofproto_run(struct ofproto *);
+int ofproto_run1(struct ofproto *);
+int ofproto_run2(struct ofproto *, bool revalidate_all);
+void ofproto_wait(struct ofproto *);
+bool ofproto_is_alive(const struct ofproto *);
+
+/* Configuration. */
+void ofproto_set_datapath_id(struct ofproto *, uint64_t datapath_id);
+void ofproto_set_mgmt_id(struct ofproto *, uint64_t mgmt_id);
+void ofproto_set_probe_interval(struct ofproto *, int probe_interval);
+void ofproto_set_max_backoff(struct ofproto *, int max_backoff);
+void ofproto_set_desc(struct ofproto *,
+ const char *manufacturer, const char *hardware,
+ const char *software, const char *serial);
+int ofproto_set_in_band(struct ofproto *, bool in_band);
+int ofproto_set_discovery(struct ofproto *, bool discovery,
+ const char *accept_controller_re,
+ bool update_resolv_conf);
+int ofproto_set_controller(struct ofproto *, const char *controller);
+int ofproto_set_listeners(struct ofproto *, const struct svec *listeners);
+int ofproto_set_snoops(struct ofproto *, const struct svec *snoops);
+int ofproto_set_netflow(struct ofproto *, const struct svec *collectors,
+ uint8_t engine_type, uint8_t engine_id, bool add_id_to_iface);
+void ofproto_set_failure(struct ofproto *, bool fail_open);
+void ofproto_set_rate_limit(struct ofproto *, int rate_limit, int burst_limit);
+int ofproto_set_stp(struct ofproto *, bool enable_stp);
+int ofproto_set_remote_execution(struct ofproto *, const char *command_acl,
+ const char *command_dir);
+
+/* Configuration querying. */
+uint64_t ofproto_get_datapath_id(const struct ofproto *);
+int ofproto_get_probe_interval(const struct ofproto *);
+int ofproto_get_max_backoff(const struct ofproto *);
+bool ofproto_get_in_band(const struct ofproto *);
+bool ofproto_get_discovery(const struct ofproto *);
+const char *ofproto_get_controller(const struct ofproto *);
+void ofproto_get_listeners(const struct ofproto *, struct svec *);
+void ofproto_get_snoops(const struct ofproto *, struct svec *);
+
+/* Functions for use by ofproto implementation modules, not by clients. */
+int ofproto_send_packet(struct ofproto *, const flow_t *,
+ const union ofp_action *, size_t n_actions,
+ const struct ofpbuf *);
+void ofproto_add_flow(struct ofproto *, const flow_t *, uint32_t wildcards,
+ unsigned int priority,
+ const union ofp_action *, size_t n_actions,
+ int idle_timeout);
+void ofproto_delete_flow(struct ofproto *, const flow_t *, uint32_t wildcards,
+ unsigned int priority);
+void ofproto_flush_flows(struct ofproto *);
+
+/* Hooks for ovs-vswitchd. */
+struct ofhooks {
+ void (*port_changed_cb)(enum ofp_port_reason, const struct ofp_phy_port *,
+ void *aux);
+ bool (*normal_cb)(const flow_t *, const struct ofpbuf *packet,
+ struct odp_actions *, tag_type *, void *aux);
+ void (*account_flow_cb)(const flow_t *, const union odp_action *,
+ size_t n_actions, unsigned long long int n_bytes,
+ void *aux);
+ void (*account_checkpoint_cb)(void *aux);
+};
+void ofproto_revalidate(struct ofproto *, tag_type);
+struct tag_set *ofproto_get_revalidate_set(struct ofproto *);
+
+#endif /* ofproto.h */
diff --git a/secchan/pinsched.c b/secchan/pinsched.c
new file mode 100644
index 000000000..910b7a2e9
--- /dev/null
+++ b/secchan/pinsched.c
@@ -0,0 +1,284 @@
+/*
+ * Copyright (c) 2008, 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <config.h>
+#include "pinsched.h"
+#include <arpa/inet.h>
+#include <stdlib.h>
+#include "ofpbuf.h"
+#include "openflow/openflow.h"
+#include "poll-loop.h"
+#include "port-array.h"
+#include "queue.h"
+#include "random.h"
+#include "rconn.h"
+#include "status.h"
+#include "timeval.h"
+#include "vconn.h"
+
+struct pinsched {
+ /* Client-supplied parameters. */
+ int rate_limit; /* Packets added to bucket per second. */
+ int burst_limit; /* Maximum token bucket size, in packets. */
+
+ /* One queue per physical port. */
+ struct port_array queues; /* Array of "struct ovs_queue *". */
+ int n_queued; /* Sum over queues[*].n. */
+ unsigned int last_tx_port; /* Last port checked in round-robin. */
+
+ /* Token bucket.
+ *
+ * It costs 1000 tokens to send a single packet_in message. A single token
+ * per message would be more straightforward, but this choice lets us avoid
+ * round-off error in refill_bucket()'s calculation of how many tokens to
+ * add to the bucket, since no division step is needed. */
+ long long int last_fill; /* Time at which we last added tokens. */
+ int tokens; /* Current number of tokens. */
+
+ /* Transmission queue. */
+ int n_txq; /* No. of packets waiting in rconn for tx. */
+
+ /* Statistics reporting. */
+ unsigned long long n_normal; /* # txed w/o rate limit queuing. */
+ unsigned long long n_limited; /* # queued for rate limiting. */
+ unsigned long long n_queue_dropped; /* # dropped due to queue overflow. */
+
+ /* Switch status. */
+ struct status_category *ss_cat;
+};
+
+static struct ofpbuf *
+dequeue_packet(struct pinsched *ps, struct ovs_queue *q,
+ unsigned int port_no)
+{
+ struct ofpbuf *packet = queue_pop_head(q);
+ if (!q->n) {
+ free(q);
+ port_array_set(&ps->queues, port_no, NULL);
+ }
+ ps->n_queued--;
+ return packet;
+}
+
+/* Drop a packet from the longest queue in 'ps'. */
+static void
+drop_packet(struct pinsched *ps)
+{
+ struct ovs_queue *longest; /* Queue currently selected as longest. */
+ int n_longest; /* # of queues of same length as 'longest'. */
+ unsigned int longest_port_no;
+ unsigned int port_no;
+ struct ovs_queue *q;
+
+ ps->n_queue_dropped++;
+
+ longest = port_array_first(&ps->queues, &port_no);
+ longest_port_no = port_no;
+ n_longest = 1;
+ while ((q = port_array_next(&ps->queues, &port_no)) != NULL) {
+ if (longest->n < q->n) {
+ longest = q;
+ n_longest = 1;
+ } else if (longest->n == q->n) {
+ n_longest++;
+
+ /* Randomly select one of the longest queues, with a uniform
+ * distribution (Knuth algorithm 3.4.2R). */
+ if (!random_range(n_longest)) {
+ longest = q;
+ longest_port_no = port_no;
+ }
+ }
+ }
+
+ /* FIXME: do we want to pop the tail instead? */
+ ofpbuf_delete(dequeue_packet(ps, longest, longest_port_no));
+}
+
+/* Remove and return the next packet to transmit (in round-robin order). */
+static struct ofpbuf *
+get_tx_packet(struct pinsched *ps)
+{
+ struct ovs_queue *q = port_array_next(&ps->queues, &ps->last_tx_port);
+ if (!q) {
+ q = port_array_first(&ps->queues, &ps->last_tx_port);
+ }
+ return dequeue_packet(ps, q, ps->last_tx_port);
+}
+
+/* Add tokens to the bucket based on elapsed time. */
+static void
+refill_bucket(struct pinsched *ps)
+{
+ long long int now = time_msec();
+ long long int tokens = (now - ps->last_fill) * ps->rate_limit + ps->tokens;
+ if (tokens >= 1000) {
+ ps->last_fill = now;
+ ps->tokens = MIN(tokens, ps->burst_limit * 1000);
+ }
+}
+
+/* Attempts to remove enough tokens from 'ps' to transmit a packet. Returns
+ * true if successful, false otherwise. (In the latter case no tokens are
+ * removed.) */
+static bool
+get_token(struct pinsched *ps)
+{
+ if (ps->tokens >= 1000) {
+ ps->tokens -= 1000;
+ return true;
+ } else {
+ return false;
+ }
+}
+
+void
+pinsched_send(struct pinsched *ps, uint16_t port_no,
+ struct ofpbuf *packet, pinsched_tx_cb *cb, void *aux)
+{
+ if (!ps) {
+ cb(packet, aux);
+ } else if (!ps->n_queued && get_token(ps)) {
+ /* In the common case where we are not constrained by the rate limit,
+ * let the packet take the normal path. */
+ ps->n_normal++;
+ cb(packet, aux);
+ } else {
+ /* Otherwise queue it up for the periodic callback to drain out. */
+ struct ovs_queue *q;
+
+ /* We are called with a buffer obtained from dpif_recv() that has much
+ * more allocated space than actual content most of the time. Since
+ * we're going to store the packet for some time, free up that
+ * otherwise wasted space. */
+ ofpbuf_trim(packet);
+
+ if (ps->n_queued >= ps->burst_limit) {
+ drop_packet(ps);
+ }
+ q = port_array_get(&ps->queues, port_no);
+ if (!q) {
+ q = xmalloc(sizeof *q);
+ queue_init(q);
+ port_array_set(&ps->queues, port_no, q);
+ }
+ queue_push_tail(q, packet);
+ ps->n_queued++;
+ ps->n_limited++;
+ }
+}
+
+static void
+pinsched_status_cb(struct status_reply *sr, void *ps_)
+{
+ struct pinsched *ps = ps_;
+
+ status_reply_put(sr, "normal=%llu", ps->n_normal);
+ status_reply_put(sr, "limited=%llu", ps->n_limited);
+ status_reply_put(sr, "queue-dropped=%llu", ps->n_queue_dropped);
+}
+
+void
+pinsched_run(struct pinsched *ps, pinsched_tx_cb *cb, void *aux)
+{
+ if (ps) {
+ int i;
+
+ /* Drain some packets out of the bucket if possible, but limit the
+ * number of iterations to allow other code to get work done too. */
+ refill_bucket(ps);
+ for (i = 0; ps->n_queued && get_token(ps) && i < 50; i++) {
+ cb(get_tx_packet(ps), aux);
+ }
+ }
+}
+
+void
+pinsched_wait(struct pinsched *ps)
+{
+ if (ps && ps->n_queued) {
+ if (ps->tokens >= 1000) {
+ /* We can transmit more packets as soon as we're called again. */
+ poll_immediate_wake();
+ } else {
+ /* We have to wait for the bucket to re-fill. We could calculate
+ * the exact amount of time here for increased smoothness. */
+ poll_timer_wait(TIME_UPDATE_INTERVAL / 2);
+ }
+ }
+}
+
+/* Creates and returns a scheduler for sending packet-in messages. */
+struct pinsched *
+pinsched_create(int rate_limit, int burst_limit, struct switch_status *ss)
+{
+ struct pinsched *ps;
+
+ ps = xcalloc(1, sizeof *ps);
+ port_array_init(&ps->queues);
+ ps->n_queued = 0;
+ ps->last_tx_port = PORT_ARRAY_SIZE;
+ ps->last_fill = time_msec();
+ ps->tokens = rate_limit * 100;
+ ps->n_txq = 0;
+ ps->n_normal = 0;
+ ps->n_limited = 0;
+ ps->n_queue_dropped = 0;
+ pinsched_set_limits(ps, rate_limit, burst_limit);
+
+ if (ss) {
+ ps->ss_cat = switch_status_register(ss, "rate-limit",
+ pinsched_status_cb, ps);
+ }
+
+ return ps;
+}
+
+void
+pinsched_destroy(struct pinsched *ps)
+{
+ if (ps) {
+ struct ovs_queue *queue;
+ unsigned int port_no;
+
+ PORT_ARRAY_FOR_EACH (queue, &ps->queues, port_no) {
+ queue_destroy(queue);
+ free(queue);
+ }
+ port_array_destroy(&ps->queues);
+ switch_status_unregister(ps->ss_cat);
+ free(ps);
+ }
+}
+
+void
+pinsched_set_limits(struct pinsched *ps, int rate_limit, int burst_limit)
+{
+ if (rate_limit <= 0) {
+ rate_limit = 1000;
+ }
+ if (burst_limit <= 0) {
+ burst_limit = rate_limit / 4;
+ }
+ burst_limit = MAX(burst_limit, 1);
+ burst_limit = MIN(burst_limit, INT_MAX / 1000);
+
+ ps->rate_limit = rate_limit;
+ ps->burst_limit = burst_limit;
+ while (ps->n_queued > burst_limit) {
+ drop_packet(ps);
+ }
+}
diff --git a/secchan/pinsched.h b/secchan/pinsched.h
new file mode 100644
index 000000000..e439450fc
--- /dev/null
+++ b/secchan/pinsched.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2008, 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef PINSCHED_H
+#define PINSCHED_H_H 1
+
+#include <stdint.h>
+
+struct ofpbuf;
+struct switch_status;
+
+typedef void pinsched_tx_cb(struct ofpbuf *, void *aux);
+struct pinsched *pinsched_create(int rate_limit, int burst_limit,
+ struct switch_status *);
+void pinsched_set_limits(struct pinsched *, int rate_limit, int burst_limit);
+void pinsched_destroy(struct pinsched *);
+void pinsched_send(struct pinsched *, uint16_t port_no, struct ofpbuf *,
+ pinsched_tx_cb *, void *aux);
+void pinsched_run(struct pinsched *, pinsched_tx_cb *, void *aux);
+void pinsched_wait(struct pinsched *);
+
+#endif /* pinsched.h */
diff --git a/secchan/pktbuf.c b/secchan/pktbuf.c
new file mode 100644
index 000000000..126c1314c
--- /dev/null
+++ b/secchan/pktbuf.c
@@ -0,0 +1,150 @@
+/*
+ * Copyright (c) 2008, 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <config.h>
+#include "pktbuf.h"
+#include <inttypes.h>
+#include <stdlib.h>
+#include "coverage.h"
+#include "ofpbuf.h"
+#include "timeval.h"
+#include "util.h"
+#include "vconn.h"
+
+#define THIS_MODULE VLM_pktbuf
+#include "vlog.h"
+
+/* Buffers are identified by a 32-bit opaque ID. We divide the ID
+ * into a buffer number (low bits) and a cookie (high bits). The buffer number
+ * is an index into an array of buffers. The cookie distinguishes between
+ * different packets that have occupied a single buffer. Thus, the more
+ * buffers we have, the lower-quality the cookie... */
+#define PKTBUF_BITS 8
+#define PKTBUF_MASK (PKTBUF_CNT - 1)
+#define PKTBUF_CNT (1u << PKTBUF_BITS)
+
+#define COOKIE_BITS (32 - PKTBUF_BITS)
+#define COOKIE_MAX ((1u << COOKIE_BITS) - 1)
+
+#define OVERWRITE_MSECS 5000
+
+struct packet {
+ struct ofpbuf *buffer;
+ uint32_t cookie;
+ long long int timeout;
+ uint16_t in_port;
+};
+
+struct pktbuf {
+ struct packet packets[PKTBUF_CNT];
+ unsigned int buffer_idx;
+};
+
+int
+pktbuf_capacity(void)
+{
+ return PKTBUF_CNT;
+}
+
+struct pktbuf *
+pktbuf_create(void)
+{
+ return xcalloc(1, sizeof *pktbuf_create());
+}
+
+void
+pktbuf_destroy(struct pktbuf *pb)
+{
+ if (pb) {
+ size_t i;
+
+ for (i = 0; i < PKTBUF_CNT; i++) {
+ ofpbuf_delete(pb->packets[i].buffer);
+ }
+ free(pb);
+ }
+}
+
+uint32_t
+pktbuf_save(struct pktbuf *pb, struct ofpbuf *buffer, uint16_t in_port)
+{
+ struct packet *p = &pb->packets[pb->buffer_idx];
+ pb->buffer_idx = (pb->buffer_idx + 1) & PKTBUF_MASK;
+ if (p->buffer) {
+ if (time_msec() < p->timeout) {
+ return UINT32_MAX;
+ }
+ ofpbuf_delete(p->buffer);
+ }
+
+ /* Don't use maximum cookie value since all-1-bits ID is special. */
+ if (++p->cookie >= COOKIE_MAX) {
+ p->cookie = 0;
+ }
+ p->buffer = ofpbuf_clone(buffer);
+ p->timeout = time_msec() + OVERWRITE_MSECS;
+ p->in_port = in_port;
+ return (p - pb->packets) | (p->cookie << PKTBUF_BITS);
+}
+
+int
+pktbuf_retrieve(struct pktbuf *pb, uint32_t id, struct ofpbuf **bufferp,
+ uint16_t *in_port)
+{
+ static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 20);
+ struct packet *p;
+ int error;
+
+ if (!pb) {
+ VLOG_WARN_RL(&rl, "attempt to send buffered packet via connection "
+ "without buffers");
+ return ofp_mkerr(OFPET_BAD_REQUEST, OFPBRC_BAD_COOKIE);
+ }
+
+ p = &pb->packets[id & PKTBUF_MASK];
+ if (p->cookie == id >> PKTBUF_BITS) {
+ struct ofpbuf *buffer = p->buffer;
+ if (buffer) {
+ *bufferp = buffer;
+ *in_port = p->in_port;
+ p->buffer = NULL;
+ COVERAGE_INC(pktbuf_retrieved);
+ return 0;
+ } else {
+ COVERAGE_INC(pktbuf_reuse_error);
+ VLOG_WARN_RL(&rl, "attempt to reuse buffer %08"PRIx32, id);
+ error = ofp_mkerr(OFPET_BAD_REQUEST, OFPBRC_BUFFER_EMPTY);
+ }
+ } else {
+ COVERAGE_INC(pktbuf_bad_cookie);
+ VLOG_WARN_RL(&rl, "cookie mismatch: %08"PRIx32" != %08"PRIx32,
+ id, (id & PKTBUF_MASK) | (p->cookie << PKTBUF_BITS));
+ error = ofp_mkerr(OFPET_BAD_REQUEST, OFPBRC_BAD_COOKIE);
+ }
+ *bufferp = NULL;
+ *in_port = -1;
+ return error;
+}
+
+void
+pktbuf_discard(struct pktbuf *pb, uint32_t id)
+{
+ struct packet *p = &pb->packets[id & PKTBUF_MASK];
+ if (p->cookie == id >> PKTBUF_BITS) {
+ ofpbuf_delete(p->buffer);
+ p->buffer = NULL;
+ }
+}
diff --git a/secchan/pktbuf.h b/secchan/pktbuf.h
new file mode 100644
index 000000000..5ff7cf069
--- /dev/null
+++ b/secchan/pktbuf.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2008, 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef PKTBUF_H
+#define PKTBUF_H 1
+
+#include <stdint.h>
+
+struct pktbuf;
+struct ofpbuf;
+
+int pktbuf_capacity(void);
+
+struct pktbuf *pktbuf_create(void);
+void pktbuf_destroy(struct pktbuf *);
+uint32_t pktbuf_save(struct pktbuf *, struct ofpbuf *buffer, uint16_t in_port);
+int pktbuf_retrieve(struct pktbuf *, uint32_t id, struct ofpbuf **bufferp,
+ uint16_t *in_port);
+void pktbuf_discard(struct pktbuf *, uint32_t id);
+
+#endif /* pktbuf.h */
diff --git a/secchan/secchan.8.in b/secchan/secchan.8.in
new file mode 100644
index 000000000..b40842a96
--- /dev/null
+++ b/secchan/secchan.8.in
@@ -0,0 +1,463 @@
+.TH secchan 8 "March 2009" "Open vSwitch" "Open vSwitch Manual"
+.ds PN secchan
+
+.SH NAME
+secchan \- OpenFlow switch implementation
+
+.SH SYNOPSIS
+.B secchan
+[\fIoptions\fR] \fIdatapath\fR [\fIcontroller\fR]
+
+.SH DESCRIPTION
+The \fBsecchan\fR program implements an OpenFlow switch using a
+flow-based datapath. \fBsecchan\fR connects to an OpenFlow controller
+over TCP or SSL.
+
+The mandatory \fIdatapath\fR argument argument specifies the local datapath
+to relay. It takes one of the following forms:
+
+.so lib/dpif.man
+
+.PP
+The optional \fIcontroller\fR argument specifies how to connect to
+the OpenFlow controller. It takes one of the following forms:
+
+.RS
+.TP
+\fBssl:\fIhost\fR[\fB:\fIport\fR]
+The specified SSL \fIport\fR (default: 6633) on the given remote
+\fIhost\fR. The \fB--private-key\fR, \fB--certificate\fR, and
+\fB--ca-cert\fR options are mandatory when this form is used.
+
+.TP
+\fBtcp:\fIhost\fR[\fB:\fIport\fR]
+The specified TCP \fIport\fR (default: 6633) on the given remote
+\fIhost\fR.
+
+.TP
+\fBunix:\fIfile\fR
+The Unix domain server socket named \fIfile\fR.
+.RE
+
+.PP
+If \fIcontroller\fR is omitted, \fBsecchan\fR attempts to discover the
+location of the controller automatically (see below).
+
+.SS "Contacting the Controller"
+The OpenFlow switch must be able to contact the OpenFlow controller
+over the network. It can do so in one of two ways:
+
+.IP out-of-band
+In this configuration, OpenFlow traffic uses a network separate from
+the data traffic that it controls, that is, the switch does not use
+any of the network devices added to the datapath with \fBovs\-dpctl
+add\-if\fR in its communication with the controller.
+
+To use \fBsecchan\fR in a network with out-of-band control, specify
+\fB--out-of-band\fR on the \fBsecchan\fR command line. The control
+network must be configured separately, before or after \fBsecchan\fR
+is started.
+
+.IP in-band
+In this configuration, a single network is used for OpenFlow traffic
+and other data traffic, that is, the switch contacts the controller
+over one of the network devices added to the datapath with \fBovs\-dpctl
+add\-if\fR. This configuration is often more convenient than
+out-of-band control, because it is not necessary to maintain two
+independent networks.
+
+In-band control is the default for \fBsecchan\fR, so no special
+command-line option is required.
+
+With in-band control, the location of the controller can be configured
+manually or discovered automatically:
+
+.RS
+.IP "controller discovery"
+To make \fBsecchan\fR discover the location of the controller
+automatically, do not specify the location of the controller on the
+\fBsecchan\fR command line.
+
+In this mode, \fBsecchan\fR will broadcast a DHCP request with vendor
+class identifier \fBOpenFlow\fR across the network devices added to
+the datapath with \fBovs\-dpctl add\-if\fR. It will accept any valid DHCP
+reply that has the same vendor class identifier and includes a
+vendor-specific option with code 1 whose contents are a string
+specifying the location of the controller in the same format used on
+the \fBsecchan\fR command line (e.g. \fBssl:192.168.0.1\fR).
+
+The DHCP reply may also, optionally, include a vendor-specific option
+with code 2 whose contents are a string specifying the URI to the base
+of the OpenFlow PKI (e.g. \fBhttp://192.168.0.1/openflow/pki\fR).
+This URI is used only for bootstrapping the OpenFlow PKI at initial
+switch setup; \fBsecchan\fR does not use it at all.
+
+The following ISC DHCP server configuration file assigns the IP
+address range 192.168.0.20 through 192.168.0.30 to OpenFlow switches
+that follow the switch protocol and addresses 192.168.0.1 through
+192.168.0.10 to all other DHCP clients:
+
+default-lease-time 600;
+.br
+max-lease-time 7200;
+.br
+option space openflow;
+.br
+option openflow.controller-vconn code 1 = text;
+.br
+option openflow.pki-uri code 2 = text;
+.br
+class "OpenFlow" {
+.br
+ match if option vendor-class-identifier = "OpenFlow";
+.br
+ vendor-option-space openflow;
+.br
+ option openflow.controller-vconn "tcp:192.168.0.10";
+.br
+ option openflow.pki-uri "http://192.168.0.10/openflow/pki";
+.br
+ option vendor-class-identifier "OpenFlow";
+.br
+}
+.br
+subnet 192.168.0.0 netmask 255.255.255.0 {
+.br
+ pool {
+.br
+ allow members of "OpenFlow";
+.br
+ range 192.168.0.20 192.168.0.30;
+.br
+ }
+.br
+ pool {
+.br
+ deny members of "OpenFlow";
+.br
+ range 192.168.0.1 192.168.0.10;
+.br
+ }
+.br
+}
+.br
+
+.IP "manual configuration"
+To configure in-band control manually, specify the location of the
+controller on the \fBsecchan\fR command line as the \fIcontroller\fR
+argument. You must also configure the network device for the OpenFlow
+``local port'' to allow \fBsecchan\fR to connect to that controller.
+The OpenFlow local port is a virtual network port that \fBsecchan\fR
+bridges to the physical switch ports. The name of the local port for
+a given \fIdatapath\fR may be seen by running \fBovs\-dpctl show
+\fIdatapath\fR; the local port is listed as port 0 in \fBshow\fR's
+output.
+
+.IP
+Before \fBsecchan\fR starts, the local port network device is not
+bridged to any physical network, so the next step depends on whether
+connectivity is required to configure the device's IP address. If the
+switch has a static IP address, you may configure its IP address now
+with a command such as
+.B ifconfig of0 192.168.1.1
+and then invoke \fBsecchan\fR.
+
+On the other hand, if the switch does not have a static IP address,
+e.g. it obtains its IP address dynamically via DHCP, the DHCP client
+will not be able to contact the DHCP server until the secure channel
+has started up. Thus, start \fBsecchan\fR without configuring
+the local port network device, and start the DHCP client afterward.
+.RE
+
+.SH OPTIONS
+.SS "Controller Discovery Options"
+.TP
+\fB--accept-vconn=\fIregex\fR
+When \fBsecchan\fR performs controller discovery (see \fBContacting
+the Controller\fR, above, for more information about controller
+discovery), it validates the controller location obtained via DHCP
+with a POSIX extended regular expression. Only controllers whose
+names match the regular expression will be accepted.
+
+The default regular expression is \fBssl:.*\fR (meaning that only SSL
+controller connections will be accepted) when any of the SSL
+configuration options \fB--private-key\fR, \fB--certificate\fR, or
+\fB--ca-cert\fR is specified. The default is \fB.*\fR otherwise
+(meaning that any controller will be accepted).
+
+The \fIregex\fR is implicitly anchored at the beginning of the
+controller location string, as if it begins with \fB^\fR.
+
+When controller discovery is not performed, this option has no effect.
+
+.TP
+\fB--no-resolv-conf\fR
+When \fBsecchan\fR performs controller discovery (see \fBContacting
+the Controller\fR, above, for more information about controller
+discovery), by default it overwrites the system's
+\fB/etc/resolv.conf\fR with domain information and DNS servers
+obtained via DHCP. If the location of the controller is specified
+using a hostname, rather than an IP address, and the network's DNS
+servers ever change, this behavior is essential. But because it also
+interferes with any administrator or process that manages
+\fB/etc/resolv.conf\fR, when this option is specified, \fBsecchan\fR
+will not modify \fB/etc/resolv.conf\fR.
+
+\fBsecchan\fR will only modify \fBresolv.conf\fR if the DHCP response
+that it receives specifies one or more DNS servers.
+
+When controller discovery is not performed, this option has no effect.
+
+.SS "Networking Options"
+.TP
+\fB--datapath-id=\fIdpid\fR
+Sets \fIdpid\fR, which must consist of exactly 12 hexadecimal digits,
+as the datapath ID that the switch will use to identify itself to the
+OpenFlow controller.
+
+If this option is omitted, the default datapath ID is taken from the
+Ethernet address of the datapath's local port (which is typically
+randomly generated).
+
+.TP
+\fB--mgmt-id=\fImgmtid\fR
+Sets \fImgmtid\fR, which must consist of exactly 12 hexadecimal
+digits, as the switch's management ID.
+
+If this option is omitted, the management ID defaults to 0, signaling
+to the controller that management is supported but not configured.
+
+.TP
+\fB--fail=\fR[\fBopen\fR|\fBclosed\fR]
+The controller is, ordinarily, responsible for setting up all flows on
+the OpenFlow switch. Thus, if the connection to the controller fails,
+no new network connections can be set up. If the connection to the
+controller stays down long enough, no packets can pass through the
+switch at all.
+
+If this option is set to \fBopen\fR (the default), \fBsecchan\fR will
+take over responsibility for setting up flows in the local datapath
+when no message has been received from the controller for three times
+the inactivity probe interval (see below), or 45 seconds by default.
+In this ``fail open'' mode, \fBsecchan\fR causes the datapath to act
+like an ordinary MAC-learning switch. \fBsecchan\fR will continue to
+retry connection to the controller in the background and, when the
+connection succeeds, it discontinues its fail-open behavior.
+
+If this option is set to \fBclosed\fR, then \fBsecchan\fR will not
+set up flows on its own when the controller connection fails.
+
+.TP
+\fB--inactivity-probe=\fIsecs\fR
+When the secure channel is connected to the controller, the secure
+channel waits for a message to be received from the controller for
+\fIsecs\fR seconds before it sends a inactivity probe to the
+controller. After sending the inactivity probe, if no response is
+received for an additional \fIsecs\fR seconds, the secure channel
+assumes that the connection has been broken and attempts to reconnect.
+The default is 15 seconds, and the minimum value is 5 seconds.
+
+When fail-open mode is configured, changing the inactivity probe
+interval also changes the interval before entering fail-open mode (see
+above).
+
+.TP
+\fB--max-idle=\fIsecs\fR|\fBpermanent\fR
+Sets \fIsecs\fR as the number of seconds that a flow set up by the
+secure channel will remain in the switch's flow table without any
+matching packets being seen. If \fBpermanent\fR is specified, which
+is not recommended, flows set up by the secure channel will never
+expire. The default is 15 seconds.
+
+Most flows are set up by the OpenFlow controller, not by the secure
+channel. This option affects only the following flows, which the
+secure channel sets up itself:
+
+.RS
+.IP \(bu
+When \fB--fail=open\fR is specified, flows set up when the secure
+channel has not been able to contact the controller for the configured
+fail-open delay.
+
+.IP \(bu
+When in-band control is in use, flows set up to bootstrap contacting
+the controller (see \fBContacting the Controller\fR, above, for
+more information about in-band control).
+.RE
+
+.IP
+As a result, when both \fB--fail=closed\fR and \fB--out-of-band\fR are
+specified, this option has no effect.
+
+.TP
+\fB--max-backoff=\fIsecs\fR
+Sets the maximum time between attempts to connect to the controller to
+\fIsecs\fR, which must be at least 1. The actual interval between
+connection attempts starts at 1 second and doubles on each failing
+attempt until it reaches the maximum. The default maximum backoff
+time is 15 seconds.
+
+.TP
+\fB-l\fR, \fB--listen=\fImethod\fR
+Configures the switch to additionally listen for incoming OpenFlow
+connections for switch management with \fBovs\-ofctl\fR. The \fImethod\fR
+must be given as one of the passive OpenFlow connection methods listed
+below. This option may be specified multiple times to listen to
+multiple connection methods.
+
+.RS
+.TP
+\fBpssl:\fR[\fIport\fR]
+Listens for SSL connections on \fIport\fR (default: 6633). The
+\fB--private-key\fR, \fB--certificate\fR, and \fB--ca-cert\fR options
+are mandatory when this form is used.
+
+.TP
+\fBptcp:\fR[\fIport\fR]
+Listens for TCP connections on \fIport\fR (default: 6633).
+
+.TP
+\fBpunix:\fIfile\fR
+Listens for connections on Unix domain server socket named \fIfile\fR.
+.RE
+
+.TP
+\fB--snoop=\fImethod\fR
+Configures the switch to additionally listen for incoming OpenFlow
+connections for controller connection snooping. The \fImethod\fR must
+be given as one of the passive OpenFlow connection methods listed
+under the \fB--listen\fR option above. This option may be specified
+multiple times to listen to multiple connection methods.
+
+If \fBovs\-ofctl monitor\fR is used to connect to \fImethod\fR specified on
+\fB--snoop\fR, it will display all the OpenFlow messages traveling
+between the switch and its controller on the primary OpenFlow
+connection. This can be useful for debugging switch and controller
+problems.
+
+.TP
+\fB--in-band\fR, \fB--out-of-band\fR
+Configures \fBsecchan\fR to operate in in-band or out-of-band control
+mode (see \fBContacting the Controller\fR above). When neither option
+is given, the default is in-band control.
+
+.TP
+\fB--netflow=\fIhost\fB:\fIport\fR
+Configures the given UDP \fIport\fR on the specified IP \fIhost\fR as
+a recipient of NetFlow messages for expired flows.
+
+This option may be specified multiple times to configure additional
+NetFlow collectors.
+
+.SS "Rate-Limiting Options"
+
+These options configure how the switch applies a ``token bucket'' to
+limit the rate at which packets in unknown flows are forwarded to an
+OpenFlow controller for flow-setup processing. This feature prevents
+a single OpenFlow switch from overwhelming a controller.
+
+.TP
+\fB--rate-limit\fR[\fB=\fIrate\fR]
+.
+Limits the maximum rate at which packets will be forwarded to the
+OpenFlow controller to \fIrate\fR packets per second. If \fIrate\fR
+is not specified then the default of 1,000 packets per second is used.
+
+If \fB--rate-limit\fR is not used, then the switch does not limit the
+rate at which packets are forwarded to the controller.
+
+.TP
+\fB--burst-limit=\fIburst\fR
+.
+Sets the maximum number of unused packet credits that the switch will
+allow to accumulate during time in which no packets are being
+forwarded to the OpenFlow controller to \fIburst\fR (measured in
+packets). The default \fIburst\fR is one-quarter of the \fIrate\fR
+specified on \fB--rate-limit\fR.
+
+This option takes effect only when \fB--rate-limit\fR is also specified.
+
+.SS "Remote Command Execution Options"
+
+.TP
+\fB--command-acl=\fR[\fB!\fR]\fIglob\fR[\fB,\fR[\fB!\fR]\fIglob\fR...]
+Configures the commands that remote OpenFlow connections are allowed
+to invoke using (e.g.) \fBovs\-ofctl execute\fR. The argument is a
+comma-separated sequence of shell glob patterns. A glob pattern
+specified without a leading \fB!\fR is a ``whitelist'' that specifies
+a set of commands that are that may be invoked, whereas a pattern that
+does begin with \fB!\fR is a ``blacklist'' that specifies commands
+that may not be invoked. To be permitted, a command name must be
+whitelisted and must not be blacklisted;
+e.g. \fB--command-acl=up*,!upgrade\fR would allow any command whose name
+begins with \fBup\fR except for the command named \fBupgrade\fR.
+Command names that include characters other than upper- and lower-case
+English letters, digits, and the underscore and hyphen characters are
+unconditionally disallowed.
+
+When the whitelist and blacklist permit a command name, \fBsecchan\fR
+looks for a program with the same name as the command in the commands
+directory (see below). Other directories are not searched.
+
+.TP
+\fB--command-dir=\fIdirectory\fR
+Sets the directory searched for remote command execution to
+\fBdirectory\fR. The default directory is
+\fB@pkgdatadir@/commands\fR.
+
+.SS "Daemon Options"
+.so lib/daemon.man
+
+.SS "Public Key Infrastructure Options"
+
+.TP
+\fB-p\fR, \fB--private-key=\fIprivkey.pem\fR
+Specifies a PEM file containing the private key used as the switch's
+identity for SSL connections to the controller.
+
+.TP
+\fB-c\fR, \fB--certificate=\fIcert.pem\fR
+Specifies a PEM file containing a certificate, signed by the
+controller's certificate authority (CA), that certifies the switch's
+private key to identify a trustworthy switch.
+
+.TP
+\fB-C\fR, \fB--ca-cert=\fIcacert.pem\fR
+Specifies a PEM file containing the CA certificate used to verify that
+the switch is connected to a trustworthy controller.
+
+.TP
+\fB--bootstrap-ca-cert=\fIcacert.pem\fR
+When \fIcacert.pem\fR exists, this option has the same effect as
+\fB-C\fR or \fB--ca-cert\fR. If it does not exist, then \fBsecchan\fR
+will attempt to obtain the CA certificate from the controller on its
+first SSL connection and save it to the named PEM file. If it is
+successful, it will immediately drop the connection and reconnect, and
+from then on all SSL connections must be authenticated by a
+certificate signed by the CA certificate thus obtained.
+
+\fBThis option exposes the SSL connection to a man-in-the-middle
+attack obtaining the initial CA certificate\fR, but it may be useful
+for bootstrapping.
+
+This option is only useful if the controller sends its CA certificate
+as part of the SSL certificate chain. The SSL protocol does not
+require the controller to send the CA certificate, but
+\fBcontroller\fR(8) can be configured to do so with the
+\fB--peer-ca-cert\fR option.
+
+.SS "Logging Options"
+.so lib/vlog.man
+.SS "Other Options"
+.so lib/common.man
+.so lib/leak-checker.man
+
+.SH "SEE ALSO"
+
+.BR ovs\-appctl (8),
+.BR ovs\-controller (8),
+.BR ovs\-discover (8),
+.BR ovs\-dpctl (8),
+.BR ovs\-ofctl (8),
+.BR ovs\-pki (8),
+.BR ovs\-vswitchd.conf (5)
diff --git a/secchan/status.c b/secchan/status.c
new file mode 100644
index 000000000..cf67d51c4
--- /dev/null
+++ b/secchan/status.c
@@ -0,0 +1,229 @@
+/*
+ * Copyright (c) 2008, 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <config.h>
+#include "status.h"
+#include <arpa/inet.h>
+#include <assert.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include "dynamic-string.h"
+#include "list.h"
+#include "ofpbuf.h"
+#include "ofproto.h"
+#include "openflow/nicira-ext.h"
+#include "rconn.h"
+#include "svec.h"
+#include "timeval.h"
+#include "vconn.h"
+
+#define THIS_MODULE VLM_status
+#include "vlog.h"
+
+struct status_category {
+ struct list node;
+ char *name;
+ void (*cb)(struct status_reply *, void *aux);
+ void *aux;
+};
+
+struct switch_status {
+ time_t booted;
+ struct status_category *config_cat;
+ struct status_category *switch_cat;
+ struct list categories;
+};
+
+struct status_reply {
+ struct status_category *category;
+ struct ds request;
+ struct ds output;
+};
+
+int
+switch_status_handle_request(struct switch_status *ss, struct rconn *rconn,
+ struct nicira_header *request)
+{
+ struct status_category *c;
+ struct nicira_header *reply;
+ struct status_reply sr;
+ struct ofpbuf *b;
+ int retval;
+
+ sr.request.string = (void *) (request + 1);
+ sr.request.length = ntohs(request->header.length) - sizeof *request;
+ ds_init(&sr.output);
+ LIST_FOR_EACH (c, struct status_category, node, &ss->categories) {
+ if (!memcmp(c->name, sr.request.string,
+ MIN(strlen(c->name), sr.request.length))) {
+ sr.category = c;
+ c->cb(&sr, c->aux);
+ }
+ }
+ reply = make_openflow_xid(sizeof *reply + sr.output.length,
+ OFPT_VENDOR, request->header.xid, &b);
+ reply->vendor = htonl(NX_VENDOR_ID);
+ reply->subtype = htonl(NXT_STATUS_REPLY);
+ memcpy(reply + 1, sr.output.string, sr.output.length);
+ retval = rconn_send(rconn, b, NULL);
+ if (retval && retval != EAGAIN) {
+ VLOG_WARN("send failed (%s)", strerror(retval));
+ }
+ ds_destroy(&sr.output);
+ return 0;
+}
+
+void
+rconn_status_cb(struct status_reply *sr, void *rconn_)
+{
+ struct rconn *rconn = rconn_;
+ time_t now = time_now();
+
+ status_reply_put(sr, "name=%s", rconn_get_name(rconn));
+ status_reply_put(sr, "state=%s", rconn_get_state(rconn));
+ status_reply_put(sr, "backoff=%d", rconn_get_backoff(rconn));
+ status_reply_put(sr, "is-connected=%s",
+ rconn_is_connected(rconn) ? "true" : "false");
+ status_reply_put(sr, "sent-msgs=%u", rconn_packets_sent(rconn));
+ status_reply_put(sr, "received-msgs=%u", rconn_packets_received(rconn));
+ status_reply_put(sr, "attempted-connections=%u",
+ rconn_get_attempted_connections(rconn));
+ status_reply_put(sr, "successful-connections=%u",
+ rconn_get_successful_connections(rconn));
+ status_reply_put(sr, "last-connection=%ld",
+ (long int) (now - rconn_get_last_connection(rconn)));
+ status_reply_put(sr, "time-connected=%lu",
+ rconn_get_total_time_connected(rconn));
+ status_reply_put(sr, "state-elapsed=%u", rconn_get_state_elapsed(rconn));
+}
+
+static void
+config_status_cb(struct status_reply *sr, void *ofproto_)
+{
+ const struct ofproto *ofproto = ofproto_;
+ struct svec listeners;
+ int probe_interval, max_backoff;
+ size_t i;
+
+ svec_init(&listeners);
+ ofproto_get_listeners(ofproto, &listeners);
+ for (i = 0; i < listeners.n; i++) {
+ status_reply_put(sr, "management%zu=%s", i, listeners.names[i]);
+ }
+ svec_destroy(&listeners);
+
+ probe_interval = ofproto_get_probe_interval(ofproto);
+ if (probe_interval) {
+ status_reply_put(sr, "probe-interval=%d", probe_interval);
+ }
+
+ max_backoff = ofproto_get_max_backoff(ofproto);
+ if (max_backoff) {
+ status_reply_put(sr, "max-backoff=%d", max_backoff);
+ }
+}
+
+static void
+switch_status_cb(struct status_reply *sr, void *ss_)
+{
+ struct switch_status *ss = ss_;
+ time_t now = time_now();
+
+ status_reply_put(sr, "now=%ld", (long int) now);
+ status_reply_put(sr, "uptime=%ld", (long int) (now - ss->booted));
+ status_reply_put(sr, "pid=%ld", (long int) getpid());
+}
+
+struct switch_status *
+switch_status_create(const struct ofproto *ofproto)
+{
+ struct switch_status *ss = xcalloc(1, sizeof *ss);
+ ss->booted = time_now();
+ list_init(&ss->categories);
+ ss->config_cat = switch_status_register(ss, "config", config_status_cb,
+ (void *) ofproto);
+ ss->switch_cat = switch_status_register(ss, "switch", switch_status_cb,
+ ss);
+ return ss;
+}
+
+void
+switch_status_destroy(struct switch_status *ss)
+{
+ if (ss) {
+ /* Orphan any remaining categories, so that unregistering them later
+ * won't write to bad memory. */
+ struct status_category *c, *next;
+ LIST_FOR_EACH_SAFE (c, next,
+ struct status_category, node, &ss->categories) {
+ list_init(&c->node);
+ }
+ switch_status_unregister(ss->config_cat);
+ switch_status_unregister(ss->switch_cat);
+ free(ss);
+ }
+}
+
+struct status_category *
+switch_status_register(struct switch_status *ss,
+ const char *category,
+ status_cb_func *cb, void *aux)
+{
+ struct status_category *c = xmalloc(sizeof *c);
+ c->cb = cb;
+ c->aux = aux;
+ c->name = xstrdup(category);
+ list_push_back(&ss->categories, &c->node);
+ return c;
+}
+
+void
+switch_status_unregister(struct status_category *c)
+{
+ if (c) {
+ if (!list_is_empty(&c->node)) {
+ list_remove(&c->node);
+ }
+ free(c->name);
+ free(c);
+ }
+}
+
+void
+status_reply_put(struct status_reply *sr, const char *content, ...)
+{
+ size_t old_length = sr->output.length;
+ size_t added;
+ va_list args;
+
+ /* Append the status reply to the output. */
+ ds_put_format(&sr->output, "%s.", sr->category->name);
+ va_start(args, content);
+ ds_put_format_valist(&sr->output, content, args);
+ va_end(args);
+ if (ds_last(&sr->output) != '\n') {
+ ds_put_char(&sr->output, '\n');
+ }
+
+ /* Drop what we just added if it doesn't match the request. */
+ added = sr->output.length - old_length;
+ if (added < sr->request.length
+ || memcmp(&sr->output.string[old_length],
+ sr->request.string, sr->request.length)) {
+ ds_truncate(&sr->output, old_length);
+ }
+}
diff --git a/secchan/status.h b/secchan/status.h
new file mode 100644
index 000000000..e7b730681
--- /dev/null
+++ b/secchan/status.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2008, 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef STATUS_H
+#define STATUS_H 1
+
+#include "compiler.h"
+
+struct nicira_header;
+struct rconn;
+struct secchan;
+struct ofproto;
+struct status_reply;
+
+struct switch_status *switch_status_create(const struct ofproto *);
+void switch_status_destroy(struct switch_status *);
+
+int switch_status_handle_request(struct switch_status *, struct rconn *,
+ struct nicira_header *);
+
+typedef void status_cb_func(struct status_reply *, void *aux);
+struct status_category *switch_status_register(struct switch_status *,
+ const char *category,
+ status_cb_func *, void *aux);
+void switch_status_unregister(struct status_category *);
+
+void status_reply_put(struct status_reply *, const char *, ...)
+ PRINTF_FORMAT(2, 3);
+
+void rconn_status_cb(struct status_reply *, void *rconn_);
+
+#endif /* status.h */
diff --git a/soexpand.pl b/soexpand.pl
new file mode 100755
index 000000000..4e1300561
--- /dev/null
+++ b/soexpand.pl
@@ -0,0 +1,26 @@
+use strict;
+use warnings;
+use Getopt::Long;
+
+my ($exit_code) = 0;
+my (@include_dirs);
+Getopt::Long::Configure ("bundling");
+GetOptions("I|include=s" => \@include_dirs) or exit(1);
+@include_dirs = ('.') if !@include_dirs;
+OUTER: while (<STDIN>) {
+ if (my ($name) = /^\.so (\S+)$/) {
+ foreach my $dir (@include_dirs, '.') {
+ if (open(INNER, "$dir/$name")) {
+ while (<INNER>) {
+ print $_;
+ }
+ close(INNER);
+ next OUTER;
+ }
+ }
+ print STDERR "$name not found in: ", join(' ', @include_dirs), "\n";
+ $exit_code = 1;
+ }
+ print $_;
+}
+exit $exit_code;
diff --git a/tests/.gitignore b/tests/.gitignore
new file mode 100644
index 000000000..111250351
--- /dev/null
+++ b/tests/.gitignore
@@ -0,0 +1,10 @@
+/Makefile
+/Makefile.in
+/test-classifier
+/test-dhcp-client
+/test-flows
+/test-hash
+/test-hmap
+/test-list
+/test-stp
+/test-type-props
diff --git a/tests/automake.mk b/tests/automake.mk
new file mode 100644
index 000000000..0508144f0
--- /dev/null
+++ b/tests/automake.mk
@@ -0,0 +1,56 @@
+TESTS += tests/test-classifier
+noinst_PROGRAMS += tests/test-classifier
+tests_test_classifier_SOURCES = tests/test-classifier.c
+tests_test_classifier_LDADD = lib/libopenvswitch.a
+
+TESTS += tests/test-flows.sh
+noinst_PROGRAMS += tests/test-flows
+tests_test_flows_SOURCES = tests/test-flows.c
+tests_test_flows_LDADD = lib/libopenvswitch.a
+dist_check_SCRIPTS = tests/test-flows.sh tests/flowgen.pl
+
+TESTS += tests/test-hash
+noinst_PROGRAMS += tests/test-hash
+tests_test_hash_SOURCES = tests/test-hash.c
+tests_test_hash_LDADD = lib/libopenvswitch.a
+
+TESTS += tests/test-hmap
+noinst_PROGRAMS += tests/test-hmap
+tests_test_hmap_SOURCES = tests/test-hmap.c
+tests_test_hmap_LDADD = lib/libopenvswitch.a
+
+TESTS += tests/test-list
+noinst_PROGRAMS += tests/test-list
+tests_test_list_SOURCES = tests/test-list.c
+tests_test_list_LDADD = lib/libopenvswitch.a
+
+TESTS += tests/test-type-props
+noinst_PROGRAMS += tests/test-type-props
+tests_test_type_props_SOURCES = tests/test-type-props.c
+
+noinst_PROGRAMS += tests/test-dhcp-client
+tests_test_dhcp_client_SOURCES = tests/test-dhcp-client.c
+tests_test_dhcp_client_LDADD = lib/libopenvswitch.a $(FAULT_LIBS)
+
+TESTS += tests/test-stp.sh
+EXTRA_DIST += tests/test-stp.sh
+noinst_PROGRAMS += tests/test-stp
+
+tests_test_stp_SOURCES = tests/test-stp.c
+tests_test_stp_LDADD = lib/libopenvswitch.a
+stp_files = \
+ tests/test-stp-ieee802.1d-1998 \
+ tests/test-stp-ieee802.1d-2004-fig17.4 \
+ tests/test-stp-ieee802.1d-2004-fig17.6 \
+ tests/test-stp-ieee802.1d-2004-fig17.7 \
+ tests/test-stp-iol-op-1.1 \
+ tests/test-stp-iol-op-1.4 \
+ tests/test-stp-iol-op-3.1 \
+ tests/test-stp-iol-op-3.3 \
+ tests/test-stp-iol-io-1.1 \
+ tests/test-stp-iol-io-1.2 \
+ tests/test-stp-iol-io-1.4 \
+ tests/test-stp-iol-io-1.5
+TESTS_ENVIRONMENT += stp_files='$(stp_files)'
+
+EXTRA_DIST += $(stp_files)
diff --git a/tests/flowgen.pl b/tests/flowgen.pl
new file mode 100755
index 000000000..6325f1fea
--- /dev/null
+++ b/tests/flowgen.pl
@@ -0,0 +1,224 @@
+#! /usr/bin/perl
+
+use strict;
+use warnings;
+
+open(FLOWS, ">&=3");# or die "failed to open fd 3 for writing: $!\n";
+open(PACKETS, ">&=4");# or die "failed to open fd 4 for writing: $!\n";
+
+# Print pcap file header.
+print PACKETS pack('NnnNNNN',
+ 0xa1b2c3d4, # magic number
+ 2, # major version
+ 4, # minor version
+ 0, # time zone offset
+ 0, # time stamp accuracy
+ 1518, # snaplen
+ 1); # Ethernet
+
+output(DL_HEADER => '802.2');
+
+for my $dl_header qw(802.2+SNAP Ethernet) {
+ my %a = (DL_HEADER => $dl_header);
+ for my $dl_vlan qw(none zero nonzero) {
+ my %b = (%a, DL_VLAN => $dl_vlan);
+
+ # Non-IP case.
+ output(%b, DL_TYPE => 'non-ip');
+
+ for my $ip_options qw(no yes) {
+ my %c = (%b, DL_TYPE => 'ip', IP_OPTIONS => $ip_options);
+ for my $ip_fragment qw(no first middle last) {
+ my %d = (%c, IP_FRAGMENT => $ip_fragment);
+ for my $tp_proto qw(TCP TCP+options UDP ICMP other) {
+ output(%d, TP_PROTO => $tp_proto);
+ }
+ }
+ }
+ }
+}
+
+sub output {
+ my (%attrs) = @_;
+
+ # Compose flow.
+ my (%flow);
+ $flow{DL_SRC} = "00:02:e3:0f:80:a4";
+ $flow{DL_DST} = "00:1a:92:40:ac:05";
+ $flow{NW_PROTO} = 0;
+ $flow{NW_SRC} = '0.0.0.0';
+ $flow{NW_DST} = '0.0.0.0';
+ $flow{TP_SRC} = 0;
+ $flow{TP_DST} = 0;
+ if (defined($attrs{DL_VLAN})) {
+ my (%vlan_map) = ('none' => 0xffff,
+ 'zero' => 0,
+ 'nonzero' => 0x0123);
+ $flow{DL_VLAN} = $vlan_map{$attrs{DL_VLAN}};
+ } else {
+ $flow{DL_VLAN} = 0xffff; # OFP_VLAN_NONE
+ }
+ if ($attrs{DL_HEADER} eq '802.2') {
+ $flow{DL_TYPE} = 0x5ff; # OFP_DL_TYPE_NOT_ETH_TYPE
+ } elsif ($attrs{DL_TYPE} eq 'ip') {
+ $flow{DL_TYPE} = 0x0800; # ETH_TYPE_IP
+ $flow{NW_SRC} = '10.0.2.15';
+ $flow{NW_DST} = '192.168.1.20';
+ if ($attrs{TP_PROTO} eq 'other') {
+ $flow{NW_PROTO} = 42;
+ } elsif ($attrs{TP_PROTO} eq 'TCP' ||
+ $attrs{TP_PROTO} eq 'TCP+options') {
+ $flow{NW_PROTO} = 6; # IP_TYPE_TCP
+ $flow{TP_SRC} = 6667;
+ $flow{TP_DST} = 9998;
+ } elsif ($attrs{TP_PROTO} eq 'UDP') {
+ $flow{NW_PROTO} = 17; # IP_TYPE_UDP
+ $flow{TP_SRC} = 1112;
+ $flow{TP_DST} = 2223;
+ } elsif ($attrs{TP_PROTO} eq 'ICMP') {
+ $flow{NW_PROTO} = 1; # IP_TYPE_ICMP
+ $flow{TP_SRC} = 8; # echo request
+ $flow{TP_DST} = 0; # code
+ } else {
+ die;
+ }
+ if ($attrs{IP_FRAGMENT} ne 'no') {
+ $flow{TP_SRC} = $flow{TP_DST} = 0;
+ }
+ } elsif ($attrs{DL_TYPE} eq 'non-ip') {
+ $flow{DL_TYPE} = 0x5678;
+ } else {
+ die;
+ }
+
+ # Compose packet.
+ my $packet = '';
+ $packet .= pack_ethaddr($flow{DL_DST});
+ $packet .= pack_ethaddr($flow{DL_SRC});
+ $packet .= pack('n', 0) if $attrs{DL_HEADER} =~ /^802.2/;
+ if ($attrs{DL_HEADER} eq '802.2') {
+ $packet .= pack('CCC', 0x42, 0x42, 0x03); # LLC for 802.1D STP.
+ } else {
+ if ($attrs{DL_HEADER} eq '802.2+SNAP') {
+ $packet .= pack('CCC', 0xaa, 0xaa, 0x03); # LLC for SNAP.
+ $packet .= pack('CCC', 0, 0, 0); # SNAP OUI.
+ }
+ if ($attrs{DL_VLAN} ne 'none') {
+ $packet .= pack('nn', 0x8100, $flow{DL_VLAN});
+ }
+ $packet .= pack('n', $flow{DL_TYPE});
+ if ($attrs{DL_TYPE} eq 'ip') {
+ my $ip = pack('CCnnnCCnNN',
+ (4 << 4) | 5, # version, hdrlen
+ 0, # type of service
+ 0, # total length (filled in later)
+ 65432, # id
+ 0, # frag offset
+ 64, # ttl
+ $flow{NW_PROTO}, # protocol
+ 0, # checksum
+ 0x0a00020f, # source
+ 0xc0a80114); # dest
+ if ($attrs{IP_OPTIONS} eq 'yes') {
+ substr($ip, 0, 1) = pack('C', (4 << 4) | 8);
+ $ip .= pack('CCnnnCCCx',
+ 130, # type
+ 11, # length
+ 0x6bc5, # top secret
+ 0xabcd,
+ 0x1234,
+ 1,
+ 2,
+ 3);
+ }
+ if ($attrs{IP_FRAGMENT} ne 'no') {
+ my (%frag_map) = ('first' => 0x2000, # more frags, ofs 0
+ 'middle' => 0x2111, # more frags, ofs 0x888
+ 'last' => 0x0222); # last frag, ofs 0x1110
+ substr($ip, 6, 2)
+ = pack('n', $frag_map{$attrs{IP_FRAGMENT}});
+ }
+
+ if ($attrs{TP_PROTO} =~ '^TCP') {
+ my $tcp = pack('nnNNnnnn',
+ $flow{TP_SRC}, # source port
+ $flow{TP_DST}, # dest port
+ 87123455, # seqno
+ 712378912, # ackno
+ (5 << 12) | 0x02 | 0x10, # hdrlen, SYN, ACK
+ 5823, # window size
+ 18923, # checksum
+ 12893); # urgent pointer
+ if ($attrs{TP_PROTO} eq 'TCP+options') {
+ substr($tcp, 12, 2) = pack('n', (6 << 12) | 0x02 | 0x10);
+ $tcp .= pack('CCn', 2, 4, 1975); # MSS option
+ }
+ $tcp .= 'payload';
+ $ip .= $tcp;
+ } elsif ($attrs{TP_PROTO} eq 'UDP') {
+ my $len = 15;
+ my $udp = pack('nnnn', $flow{TP_SRC}, $flow{TP_DST}, $len, 0);
+ $udp .= chr($len) while length($udp) < $len;
+ $ip .= $udp;
+ } elsif ($attrs{TP_PROTO} eq 'ICMP') {
+ $ip .= pack('CCnnn',
+ 8, # echo request
+ 0, # code
+ 0, # checksum
+ 736, # identifier
+ 931); # sequence number
+ } elsif ($attrs{TP_PROTO} eq 'other') {
+ $ip .= 'other header';
+ } else {
+ die;
+ }
+
+ substr($ip, 2, 2) = pack('n', length($ip));
+ $packet .= $ip;
+ }
+ }
+ substr($packet, 12, 2) = pack('n', length($packet))
+ if $attrs{DL_HEADER} =~ /^802.2/;
+
+ print join(' ', map("$_=$attrs{$_}", keys(%attrs))), "\n";
+ print join(' ', map("$_=$flow{$_}", keys(%flow))), "\n";
+ print "\n";
+
+ print FLOWS pack('Nn',
+ 0, # wildcards
+ 1); # in_port
+ print FLOWS pack_ethaddr($flow{DL_SRC});
+ print FLOWS pack_ethaddr($flow{DL_DST});
+ print FLOWS pack('nnCxNNnn',
+ $flow{DL_VLAN},
+ $flow{DL_TYPE},
+ $flow{NW_PROTO},
+ inet_aton($flow{NW_SRC}),
+ inet_aton($flow{NW_DST}),
+ $flow{TP_SRC},
+ $flow{TP_DST});
+
+ print PACKETS pack('NNNN',
+ 0, # timestamp seconds
+ 0, # timestamp microseconds
+ length($packet), # bytes saved
+ length($packet)), # total length
+ $packet;
+}
+
+sub pack_ethaddr {
+ local ($_) = @_;
+ my $xx = '([0-9a-fA-F][0-9a-fA-F])';
+ my (@octets) = /$xx:$xx:$xx:$xx:$xx:$xx/;
+ @octets == 6 or die $_;
+ my ($out) = '';
+ $out .= pack('C', hex($_)) foreach @octets;
+ return $out;
+}
+
+sub inet_aton {
+ local ($_) = @_;
+ my ($a, $b, $c, $d) = /^(\d+)\.(\d+)\.(\d+)\.(\d+)$/;
+ defined $d or die $_;
+ return ($a << 24) | ($b << 16) | ($c << 8) | $d;
+}
diff --git a/tests/test-classifier.c b/tests/test-classifier.c
new file mode 100644
index 000000000..309c4dd60
--- /dev/null
+++ b/tests/test-classifier.c
@@ -0,0 +1,977 @@
+/*
+ * Copyright (c) 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+/* "White box" tests for classifier.
+ *
+ * With very few exceptions, these tests obtain complete coverage of every
+ * basic block and every branch in the classifier implementation, e.g. a clean
+ * report from "gcov -b". (Covering the exceptions would require finding
+ * collisions in the hash function used for flow data, etc.)
+ *
+ * This test should receive a clean report from "valgrind --leak-check=full":
+ * it frees every heap block that it allocates.
+ */
+
+#include <config.h>
+#include <limits.h>
+#include "classifier.h"
+#include <errno.h>
+#include <limits.h>
+#include "flow.h"
+#include <limits.h>
+#include "packets.h"
+
+#undef NDEBUG
+#include <assert.h>
+
+struct test_rule {
+ int aux; /* Auxiliary data. */
+ struct cls_rule cls_rule; /* Classifier rule data. */
+};
+
+static struct test_rule *
+test_rule_from_cls_rule(const struct cls_rule *rule)
+{
+ return rule ? CONTAINER_OF(rule, struct test_rule, cls_rule) : NULL;
+}
+
+/* Trivial (linear) classifier. */
+struct tcls {
+ size_t n_rules;
+ size_t allocated_rules;
+ struct test_rule **rules;
+};
+
+static void
+tcls_init(struct tcls *tcls)
+{
+ tcls->n_rules = 0;
+ tcls->allocated_rules = 0;
+ tcls->rules = NULL;
+}
+
+static void
+tcls_destroy(struct tcls *tcls)
+{
+ if (tcls) {
+ size_t i;
+
+ for (i = 0; i < tcls->n_rules; i++) {
+ free(tcls->rules[i]);
+ }
+ free(tcls->rules);
+ }
+}
+
+static int
+tcls_count_exact(const struct tcls *tcls)
+{
+ int n_exact;
+ size_t i;
+
+ n_exact = 0;
+ for (i = 0; i < tcls->n_rules; i++) {
+ n_exact += tcls->rules[i]->cls_rule.wc.wildcards == 0;
+ }
+ return n_exact;
+}
+
+static bool
+tcls_is_empty(const struct tcls *tcls)
+{
+ return tcls->n_rules == 0;
+}
+
+static struct test_rule *
+tcls_insert(struct tcls *tcls, const struct test_rule *rule)
+{
+ size_t i;
+
+ assert(rule->cls_rule.wc.wildcards || rule->cls_rule.priority == UINT_MAX);
+ for (i = 0; i < tcls->n_rules; i++) {
+ const struct cls_rule *pos = &tcls->rules[i]->cls_rule;
+ if (pos->priority == rule->cls_rule.priority
+ && pos->wc.wildcards == rule->cls_rule.wc.wildcards
+ && flow_equal(&pos->flow, &rule->cls_rule.flow)) {
+ /* Exact match.
+ * XXX flow_equal should ignore wildcarded fields */
+ free(tcls->rules[i]);
+ tcls->rules[i] = xmemdup(rule, sizeof *rule);
+ return tcls->rules[i];
+ } else if (pos->priority <= rule->cls_rule.priority) {
+ break;
+ }
+ }
+
+ if (tcls->n_rules >= tcls->allocated_rules) {
+ tcls->rules = x2nrealloc(tcls->rules, &tcls->allocated_rules,
+ sizeof *tcls->rules);
+ }
+ if (i != tcls->n_rules) {
+ memmove(&tcls->rules[i + 1], &tcls->rules[i],
+ sizeof *tcls->rules * (tcls->n_rules - i));
+ }
+ tcls->rules[i] = xmemdup(rule, sizeof *rule);
+ tcls->n_rules++;
+ return tcls->rules[i];
+}
+
+static void
+tcls_remove(struct tcls *cls, const struct test_rule *rule)
+{
+ size_t i;
+
+ for (i = 0; i < cls->n_rules; i++) {
+ struct test_rule *pos = cls->rules[i];
+ if (pos == rule) {
+ free(pos);
+ memmove(&cls->rules[i], &cls->rules[i + 1],
+ sizeof *cls->rules * (cls->n_rules - i - 1));
+ cls->n_rules--;
+ return;
+ }
+ }
+ NOT_REACHED();
+}
+
+static uint32_t
+read_uint32(const void *p)
+{
+ uint32_t x;
+ memcpy(&x, p, sizeof x);
+ return x;
+}
+
+static bool
+match(const struct cls_rule *wild, const flow_t *fixed)
+{
+ int f_idx;
+
+ for (f_idx = 0; f_idx < CLS_N_FIELDS; f_idx++) {
+ const struct cls_field *f = &cls_fields[f_idx];
+ void *wild_field = (char *) &wild->flow + f->ofs;
+ void *fixed_field = (char *) fixed + f->ofs;
+
+ if ((wild->wc.wildcards & f->wildcards) == f->wildcards ||
+ !memcmp(wild_field, fixed_field, f->len)) {
+ /* Definite match. */
+ continue;
+ }
+
+ if (wild->wc.wildcards & f->wildcards) {
+ uint32_t test = read_uint32(wild_field);
+ uint32_t ip = read_uint32(fixed_field);
+ int shift = (f_idx == CLS_F_IDX_NW_SRC
+ ? OFPFW_NW_SRC_SHIFT : OFPFW_NW_DST_SHIFT);
+ uint32_t mask = flow_nw_bits_to_mask(wild->wc.wildcards, shift);
+ if (!((test ^ ip) & mask)) {
+ continue;
+ }
+ }
+
+ return false;
+ }
+ return true;
+}
+
+static struct cls_rule *
+tcls_lookup(const struct tcls *cls, const flow_t *flow, int include)
+{
+ size_t i;
+
+ for (i = 0; i < cls->n_rules; i++) {
+ struct test_rule *pos = cls->rules[i];
+ uint32_t wildcards = pos->cls_rule.wc.wildcards;
+ if (include & (wildcards ? CLS_INC_WILD : CLS_INC_EXACT)
+ && match(&pos->cls_rule, flow)) {
+ return &pos->cls_rule;
+ }
+ }
+ return NULL;
+}
+
+static void
+tcls_delete_matches(struct tcls *cls,
+ const struct cls_rule *target,
+ int include)
+{
+ size_t i;
+
+ for (i = 0; i < cls->n_rules; ) {
+ struct test_rule *pos = cls->rules[i];
+ uint32_t wildcards = pos->cls_rule.wc.wildcards;
+ if (include & (wildcards ? CLS_INC_WILD : CLS_INC_EXACT)
+ && match(target, &pos->cls_rule.flow)) {
+ tcls_remove(cls, pos);
+ } else {
+ i++;
+ }
+ }
+}
+
+#ifdef WORDS_BIGENDIAN
+#define HTONL(VALUE) ((uint32_t) (VALUE))
+#define HTONS(VALUE) ((uint32_t) (VALUE))
+#else
+#define HTONL(VALUE) (((((uint32_t) (VALUE)) & 0x000000ff) << 24) | \
+ ((((uint32_t) (VALUE)) & 0x0000ff00) << 8) | \
+ ((((uint32_t) (VALUE)) & 0x00ff0000) >> 8) | \
+ ((((uint32_t) (VALUE)) & 0xff000000) >> 24))
+#define HTONS(VALUE) (((((uint16_t) (VALUE)) & 0xff00) >> 8) | \
+ ((((uint16_t) (VALUE)) & 0x00ff) << 8))
+#endif
+
+static uint32_t nw_src_values[] = { HTONL(0xc0a80001),
+ HTONL(0xc0a04455) };
+static uint32_t nw_dst_values[] = { HTONL(0xc0a80002),
+ HTONL(0xc0a04455) };
+static uint16_t in_port_values[] = { HTONS(1), HTONS(OFPP_LOCAL) };
+static uint16_t dl_vlan_values[] = { HTONS(101), HTONS(0) };
+static uint16_t dl_type_values[] = { HTONS(ETH_TYPE_IP), HTONS(ETH_TYPE_ARP) };
+static uint16_t tp_src_values[] = { HTONS(49362), HTONS(80) };
+static uint16_t tp_dst_values[] = { HTONS(6667), HTONS(22) };
+static uint8_t dl_src_values[][6] = { { 0x00, 0x02, 0xe3, 0x0f, 0x80, 0xa4 },
+ { 0x5e, 0x33, 0x7f, 0x5f, 0x1e, 0x99 } };
+static uint8_t dl_dst_values[][6] = { { 0x4a, 0x27, 0x71, 0xae, 0x64, 0xc1 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff } };
+static uint8_t nw_proto_values[] = { IP_TYPE_TCP, IP_TYPE_ICMP };
+
+static void *values[CLS_N_FIELDS][2];
+
+static void
+init_values(void)
+{
+ values[CLS_F_IDX_IN_PORT][0] = &in_port_values[0];
+ values[CLS_F_IDX_IN_PORT][1] = &in_port_values[1];
+
+ values[CLS_F_IDX_DL_VLAN][0] = &dl_vlan_values[0];
+ values[CLS_F_IDX_DL_VLAN][1] = &dl_vlan_values[1];
+
+ values[CLS_F_IDX_DL_SRC][0] = dl_src_values[0];
+ values[CLS_F_IDX_DL_SRC][1] = dl_src_values[1];
+
+ values[CLS_F_IDX_DL_DST][0] = dl_dst_values[0];
+ values[CLS_F_IDX_DL_DST][1] = dl_dst_values[1];
+
+ values[CLS_F_IDX_DL_TYPE][0] = &dl_type_values[0];
+ values[CLS_F_IDX_DL_TYPE][1] = &dl_type_values[1];
+
+ values[CLS_F_IDX_NW_SRC][0] = &nw_src_values[0];
+ values[CLS_F_IDX_NW_SRC][1] = &nw_src_values[1];
+
+ values[CLS_F_IDX_NW_DST][0] = &nw_dst_values[0];
+ values[CLS_F_IDX_NW_DST][1] = &nw_dst_values[1];
+
+ values[CLS_F_IDX_NW_PROTO][0] = &nw_proto_values[0];
+ values[CLS_F_IDX_NW_PROTO][1] = &nw_proto_values[1];
+
+ values[CLS_F_IDX_TP_SRC][0] = &tp_src_values[0];
+ values[CLS_F_IDX_TP_SRC][1] = &tp_src_values[1];
+
+ values[CLS_F_IDX_TP_DST][0] = &tp_dst_values[0];
+ values[CLS_F_IDX_TP_DST][1] = &tp_dst_values[1];
+}
+
+#define N_NW_SRC_VALUES ARRAY_SIZE(nw_src_values)
+#define N_NW_DST_VALUES ARRAY_SIZE(nw_dst_values)
+#define N_IN_PORT_VALUES ARRAY_SIZE(in_port_values)
+#define N_DL_VLAN_VALUES ARRAY_SIZE(dl_vlan_values)
+#define N_DL_TYPE_VALUES ARRAY_SIZE(dl_type_values)
+#define N_TP_SRC_VALUES ARRAY_SIZE(tp_src_values)
+#define N_TP_DST_VALUES ARRAY_SIZE(tp_dst_values)
+#define N_DL_SRC_VALUES ARRAY_SIZE(dl_src_values)
+#define N_DL_DST_VALUES ARRAY_SIZE(dl_dst_values)
+#define N_NW_PROTO_VALUES ARRAY_SIZE(nw_proto_values)
+
+#define N_FLOW_VALUES (N_NW_SRC_VALUES * \
+ N_NW_DST_VALUES * \
+ N_IN_PORT_VALUES * \
+ N_DL_VLAN_VALUES * \
+ N_DL_TYPE_VALUES * \
+ N_TP_SRC_VALUES * \
+ N_TP_DST_VALUES * \
+ N_DL_SRC_VALUES * \
+ N_DL_DST_VALUES * \
+ N_NW_PROTO_VALUES)
+
+static unsigned int
+get_value(unsigned int *x, unsigned n_values)
+{
+ unsigned int rem = *x % n_values;
+ *x /= n_values;
+ return rem;
+}
+
+static struct cls_rule *
+lookup_with_include_bits(const struct classifier *cls,
+ const flow_t *flow, int include)
+{
+ switch (include) {
+ case CLS_INC_WILD:
+ return classifier_lookup_wild(cls, flow);
+ case CLS_INC_EXACT:
+ return classifier_lookup_exact(cls, flow);
+ case CLS_INC_WILD | CLS_INC_EXACT:
+ return classifier_lookup(cls, flow);
+ default:
+ abort();
+ }
+}
+
+static void
+compare_classifiers(struct classifier *cls, struct tcls *tcls)
+{
+ unsigned int i;
+
+ assert(classifier_count(cls) == tcls->n_rules);
+ assert(classifier_count_exact(cls) == tcls_count_exact(tcls));
+ for (i = 0; i < N_FLOW_VALUES; i++) {
+ struct cls_rule *cr0, *cr1;
+ flow_t flow;
+ unsigned int x;
+ int include;
+
+ x = i;
+ flow.nw_src = nw_src_values[get_value(&x, N_NW_SRC_VALUES)];
+ flow.nw_dst = nw_dst_values[get_value(&x, N_NW_DST_VALUES)];
+ flow.in_port = in_port_values[get_value(&x, N_IN_PORT_VALUES)];
+ flow.dl_vlan = dl_vlan_values[get_value(&x, N_DL_VLAN_VALUES)];
+ flow.dl_type = dl_type_values[get_value(&x, N_DL_TYPE_VALUES)];
+ flow.tp_src = tp_src_values[get_value(&x, N_TP_SRC_VALUES)];
+ flow.tp_dst = tp_dst_values[get_value(&x, N_TP_DST_VALUES)];
+ memcpy(flow.dl_src, dl_src_values[get_value(&x, N_DL_SRC_VALUES)],
+ ETH_ADDR_LEN);
+ memcpy(flow.dl_dst, dl_dst_values[get_value(&x, N_DL_DST_VALUES)],
+ ETH_ADDR_LEN);
+ flow.nw_proto = nw_proto_values[get_value(&x, N_NW_PROTO_VALUES)];
+ flow.reserved = 0;
+
+ for (include = 1; include <= 3; include++) {
+ cr0 = lookup_with_include_bits(cls, &flow, include);
+ cr1 = tcls_lookup(tcls, &flow, include);
+ assert((cr0 == NULL) == (cr1 == NULL));
+ if (cr0 != NULL) {
+ const struct test_rule *tr0 = test_rule_from_cls_rule(cr0);
+ const struct test_rule *tr1 = test_rule_from_cls_rule(cr1);
+
+ assert(flow_equal(&cr0->flow, &cr1->flow));
+ assert(cr0->wc.wildcards == cr1->wc.wildcards);
+ assert(cr0->priority == cr1->priority);
+ /* Skip nw_src_mask and nw_dst_mask, because they are derived
+ * members whose values are used only for optimization. */
+ assert(tr0->aux == tr1->aux);
+ }
+ }
+ }
+}
+
+static void
+free_rule(struct cls_rule *cls_rule, void *cls)
+{
+ classifier_remove(cls, cls_rule);
+ free(test_rule_from_cls_rule(cls_rule));
+}
+
+static void
+destroy_classifier(struct classifier *cls)
+{
+ classifier_for_each(cls, CLS_INC_ALL, free_rule, cls);
+ classifier_destroy(cls);
+}
+
+static void
+check_tables(const struct classifier *cls,
+ int n_tables, int n_buckets, int n_rules)
+{
+ int found_tables = 0;
+ int found_buckets = 0;
+ int found_rules = 0;
+ int i;
+
+ BUILD_ASSERT(CLS_N_FIELDS == ARRAY_SIZE(cls->tables));
+ for (i = 0; i < CLS_N_FIELDS; i++) {
+ const struct cls_bucket *bucket;
+ if (!hmap_is_empty(&cls->tables[i])) {
+ found_tables++;
+ }
+ HMAP_FOR_EACH (bucket, struct cls_bucket, hmap_node, &cls->tables[i]) {
+ found_buckets++;
+ assert(!list_is_empty(&bucket->rules));
+ found_rules += list_size(&bucket->rules);
+ }
+ }
+
+ if (!hmap_is_empty(&cls->exact_table)) {
+ found_tables++;
+ found_buckets++;
+ found_rules += hmap_count(&cls->exact_table);
+ }
+
+ assert(n_tables == -1 || found_tables == n_tables);
+ assert(n_rules == -1 || found_rules == n_rules);
+ assert(n_buckets == -1 || found_buckets == n_buckets);
+}
+
+static struct test_rule *
+make_rule(int wc_fields, unsigned int priority, int value_pat)
+{
+ const struct cls_field *f;
+ struct test_rule *rule;
+ uint32_t wildcards;
+ flow_t flow;
+
+ wildcards = 0;
+ memset(&flow, 0, sizeof flow);
+ for (f = &cls_fields[0]; f < &cls_fields[CLS_N_FIELDS]; f++) {
+ int f_idx = f - cls_fields;
+ if (wc_fields & (1u << f_idx)) {
+ wildcards |= f->wildcards;
+ } else {
+ int value_idx = (value_pat & (1u << f_idx)) != 0;
+ memcpy((char *) &flow + f->ofs, values[f_idx][value_idx], f->len);
+ }
+ }
+
+ rule = xcalloc(1, sizeof *rule);
+ cls_rule_from_flow(&rule->cls_rule, &flow, wildcards,
+ !wildcards ? UINT_MAX : priority);
+ return rule;
+}
+
+static void
+shuffle(unsigned int *p, size_t n)
+{
+ for (; n > 1; n--, p++) {
+ unsigned int *q = &p[rand() % n];
+ unsigned int tmp = *p;
+ *p = *q;
+ *q = tmp;
+ }
+}
+
+/* Tests an empty classifier. */
+static void
+test_empty(void)
+{
+ struct classifier cls;
+ struct tcls tcls;
+
+ classifier_init(&cls);
+ tcls_init(&tcls);
+ assert(classifier_is_empty(&cls));
+ assert(tcls_is_empty(&tcls));
+ compare_classifiers(&cls, &tcls);
+ classifier_destroy(&cls);
+ tcls_destroy(&tcls);
+}
+
+/* Destroys a null classifier. */
+static void
+test_destroy_null(void)
+{
+ classifier_destroy(NULL);
+}
+
+/* Tests classification with one rule at a time. */
+static void
+test_single_rule(void)
+{
+ unsigned int wc_fields; /* Hilarious. */
+
+ for (wc_fields = 0; wc_fields < (1u << CLS_N_FIELDS); wc_fields++) {
+ struct classifier cls;
+ struct test_rule *rule, *tcls_rule;
+ struct tcls tcls;
+
+ rule = make_rule(wc_fields,
+ hash_bytes(&wc_fields, sizeof wc_fields, 0), 0);
+
+ classifier_init(&cls);
+ tcls_init(&tcls);
+
+ tcls_rule = tcls_insert(&tcls, rule);
+ if (wc_fields) {
+ assert(!classifier_insert(&cls, &rule->cls_rule));
+ } else {
+ classifier_insert_exact(&cls, &rule->cls_rule);
+ }
+ check_tables(&cls, 1, 1, 1);
+ compare_classifiers(&cls, &tcls);
+
+ classifier_remove(&cls, &rule->cls_rule);
+ tcls_remove(&tcls, tcls_rule);
+ assert(classifier_is_empty(&cls));
+ assert(tcls_is_empty(&tcls));
+ compare_classifiers(&cls, &tcls);
+
+ free(rule);
+ classifier_destroy(&cls);
+ tcls_destroy(&tcls);
+ }
+}
+
+/* Tests replacing one rule by another. */
+static void
+test_rule_replacement(void)
+{
+ unsigned int wc_fields;
+
+ for (wc_fields = 0; wc_fields < (1u << CLS_N_FIELDS); wc_fields++) {
+ struct classifier cls;
+ struct test_rule *rule1, *tcls_rule1;
+ struct test_rule *rule2, *tcls_rule2;
+ struct tcls tcls;
+
+ rule1 = make_rule(wc_fields, OFP_DEFAULT_PRIORITY, UINT_MAX);
+ rule2 = make_rule(wc_fields, OFP_DEFAULT_PRIORITY, UINT_MAX);
+ rule2->aux += 5;
+ rule2->aux += 5;
+
+ classifier_init(&cls);
+ tcls_init(&tcls);
+ tcls_rule1 = tcls_insert(&tcls, rule1);
+ assert(!classifier_insert(&cls, &rule1->cls_rule));
+ check_tables(&cls, 1, 1, 1);
+ compare_classifiers(&cls, &tcls);
+ tcls_destroy(&tcls);
+
+ tcls_init(&tcls);
+ tcls_rule2 = tcls_insert(&tcls, rule2);
+ assert(test_rule_from_cls_rule(
+ classifier_insert(&cls, &rule2->cls_rule)) == rule1);
+ free(rule1);
+ check_tables(&cls, 1, 1, 1);
+ compare_classifiers(&cls, &tcls);
+ tcls_destroy(&tcls);
+ destroy_classifier(&cls);
+ }
+}
+
+static int
+table_mask(int table)
+{
+ return ((1u << CLS_N_FIELDS) - 1) & ~((1u << table) - 1);
+}
+
+static int
+random_wcf_in_table(int table, int seed)
+{
+ int wc_fields = (1u << table) | hash_int(seed, 0);
+ return wc_fields & table_mask(table);
+}
+
+/* Tests classification with two rules at a time that fall into the same
+ * bucket. */
+static void
+test_two_rules_in_one_bucket(void)
+{
+ int table, rel_pri, wcf_pat, value_pat;
+
+ for (table = 0; table <= CLS_N_FIELDS; table++) {
+ for (rel_pri = -1; rel_pri <= +1; rel_pri++) {
+ for (wcf_pat = 0; wcf_pat < 4; wcf_pat++) {
+ int n_value_pats = table == CLS_N_FIELDS - 1 ? 1 : 2;
+ for (value_pat = 0; value_pat < n_value_pats; value_pat++) {
+ struct test_rule *rule1, *tcls_rule1;
+ struct test_rule *rule2, *tcls_rule2;
+ struct test_rule *displaced_rule;
+ struct classifier cls;
+ struct tcls tcls;
+ unsigned int pri1, pri2;
+ int wcf1, wcf2;
+
+ if (table != CLS_F_IDX_EXACT) {
+ /* We can use identical priorities in this test because
+ * the classifier always chooses the rule added later
+ * for equal-priority rules that fall into the same
+ * bucket. */
+ pri1 = table * 257 + 50;
+ pri2 = pri1 + rel_pri;
+
+ wcf1 = (wcf_pat & 1
+ ? random_wcf_in_table(table, pri1)
+ : 1u << table);
+ wcf2 = (wcf_pat & 2
+ ? random_wcf_in_table(table, pri2)
+ : 1u << table);
+ if (value_pat) {
+ wcf1 &= ~(1u << (CLS_N_FIELDS - 1));
+ wcf2 &= ~(1u << (CLS_N_FIELDS - 1));
+ }
+ } else {
+ /* This classifier always puts exact-match rules at
+ * maximum priority. */
+ pri1 = pri2 = UINT_MAX;
+
+ /* No wildcard fields. */
+ wcf1 = wcf2 = 0;
+ }
+
+ rule1 = make_rule(wcf1, pri1, 0);
+ rule2 = make_rule(wcf2, pri2,
+ value_pat << (CLS_N_FIELDS - 1));
+
+ classifier_init(&cls);
+ tcls_init(&tcls);
+
+ tcls_rule1 = tcls_insert(&tcls, rule1);
+ tcls_rule2 = tcls_insert(&tcls, rule2);
+ assert(!classifier_insert(&cls, &rule1->cls_rule));
+ displaced_rule = test_rule_from_cls_rule(
+ classifier_insert(&cls, &rule2->cls_rule));
+ if (wcf1 != wcf2 || pri1 != pri2 || value_pat) {
+ assert(!displaced_rule);
+
+ check_tables(&cls, 1, 1, 2);
+ compare_classifiers(&cls, &tcls);
+
+ classifier_remove(&cls, &rule1->cls_rule);
+ tcls_remove(&tcls, tcls_rule1);
+ check_tables(&cls, 1, 1, 1);
+ compare_classifiers(&cls, &tcls);
+ } else {
+ assert(displaced_rule == rule1);
+ check_tables(&cls, 1, 1, 1);
+ compare_classifiers(&cls, &tcls);
+ }
+ free(rule1);
+
+ classifier_remove(&cls, &rule2->cls_rule);
+ tcls_remove(&tcls, tcls_rule2);
+ compare_classifiers(&cls, &tcls);
+ free(rule2);
+
+ destroy_classifier(&cls);
+ tcls_destroy(&tcls);
+ }
+ }
+ }
+ }
+}
+
+/* Tests classification with two rules at a time that fall into the same
+ * table but different buckets. */
+static void
+test_two_rules_in_one_table(void)
+{
+ int table, rel_pri, wcf_pat;
+
+ /* Skip tables 0 and CLS_F_IDX_EXACT because they have one bucket. */
+ for (table = 1; table < CLS_N_FIELDS; table++) {
+ for (rel_pri = -1; rel_pri <= +1; rel_pri++) {
+ for (wcf_pat = 0; wcf_pat < 5; wcf_pat++) {
+ struct test_rule *rule1, *tcls_rule1;
+ struct test_rule *rule2, *tcls_rule2;
+ struct classifier cls;
+ struct tcls tcls;
+ unsigned int pri1, pri2;
+ int wcf1, wcf2;
+ int value_mask, value_pat1, value_pat2;
+ int i;
+
+ /* We can use identical priorities in this test because the
+ * classifier always chooses the rule added later for
+ * equal-priority rules that fall into the same table. */
+ pri1 = table * 257 + 50;
+ pri2 = pri1 + rel_pri;
+
+ if (wcf_pat & 4) {
+ wcf1 = wcf2 = random_wcf_in_table(table, pri1);
+ } else {
+ wcf1 = (wcf_pat & 1
+ ? random_wcf_in_table(table, pri1)
+ : 1u << table);
+ wcf2 = (wcf_pat & 2
+ ? random_wcf_in_table(table, pri2)
+ : 1u << table);
+ }
+
+ /* Generate value patterns that will put the two rules into
+ * different buckets. */
+ value_mask = ((1u << table) - 1);
+ value_pat1 = hash_int(pri1, 1) & value_mask;
+ i = 0;
+ do {
+ value_pat2 = (hash_int(pri2, i++) & value_mask);
+ } while (value_pat1 == value_pat2);
+ rule1 = make_rule(wcf1, pri1, value_pat1);
+ rule2 = make_rule(wcf2, pri2, value_pat2);
+
+ classifier_init(&cls);
+ tcls_init(&tcls);
+
+ tcls_rule1 = tcls_insert(&tcls, rule1);
+ tcls_rule2 = tcls_insert(&tcls, rule2);
+ assert(!classifier_insert(&cls, &rule1->cls_rule));
+ assert(!classifier_insert(&cls, &rule2->cls_rule));
+ check_tables(&cls, 1, 2, 2);
+ compare_classifiers(&cls, &tcls);
+
+ classifier_remove(&cls, &rule1->cls_rule);
+ tcls_remove(&tcls, tcls_rule1);
+ check_tables(&cls, 1, 1, 1);
+ compare_classifiers(&cls, &tcls);
+ free(rule1);
+
+ classifier_remove(&cls, &rule2->cls_rule);
+ tcls_remove(&tcls, tcls_rule2);
+ compare_classifiers(&cls, &tcls);
+ free(rule2);
+
+ classifier_destroy(&cls);
+ tcls_destroy(&tcls);
+ }
+ }
+ }
+}
+
+/* Tests classification with two rules at a time that fall into different
+ * tables. */
+static void
+test_two_rules_in_different_tables(void)
+{
+ int table1, table2, rel_pri, wcf_pat;
+
+ for (table1 = 0; table1 < CLS_N_FIELDS; table1++) {
+ for (table2 = table1 + 1; table2 <= CLS_N_FIELDS; table2++) {
+ for (rel_pri = 0; rel_pri < 2; rel_pri++) {
+ for (wcf_pat = 0; wcf_pat < 4; wcf_pat++) {
+ struct test_rule *rule1, *tcls_rule1;
+ struct test_rule *rule2, *tcls_rule2;
+ struct classifier cls;
+ struct tcls tcls;
+ unsigned int pri1, pri2;
+ int wcf1, wcf2;
+
+ /* We must use unique priorities in this test because the
+ * classifier makes the rule choice undefined for rules of
+ * equal priority that fall into different tables. (In
+ * practice, lower-numbered tables win.) */
+ pri1 = table1 * 257 + 50;
+ pri2 = rel_pri ? pri1 - 1 : pri1 + 1;
+
+ wcf1 = (wcf_pat & 1
+ ? random_wcf_in_table(table1, pri1)
+ : 1u << table1);
+ wcf2 = (wcf_pat & 2
+ ? random_wcf_in_table(table2, pri2)
+ : 1u << table2);
+
+ if (table2 == CLS_F_IDX_EXACT) {
+ pri2 = UINT16_MAX;
+ wcf2 = 0;
+ }
+
+ rule1 = make_rule(wcf1, pri1, 0);
+ rule2 = make_rule(wcf2, pri2, 0);
+
+ classifier_init(&cls);
+ tcls_init(&tcls);
+
+ tcls_rule1 = tcls_insert(&tcls, rule1);
+ tcls_rule2 = tcls_insert(&tcls, rule2);
+ assert(!classifier_insert(&cls, &rule1->cls_rule));
+ assert(!classifier_insert(&cls, &rule2->cls_rule));
+ check_tables(&cls, 2, 2, 2);
+ compare_classifiers(&cls, &tcls);
+
+ classifier_remove(&cls, &rule1->cls_rule);
+ tcls_remove(&tcls, tcls_rule1);
+ check_tables(&cls, 1, 1, 1);
+ compare_classifiers(&cls, &tcls);
+ free(rule1);
+
+ classifier_remove(&cls, &rule2->cls_rule);
+ tcls_remove(&tcls, tcls_rule2);
+ compare_classifiers(&cls, &tcls);
+ free(rule2);
+
+ classifier_destroy(&cls);
+ tcls_destroy(&tcls);
+ }
+ }
+ }
+ }
+}
+
+/* Tests classification with many rules at a time that fall into the same
+ * bucket but have unique priorities (and various wildcards). */
+static void
+test_many_rules_in_one_bucket(void)
+{
+ enum { MAX_RULES = 50 };
+ int iteration, table;
+
+ for (iteration = 0; iteration < 3; iteration++) {
+ for (table = 0; table <= CLS_N_FIELDS; table++) {
+ unsigned int priorities[MAX_RULES];
+ struct classifier cls;
+ struct tcls tcls;
+ int i;
+
+ srand(hash_int(table, iteration));
+ for (i = 0; i < MAX_RULES; i++) {
+ priorities[i] = i * 129;
+ }
+ shuffle(priorities, ARRAY_SIZE(priorities));
+
+ classifier_init(&cls);
+ tcls_init(&tcls);
+
+ for (i = 0; i < MAX_RULES; i++) {
+ struct test_rule *rule;
+ unsigned int priority = priorities[i];
+ int wcf;
+
+ wcf = random_wcf_in_table(table, priority);
+ rule = make_rule(wcf, priority,
+ table == CLS_F_IDX_EXACT ? i : 1234);
+ tcls_insert(&tcls, rule);
+ assert(!classifier_insert(&cls, &rule->cls_rule));
+ check_tables(&cls, 1, 1, i + 1);
+ compare_classifiers(&cls, &tcls);
+ }
+
+ destroy_classifier(&cls);
+ tcls_destroy(&tcls);
+ }
+ }
+}
+
+/* Tests classification with many rules at a time that fall into the same
+ * table but random buckets. */
+static void
+test_many_rules_in_one_table(void)
+{
+ enum { MAX_RULES = 50 };
+ int iteration, table;
+
+ for (iteration = 0; iteration < 3; iteration++) {
+ for (table = 0; table < CLS_N_FIELDS; table++) {
+ unsigned int priorities[MAX_RULES];
+ struct classifier cls;
+ struct tcls tcls;
+ int i;
+
+ srand(hash_int(table, iteration));
+ for (i = 0; i < MAX_RULES; i++) {
+ priorities[i] = i * 129;
+ }
+ shuffle(priorities, ARRAY_SIZE(priorities));
+
+ classifier_init(&cls);
+ tcls_init(&tcls);
+
+ for (i = 0; i < MAX_RULES; i++) {
+ struct test_rule *rule;
+ unsigned int priority = priorities[i];
+ int wcf;
+
+ wcf = random_wcf_in_table(table, priority);
+ rule = make_rule(wcf, priority, hash_int(priority, 1));
+ tcls_insert(&tcls, rule);
+ assert(!classifier_insert(&cls, &rule->cls_rule));
+ check_tables(&cls, 1, -1, i + 1);
+ compare_classifiers(&cls, &tcls);
+ }
+
+ destroy_classifier(&cls);
+ tcls_destroy(&tcls);
+ }
+ }
+}
+
+/* Tests classification with many rules at a time that fall into random buckets
+ * in random tables. */
+static void
+test_many_rules_in_different_tables(void)
+{
+ enum { MAX_RULES = 50 };
+ int iteration;
+
+ for (iteration = 0; iteration < 30; iteration++) {
+ unsigned int priorities[MAX_RULES];
+ struct classifier cls;
+ struct tcls tcls;
+ int i;
+
+ srand(iteration);
+ for (i = 0; i < MAX_RULES; i++) {
+ priorities[i] = i * 129;
+ }
+ shuffle(priorities, ARRAY_SIZE(priorities));
+
+ classifier_init(&cls);
+ tcls_init(&tcls);
+
+ for (i = 0; i < MAX_RULES; i++) {
+ struct test_rule *rule;
+ unsigned int priority = priorities[i];
+ int table = rand() % (CLS_N_FIELDS + 1);
+ int wcf = random_wcf_in_table(table, rand());
+ int value_pat = rand() & ((1u << CLS_N_FIELDS) - 1);
+ rule = make_rule(wcf, priority, value_pat);
+ tcls_insert(&tcls, rule);
+ assert(!classifier_insert(&cls, &rule->cls_rule));
+ check_tables(&cls, -1, -1, i + 1);
+ compare_classifiers(&cls, &tcls);
+ }
+
+ while (!classifier_is_empty(&cls)) {
+ struct test_rule *rule = xmemdup(tcls.rules[rand() % tcls.n_rules],
+ sizeof(struct test_rule));
+ int include = rand() % 2 ? CLS_INC_WILD : CLS_INC_EXACT;
+ include |= (rule->cls_rule.wc.wildcards
+ ? CLS_INC_WILD : CLS_INC_EXACT);
+ classifier_for_each_match(&cls, &rule->cls_rule, include,
+ free_rule, &cls);
+ tcls_delete_matches(&tcls, &rule->cls_rule, include);
+ compare_classifiers(&cls, &tcls);
+ free(rule);
+ }
+ putchar('.');
+ fflush(stdout);
+
+ destroy_classifier(&cls);
+ tcls_destroy(&tcls);
+ }
+}
+
+static void
+run_test(void (*function)(void))
+{
+ function();
+ putchar('.');
+ fflush(stdout);
+}
+
+int
+main(void)
+{
+ init_values();
+ run_test(test_empty);
+ run_test(test_destroy_null);
+ run_test(test_single_rule);
+ run_test(test_rule_replacement);
+ run_test(test_two_rules_in_one_bucket);
+ run_test(test_two_rules_in_one_table);
+ run_test(test_two_rules_in_different_tables);
+ run_test(test_many_rules_in_one_bucket);
+ run_test(test_many_rules_in_one_table);
+ run_test(test_many_rules_in_different_tables);
+ putchar('\n');
+ return 0;
+}
diff --git a/tests/test-dhcp-client.c b/tests/test-dhcp-client.c
new file mode 100644
index 000000000..2fee3fc11
--- /dev/null
+++ b/tests/test-dhcp-client.c
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) 2008, 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <config.h>
+#include "dhcp-client.h"
+#include <arpa/inet.h>
+#include <getopt.h>
+#include <stdlib.h>
+#include <limits.h>
+#include "command-line.h"
+#include "dhcp.h"
+#include "fatal-signal.h"
+#include "fault.h"
+#include "poll-loop.h"
+#include "util.h"
+#include "vlog.h"
+
+/* --request-ip: IP address to request from server. If zero, then do not
+ * request a specific IP address. */
+static struct in_addr request_ip;
+
+/* --vendor-class: Vendor class string to include in request. If null, no
+ * vendor class string is included. */
+static const char *vendor_class;
+
+/* --no-resolv-conf: Update /etc/resolv.conf to match DHCP reply? */
+static bool update_resolv_conf = true;
+
+static void parse_options(int argc, char *argv[]);
+static void usage(void);
+static void release(void *cli_);
+static void modify_dhcp_request(struct dhcp_msg *, void *aux);
+
+int
+main(int argc, char *argv[])
+{
+ struct dhclient *cli;
+ int error;
+
+ set_program_name(argv[0]);
+ register_fault_handlers();
+ vlog_init();
+ parse_options(argc, argv);
+
+ argc -= optind;
+ argv += optind;
+ if (argc != 1) {
+ ovs_fatal(0, "exactly one non-option argument required; "
+ "use --help for help");
+ }
+
+ error = dhclient_create(argv[0], modify_dhcp_request, NULL, NULL, &cli);
+ if (error) {
+ ovs_fatal(error, "dhclient_create failed");
+ }
+ dhclient_init(cli, request_ip.s_addr);
+ fatal_signal_add_hook(release, cli, true);
+
+ for (;;) {
+ fatal_signal_block();
+ dhclient_run(cli);
+ if (dhclient_changed(cli)) {
+ dhclient_configure_netdev(cli);
+ if (update_resolv_conf) {
+ dhclient_update_resolv_conf(cli);
+ }
+ }
+ dhclient_wait(cli);
+ fatal_signal_unblock();
+ poll_block();
+ }
+}
+
+static void
+release(void *cli_)
+{
+ struct dhclient *cli = cli_;
+ dhclient_release(cli);
+ if (dhclient_changed(cli)) {
+ dhclient_configure_netdev(cli);
+ }
+}
+
+static void
+modify_dhcp_request(struct dhcp_msg *msg, void *aux UNUSED)
+{
+ if (vendor_class) {
+ dhcp_msg_put_string(msg, DHCP_CODE_VENDOR_CLASS, vendor_class);
+ }
+}
+
+static void
+parse_options(int argc, char *argv[])
+{
+ enum {
+ OPT_REQUEST_IP = UCHAR_MAX + 1,
+ OPT_VENDOR_CLASS,
+ OPT_NO_RESOLV_CONF
+ };
+ static struct option long_options[] = {
+ {"request-ip", required_argument, 0, OPT_REQUEST_IP },
+ {"vendor-class", required_argument, 0, OPT_VENDOR_CLASS },
+ {"no-resolv-conf", no_argument, 0, OPT_NO_RESOLV_CONF},
+ {"verbose", optional_argument, 0, 'v'},
+ {"help", no_argument, 0, 'h'},
+ {"version", no_argument, 0, 'V'},
+ {0, 0, 0, 0},
+ };
+ char *short_options = long_options_to_short_options(long_options);
+
+ for (;;) {
+ int c;
+
+ c = getopt_long(argc, argv, short_options, long_options, NULL);
+ if (c == -1) {
+ break;
+ }
+
+ switch (c) {
+ case OPT_REQUEST_IP:
+ if (!inet_aton(optarg, &request_ip)) {
+ ovs_fatal(0,
+ "--request-ip argument is not a valid IP address");
+ }
+ break;
+
+ case OPT_VENDOR_CLASS:
+ vendor_class = optarg;
+ break;
+
+ case OPT_NO_RESOLV_CONF:
+ update_resolv_conf = false;
+ break;
+
+ case 'h':
+ usage();
+
+ case 'V':
+ printf("%s %s compiled "__DATE__" "__TIME__"\n",
+ program_name, VERSION BUILDNR);
+ exit(EXIT_SUCCESS);
+
+ case 'v':
+ vlog_set_verbosity(optarg);
+ break;
+
+ case '?':
+ exit(EXIT_FAILURE);
+
+ default:
+ abort();
+ }
+ }
+ free(short_options);
+}
+
+static void
+usage(void)
+{
+ printf("%s: standalone program for testing Open vSwitch DHCP client.\n"
+ "usage: %s [OPTIONS] NETDEV\n"
+ "where NETDEV is a network device (e.g. eth0).\n"
+ "\nDHCP options:\n"
+ " --request-ip=IP request specified IP address (default:\n"
+ " do not request a specific IP)\n"
+ " --vendor-class=STRING use STRING as vendor class (default:\n"
+ " none); use OpenFlow to imitate secchan\n"
+ " --no-resolv-conf do not update /etc/resolv.conf\n",
+ program_name, program_name);
+ vlog_usage();
+ printf("\nOther options:\n"
+ " -h, --help display this help message\n"
+ " -V, --version display version information\n");
+ exit(EXIT_SUCCESS);
+}
+
diff --git a/tests/test-flows.c b/tests/test-flows.c
new file mode 100644
index 000000000..663c5a6d1
--- /dev/null
+++ b/tests/test-flows.c
@@ -0,0 +1,76 @@
+#include <config.h>
+#include "flow.h"
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+#include "openflow/openflow.h"
+#include "timeval.h"
+#include "ofpbuf.h"
+#include "ofp-print.h"
+#include "pcap.h"
+#include "util.h"
+#include "vlog.h"
+
+#undef NDEBUG
+#include <assert.h>
+
+int
+main(int argc UNUSED, char *argv[])
+{
+ struct ofp_match expected_match;
+ FILE *flows, *pcap;
+ int retval;
+ int n = 0, errors = 0;
+
+ set_program_name(argv[0]);
+ time_init();
+ vlog_init();
+
+ flows = stdin;
+ pcap = fdopen(3, "rb");
+ if (!pcap) {
+ ovs_fatal(errno, "failed to open fd 3 for reading");
+ }
+
+ retval = pcap_read_header(pcap);
+ if (retval) {
+ ovs_fatal(retval > 0 ? retval : 0, "reading pcap header failed");
+ }
+
+ while (fread(&expected_match, sizeof expected_match, 1, flows)) {
+ struct ofpbuf *packet;
+ struct ofp_match extracted_match;
+ flow_t flow;
+
+ n++;
+
+ retval = pcap_read(pcap, &packet);
+ if (retval == EOF) {
+ ovs_fatal(0, "unexpected end of file reading pcap file");
+ } else if (retval) {
+ ovs_fatal(retval, "error reading pcap file");
+ }
+
+ flow_extract(packet, 1, &flow);
+ flow_to_match(&flow, 0, &extracted_match);
+
+ if (memcmp(&expected_match, &extracted_match, sizeof expected_match)) {
+ char *exp_s = ofp_match_to_string(&expected_match, 2);
+ char *got_s = ofp_match_to_string(&extracted_match, 2);
+ errors++;
+ printf("mismatch on packet #%d (1-based).\n", n);
+ printf("Packet:\n");
+ ofp_print_packet(stdout, packet->data, packet->size, packet->size);
+ printf("Expected flow:\n%s\n", exp_s);
+ printf("Actually extracted flow:\n%s\n", got_s);
+ printf("\n");
+ free(exp_s);
+ free(got_s);
+ }
+
+ ofpbuf_delete(packet);
+ }
+ printf("checked %d packets, %d errors\n", n, errors);
+ return errors != 0;
+}
+
diff --git a/tests/test-flows.sh b/tests/test-flows.sh
new file mode 100755
index 000000000..0d38ad784
--- /dev/null
+++ b/tests/test-flows.sh
@@ -0,0 +1,9 @@
+#! /bin/sh -e
+srcdir=`cd $srcdir && pwd`
+trap 'rm -f flows$$ pcap$$ out$$' 0 1 2 13 15
+cd tests
+"$srcdir"/tests/flowgen.pl >/dev/null 3>flows$$ 4>pcap$$
+./test-flows <flows$$ 3<pcap$$ >out$$ || true
+diff -u - out$$ <<EOF
+checked 247 packets, 0 errors
+EOF
diff --git a/tests/test-hash.c b/tests/test-hash.c
new file mode 100644
index 000000000..55a544fc0
--- /dev/null
+++ b/tests/test-hash.c
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <config.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "hash.h"
+
+#undef NDEBUG
+#include <assert.h>
+
+static void
+set_bit(uint32_t array[3], int bit)
+{
+ assert(bit >= 0 && bit <= 96);
+ memset(array, 0, sizeof(uint32_t) * 3);
+ if (bit < 96) {
+ array[bit / 32] = UINT32_C(1) << (bit % 32);
+ }
+}
+
+static uint32_t
+hash_words_cb(uint32_t input)
+{
+ return hash_words(&input, 1, 0);
+}
+
+static uint32_t
+hash_int_cb(uint32_t input)
+{
+ return hash_int(input, 0);
+}
+
+static void
+check_word_hash(uint32_t (*hash)(uint32_t), const char *name,
+ int min_unique)
+{
+ int i, j;
+
+ for (i = 0; i <= 32; i++) {
+ uint32_t in1 = i < 32 ? UINT32_C(1) << i : 0;
+ for (j = i + 1; j <= 32; j++) {
+ uint32_t in2 = j < 32 ? UINT32_C(1) << j : 0;
+ uint32_t out1 = hash(in1);
+ uint32_t out2 = hash(in2);
+ const uint32_t unique_mask = (UINT32_C(1) << min_unique) - 1;
+ int ofs;
+ for (ofs = 0; ofs < 32 - min_unique; ofs++) {
+ uint32_t bits1 = (out1 >> ofs) & unique_mask;
+ uint32_t bits2 = (out2 >> ofs) & unique_mask;
+ if (bits1 == bits2) {
+ printf("Partial collision for '%s':\n", name);
+ printf("%s(%08"PRIx32") = %08"PRIx32"\n", name, in1, out1);
+ printf("%s(%08"PRIx32") = %08"PRIx32"\n", name, in2, out2);
+ printf("%d bits of output starting at bit %d "
+ "are both 0x%"PRIx32"\n", min_unique, ofs, bits1);
+ exit(1);
+ }
+ }
+ }
+ }
+}
+
+int
+main(void)
+{
+ int i, j;
+
+ /* Check that all hashes computed with hash_words with one 1-bit (or no
+ * 1-bits) set within a single 32-bit word have different values in all
+ * 11-bit consecutive runs.
+ *
+ * Given a random distribution, the probability of at least one collision
+ * in any set of 11 bits is approximately
+ *
+ * 1 - ((2**11 - 1)/2**11)**C(33,2)
+ * == 1 - (2047/2048)**528
+ * =~ 0.22
+ *
+ * There are 21 ways to pick 11 consecutive bits in a 32-bit word, so if we
+ * assumed independence then the chance of having no collisions in any of
+ * those 11-bit runs would be (1-0.22)**21 =~ .0044. Obviously
+ * independence must be a bad assumption :-)
+ */
+ check_word_hash(hash_words_cb, "hash_words", 11);
+
+ /* Check that all hash functions of with one 1-bit (or no 1-bits) set
+ * within three 32-bit words have different values in their lowest 12
+ * bits.
+ *
+ * Given a random distribution, the probability of at least one collision
+ * in 12 bits is approximately
+ *
+ * 1 - ((2**12 - 1)/2**12)**C(97,2)
+ * == 1 - (4095/4096)**4656
+ * =~ 0.68
+ *
+ * so we are doing pretty well to not have any collisions in 12 bits.
+ */
+ for (i = 0; i <= 96; i++) {
+ for (j = i + 1; j <= 96; j++) {
+ uint32_t in1[3], in2[3];
+ uint32_t out1, out2;
+ const int min_unique = 12;
+ const uint32_t unique_mask = (UINT32_C(1) << min_unique) - 1;
+
+ set_bit(in1, i);
+ set_bit(in2, j);
+ out1 = hash_words(in1, 3, 0);
+ out2 = hash_words(in2, 3, 0);
+ if ((out1 & unique_mask) == (out2 & unique_mask)) {
+ printf("Partial collision:\n");
+ printf("hash(1 << %d) == %08"PRIx32"\n", i, out1);
+ printf("hash(1 << %d) == %08"PRIx32"\n", j, out2);
+ printf("The low-order %d bits of output are both "
+ "0x%"PRIx32"\n", min_unique, out1 & unique_mask);
+ exit(1);
+ }
+ }
+ }
+
+ /* Check that all hashes computed with hash_int with one 1-bit (or no
+ * 1-bits) set within a single 32-bit word have different values in all
+ * 14-bit consecutive runs.
+ *
+ * Given a random distribution, the probability of at least one collision
+ * in any set of 14 bits is approximately
+ *
+ * 1 - ((2**14 - 1)/2**14)**C(33,2)
+ * == 1 - (16,383/16,834)**528
+ * =~ 0.031
+ *
+ * There are 18 ways to pick 14 consecutive bits in a 32-bit word, so if we
+ * assumed independence then the chance of having no collisions in any of
+ * those 14-bit runs would be (1-0.03)**18 =~ 0.56. This seems reasonable.
+ */
+ check_word_hash(hash_int_cb, "hash_int", 14);
+
+ return 0;
+}
diff --git a/tests/test-hmap.c b/tests/test-hmap.c
new file mode 100644
index 000000000..684a5bae2
--- /dev/null
+++ b/tests/test-hmap.c
@@ -0,0 +1,281 @@
+/* A non-exhaustive test for some of the functions and macros declared in
+ * hmap.h. */
+
+#include <config.h>
+#include "hmap.h"
+#include <string.h>
+#include "hash.h"
+#include "util.h"
+
+#undef NDEBUG
+#include <assert.h>
+
+/* Sample hmap element. */
+struct element {
+ int value;
+ struct hmap_node node;
+};
+
+typedef size_t hash_func(int value);
+
+static int
+compare_ints(const void *a_, const void *b_)
+{
+ const int *a = a_;
+ const int *b = b_;
+ return *a < *b ? -1 : *a > *b;
+}
+
+/* Verifies that 'hmap' contains exactly the 'n' values in 'values'. */
+static void
+check_hmap(struct hmap *hmap, const int values[], size_t n,
+ hash_func *hash)
+{
+ int *sort_values, *hmap_values;
+ struct element *e;
+ size_t i;
+
+ /* Check that all the values are there in iteration. */
+ sort_values = xmalloc(sizeof *sort_values * n);
+ hmap_values = xmalloc(sizeof *sort_values * n);
+
+ i = 0;
+ HMAP_FOR_EACH (e, struct element, node, hmap) {
+ assert(i < n);
+ hmap_values[i++] = e->value;
+ }
+ assert(i == n);
+
+ memcpy(sort_values, values, sizeof *sort_values * n);
+ qsort(sort_values, n, sizeof *sort_values, compare_ints);
+ qsort(hmap_values, n, sizeof *hmap_values, compare_ints);
+
+ for (i = 0; i < n; i++) {
+ assert(sort_values[i] == hmap_values[i]);
+ }
+
+ free(hmap_values);
+ free(sort_values);
+
+ /* Check that all the values are there in lookup. */
+ for (i = 0; i < n; i++) {
+ size_t count = 0;
+
+ HMAP_FOR_EACH_WITH_HASH (e, struct element, node,
+ hash(values[i]), hmap) {
+ count += e->value == values[i];
+ }
+ assert(count == 1);
+ }
+
+ /* Check counters. */
+ assert(hmap_is_empty(hmap) == !n);
+ assert(hmap_count(hmap) == n);
+}
+
+/* Puts the 'n' values in 'values' into 'elements', and then puts those
+ * elements into 'hmap'. */
+static void
+make_hmap(struct hmap *hmap, struct element elements[],
+ int values[], size_t n, hash_func *hash)
+{
+ size_t i;
+
+ hmap_init(hmap);
+ for (i = 0; i < n; i++) {
+ elements[i].value = i;
+ hmap_insert(hmap, &elements[i].node, hash(elements[i].value));
+ values[i] = i;
+ }
+}
+
+static void
+shuffle(int *p, size_t n)
+{
+ for (; n > 1; n--, p++) {
+ int *q = &p[rand() % n];
+ int tmp = *p;
+ *p = *q;
+ *q = tmp;
+ }
+}
+
+#if 0
+/* Prints the values in 'hmap', plus 'name' as a title. */
+static void
+print_hmap(const char *name, struct hmap *hmap)
+{
+ struct element *e;
+
+ printf("%s:", name);
+ HMAP_FOR_EACH (e, struct element, node, hmap) {
+ printf(" %d(%zu)", e->value, e->node.hash & hmap->mask);
+ }
+ printf("\n");
+}
+
+/* Prints the 'n' values in 'values', plus 'name' as a title. */
+static void
+print_ints(const char *name, const int *values, size_t n)
+{
+ size_t i;
+
+ printf("%s:", name);
+ for (i = 0; i < n; i++) {
+ printf(" %d", values[i]);
+ }
+ printf("\n");
+}
+#endif
+
+static size_t
+identity_hash(int value)
+{
+ return value;
+}
+
+static size_t
+good_hash(int value)
+{
+ return hash_int(value, 0x1234abcd);
+}
+
+static size_t
+constant_hash(int value UNUSED)
+{
+ return 123;
+}
+
+/* Tests basic hmap insertion and deletion. */
+static void
+test_hmap_insert_delete(hash_func *hash)
+{
+ enum { N_ELEMS = 100 };
+
+ struct element elements[N_ELEMS];
+ int values[N_ELEMS];
+ struct hmap hmap;
+ size_t i;
+
+ hmap_init(&hmap);
+ for (i = 0; i < N_ELEMS; i++) {
+ elements[i].value = i;
+ hmap_insert(&hmap, &elements[i].node, hash(i));
+ values[i] = i;
+ check_hmap(&hmap, values, i + 1, hash);
+ }
+ shuffle(values, N_ELEMS);
+ for (i = 0; i < N_ELEMS; i++) {
+ hmap_remove(&hmap, &elements[values[i]].node);
+ check_hmap(&hmap, values + (i + 1), N_ELEMS - (i + 1), hash);
+ }
+ hmap_destroy(&hmap);
+}
+
+/* Tests basic hmap_reserve() and hmap_shrink(). */
+static void
+test_hmap_reserve_shrink(hash_func *hash)
+{
+ enum { N_ELEMS = 32 };
+
+ size_t i;
+
+ for (i = 0; i < N_ELEMS; i++) {
+ struct element elements[N_ELEMS];
+ int values[N_ELEMS];
+ struct hmap hmap;
+ size_t j;
+
+ hmap_init(&hmap);
+ hmap_reserve(&hmap, i);
+ for (j = 0; j < N_ELEMS; j++) {
+ elements[j].value = j;
+ hmap_insert(&hmap, &elements[j].node, hash(j));
+ values[j] = j;
+ check_hmap(&hmap, values, j + 1, hash);
+ }
+ shuffle(values, N_ELEMS);
+ for (j = 0; j < N_ELEMS; j++) {
+ hmap_remove(&hmap, &elements[values[j]].node);
+ hmap_shrink(&hmap);
+ check_hmap(&hmap, values + (j + 1), N_ELEMS - (j + 1), hash);
+ }
+ hmap_destroy(&hmap);
+ }
+}
+
+/* Tests that HMAP_FOR_EACH_SAFE properly allows for deletion of the current
+ * element of a hmap. */
+static void
+test_hmap_for_each_safe(hash_func *hash)
+{
+ enum { MAX_ELEMS = 10 };
+ size_t n;
+ unsigned long int pattern;
+
+ for (n = 0; n <= MAX_ELEMS; n++) {
+ for (pattern = 0; pattern < 1ul << n; pattern++) {
+ struct element elements[MAX_ELEMS];
+ int values[MAX_ELEMS];
+ struct hmap hmap;
+ struct element *e, *next;
+ size_t n_remaining;
+ int i;
+
+ make_hmap(&hmap, elements, values, n, hash);
+
+ i = 0;
+ n_remaining = n;
+ HMAP_FOR_EACH_SAFE (e, next, struct element, node, &hmap) {
+ assert(i < n);
+ if (pattern & (1ul << e->value)) {
+ size_t j;
+ hmap_remove(&hmap, &e->node);
+ for (j = 0; ; j++) {
+ assert(j < n_remaining);
+ if (values[j] == e->value) {
+ values[j] = values[--n_remaining];
+ break;
+ }
+ }
+ }
+ check_hmap(&hmap, values, n_remaining, hash);
+ i++;
+ }
+ assert(i == n);
+
+ for (i = 0; i < n; i++) {
+ if (pattern & (1ul << i)) {
+ n_remaining++;
+ }
+ }
+ assert(n == n_remaining);
+
+ hmap_destroy(&hmap);
+ }
+ }
+}
+
+static void
+run_test(void (*function)(hash_func *))
+{
+ hash_func *hash_funcs[] = { identity_hash, good_hash, constant_hash };
+ size_t i;
+
+ for (i = 0; i < ARRAY_SIZE(hash_funcs); i++) {
+ function(hash_funcs[i]);
+ printf(".");
+ fflush(stdout);
+ }
+}
+
+int
+main(void)
+{
+ run_test(test_hmap_insert_delete);
+ run_test(test_hmap_for_each_safe);
+ run_test(test_hmap_reserve_shrink);
+ printf("\n");
+ return 0;
+}
+
diff --git a/tests/test-list.c b/tests/test-list.c
new file mode 100644
index 000000000..62857be94
--- /dev/null
+++ b/tests/test-list.c
@@ -0,0 +1,159 @@
+/* A non-exhaustive test for some of the functions and macros declared in
+ * list.h. */
+
+#include <config.h>
+#include "list.h"
+#include <string.h>
+
+#undef NDEBUG
+#include <assert.h>
+
+/* Sample list element. */
+struct element {
+ int value;
+ struct list node;
+};
+
+/* Puts the 'n' values in 'values' into 'elements', and then puts those
+ * elements in order into 'list'. */
+static void
+make_list(struct list *list, struct element elements[],
+ int values[], size_t n)
+{
+ size_t i;
+
+ list_init(list);
+ for (i = 0; i < n; i++) {
+ elements[i].value = i;
+ list_push_back(list, &elements[i].node);
+ values[i] = i;
+ }
+}
+
+/* Verifies that 'list' contains exactly the 'n' values in 'values', in the
+ * specified order. */
+static void
+check_list(struct list *list, const int values[], size_t n)
+{
+ struct element *e;
+ size_t i;
+
+ i = 0;
+ LIST_FOR_EACH (e, struct element, node, list) {
+ assert(i < n);
+ assert(e->value == values[i]);
+ i++;
+ }
+ assert(&e->node == list);
+ assert(i == n);
+
+ i = 0;
+ LIST_FOR_EACH_REVERSE (e, struct element, node, list) {
+ assert(i < n);
+ assert(e->value == values[n - i - 1]);
+ i++;
+ }
+ assert(&e->node == list);
+ assert(i == n);
+
+ assert(list_is_empty(list) == !n);
+ assert(list_size(list) == n);
+}
+
+#if 0
+/* Prints the values in 'list', plus 'name' as a title. */
+static void
+print_list(const char *name, struct list *list)
+{
+ struct element *e;
+
+ printf("%s:", name);
+ LIST_FOR_EACH (e, struct element, node, list) {
+ printf(" %d", e->value);
+ }
+ printf("\n");
+}
+#endif
+
+/* Tests basic list construction. */
+static void
+test_list_construction(void)
+{
+ enum { MAX_ELEMS = 100 };
+ size_t n;
+
+ for (n = 0; n <= MAX_ELEMS; n++) {
+ struct element elements[MAX_ELEMS];
+ int values[MAX_ELEMS];
+ struct list list;
+
+ make_list(&list, elements, values, n);
+ check_list(&list, values, n);
+ }
+}
+
+/* Tests that LIST_FOR_EACH_SAFE properly allows for deletion of the current
+ * element of a list. */
+static void
+test_list_for_each_safe(void)
+{
+ enum { MAX_ELEMS = 10 };
+ size_t n;
+ unsigned long int pattern;
+
+ for (n = 0; n <= MAX_ELEMS; n++) {
+ for (pattern = 0; pattern < 1ul << n; pattern++) {
+ struct element elements[MAX_ELEMS];
+ int values[MAX_ELEMS];
+ struct list list;
+ struct element *e, *next;
+ size_t values_idx, n_remaining;
+ int i;
+
+ make_list(&list, elements, values, n);
+
+ i = 0;
+ values_idx = 0;
+ n_remaining = n;
+ LIST_FOR_EACH_SAFE (e, next, struct element, node, &list) {
+ assert(i < n);
+ if (pattern & (1ul << i)) {
+ list_remove(&e->node);
+ n_remaining--;
+ memmove(&values[values_idx], &values[values_idx + 1],
+ sizeof *values * (n_remaining - values_idx));
+ } else {
+ values_idx++;
+ }
+ check_list(&list, values, n_remaining);
+ i++;
+ }
+ assert(i == n);
+ assert(&e->node == &list);
+
+ for (i = 0; i < n; i++) {
+ if (pattern & (1ul << i)) {
+ n_remaining++;
+ }
+ }
+ assert(n == n_remaining);
+ }
+ }
+}
+
+static void
+run_test(void (*function)(void))
+{
+ function();
+ printf(".");
+}
+
+int
+main(void)
+{
+ run_test(test_list_construction);
+ run_test(test_list_for_each_safe);
+ printf("\n");
+ return 0;
+}
+
diff --git a/tests/test-stp-ieee802.1d-1998 b/tests/test-stp-ieee802.1d-1998
new file mode 100644
index 000000000..f1982a03e
--- /dev/null
+++ b/tests/test-stp-ieee802.1d-1998
@@ -0,0 +1,12 @@
+# This is the STP example from IEEE 802.1D-1998.
+bridge 0 0x42 = a b
+bridge 1 0x97 = c:5 a d:5
+bridge 2 0x45 = b e
+bridge 3 0x57 = b:5 e:5
+bridge 4 0x83 = a:5 e:5
+run 1000
+check 0 = root
+check 1 = F F:10 F
+check 2 = F:10 B
+check 3 = F:5 F
+check 4 = F:5 B
diff --git a/tests/test-stp-ieee802.1d-2004-fig17.4 b/tests/test-stp-ieee802.1d-2004-fig17.4
new file mode 100644
index 000000000..1f708630a
--- /dev/null
+++ b/tests/test-stp-ieee802.1d-2004-fig17.4
@@ -0,0 +1,31 @@
+# This is the STP example from IEEE 802.1D-2004 figures 17.4 and 17.5.
+bridge 0 0x111 = a b e c
+bridge 1 0x222 = a b d f
+bridge 2 0x333 = c d l j h g
+bridge 3 0x444 = e f n m k i
+bridge 4 0x555 = g i 0 0
+bridge 5 0x666 = h k 0 0
+bridge 6 0x777 = j m 0 0
+bridge 7 0x888 = l n 0 0
+run 1000
+check 0 = root
+check 1 = F:10 B F F
+check 2 = F:10 B F F F F
+check 3 = F:10 B F F F F
+check 4 = F:20 B F F
+check 5 = F:20 B F F
+check 6 = F:20 B F F
+check 7 = F:20 B F F
+
+# Now connect two ports of bridge 7 to the same LAN.
+bridge 7 = l n o o
+# Same results except for bridge 7:
+run 1000
+check 0 = root
+check 1 = F:10 B F F
+check 2 = F:10 B F F F F
+check 3 = F:10 B F F F F
+check 4 = F:20 B F F
+check 5 = F:20 B F F
+check 6 = F:20 B F F
+check 7 = F:20 B F B
diff --git a/tests/test-stp-ieee802.1d-2004-fig17.6 b/tests/test-stp-ieee802.1d-2004-fig17.6
new file mode 100644
index 000000000..6ed59177e
--- /dev/null
+++ b/tests/test-stp-ieee802.1d-2004-fig17.6
@@ -0,0 +1,14 @@
+# This is the STP example from IEEE 802.1D-2004 figure 17.6.
+bridge 0 0x111 = a b l
+bridge 1 0x222 = b c d
+bridge 2 0x333 = d e f
+bridge 3 0x444 = f g h
+bridge 4 0x555 = j h i
+bridge 5 0x666 = l j k
+run 1000
+check 0 = root
+check 1 = F:10 F F
+check 2 = F:20 F F
+check 3 = F:30 F B
+check 4 = F:20 F F
+check 5 = F:10 F F
diff --git a/tests/test-stp-ieee802.1d-2004-fig17.7 b/tests/test-stp-ieee802.1d-2004-fig17.7
new file mode 100644
index 000000000..daa0cdf2f
--- /dev/null
+++ b/tests/test-stp-ieee802.1d-2004-fig17.7
@@ -0,0 +1,17 @@
+# This is the STP example from IEEE 802.1D-2004 figure 17.7.
+bridge 0 0xaa = b
+bridge 1 0x111 = a b d f h g e c
+bridge 2 0x222 = g h j l n m k i
+run 1000
+check 0 = root
+check 1 = F F:10 F F F F F F
+check 2 = B F:20 F F F F F F
+
+# This is not the port priority change described in that figure,
+# but I don't understand what port priority change would cause
+# that change.
+bridge 2 = g X j l n m k i
+run 1000
+check 0 = root
+check 1 = F F:10 F F F F F F
+check 2 = F:20 D F F F F F F
diff --git a/tests/test-stp-iol-io-1.1 b/tests/test-stp-iol-io-1.1
new file mode 100644
index 000000000..186d6c4cf
--- /dev/null
+++ b/tests/test-stp-iol-io-1.1
@@ -0,0 +1,25 @@
+# This test file approximates the following test from "Bridge
+# Functions Consortium Spanning Tree Interoperability Test Suite
+# Version 1.5":
+# STP.io.1.1: Link Failure
+bridge 0 0x111 = a b c
+bridge 1 0x222 = a b c
+run 1000
+check 0 = root
+check 1 = F:10 B B
+bridge 1 = 0 _ _
+run 1000
+check 0 = root
+check 1 = F F:10 B
+bridge 1 = X _ _
+run 1000
+check 0 = root
+check 1 = D F:10 B
+bridge 1 = _ 0 _
+run 1000
+check 0 = root
+check 1 = D F F:10
+bridge 1 = _ X _
+run 1000
+check 0 = root
+check 1 = D D F:10
diff --git a/tests/test-stp-iol-io-1.2 b/tests/test-stp-iol-io-1.2
new file mode 100644
index 000000000..285bbd886
--- /dev/null
+++ b/tests/test-stp-iol-io-1.2
@@ -0,0 +1,14 @@
+# This test file approximates the following test from "Bridge
+# Functions Consortium Spanning Tree Interoperability Test Suite
+# Version 1.5":
+# STP.io.1.2: Repeated Network
+bridge 0 0x111 = a a
+bridge 1 0x222 = a a
+run 1000
+check 0 = rootid:0x111 F B
+check 1 = rootid:0x111 F:10 B
+bridge 1 = a^0x90 _
+run 1000
+check 0 = rootid:0x111 F B
+check 1 = rootid:0x111 B F:10
+
diff --git a/tests/test-stp-iol-io-1.4 b/tests/test-stp-iol-io-1.4
new file mode 100644
index 000000000..0065aaf5c
--- /dev/null
+++ b/tests/test-stp-iol-io-1.4
@@ -0,0 +1,13 @@
+# This test file approximates the following test from "Bridge
+# Functions Consortium Spanning Tree Interoperability Test Suite
+# Version 1.5":
+# STP.io.1.4: Network Initialization
+bridge 0 0x111 = a b c
+bridge 1 0x222 = b d e
+bridge 2 0x333 = a d f
+bridge 3 0x444 = c e f
+run 1000
+check 0 = root
+check 1 = F:10 F F
+check 2 = F:10 B F
+check 3 = F:10 B B
diff --git a/tests/test-stp-iol-io-1.5 b/tests/test-stp-iol-io-1.5
new file mode 100644
index 000000000..285d29dea
--- /dev/null
+++ b/tests/test-stp-iol-io-1.5
@@ -0,0 +1,40 @@
+# This test file approximates the following test from "Bridge
+# Functions Consortium Spanning Tree Interoperability Test Suite
+# Version 1.5":
+# STP.io.1.5: Topology Change
+bridge 0 0x111 = a b d c
+bridge 1 0x222 = a b f e
+bridge 2 0x333 = c d g h
+bridge 3 0x444 = e f g h
+run 1000
+check 0 = root
+check 1 = F:10 B F F
+check 2 = B F:10 F F
+check 3 = B F:20 B B
+bridge 1^0x7000
+run 1000
+check 0 = F:10 B F F
+check 1 = root
+check 2 = B F:20 B B
+check 3 = B F:10 F F
+bridge 2^0x6000
+run 1000
+check 0 = F F B F:10
+check 1 = F:20 B B B
+check 2 = root
+check 3 = F F F:10 B
+bridge 3^0x5000
+run 1000
+check 0 = B B B F:20
+check 1 = F F B F:10
+check 2 = F F F:10 B
+check 3 = root
+bridge 0^0x4000
+bridge 1^0x4001
+bridge 2^0x4002
+bridge 3^0x4003
+run 1000
+check 0 = root
+check 1 = F:10 B F F
+check 2 = B F:10 F F
+check 3 = B F:20 B B
diff --git a/tests/test-stp-iol-op-1.1 b/tests/test-stp-iol-op-1.1
new file mode 100644
index 000000000..8432bf36e
--- /dev/null
+++ b/tests/test-stp-iol-op-1.1
@@ -0,0 +1,7 @@
+# This test file approximates the following tests from "Bridge
+# Functions Consortium Spanning Tree Protocol Operations Test Suite
+# Version 2.3":
+# Test STP.op.1.1 ­ Root ID Initialized to Bridge ID
+# Test STP.op.1.2 ­ Root Path Cost Initialized to Zero
+bridge 0 0x123 =
+check 0 = root
diff --git a/tests/test-stp-iol-op-1.4 b/tests/test-stp-iol-op-1.4
new file mode 100644
index 000000000..6a1211647
--- /dev/null
+++ b/tests/test-stp-iol-op-1.4
@@ -0,0 +1,8 @@
+# This test file approximates the following test from "Bridge
+# Functions Consortium Spanning Tree Protocol Operations Test Suite
+# Version 2.3":
+# Test STP.op.1.4 ­ All Ports Initialized to Designated Ports
+bridge 0 0x123 = a b c d e f
+check 0 = Li Li Li Li Li Li
+run 1000
+check 0 = F F F F F F
diff --git a/tests/test-stp-iol-op-3.1 b/tests/test-stp-iol-op-3.1
new file mode 100644
index 000000000..3e1099cbb
--- /dev/null
+++ b/tests/test-stp-iol-op-3.1
@@ -0,0 +1,11 @@
+# This test file approximates the following test from "Bridge
+# Functions Consortium Spanning Tree Protocol Operations Test Suite
+# Version 2.3":
+# Test STP.op.3.1 ­ Root Bridge Selection: Root ID Values
+bridge 0 0x111 = a
+bridge 1 0x222 = a
+check 0 = rootid:0x111 Li
+check 1 = rootid:0x222 Li
+run 1000
+check 0 = rootid:0x111 root
+check 1 = rootid:0x111 F:10
diff --git a/tests/test-stp-iol-op-3.3 b/tests/test-stp-iol-op-3.3
new file mode 100644
index 000000000..2bcd45e1e
--- /dev/null
+++ b/tests/test-stp-iol-op-3.3
@@ -0,0 +1,11 @@
+# This test file approximates the following test from "Bridge
+# Functions Consortium Spanning Tree Protocol Operations Test Suite
+# Version 2.3":
+# Test STP.op.3.3 ­ Root Bridge Selection: Bridge ID Values
+bridge 0 0x333^0x6000 = a
+bridge 1 0x222^0x7000 = b
+bridge 2 0x111 = a b
+run 1000
+check 0 = rootid:0x333^0x6000 root
+check 1 = rootid:0x333^0x6000 F:20
+check 2 = rootid:0x333^0x6000 F:10 F
diff --git a/tests/test-stp-iol-op-3.4 b/tests/test-stp-iol-op-3.4
new file mode 100644
index 000000000..2bcd45e1e
--- /dev/null
+++ b/tests/test-stp-iol-op-3.4
@@ -0,0 +1,11 @@
+# This test file approximates the following test from "Bridge
+# Functions Consortium Spanning Tree Protocol Operations Test Suite
+# Version 2.3":
+# Test STP.op.3.3 ­ Root Bridge Selection: Bridge ID Values
+bridge 0 0x333^0x6000 = a
+bridge 1 0x222^0x7000 = b
+bridge 2 0x111 = a b
+run 1000
+check 0 = rootid:0x333^0x6000 root
+check 1 = rootid:0x333^0x6000 F:20
+check 2 = rootid:0x333^0x6000 F:10 F
diff --git a/tests/test-stp.c b/tests/test-stp.c
new file mode 100644
index 000000000..073368154
--- /dev/null
+++ b/tests/test-stp.c
@@ -0,0 +1,648 @@
+/*
+ * Copyright (c) 2008, 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include "stp.h"
+#include <assert.h>
+#include <ctype.h>
+#include <errno.h>
+#include <inttypes.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include "ofpbuf.h"
+#include "packets.h"
+
+struct bpdu {
+ int port_no;
+ void *data;
+ size_t size;
+};
+
+struct bridge {
+ struct test_case *tc;
+ int id;
+ bool reached;
+
+ struct stp *stp;
+
+ struct lan *ports[STP_MAX_PORTS];
+ int n_ports;
+
+#define RXQ_SIZE 16
+ struct bpdu rxq[RXQ_SIZE];
+ int rxq_head, rxq_tail;
+};
+
+struct lan_conn {
+ struct bridge *bridge;
+ int port_no;
+};
+
+struct lan {
+ struct test_case *tc;
+ const char *name;
+ bool reached;
+ struct lan_conn conns[16];
+ int n_conns;
+};
+
+struct test_case {
+ struct bridge *bridges[16];
+ int n_bridges;
+ struct lan *lans[26];
+ int n_lans;
+};
+
+static const char *file_name;
+static int line_number;
+static char line[128];
+static char *pos, *token;
+static int n_warnings;
+
+static struct test_case *
+new_test_case(void)
+{
+ struct test_case *tc = xmalloc(sizeof *tc);
+ tc->n_bridges = 0;
+ tc->n_lans = 0;
+ return tc;
+}
+
+static void
+send_bpdu(struct ofpbuf *pkt, int port_no, void *b_)
+{
+ struct bridge *b = b_;
+ struct lan *lan;
+
+ assert(port_no < b->n_ports);
+ lan = b->ports[port_no];
+ if (lan) {
+ const void *data = pkt->l3;
+ size_t size = (char *) ofpbuf_tail(pkt) - (char *) data;
+ int i;
+
+ for (i = 0; i < lan->n_conns; i++) {
+ struct lan_conn *conn = &lan->conns[i];
+ if (conn->bridge != b || conn->port_no != port_no) {
+ struct bridge *dst = conn->bridge;
+ struct bpdu *bpdu = &dst->rxq[dst->rxq_head++ % RXQ_SIZE];
+ assert(dst->rxq_head - dst->rxq_tail <= RXQ_SIZE);
+ bpdu->data = xmemdup(data, size);
+ bpdu->size = size;
+ bpdu->port_no = conn->port_no;
+ }
+ }
+ }
+ ofpbuf_delete(pkt);
+}
+
+static struct bridge *
+new_bridge(struct test_case *tc, int id)
+{
+ struct bridge *b = xmalloc(sizeof *b);
+ char name[16];
+ b->tc = tc;
+ b->id = id;
+ snprintf(name, sizeof name, "stp%x", id);
+ b->stp = stp_create(name, id, send_bpdu, b);
+ assert(tc->n_bridges < ARRAY_SIZE(tc->bridges));
+ b->n_ports = 0;
+ b->rxq_head = b->rxq_tail = 0;
+ tc->bridges[tc->n_bridges++] = b;
+ return b;
+}
+
+static struct lan *
+new_lan(struct test_case *tc, const char *name)
+{
+ struct lan *lan = xmalloc(sizeof *lan);
+ lan->tc = tc;
+ lan->name = xstrdup(name);
+ lan->n_conns = 0;
+ assert(tc->n_lans < ARRAY_SIZE(tc->lans));
+ tc->lans[tc->n_lans++] = lan;
+ return lan;
+}
+
+static void
+reconnect_port(struct bridge *b, int port_no, struct lan *new_lan)
+{
+ struct lan *old_lan;
+ int j;
+
+ assert(port_no < b->n_ports);
+ old_lan = b->ports[port_no];
+ if (old_lan == new_lan) {
+ return;
+ }
+
+ /* Disconnect from old_lan. */
+ if (old_lan) {
+ for (j = 0; j < old_lan->n_conns; j++) {
+ struct lan_conn *c = &old_lan->conns[j];
+ if (c->bridge == b && c->port_no == port_no) {
+ memmove(c, c + 1, sizeof *c * (old_lan->n_conns - j - 1));
+ old_lan->n_conns--;
+ break;
+ }
+ }
+ }
+
+ /* Connect to new_lan. */
+ b->ports[port_no] = new_lan;
+ if (new_lan) {
+ int conn_no = new_lan->n_conns++;
+ assert(conn_no < ARRAY_SIZE(new_lan->conns));
+ new_lan->conns[conn_no].bridge = b;
+ new_lan->conns[conn_no].port_no = port_no;
+ }
+}
+
+static void
+new_port(struct bridge *b, struct lan *lan, int path_cost)
+{
+ int port_no = b->n_ports++;
+ struct stp_port *p = stp_get_port(b->stp, port_no);
+ assert(port_no < ARRAY_SIZE(b->ports));
+ b->ports[port_no] = NULL;
+ stp_port_set_path_cost(p, path_cost);
+ stp_port_enable(p);
+ reconnect_port(b, port_no, lan);
+}
+
+static void
+dump(struct test_case *tc)
+{
+ int i;
+
+ for (i = 0; i < tc->n_bridges; i++) {
+ struct bridge *b = tc->bridges[i];
+ struct stp *stp = b->stp;
+ int j;
+
+ printf("%s:", stp_get_name(stp));
+ if (stp_is_root_bridge(stp)) {
+ printf(" root");
+ }
+ printf("\n");
+ for (j = 0; j < b->n_ports; j++) {
+ struct stp_port *p = stp_get_port(stp, j);
+ enum stp_state state = stp_port_get_state(p);
+
+ printf("\tport %d", j);
+ if (b->ports[j]) {
+ printf(" (lan %s)", b->ports[j]->name);
+ } else {
+ printf(" (disconnected)");
+ }
+ printf(": %s", stp_state_name(state));
+ if (p == stp_get_root_port(stp)) {
+ printf(" (root port, root_path_cost=%u)", stp_get_root_path_cost(stp));
+ }
+ printf("\n");
+ }
+ }
+}
+
+static void dump_lan_tree(struct test_case *, struct lan *, int level);
+
+static void
+dump_bridge_tree(struct test_case *tc, struct bridge *b, int level)
+{
+ int i;
+
+ if (b->reached) {
+ return;
+ }
+ b->reached = true;
+ for (i = 0; i < level; i++) {
+ printf("\t");
+ }
+ printf("%s\n", stp_get_name(b->stp));
+ for (i = 0; i < b->n_ports; i++) {
+ struct lan *lan = b->ports[i];
+ struct stp_port *p = stp_get_port(b->stp, i);
+ if (stp_port_get_state(p) == STP_FORWARDING && lan) {
+ dump_lan_tree(tc, lan, level + 1);
+ }
+ }
+}
+
+static void
+dump_lan_tree(struct test_case *tc, struct lan *lan, int level)
+{
+ int i;
+
+ if (lan->reached) {
+ return;
+ }
+ lan->reached = true;
+ for (i = 0; i < level; i++) {
+ printf("\t");
+ }
+ printf("%s\n", lan->name);
+ for (i = 0; i < lan->n_conns; i++) {
+ struct bridge *b = lan->conns[i].bridge;
+ dump_bridge_tree(tc, b, level + 1);
+ }
+}
+
+static void
+tree(struct test_case *tc)
+{
+ int i;
+
+ for (i = 0; i < tc->n_bridges; i++) {
+ struct bridge *b = tc->bridges[i];
+ b->reached = false;
+ }
+ for (i = 0; i < tc->n_lans; i++) {
+ struct lan *lan = tc->lans[i];
+ lan->reached = false;
+ }
+ for (i = 0; i < tc->n_bridges; i++) {
+ struct bridge *b = tc->bridges[i];
+ struct stp *stp = b->stp;
+ if (stp_is_root_bridge(stp)) {
+ dump_bridge_tree(tc, b, 0);
+ }
+ }
+}
+
+static void
+simulate(struct test_case *tc, int granularity)
+{
+ int time;
+
+ for (time = 0; time < 1000 * 180; time += granularity) {
+ int round_trips;
+ int i;
+
+ for (i = 0; i < tc->n_bridges; i++) {
+ stp_tick(tc->bridges[i]->stp, granularity);
+ }
+ for (round_trips = 0; round_trips < granularity; round_trips++) {
+ bool any = false;
+ for (i = 0; i < tc->n_bridges; i++) {
+ struct bridge *b = tc->bridges[i];
+ for (; b->rxq_tail != b->rxq_head; b->rxq_tail++) {
+ struct bpdu *bpdu = &b->rxq[b->rxq_tail % RXQ_SIZE];
+ stp_received_bpdu(stp_get_port(b->stp, bpdu->port_no),
+ bpdu->data, bpdu->size);
+ any = true;
+ }
+ }
+ if (!any) {
+ break;
+ }
+ }
+ }
+}
+
+static void
+err(const char *message, ...)
+ PRINTF_FORMAT(1, 2)
+ NO_RETURN;
+
+static void
+err(const char *message, ...)
+{
+ va_list args;
+
+ fprintf(stderr, "%s:%d:%td: ", file_name, line_number, pos - line);
+ va_start(args, message);
+ vfprintf(stderr, message, args);
+ va_end(args);
+ putc('\n', stderr);
+
+ exit(EXIT_FAILURE);
+}
+
+static void
+warn(const char *message, ...)
+ PRINTF_FORMAT(1, 2);
+
+static void
+warn(const char *message, ...)
+{
+ va_list args;
+
+ fprintf(stderr, "%s:%d: ", file_name, line_number);
+ va_start(args, message);
+ vfprintf(stderr, message, args);
+ va_end(args);
+ putc('\n', stderr);
+
+ n_warnings++;
+}
+
+static bool
+get_token(void)
+{
+ char *start;
+
+ while (isspace((unsigned char) *pos)) {
+ pos++;
+ }
+ if (*pos == '\0') {
+ token = NULL;
+ return false;
+ }
+
+ start = pos;
+ if (isalpha((unsigned char) *pos)) {
+ while (isalpha((unsigned char) *++pos)) {
+ continue;
+ }
+ } else if (isdigit((unsigned char) *pos)) {
+ if (*pos == '0' && (pos[1] == 'x' || pos[1] == 'X')) {
+ pos += 2;
+ while (isxdigit((unsigned char) *pos)) {
+ pos++;
+ }
+ } else {
+ while (isdigit((unsigned char) *++pos)) {
+ continue;
+ }
+ }
+ } else {
+ pos++;
+ }
+
+ free(token);
+ token = xmemdup0(start, pos - start);
+ return true;
+}
+
+static bool
+get_int(int *intp)
+{
+ char *save_pos = pos;
+ if (token && isdigit((unsigned char) *token)) {
+ *intp = strtol(token, NULL, 0);
+ get_token();
+ return true;
+ } else {
+ pos = save_pos;
+ return false;
+ }
+}
+
+static bool
+match(const char *want)
+{
+ if (token && !strcmp(want, token)) {
+ get_token();
+ return true;
+ } else {
+ return false;
+ }
+}
+
+static int
+must_get_int(void)
+{
+ int x;
+ if (!get_int(&x)) {
+ err("expected integer");
+ }
+ return x;
+}
+
+static void
+must_match(const char *want)
+{
+ if (!match(want)) {
+ err("expected \"%s\"", want);
+ }
+}
+
+int
+main(int argc, char *argv[])
+{
+ struct test_case *tc;
+ FILE *input_file;
+ int i;
+
+ if (argc != 2) {
+ ovs_fatal(0, "usage: test-stp INPUT.STP\n");
+ }
+ file_name = argv[1];
+
+ input_file = fopen(file_name, "r");
+ if (!input_file) {
+ ovs_fatal(errno, "error opening \"%s\"", file_name);
+ }
+
+ tc = new_test_case();
+ for (i = 0; i < 26; i++) {
+ char name[2];
+ name[0] = 'a' + i;
+ name[1] = '\0';
+ new_lan(tc, name);
+ }
+
+ for (line_number = 1; fgets(line, sizeof line, input_file);
+ line_number++)
+ {
+ char *newline, *hash;
+
+ newline = strchr(line, '\n');
+ if (newline) {
+ *newline = '\0';
+ }
+ hash = strchr(line, '#');
+ if (hash) {
+ *hash = '\0';
+ }
+
+ pos = line;
+ if (!get_token()) {
+ continue;
+ }
+ if (match("bridge")) {
+ struct bridge *bridge;
+ int bridge_no, port_no;
+
+ bridge_no = must_get_int();
+ if (bridge_no < tc->n_bridges) {
+ bridge = tc->bridges[bridge_no];
+ } else if (bridge_no == tc->n_bridges) {
+ bridge = new_bridge(tc, must_get_int());
+ } else {
+ err("bridges must be numbered consecutively from 0");
+ }
+ if (match("^")) {
+ stp_set_bridge_priority(bridge->stp, must_get_int());
+ }
+
+ if (match("=")) {
+ for (port_no = 0; port_no < STP_MAX_PORTS; port_no++) {
+ struct stp_port *p = stp_get_port(bridge->stp, port_no);
+ if (!token || match("X")) {
+ stp_port_disable(p);
+ } else if (match("_")) {
+ /* Nothing to do. */
+ } else {
+ struct lan *lan;
+ int path_cost;
+
+ if (!strcmp(token, "0")) {
+ lan = NULL;
+ } else if (strlen(token) == 1 && islower(*token)) {
+ lan = tc->lans[*token - 'a'];
+ } else {
+ err("%s is not a valid LAN name "
+ "(0 or a lowercase letter)", token);
+ }
+ get_token();
+
+ path_cost = match(":") ? must_get_int() : 10;
+ if (port_no < bridge->n_ports) {
+ stp_port_set_path_cost(p, path_cost);
+ stp_port_enable(p);
+ reconnect_port(bridge, port_no, lan);
+ } else if (port_no == bridge->n_ports) {
+ new_port(bridge, lan, path_cost);
+ } else {
+ err("ports must be numbered consecutively");
+ }
+ if (match("^")) {
+ stp_port_set_priority(p, must_get_int());
+ }
+ }
+ }
+ }
+ } else if (match("run")) {
+ simulate(tc, must_get_int());
+ } else if (match("dump")) {
+ dump(tc);
+ } else if (match("tree")) {
+ tree(tc);
+ } else if (match("check")) {
+ struct bridge *b;
+ struct stp *stp;
+ int bridge_no, port_no;
+
+ bridge_no = must_get_int();
+ if (bridge_no >= tc->n_bridges) {
+ err("no bridge numbered %d", bridge_no);
+ }
+ b = tc->bridges[bridge_no];
+ stp = b->stp;
+
+ must_match("=");
+
+ if (match("rootid")) {
+ uint64_t rootid;
+ must_match(":");
+ rootid = must_get_int();
+ if (match("^")) {
+ rootid |= (uint64_t) must_get_int() << 48;
+ } else {
+ rootid |= UINT64_C(0x8000) << 48;
+ }
+ if (stp_get_designated_root(stp) != rootid) {
+ warn("%s: root %"PRIx64", not %"PRIx64,
+ stp_get_name(stp), stp_get_designated_root(stp),
+ rootid);
+ }
+ }
+
+ if (match("root")) {
+ if (stp_get_root_path_cost(stp)) {
+ warn("%s: root path cost of root is %u but should be 0",
+ stp_get_name(stp), stp_get_root_path_cost(stp));
+ }
+ if (!stp_is_root_bridge(stp)) {
+ warn("%s: root is %"PRIx64", not %"PRIx64,
+ stp_get_name(stp),
+ stp_get_designated_root(stp), stp_get_bridge_id(stp));
+ }
+ for (port_no = 0; port_no < b->n_ports; port_no++) {
+ struct stp_port *p = stp_get_port(stp, port_no);
+ enum stp_state state = stp_port_get_state(p);
+ if (!(state & (STP_DISABLED | STP_FORWARDING))) {
+ warn("%s: root port %d in state %s",
+ stp_get_name(b->stp), port_no,
+ stp_state_name(state));
+ }
+ }
+ } else {
+ for (port_no = 0; port_no < STP_MAX_PORTS; port_no++) {
+ struct stp_port *p = stp_get_port(stp, port_no);
+ enum stp_state state;
+ if (token == NULL || match("D")) {
+ state = STP_DISABLED;
+ } else if (match("B")) {
+ state = STP_BLOCKING;
+ } else if (match("Li")) {
+ state = STP_LISTENING;
+ } else if (match("Le")) {
+ state = STP_LEARNING;
+ } else if (match("F")) {
+ state = STP_FORWARDING;
+ } else if (match("_")) {
+ continue;
+ } else {
+ err("unknown port state %s", token);
+ }
+ if (stp_port_get_state(p) != state) {
+ warn("%s port %d: state is %s but should be %s",
+ stp_get_name(stp), port_no,
+ stp_state_name(stp_port_get_state(p)),
+ stp_state_name(state));
+ }
+ if (state == STP_FORWARDING) {
+ struct stp_port *root_port = stp_get_root_port(stp);
+ if (match(":")) {
+ int root_path_cost = must_get_int();
+ if (p != root_port) {
+ warn("%s: port %d is not the root port",
+ stp_get_name(stp), port_no);
+ if (!root_port) {
+ warn("%s: (there is no root port)",
+ stp_get_name(stp));
+ } else {
+ warn("%s: (port %d is the root port)",
+ stp_get_name(stp),
+ stp_port_no(root_port));
+ }
+ } else if (root_path_cost
+ != stp_get_root_path_cost(stp)) {
+ warn("%s: root path cost is %u, should be %d",
+ stp_get_name(stp),
+ stp_get_root_path_cost(stp),
+ root_path_cost);
+ }
+ } else if (p == root_port) {
+ warn("%s: port %d is the root port but "
+ "not expected to be",
+ stp_get_name(stp), port_no);
+ }
+ }
+ }
+ }
+ if (n_warnings) {
+ exit(EXIT_FAILURE);
+ }
+ }
+ if (get_token()) {
+ err("trailing garbage on line");
+ }
+ }
+
+ return 0;
+}
diff --git a/tests/test-stp.sh b/tests/test-stp.sh
new file mode 100755
index 000000000..fd6acf544
--- /dev/null
+++ b/tests/test-stp.sh
@@ -0,0 +1,7 @@
+#! /bin/sh
+set -e
+progress=
+for d in ${stp_files}; do
+ echo "Testing $d..."
+ $SUPERVISOR ./tests/test-stp ${srcdir}/$d
+done
diff --git a/tests/test-type-props.c b/tests/test-type-props.c
new file mode 100644
index 000000000..67dabae86
--- /dev/null
+++ b/tests/test-type-props.c
@@ -0,0 +1,41 @@
+#include <config.h>
+#include "type-props.h"
+#include <stdio.h>
+#include <stdlib.h>
+
+#define MUST_SUCCEED(EXPRESSION) \
+ if (!(EXPRESSION)) { \
+ fprintf(stderr, "%s:%d: %s failed\n", \
+ __FILE__, __LINE__, #EXPRESSION); \
+ exit(EXIT_FAILURE); \
+ }
+
+#define TEST_TYPE(type, minimum, maximum, is_signed) \
+ MUST_SUCCEED(TYPE_IS_INTEGER(type)); \
+ MUST_SUCCEED(TYPE_IS_SIGNED(type) == is_signed); \
+ MUST_SUCCEED(TYPE_MAXIMUM(type) == maximum); \
+ MUST_SUCCEED(TYPE_MINIMUM(type) == minimum);
+
+int
+main (void)
+{
+ TEST_TYPE(char, CHAR_MIN, CHAR_MAX, (CHAR_MIN < 0));
+
+ TEST_TYPE(signed char, SCHAR_MIN, SCHAR_MAX, 1);
+ TEST_TYPE(short int, SHRT_MIN, SHRT_MAX, 1);
+ TEST_TYPE(int, INT_MIN, INT_MAX, 1);
+ TEST_TYPE(long int, LONG_MIN, LONG_MAX, 1);
+ TEST_TYPE(long long int, LLONG_MIN, LLONG_MAX, 1);
+
+ TEST_TYPE(unsigned char, 0, UCHAR_MAX, 0);
+ TEST_TYPE(unsigned short int, 0, USHRT_MAX, 0);
+ TEST_TYPE(unsigned int, 0, UINT_MAX, 0);
+ TEST_TYPE(unsigned long int, 0, ULONG_MAX, 0);
+ TEST_TYPE(unsigned long long int, 0, ULLONG_MAX, 0);
+
+ MUST_SUCCEED(!(TYPE_IS_INTEGER(float)));
+ MUST_SUCCEED(!(TYPE_IS_INTEGER(double)));
+ MUST_SUCCEED(!(TYPE_IS_INTEGER(long double)));
+
+ return 0;
+}
diff --git a/third-party/.gitignore b/third-party/.gitignore
new file mode 100644
index 000000000..b336cc7ce
--- /dev/null
+++ b/third-party/.gitignore
@@ -0,0 +1,2 @@
+/Makefile
+/Makefile.in
diff --git a/third-party/README b/third-party/README
new file mode 100644
index 000000000..15f4d647e
--- /dev/null
+++ b/third-party/README
@@ -0,0 +1,35 @@
+This directory contains third-party software that may be useful for
+debugging.
+
+tcpdump
+-------
+The "ofp-tcpdump.patch" patch adds the ability to parse OpenFlow
+messages to tcpdump. These instructions assume that tcpdump 3.9.8
+is going to be used, but it should work with other versions that are not
+substantially different. To begin, download tcpdump and apply the
+patch:
+
+ wget http://www.tcpdump.org/release/tcpdump-3.9.8.tar.gz
+ tar xzf tcpdump-3.9.8.tar.gz
+ ln -s tcpdump-3.9.8 tcpdump
+ patch -p0 < ofp-tcpdump.patch
+
+Then build the new version of tcpdump:
+
+ cd tcpdump
+ ./configure
+ make
+
+Clearly, tcpdump can only parse unencrypted packets, so you will need to
+connect the controller and datapath using plain TCP. To look at the
+traffic, tcpdump will be started in a manner similar to the following:
+
+ sudo ./tcpdump -s0 -i eth0 port 6633
+
+The "-s0" flag indicates that tcpdump should capture the entire packet.
+If the OpenFlow message is not received in its entirety, "[|openflow]" will
+be printed instead of the OpenFlow message contents.
+
+The verbosity of the output may be increased by adding additional "-v"
+flags. If "-vvv" is used, the raw OpenFlow data is also printed in
+hex and ASCII.
diff --git a/third-party/automake.mk b/third-party/automake.mk
new file mode 100644
index 000000000..02636bb53
--- /dev/null
+++ b/third-party/automake.mk
@@ -0,0 +1,3 @@
+EXTRA_DIST += \
+ third-party/README \
+ third-party/ofp-tcpdump.patch
diff --git a/third-party/ofp-tcpdump.patch b/third-party/ofp-tcpdump.patch
new file mode 100644
index 000000000..d9a23cb1e
--- /dev/null
+++ b/third-party/ofp-tcpdump.patch
@@ -0,0 +1,109 @@
+--- tcpdump/interface.h 2007-06-13 18:03:20.000000000 -0700
++++ tcpdump/interface.h 2008-04-15 18:28:55.000000000 -0700
+@@ -148,7 +148,8 @@
+
+ extern const char *dnaddr_string(u_short);
+
+-extern void error(const char *, ...)
++#define error(fmt, args...) tcpdump_error(fmt, ## args)
++extern void tcpdump_error(const char *, ...)
+ __attribute__((noreturn, format (printf, 1, 2)));
+ extern void warning(const char *, ...) __attribute__ ((format (printf, 1, 2)));
+
+@@ -176,6 +177,7 @@
+ extern void hex_print_with_offset(const char *, const u_char *, u_int, u_int);
+ extern void hex_print(const char *, const u_char *, u_int);
+ extern void telnet_print(const u_char *, u_int);
++extern void openflow_print(const u_char *, u_int);
+ extern int ether_encap_print(u_short, const u_char *, u_int, u_int, u_short *);
+ extern int llc_print(const u_char *, u_int, u_int, const u_char *,
+ const u_char *, u_short *);
+--- tcpdump/Makefile.in 2007-09-25 18:59:52.000000000 -0700
++++ tcpdump/Makefile.in 2009-05-11 15:59:28.000000000 -0700
+@@ -49,10 +49,10 @@
+ CFLAGS = $(CCOPT) $(DEFS) $(INCLS)
+
+ # Standard LDFLAGS
+-LDFLAGS = @LDFLAGS@
++LDFLAGS = @LDFLAGS@ -L../../lib
+
+ # Standard LIBS
+-LIBS = @LIBS@
++LIBS = @LIBS@ -lopenvswitch
+
+ INSTALL = @INSTALL@
+ INSTALL_PROGRAM = @INSTALL_PROGRAM@
+@@ -87,7 +87,8 @@
+ print-slow.c print-snmp.c print-stp.c print-sunatm.c print-sunrpc.c \
+ print-symantec.c print-syslog.c print-tcp.c print-telnet.c print-tftp.c \
+ print-timed.c print-token.c print-udp.c print-vjc.c print-vrrp.c \
+- print-wb.c print-zephyr.c setsignal.c tcpdump.c util.c
++ print-wb.c print-zephyr.c setsignal.c tcpdump.c util.c \
++ print-openflow.c
+
+ LOCALSRC = @LOCALSRC@
+ GENSRC = version.c
+--- tcpdump/print-openflow.c 1969-12-31 16:00:00.000000000 -0800
++++ tcpdump/print-openflow.c 2009-05-11 15:38:41.000000000 -0700
+@@ -0,0 +1,40 @@
++/* Copyright (C) 2007, 2008, 2009 Nicira Networks.
++ *
++ * Permission to use, copy, modify, and/or distribute this software for any
++ * purpose with or without fee is hereby granted, provided that the above
++ * copyright notice and this permission notice appear in all copies.
++ *
++ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
++ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
++ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
++ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
++ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
++ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
++ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
++ */
++ *
++
++#ifdef HAVE_CONFIG_H
++#include "config.h"
++#endif
++
++#include <stdlib.h>
++
++#include "interface.h"
++#include "../../include/openflow/openflow.h"
++#include "../../lib/ofp-print.h"
++
++void
++openflow_print(const u_char *sp, u_int length)
++{
++ const struct ofp_header *ofp = (struct ofp_header *)sp;
++
++ if (!TTEST2(*sp, ntohs(ofp->length)))
++ goto trunc;
++
++ ofp_print(stdout, sp, length, vflag);
++ return;
++
++trunc:
++ printf("[|openflow]");
++}
+--- tcpdump/print-tcp.c 2006-09-19 12:07:57.000000000 -0700
++++ tcpdump/print-tcp.c 2009-05-11 15:38:25.000000000 -0700
+@@ -52,6 +52,8 @@
+
+ #include "nameser.h"
+
++#include "../../include/openflow/openflow.h"
++
+ #ifdef HAVE_LIBCRYPTO
+ #include <openssl/md5.h>
+
+@@ -680,7 +682,8 @@
+ }
+ else if (length > 0 && (sport == LDP_PORT || dport == LDP_PORT)) {
+ ldp_print(bp, length);
+- }
++ } else if (sport == OFP_TCP_PORT || dport == OFP_TCP_PORT)
++ openflow_print(bp, length);
+ }
+ return;
+ bad:
diff --git a/utilities/.gitignore b/utilities/.gitignore
new file mode 100644
index 000000000..32a7f2eb7
--- /dev/null
+++ b/utilities/.gitignore
@@ -0,0 +1,22 @@
+/Makefile
+/Makefile.in
+/nlmon
+/ovs-appctl
+/ovs-appctl.8
+/ovs-cfg-mod
+/ovs-cfg-mod.8
+/ovs-controller
+/ovs-controller.8
+/ovs-discover
+/ovs-discover.8
+/ovs-dpctl
+/ovs-dpctl.8
+/ovs-kill
+/ovs-kill.8
+/ovs-ofctl
+/ovs-ofctl.8
+/ovs-parse-leaks
+/ovs-pki
+/ovs-pki-cgi
+/ovs-pki.8
+/ovs-wdt
diff --git a/utilities/automake.mk b/utilities/automake.mk
new file mode 100644
index 000000000..97b827acc
--- /dev/null
+++ b/utilities/automake.mk
@@ -0,0 +1,74 @@
+bin_PROGRAMS += \
+ utilities/ovs-appctl \
+ utilities/ovs-cfg-mod \
+ utilities/ovs-controller \
+ utilities/ovs-discover \
+ utilities/ovs-dpctl \
+ utilities/ovs-kill \
+ utilities/ovs-ofctl \
+ utilities/ovs-wdt
+noinst_PROGRAMS += utilities/nlmon
+bin_SCRIPTS += utilities/ovs-pki
+noinst_SCRIPTS += utilities/ovs-pki-cgi utilities/ovs-parse-leaks
+dist_sbin_SCRIPTS += utilities/ovs-monitor
+
+EXTRA_DIST += \
+ utilities/ovs-appctl.8.in \
+ utilities/ovs-cfg-mod.8.in \
+ utilities/ovs-controller.8.in \
+ utilities/ovs-discover.8.in \
+ utilities/ovs-dpctl.8.in \
+ utilities/ovs-kill.8.in \
+ utilities/ovs-ofctl.8.in \
+ utilities/ovs-parse-leaks.in \
+ utilities/ovs-pki-cgi.in \
+ utilities/ovs-pki.8.in \
+ utilities/ovs-pki.in
+DISTCLEANFILES += \
+ utilities/ovs-appctl.8 \
+ utilities/ovs-cfg-mod.8 \
+ utilities/ovs-controller.8 \
+ utilities/ovs-discover.8 \
+ utilities/ovs-dpctl.8 \
+ utilities/ovs-kill.8 \
+ utilities/ovs-ofctl.8 \
+ utilities/ovs-parse-leaks \
+ utilities/ovs-pki \
+ utilities/ovs-pki.8 \
+ utilities/ovs-pki-cgi
+
+man_MANS += \
+ utilities/ovs-appctl.8 \
+ utilities/ovs-cfg-mod.8 \
+ utilities/ovs-controller.8 \
+ utilities/ovs-discover.8 \
+ utilities/ovs-dpctl.8 \
+ utilities/ovs-kill.8 \
+ utilities/ovs-ofctl.8 \
+ utilities/ovs-pki.8
+
+utilities_ovs_appctl_SOURCES = utilities/ovs-appctl.c
+utilities_ovs_appctl_LDADD = lib/libopenvswitch.a
+
+utilities_ovs_cfg_mod_SOURCES = utilities/ovs-cfg-mod.c
+utilities_ovs_cfg_mod_LDADD = lib/libopenvswitch.a
+
+utilities_ovs_controller_SOURCES = utilities/ovs-controller.c
+utilities_ovs_controller_LDADD = lib/libopenvswitch.a $(FAULT_LIBS) $(SSL_LIBS)
+
+utilities_ovs_discover_SOURCES = utilities/ovs-discover.c
+utilities_ovs_discover_LDADD = lib/libopenvswitch.a
+
+utilities_ovs_dpctl_SOURCES = utilities/ovs-dpctl.c
+utilities_ovs_dpctl_LDADD = lib/libopenvswitch.a $(FAULT_LIBS)
+
+utilities_ovs_kill_SOURCES = utilities/ovs-kill.c
+utilities_ovs_kill_LDADD = lib/libopenvswitch.a
+
+utilities_ovs_ofctl_SOURCES = utilities/ovs-ofctl.c
+utilities_ovs_ofctl_LDADD = lib/libopenvswitch.a $(FAULT_LIBS) $(SSL_LIBS)
+
+utilities_ovs_wdt_SOURCES = utilities/ovs-wdt.c
+
+utilities_nlmon_SOURCES = utilities/nlmon.c
+utilities_nlmon_LDADD = lib/libopenvswitch.a
diff --git a/utilities/nlmon.c b/utilities/nlmon.c
new file mode 100644
index 000000000..eb1be60a7
--- /dev/null
+++ b/utilities/nlmon.c
@@ -0,0 +1,90 @@
+#include <config.h>
+#include <errno.h>
+#include <inttypes.h>
+#include <net/if.h>
+#include <poll.h>
+#include <sys/socket.h>
+#include <sys/uio.h>
+#include <stddef.h>
+#include <linux/rtnetlink.h>
+#include "netlink.h"
+#include "ofpbuf.h"
+#include "poll-loop.h"
+#include "timeval.h"
+#include "util.h"
+#include "vlog.h"
+
+static const struct nl_policy rtnlgrp_link_policy[] = {
+ [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
+ [IFLA_MASTER] = { .type = NL_A_U32, .optional = true },
+};
+
+int
+main(int argc UNUSED, char *argv[])
+{
+ struct nl_sock *sock;
+ int error;
+
+ set_program_name(argv[0]);
+ time_init();
+ vlog_init();
+ vlog_set_levels(VLM_ANY_MODULE, VLF_ANY_FACILITY, VLL_DBG);
+
+ error = nl_sock_create(NETLINK_ROUTE, RTNLGRP_LINK, 0, 0, &sock);
+ if (error) {
+ ovs_fatal(error, "could not create rtnetlink socket");
+ }
+
+ for (;;) {
+ struct ofpbuf *buf;
+
+ error = nl_sock_recv(sock, &buf, false);
+ if (error == EAGAIN) {
+ /* Nothing to do. */
+ } else if (error == ENOBUFS) {
+ ovs_error(0, "network monitor socket overflowed");
+ } else if (error) {
+ ovs_fatal(error, "error on network monitor socket");
+ } else {
+ struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
+ struct nlmsghdr *nlh;
+ struct ifinfomsg *iim;
+
+ nlh = ofpbuf_at(buf, 0, NLMSG_HDRLEN);
+ iim = ofpbuf_at(buf, NLMSG_HDRLEN, sizeof *iim);
+ if (!iim) {
+ ovs_error(0, "received bad rtnl message (no ifinfomsg)");
+ ofpbuf_delete(buf);
+ continue;
+ }
+
+ if (!nl_policy_parse(buf, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
+ rtnlgrp_link_policy,
+ attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
+ ovs_error(0, "received bad rtnl message (policy)");
+ ofpbuf_delete(buf);
+ continue;
+ }
+ printf("netdev %s changed (%s):\n",
+ nl_attr_get_string(attrs[IFLA_IFNAME]),
+ (nlh->nlmsg_type == RTM_NEWLINK ? "RTM_NEWLINK"
+ : nlh->nlmsg_type == RTM_DELLINK ? "RTM_DELLINK"
+ : nlh->nlmsg_type == RTM_GETLINK ? "RTM_GETLINK"
+ : nlh->nlmsg_type == RTM_SETLINK ? "RTM_SETLINK"
+ : "other"));
+ if (attrs[IFLA_MASTER]) {
+ uint32_t idx = nl_attr_get_u32(attrs[IFLA_MASTER]);
+ char ifname[IFNAMSIZ];
+ if (!if_indextoname(idx, ifname)) {
+ strcpy(ifname, "unknown");
+ }
+ printf("\tmaster=%"PRIu32" (%s)\n", idx, ifname);
+ }
+ ofpbuf_delete(buf);
+ }
+
+ nl_sock_wait(sock, POLLIN);
+ poll_block();
+ }
+}
+
diff --git a/utilities/ovs-appctl.8.in b/utilities/ovs-appctl.8.in
new file mode 100644
index 000000000..9bf97fd2f
--- /dev/null
+++ b/utilities/ovs-appctl.8.in
@@ -0,0 +1,166 @@
+.\" -*- nroff -*-
+.de IQ
+. br
+. ns
+. IP "\\$1"
+..
+.TH ovs\-appctl 8 "April 2009" "Open vSwitch" "Open vSwitch Manual"
+.ds PN ovs\-appctl
+
+.SH NAME
+ovs\-appctl \- utility for configuring running Open vSwitch daemons
+
+.SH SYNOPSIS
+\fBovs\-appctl\fR [\fB-h\fR | \fB--help\fR] [\fItarget\fR...] [\fIaction\fR...]
+.sp 1
+The available \fItarget\fR options are:
+.br
+[\fB-t\fR \fIpid\fR | \fB--target=\fIpid\fR]
+.sp 1
+The available \fIaction\fR options are:
+.br
+[\fB-l\fR | \fB--list\fR] [\fB-s\fR
+\fImodule\fR[\fB:\fIfacility\fR[\fB:\fIlevel\fR]] |
+\fB--set=\fImodule\fR[\fB:\fIfacility\fR[\fB:\fIlevel\fR]]]
+[\fB-r\fR | \fB--reopen\fR]
+[\fB-e\fR | \fB--execute=\fIcommand\fR]
+
+.SH DESCRIPTION
+The \fBovs\-appctl\fR program connects to one or more running
+Open vSwitch daemons (such as \fBovs\-vswitchd\fR(8)), as specified by the
+user, and sends them commands to query or modify their behavior.
+Its primary purpose is currently to adjust daemons' logging levels.
+
+\fBovs\-appctl\fR applies one or more actions to each of one or more
+target processes. Targets may be specified using:
+
+.IP "\fB-t \fIsocket\fR"
+.IQ "\fB--target=\fIsocket\fR"
+The specified \fIsocket\fR must be the name of a Unix domain socket
+for a \fBovs\-appctl\fR-controllable process. If \fIsocket\fR does not
+begin with \fB/\fR, it is treated as relative to \fB@RUNDIR@\fR.
+
+Each Open vSwitch daemon by default creates a socket named
+\fB@RUNDIR@/\fIprogram\fB.\fIpid\fB.ctl\fR, where \fIprogram\fR is
+the program's name (such as \fBovs\-vswitchd\fR) and \fIpid\fR is the
+daemon's PID.
+
+.PP
+The available actions are:
+
+.IP "\fB-l\fR"
+.IQ "\fB--list\fR"
+Print the list of known logging modules and their current levels to
+stdout.
+
+.IP "\fB-s\fR \fImodule\fR[\fB:\fIfacility\fR[\fB:\fIlevel\fR]]"
+.IQ "\fB--set=\fImodule\fR[\fB:\fIfacility\fR[\fB:\fIlevel\fR]]"
+
+Sets the logging level for \fImodule\fR in \fIfacility\fR to
+\fIlevel\fR. The \fImodule\fR may be any valid module name (as
+displayed by the \fB--list\fR option) or the special name \fBANY\fR to
+set the logging levels for all modules. The \fIfacility\fR may be
+\fBsyslog\fR or \fBconsole\fR to set the levels for logging to the
+system log or to the console, respectively, or \fBANY\fR to set the
+logging levels for both facilities. If it is omitted,
+\fIfacility\fR defaults to \fBANY\fR. The \fIlevel\fR must be one of
+\fBemer\fR, \fBerr\fR, \fBwarn\fR, \fBinfo\fR, or \fBdbg\fR, designating the
+minimum severity of a message for it to be logged. If it is omitted,
+\fIlevel\fR defaults to \fBdbg\fR.
+
+.IP "\fB-s PATTERN:\fIfacility\fB:\fIpattern\fR"
+.IQ "\fB--set=PATTERN:\fIfacility\fB:\fIpattern\fR"
+
+Sets the log pattern for \fIfacility\fR to \fIpattern\fR. Each time a
+message is logged to \fIfacility\fR, \fIpattern\fR determines the
+message's formatting. Most characters in \fIpattern\fR are copied
+literally to the log, but special escapes beginning with \fB%\fR are
+expanded as follows:
+
+.RS
+.IP \fB%A\fR
+The name of the application logging the message, e.g. \fBsecchan\fR.
+
+.IP \fB%c\fR
+The name of the module (as shown by \fBovs\-appctl --list\fR) logging
+the message.
+
+.IP \fB%d\fR
+The current date and time in ISO 8601 format (YYYY-MM-DD HH:MM:SS).
+
+.IP \fB%d{\fIformat\fB}\fR
+The current date and time in the specified \fIformat\fR, which takes
+the same format as the \fItemplate\fR argument to \fBstrftime\fR(3).
+
+.IP \fB%m\fR
+The message being logged.
+
+.IP \fB%N\fR
+A serial number for this message within this run of the program, as a
+decimal number. The first message a program logs has serial number 1,
+the second one has serial number 2, and so on.
+
+.IP \fB%n\fR
+A new-line.
+
+.IP \fB%p\fR
+The level at which the message is logged, e.g. \fBDBG\fR.
+
+.IP \fB%P\fR
+The program's process ID (pid), as a decimal number.
+
+.IP \fB%r\fR
+The number of milliseconds elapsed from the start of the application
+to the time the message was logged.
+
+.IP \fB%%\fR
+A literal \fB%\fR.
+.RE
+
+.IP
+A few options may appear between the \fB%\fR and the format specifier
+character, in this order:
+
+.RS
+.IP \fB-\fR
+Left justify the escape's expansion within its field width. Right
+justification is the default.
+
+.IP \fB0\fR
+Pad the field to the field width with \fB0\fRs. Padding with spaces
+is the default.
+
+.IP \fIwidth\fR
+A number specifies the minimum field width. If the escape expands to
+fewer characters than \fIwidth\fR then it is padded to fill the field
+width. (A field wider than \fIwidth\fR is not truncated to fit.)
+.RE
+
+.IP
+The default pattern for console output is \fB%d{%b %d
+%H:%M:%S}|%05N|%c|%p|%m\fR; for syslog output, \fB%05N|%c|%p|%m\fR.
+
+.IP \fB-r\fR
+.IQ \fB--reopen\fR
+Causes the target application to close and reopen its log file. (This
+is useful after rotating log files, to cause a new log file to be
+used.)
+
+This has no effect if the target application was not invoked with the
+\fB--log-file\fR option.
+
+.IP "\fB-e \fIcommand\fR"
+.IQ "\fB--execute=\fIcommand\fR"
+Passes the specified \fIcommand\fR literally to the target application
+and prints its response to stdout, if successful, or to stderr if an
+error occurs. Use \fB-e help\fR to print a list of available commands.
+
+.SH OPTIONS
+
+.so lib/common.man
+
+.SH "SEE ALSO"
+
+.BR ovs\-controller (8),
+.BR ovs\-dpctl (8),
+.BR secchan (8)
diff --git a/utilities/ovs-appctl.c b/utilities/ovs-appctl.c
new file mode 100644
index 000000000..eb544452c
--- /dev/null
+++ b/utilities/ovs-appctl.c
@@ -0,0 +1,221 @@
+/*
+ * Copyright (c) 2008, 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+#include <config.h>
+#include "vlog.h"
+
+#include <dirent.h>
+#include <errno.h>
+#include <getopt.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+
+#include "command-line.h"
+#include "compiler.h"
+#include "timeval.h"
+#include "unixctl.h"
+#include "util.h"
+
+static void
+usage(char *prog_name, int exit_code)
+{
+ printf("Usage: %s [TARGET] [ACTION...]\n"
+ "Targets:\n"
+ " -t, --target=TARGET Path to Unix domain socket\n"
+ "Actions:\n"
+ " -l, --list List current settings\n"
+ " -s, --set=MODULE[:FACILITY[:LEVEL]]\n"
+ " Set MODULE and FACILITY log level to LEVEL\n"
+ " MODULE may be any valid module name or 'ANY'\n"
+ " FACILITY may be 'syslog', 'console', 'file', or 'ANY' (default)\n"
+ " LEVEL may be 'emer', 'err', 'warn', 'info', or 'dbg' (default)\n"
+ " -r, --reopen Make the program reopen its log file\n"
+ " -e, --execute=COMMAND Execute control COMMAND and print its output\n"
+ "Other options:\n"
+ " -h, --help Print this helpful information\n"
+ " -V, --version Display version information\n",
+ prog_name);
+ exit(exit_code);
+}
+
+static char *
+transact(struct unixctl_client *client, const char *request, bool *ok)
+{
+ int code;
+ char *reply;
+ int error = unixctl_client_transact(client, request, &code, &reply);
+ if (error) {
+ fprintf(stderr, "%s: transaction error: %s\n",
+ unixctl_client_target(client), strerror(error));
+ *ok = false;
+ return xstrdup("");
+ } else {
+ if (code / 100 != 2) {
+ fprintf(stderr, "%s: server returned reply code %03d\n",
+ unixctl_client_target(client), code);
+ }
+ return reply;
+ }
+}
+
+static void
+transact_ack(struct unixctl_client *client, const char *request, bool *ok)
+{
+ free(transact(client, request, ok));
+}
+
+static void
+execute_command(struct unixctl_client *client, const char *request, bool *ok)
+{
+ int code;
+ char *reply;
+ int error = unixctl_client_transact(client, request, &code, &reply);
+ if (error) {
+ fprintf(stderr, "%s: transaction error: %s\n",
+ unixctl_client_target(client), strerror(error));
+ *ok = false;
+ } else {
+ if (code / 100 != 2) {
+ fprintf(stderr, "%s: server returned reply code %03d\n",
+ unixctl_client_target(client), code);
+ fputs(reply, stderr);
+ *ok = false;
+ } else {
+ fputs(reply, stdout);
+ }
+ }
+}
+
+static void
+add_target(struct unixctl_client ***clients, size_t *n_clients,
+ const char *path, bool *ok)
+{
+ struct unixctl_client *client;
+ int error = unixctl_client_create(path, &client);
+ if (error) {
+ fprintf(stderr, "Error connecting to \"%s\": %s\n",
+ path, strerror(error));
+ *ok = false;
+ } else {
+ *clients = xrealloc(*clients, sizeof *clients * (*n_clients + 1));
+ (*clients)[*n_clients] = client;
+ ++*n_clients;
+ }
+}
+
+int main(int argc, char *argv[])
+{
+ static const struct option long_options[] = {
+ /* Target options must come first. */
+ {"target", required_argument, NULL, 't'},
+ {"help", no_argument, NULL, 'h'},
+ {"version", no_argument, NULL, 'V'},
+
+ /* Action options come afterward. */
+ {"list", no_argument, NULL, 'l'},
+ {"set", required_argument, NULL, 's'},
+ {"reopen", no_argument, NULL, 'r'},
+ {"execute", required_argument, NULL, 'e'},
+ {0, 0, 0, 0},
+ };
+ char *short_options;
+
+ /* Determine targets. */
+ bool ok = true;
+ int n_actions = 0;
+ struct unixctl_client **clients = NULL;
+ size_t n_clients = 0;
+
+ set_program_name(argv[0]);
+ time_init();
+
+ short_options = long_options_to_short_options(long_options);
+ for (;;) {
+ int option;
+ size_t i;
+
+ option = getopt_long(argc, argv, short_options, long_options, NULL);
+ if (option == -1) {
+ break;
+ }
+ if (!strchr("thV", option) && n_clients == 0) {
+ ovs_fatal(0, "no targets specified (use --help for help)");
+ } else {
+ ++n_actions;
+ }
+ switch (option) {
+ case 't':
+ add_target(&clients, &n_clients, optarg, &ok);
+ break;
+
+ case 'l':
+ for (i = 0; i < n_clients; i++) {
+ struct unixctl_client *client = clients[i];
+ char *reply;
+
+ printf("%s:\n", unixctl_client_target(client));
+ reply = transact(client, "vlog/list", &ok);
+ fputs(reply, stdout);
+ free(reply);
+ }
+ break;
+
+ case 's':
+ for (i = 0; i < n_clients; i++) {
+ struct unixctl_client *client = clients[i];
+ char *request = xasprintf("vlog/set %s", optarg);
+ transact_ack(client, request, &ok);
+ free(request);
+ }
+ break;
+
+ case 'r':
+ for (i = 0; i < n_clients; i++) {
+ struct unixctl_client *client = clients[i];
+ char *request = xstrdup("vlog/reopen");
+ transact_ack(client, request, &ok);
+ free(request);
+ }
+ break;
+
+ case 'e':
+ for (i = 0; i < n_clients; i++) {
+ execute_command(clients[i], optarg, &ok);
+ }
+ break;
+
+ case 'h':
+ usage(argv[0], EXIT_SUCCESS);
+ break;
+
+ case 'V':
+ OVS_PRINT_VERSION(0, 0);
+ exit(EXIT_SUCCESS);
+
+ case '?':
+ exit(EXIT_FAILURE);
+
+ default:
+ NOT_REACHED();
+ }
+ }
+ if (!n_actions) {
+ fprintf(stderr,
+ "warning: no actions specified (use --help for help)\n");
+ }
+ exit(ok ? 0 : 1);
+}
diff --git a/utilities/ovs-cfg-mod.8.in b/utilities/ovs-cfg-mod.8.in
new file mode 100644
index 000000000..5b96f288e
--- /dev/null
+++ b/utilities/ovs-cfg-mod.8.in
@@ -0,0 +1,101 @@
+.\" -*- nroff -*-
+.de IQ
+. br
+. ns
+. IP "\\$1"
+..
+.TH ovs\-cfg\-mod 8 "June 2009" "Open vSwitch" "Open vSwitch Manual"
+.ds PN ovs\-cfg\-mod
+.
+.SH NAME
+ovs\-cfg\-mod \- Safely manage a ovs\-vswitchd.conf\-style configuration file
+.
+.SH SYNOPSIS
+\fB ovs\-cfg\-mod \fR[\fB\-T \fItimeout\fR] \fB\-F \fIfile\fR
+[\fIaction\fR] [\fIaction\fR...\fR]
+.
+.SH DESCRIPTION
+A program for managing a \fovs\-vswitchd.conf\fR(5)\-style configuration
+file. \fBovs\-cfg\-mod\fR uses the same locking mechanisms as
+\fBovs\-vswitchd\fR and its related utilities. This allows it to be
+run safely on ``live'' configurations.
+.
+.SH OPTIONS
+.SS "Specifying the Configuration File"
+.
+.IP "\fB\-T\fR \fItimeout\fR
+.IQ "\fB\-\-timeout=\fItimeout\fR
+By default, \fBovs\-cfg\-mod\fR will wait forever to lock the
+configuration file specified on \fB\-F\fR or \fB\-\-config\-file\fR. This
+option makes \fBovs\-cfg\-mod\fR wait no more than \fItimeout\fR
+milliseconds to obtain the lock, after which it exits unsuccessfully.
+.
+If it is present, this option must be specified before \fB\-F\fR or
+\fB\-\-config\-file\fR.
+.
+.IP "\fB\-F\fR \fIfile\fR"
+.IQ "\fB\-\-config\-file=\fIfile\fR"
+Use \fIfile\fR as the configuration file to query or modify.
+.
+This option is required. It must be specified before any action
+options.
+.
+.SS "Specifying Actions"
+A series of one or more action options may follow the configuration
+file options. These are executed in the order provided and under a
+single lock instance, so they appear atomic to external viewers of
+\fIfile\fR.
+.
+As discussed in \fBovs\-vswitchd.conf\fR(5), each line in the
+configuration file consists of a key\-value pair. Actions generally
+take either a \fIkey\fR or \fIentry\fR argument. A \fIkey\fR is a
+dot\-separated description of a configuration option. A \fIentry\fR is
+a key\-value pair, separated by the \fB=\fR sign.
+.
+The following actions are supported:
+.
+.IP "\fB\-a\fR \fIentry\fR"
+.IQ "\fB\-\-add=\fIentry\fR"
+Add \fIentry\fR to \fIfile\fR. Please note that duplicates are
+allowed, so if a unique key is required, a delete must be done first.
+.
+.IP "\fB\-d\fR \fIentry\fR"
+.IQ "\fB\-\-del\-entry=\fIentry\fR"
+Delete \fIentry\fR from \fIfile\fR. Deletes only the first entry
+that matches \fIentry\fR.
+.
+.IP "\fB\-D\fR \fIkey\fR"
+.IQ "\fB\-\-del\-section=\fIkey\fR"
+Delete section \fIkey\fR from \fIfile\fR.
+.
+.IP "\fB\-\-del\-match=\fIpattern\fR"
+Deletes every entry that matches the given shell glob \fIpattern\fR.
+For example, \fB\-\-del\-match=bridge.*.port=*\fR deletes all the ports
+from every bridge, and \fB\-\-del\-match=bonding.bond0.*\fR is equivalent
+to \fB\-\-del\-section=bonding.bond0\fR.
+.
+.IP "\fB\-q\fR \fIkey\fR"
+.IQ "\fB\-\-query=\fIkey\fR"
+Queries \fIfile\fR for entries that match \fIkey\fR. Each matching
+value is printed on a separate line. Duplicates will be printed
+multiple times.
+.
+.IP "\fB\-c\fR"
+.IQ "\fB\-\-changes\fR"
+Logs all of the changes made to the configuration file in a ``unified
+diff''\-like format. Only actual changes are logged, so that if, for
+example, a \fB\-\-del\-match\fR action did not match any key\-value pairs,
+then nothing will be logged due to that action. Furthermore, only the
+net effects of changes are logged: if a key\-value pair was deleted and
+then an identical key\-value pair was added back, then nothing would be
+logged due to those changes.
+.
+This action logs changes that have taken effect at the point where it
+is inserted. Thus, if it is given before any other action, it will
+not log any changes. If \fB\-\-changes\fR is given more than once,
+instances after the first log only the changes since the previous
+instance.
+.
+.SH "SEE ALSO"
+.BR ovs\-vswitchd (8),
+.BR ovs\-vswitchd.conf (5)
diff --git a/utilities/ovs-cfg-mod.c b/utilities/ovs-cfg-mod.c
new file mode 100644
index 000000000..53ebd00a9
--- /dev/null
+++ b/utilities/ovs-cfg-mod.c
@@ -0,0 +1,239 @@
+/* Copyright (c) 2008, 2009 Nicira Networks
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * In addition, as a special exception, Nicira Networks gives permission
+ * to link the code of its release of vswitchd with the OpenSSL project's
+ * "OpenSSL" library (or with modified versions of it that use the same
+ * license as the "OpenSSL" library), and distribute the linked
+ * executables. You must obey the GNU General Public License in all
+ * respects for all of the code used other than "OpenSSL". If you modify
+ * this file, you may extend this exception to your version of the file,
+ * but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version.
+ */
+#include <config.h>
+
+#include <dirent.h>
+#include <errno.h>
+#include <getopt.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+
+#include "cfg.h"
+#include "command-line.h"
+#include "svec.h"
+#include "timeval.h"
+#include "util.h"
+
+#define THIS_MODULE VLM_cfg_mod
+#include "vlog.h"
+
+/* Configuration when we first read the configuration file. */
+static struct svec orig_cfg = SVEC_EMPTY_INITIALIZER;
+
+static void
+usage(char *prog_name, int exit_code)
+{
+ printf("Usage: %s --config-file=FILE ACTIONS\n"
+ "\nConfig:\n"
+ " -T, --timeout=MS wait at most MS milliseconds for lock\n"
+ " -F, --config-file=FILE use configuration FILE\n"
+ "\nActions:\n"
+ " -a, --add=ENTRY add ENTRY\n"
+ " -d, --del-entry=ENTRY delete ENTRY\n"
+ " -D, --del-section=KEY delete section matching KEY\n"
+ " --del-match=PATTERN delete entries matching shell PATTERN\n"
+ " -q, --query=KEY return all entries matching KEY\n"
+ " -c, --log-changes log changes up to this point\n"
+ "\nOther options:\n"
+ " -h, --help display this help message\n"
+ " -V, --version display version information\n",
+ prog_name);
+ exit(exit_code);
+}
+
+static void
+open_config(char *config_file, int timeout)
+{
+ int error;
+
+ error = cfg_set_file(config_file);
+ if (error) {
+ ovs_fatal(error, "failed to add configuration file \"%s\"",
+ config_file);
+ }
+
+ error = cfg_lock(NULL, timeout);
+ if (error) {
+ ovs_fatal(error, "could not lock configuration file\n");
+ }
+
+ cfg_get_all(&orig_cfg);
+}
+
+static void
+print_vals(char *key)
+{
+ struct svec vals;
+ int i;
+
+ svec_init(&vals);
+ cfg_get_all_strings(&vals, "%s", key);
+
+ for (i=0; i<vals.n; i++) {
+ printf("%s\n", vals.names[i]);
+ }
+}
+
+static void
+log_diffs(void)
+{
+ struct svec new_cfg, removed, added;
+ size_t i;
+
+ svec_init(&new_cfg);
+ cfg_get_all(&new_cfg);
+ svec_diff(&orig_cfg, &new_cfg, &removed, NULL, &added);
+ if (removed.n || added.n) {
+ VLOG_INFO("configuration changes:");
+ for (i = 0; i < removed.n; i++) {
+ VLOG_INFO("-%s", removed.names[i]);
+ }
+ for (i = 0; i < added.n; i++) {
+ VLOG_INFO("+%s", added.names[i]);
+ }
+ } else {
+ VLOG_INFO("configuration unchanged");
+ }
+ svec_destroy(&added);
+ svec_destroy(&removed);
+ svec_swap(&new_cfg, &orig_cfg);
+ svec_destroy(&new_cfg);
+}
+
+int main(int argc, char *argv[])
+{
+ enum {
+ OPT_DEL_MATCH = UCHAR_MAX + 1,
+ };
+ static const struct option long_options[] = {
+ {"config-file", required_argument, 0, 'F'},
+ {"timeout", required_argument, 0, 'T'},
+ {"add", required_argument, 0, 'a'},
+ {"del-entry", required_argument, 0, 'd'},
+ {"del-section", required_argument, 0, 'D'},
+ {"del-match", required_argument, 0, OPT_DEL_MATCH},
+ {"query", required_argument, 0, 'q'},
+ {"changes", no_argument, 0, 'c'},
+ {"verbose", optional_argument, 0, 'v'},
+ {"help", no_argument, 0, 'h'},
+ {"version", no_argument, 0, 'V'},
+ {0, 0, 0, 0},
+ };
+ char *short_options;
+ bool config_set = false;
+ int timeout = INT_MAX;
+
+ set_program_name(argv[0]);
+ time_init();
+ vlog_init();
+
+ short_options = long_options_to_short_options(long_options);
+ for (;;) {
+ int option;
+
+ option = getopt_long(argc, argv, short_options, long_options, NULL);
+ if (option == -1) {
+ break;
+ }
+
+ if ((option > UCHAR_MAX || !strchr("FhVv?", option))
+ && config_set == false) {
+ ovs_fatal(0, "no config file specified (use --help for help)");
+ }
+
+ switch (option) {
+ case 'T':
+ if (config_set) {
+ ovs_fatal(0, "--timeout or -T must be specified "
+ "before --file or -F");
+ }
+ timeout = atoi(optarg);
+ break;
+
+ case 'F':
+ open_config(optarg, timeout);
+ config_set = true;
+ break;
+
+ case 'a':
+ cfg_add_entry("%s", optarg);
+ break;
+
+ case 'd':
+ cfg_del_entry("%s", optarg);
+ break;
+
+ case 'D':
+ cfg_del_section("%s", optarg);
+ break;
+
+ case OPT_DEL_MATCH:
+ cfg_del_match("%s", optarg);
+ break;
+
+ case 'q':
+ print_vals(optarg);
+ break;
+
+ case 'c':
+ log_diffs();
+ break;
+
+ case 'h':
+ usage(argv[0], EXIT_SUCCESS);
+ break;
+
+ case 'V':
+ OVS_PRINT_VERSION(0, 0);
+ exit(EXIT_SUCCESS);
+
+ case 'v':
+ vlog_set_verbosity(optarg);
+ break;
+
+ case '?':
+ exit(EXIT_FAILURE);
+
+ default:
+ NOT_REACHED();
+ }
+ }
+ free(short_options);
+
+ if (optind != argc) {
+ ovs_fatal(0, "non-option arguments not accepted "
+ "(use --help for help)");
+ }
+
+ if (cfg_is_dirty()) {
+ cfg_write();
+ }
+ cfg_unlock();
+
+ exit(0);
+}
diff --git a/utilities/ovs-controller.8.in b/utilities/ovs-controller.8.in
new file mode 100644
index 000000000..31c7a865c
--- /dev/null
+++ b/utilities/ovs-controller.8.in
@@ -0,0 +1,132 @@
+.TH ovs\-controller 8 "March 2009" "Open vSwitch" "Open vSwitch Manual"
+.ds PN ovs\-controller
+
+.SH NAME
+ovs\-controller \- simple OpenFlow controller reference implementation
+
+.SH SYNOPSIS
+.B ovs\-controller
+[\fIoptions\fR] \fImethod\fR \fB[\fImethod\fR]\&...
+
+.SH DESCRIPTION
+\fBovs\-controller\fR manages any number of remote switches over OpenFlow
+protocol, causing them to function as L2 MAC-learning switches or hub.
+
+\fBovs\-controller\fR controls one or more OpenFlow switches, specified as
+one or more of the following OpenFlow connection methods:
+
+.TP
+\fBpssl:\fR[\fIport\fR]
+Listens for SSL connections from remote OpenFlow switches on
+\fIport\fR (default: 6633). The \fB--private-key\fR,
+\fB--certificate\fR, and \fB--ca-cert\fR options are mandatory when
+this form is used.
+
+.TP
+\fBptcp:\fR[\fIport\fR]
+Listens for TCP connections from remote OpenFlow switches on
+\fIport\fR (default: 6633).
+
+.TP
+\fBpunix:\fIfile\fR
+Listens for connections from OpenFlow switches on the Unix domain
+server socket named \fIfile\fR.
+
+.TP
+\fBssl:\fIhost\fR[\fB:\fIport\fR]
+The specified SSL \fIport\fR (default: 6633) on the given remote
+\fIhost\fR. The \fB--private-key\fR, \fB--certificate\fR, and
+\fB--ca-cert\fR options are mandatory when this form is used.
+
+.TP
+\fBtcp:\fIhost\fR[\fB:\fIport\fR]
+The specified TCP \fIport\fR (default: 6633) on the given remote
+\fIhost\fR.
+
+.TP
+\fBunix:\fIfile\fR
+The Unix domain server socket named \fIfile\fR.
+
+.SH OPTIONS
+.TP
+\fB-p\fR, \fB--private-key=\fIprivkey.pem\fR
+Specifies a PEM file containing the private key used as the switch's
+identity for SSL connections to the controller.
+
+.TP
+\fB-c\fR, \fB--certificate=\fIcert.pem\fR
+Specifies a PEM file containing a certificate, signed by the
+controller's certificate authority (CA), that certifies the switch's
+private key to identify a trustworthy switch.
+
+.TP
+\fB-C\fR, \fB--ca-cert=\fIswitch-cacert.pem\fR
+Specifies a PEM file containing the CA certificate used to verify that
+the switch is connected to a trustworthy controller.
+
+.TP
+\fB--peer-ca-cert=\fIcontroller-cacert.pem\fR
+Specifies a PEM file that contains one or more additional certificates
+to send to switches. \fIcontroller-cacert.pem\fR should be the CA
+certificate used to sign the controller's own certificate (the
+certificate specified on \fB-c\fR or \fB--certificate\fR).
+
+This option is not useful in normal operation, because the switch must
+already have the controller CA certificate for it to have any
+confidence in the controller's identity. However, this option allows
+a newly installed switch to obtain the controller CA certificate on
+first boot using, e.g., the \fB--bootstrap-ca-cert\fR option to
+\fBsecchan\fR(8).
+
+.IP "\fB-n\fR, \fB--noflow\fR"
+By default, \fBovs\-controller\fR sets up a flow in each OpenFlow switch
+whenever it receives a packet whose destination is known due through
+MAC learning. This option disables flow setup, so that every packet
+in the network passes through the controller.
+
+This option is most useful for debugging. It reduces switching
+performance, so it should not be used in production.
+
+.TP
+\fB--max-idle=\fIsecs\fR|\fBpermanent\fR
+Sets \fIsecs\fR as the number of seconds that a flow set up by the
+controller will remain in the switch's flow table without any matching
+packets being seen. If \fBpermanent\fR is specified, which is not
+recommended, flows will never expire. The default is 60 seconds.
+
+This option affects only flows set up by the OpenFlow controller. In
+some configurations, the switch can set up some flows
+on its own. To set the idle time for those flows, pass
+\fB--max-idle\fR to \fBsecchan\fR (on the switch).
+
+This option has no effect when \fB-n\fR (or \fB--noflow\fR) is in use
+(because the controller does not set up flows in that case).
+
+.IP "\fB-H\fR, \fB--hub\fR"
+By default, the controller acts as an L2 MAC-learning switch. This
+option changes its behavior to that of a hub that floods packets on
+all but the incoming port.
+
+If \fB-H\fR (or \fB--hub\fR) and \fB-n\fR (or \fB--noflow\fR) are used
+together, then the cumulative effect is that every packet passes
+through the controller and every packet is flooded.
+
+This option is most useful for debugging. It reduces switching
+performance, so it should not be used in production.
+
+.so lib/daemon.man
+.so lib/vlog.man
+.so lib/common.man
+
+.SH EXAMPLES
+
+.TP
+To bind locally to port 6633 (the default) and wait for incoming connections from OpenFlow switches:
+
+.B % ovs\-controller ptcp:
+
+.SH "SEE ALSO"
+
+.BR secchan (8),
+.BR ovs\-appctl (8),
+.BR ovs\-dpctl (8)
diff --git a/utilities/ovs-controller.c b/utilities/ovs-controller.c
new file mode 100644
index 000000000..423ce1955
--- /dev/null
+++ b/utilities/ovs-controller.c
@@ -0,0 +1,323 @@
+/*
+ * Copyright (c) 2008, 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <config.h>
+
+#include <errno.h>
+#include <getopt.h>
+#include <limits.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "command-line.h"
+#include "compiler.h"
+#include "daemon.h"
+#include "fault.h"
+#include "learning-switch.h"
+#include "ofpbuf.h"
+#include "openflow/openflow.h"
+#include "poll-loop.h"
+#include "rconn.h"
+#include "timeval.h"
+#include "unixctl.h"
+#include "util.h"
+#include "vconn-ssl.h"
+#include "vconn.h"
+
+#include "vlog.h"
+#define THIS_MODULE VLM_controller
+
+#define MAX_SWITCHES 16
+#define MAX_LISTENERS 16
+
+struct switch_ {
+ struct lswitch *lswitch;
+ struct rconn *rconn;
+};
+
+/* Learn the ports on which MAC addresses appear? */
+static bool learn_macs = true;
+
+/* Set up flows? (If not, every packet is processed at the controller.) */
+static bool setup_flows = true;
+
+/* --max-idle: Maximum idle time, in seconds, before flows expire. */
+static int max_idle = 60;
+
+static int do_switching(struct switch_ *);
+static void new_switch(struct switch_ *, struct vconn *, const char *name);
+static void parse_options(int argc, char *argv[]);
+static void usage(void) NO_RETURN;
+
+int
+main(int argc, char *argv[])
+{
+ struct unixctl_server *unixctl;
+ struct switch_ switches[MAX_SWITCHES];
+ struct pvconn *listeners[MAX_LISTENERS];
+ int n_switches, n_listeners;
+ int retval;
+ int i;
+
+ set_program_name(argv[0]);
+ register_fault_handlers();
+ time_init();
+ vlog_init();
+ parse_options(argc, argv);
+ signal(SIGPIPE, SIG_IGN);
+
+ if (argc - optind < 1) {
+ ovs_fatal(0, "at least one vconn argument required; "
+ "use --help for usage");
+ }
+
+ n_switches = n_listeners = 0;
+ for (i = optind; i < argc; i++) {
+ const char *name = argv[i];
+ struct vconn *vconn;
+ int retval;
+
+ retval = vconn_open(name, OFP_VERSION, &vconn);
+ if (!retval) {
+ if (n_switches >= MAX_SWITCHES) {
+ ovs_fatal(0, "max %d switch connections", n_switches);
+ }
+ new_switch(&switches[n_switches++], vconn, name);
+ continue;
+ } else if (retval == EAFNOSUPPORT) {
+ struct pvconn *pvconn;
+ retval = pvconn_open(name, &pvconn);
+ if (!retval) {
+ if (n_listeners >= MAX_LISTENERS) {
+ ovs_fatal(0, "max %d passive connections", n_listeners);
+ }
+ listeners[n_listeners++] = pvconn;
+ }
+ }
+ if (retval) {
+ VLOG_ERR("%s: connect: %s", name, strerror(retval));
+ }
+ }
+ if (n_switches == 0 && n_listeners == 0) {
+ ovs_fatal(0, "no active or passive switch connections");
+ }
+
+ die_if_already_running();
+ daemonize();
+
+ retval = unixctl_server_create(NULL, &unixctl);
+ if (retval) {
+ ovs_fatal(retval, "Could not listen for unixctl connections");
+ }
+
+ while (n_switches > 0 || n_listeners > 0) {
+ int iteration;
+ int i;
+
+ /* Accept connections on listening vconns. */
+ for (i = 0; i < n_listeners && n_switches < MAX_SWITCHES; ) {
+ struct vconn *new_vconn;
+ int retval;
+
+ retval = pvconn_accept(listeners[i], OFP_VERSION, &new_vconn);
+ if (!retval || retval == EAGAIN) {
+ if (!retval) {
+ new_switch(&switches[n_switches++], new_vconn, "tcp");
+ }
+ i++;
+ } else {
+ pvconn_close(listeners[i]);
+ listeners[i] = listeners[--n_listeners];
+ }
+ }
+
+ /* Do some switching work. Limit the number of iterations so that
+ * callbacks registered with the poll loop don't starve. */
+ for (iteration = 0; iteration < 50; iteration++) {
+ bool progress = false;
+ for (i = 0; i < n_switches; ) {
+ struct switch_ *this = &switches[i];
+ int retval = do_switching(this);
+ if (!retval || retval == EAGAIN) {
+ if (!retval) {
+ progress = true;
+ }
+ i++;
+ } else {
+ rconn_destroy(this->rconn);
+ lswitch_destroy(this->lswitch);
+ switches[i] = switches[--n_switches];
+ }
+ }
+ if (!progress) {
+ break;
+ }
+ }
+ for (i = 0; i < n_switches; i++) {
+ struct switch_ *this = &switches[i];
+ lswitch_run(this->lswitch, this->rconn);
+ }
+
+ unixctl_server_run(unixctl);
+
+ /* Wait for something to happen. */
+ if (n_switches < MAX_SWITCHES) {
+ for (i = 0; i < n_listeners; i++) {
+ pvconn_wait(listeners[i]);
+ }
+ }
+ for (i = 0; i < n_switches; i++) {
+ struct switch_ *sw = &switches[i];
+ rconn_run_wait(sw->rconn);
+ rconn_recv_wait(sw->rconn);
+ lswitch_wait(sw->lswitch);
+ }
+ unixctl_server_wait(unixctl);
+ poll_block();
+ }
+
+ return 0;
+}
+
+static void
+new_switch(struct switch_ *sw, struct vconn *vconn, const char *name)
+{
+ sw->rconn = rconn_new_from_vconn(name, vconn);
+ sw->lswitch = lswitch_create(sw->rconn, learn_macs,
+ setup_flows ? max_idle : -1);
+}
+
+static int
+do_switching(struct switch_ *sw)
+{
+ unsigned int packets_sent;
+ struct ofpbuf *msg;
+
+ packets_sent = rconn_packets_sent(sw->rconn);
+
+ msg = rconn_recv(sw->rconn);
+ if (msg) {
+ lswitch_process_packet(sw->lswitch, sw->rconn, msg);
+ ofpbuf_delete(msg);
+ }
+ rconn_run(sw->rconn);
+
+ return (!rconn_is_alive(sw->rconn) ? EOF
+ : rconn_packets_sent(sw->rconn) != packets_sent ? 0
+ : EAGAIN);
+}
+
+static void
+parse_options(int argc, char *argv[])
+{
+ enum {
+ OPT_MAX_IDLE = UCHAR_MAX + 1,
+ OPT_PEER_CA_CERT,
+ VLOG_OPTION_ENUMS
+ };
+ static struct option long_options[] = {
+ {"hub", no_argument, 0, 'H'},
+ {"noflow", no_argument, 0, 'n'},
+ {"max-idle", required_argument, 0, OPT_MAX_IDLE},
+ {"help", no_argument, 0, 'h'},
+ {"version", no_argument, 0, 'V'},
+ DAEMON_LONG_OPTIONS,
+ VLOG_LONG_OPTIONS,
+#ifdef HAVE_OPENSSL
+ VCONN_SSL_LONG_OPTIONS
+ {"peer-ca-cert", required_argument, 0, OPT_PEER_CA_CERT},
+#endif
+ {0, 0, 0, 0},
+ };
+ char *short_options = long_options_to_short_options(long_options);
+
+ for (;;) {
+ int indexptr;
+ int c;
+
+ c = getopt_long(argc, argv, short_options, long_options, &indexptr);
+ if (c == -1) {
+ break;
+ }
+
+ switch (c) {
+ case 'H':
+ learn_macs = false;
+ break;
+
+ case 'n':
+ setup_flows = false;
+ break;
+
+ case OPT_MAX_IDLE:
+ if (!strcmp(optarg, "permanent")) {
+ max_idle = OFP_FLOW_PERMANENT;
+ } else {
+ max_idle = atoi(optarg);
+ if (max_idle < 1 || max_idle > 65535) {
+ ovs_fatal(0, "--max-idle argument must be between 1 and "
+ "65535 or the word 'permanent'");
+ }
+ }
+ break;
+
+ case 'h':
+ usage();
+
+ case 'V':
+ OVS_PRINT_VERSION(OFP_VERSION, OFP_VERSION);
+ exit(EXIT_SUCCESS);
+
+ VLOG_OPTION_HANDLERS
+ DAEMON_OPTION_HANDLERS
+
+#ifdef HAVE_OPENSSL
+ VCONN_SSL_OPTION_HANDLERS
+
+ case OPT_PEER_CA_CERT:
+ vconn_ssl_set_peer_ca_cert_file(optarg);
+ break;
+#endif
+
+ case '?':
+ exit(EXIT_FAILURE);
+
+ default:
+ abort();
+ }
+ }
+ free(short_options);
+}
+
+static void
+usage(void)
+{
+ printf("%s: OpenFlow controller\n"
+ "usage: %s [OPTIONS] METHOD\n"
+ "where METHOD is any OpenFlow connection method.\n",
+ program_name, program_name);
+ vconn_usage(true, true, false);
+ daemon_usage();
+ vlog_usage();
+ printf("\nOther options:\n"
+ " -H, --hub act as hub instead of learning switch\n"
+ " -n, --noflow pass traffic, but don't add flows\n"
+ " --max-idle=SECS max idle time for new flows\n"
+ " -h, --help display this help message\n"
+ " -V, --version display version information\n");
+ exit(EXIT_SUCCESS);
+}
diff --git a/utilities/ovs-discover.8.in b/utilities/ovs-discover.8.in
new file mode 100644
index 000000000..d38ce9ee7
--- /dev/null
+++ b/utilities/ovs-discover.8.in
@@ -0,0 +1,118 @@
+.TH ovs\-discover 8 "May 2008" "Open vSwitch" "Open vSwitch Manual"
+.ds PN ovs\-discover
+
+.SH NAME
+ovs\-discover \- controller discovery utility
+
+.SH SYNOPSIS
+.B ovs\-discover
+[\fIoptions\fR] \fInetdev\fR [\fInetdev\fR...]
+
+.SH DESCRIPTION
+The \fBovs\-discover\fR program attempts to discover the location of
+an OpenFlow controller on one of the network devices listed on the
+command line. It repeatedly broadcasts a DHCP request with vendor
+class identifier \fBOpenFlow\fR on each network device until it
+receives an acceptable DHCP response. It will accept any valid DHCP
+reply that has the same vendor class identifier and includes a
+vendor-specific option with code 1 whose contents are a string
+specifying the location of the controller in the same format used on
+the \fBsecchan\fR command line (e.g. \fBssl:192.168.0.1\fR).
+
+When \fBovs\-discover\fR receives an acceptable response, it prints
+the details of the response on \fBstdout\fR. Then, by default, it
+configures the network device on which the response was received with
+the received IP address, netmask, and default gateway, and detaches
+itself to the background.
+
+.SH OPTIONS
+.TP
+\fB--accept-vconn=\fIregex\fR
+By default, \fBovs\-discover\fR accepts any controller location
+advertised over DHCP. With this option, only controllers whose names
+match POSIX extended regular expression \fIregex\fR will be accepted.
+Specifying \fBssl:.*\fR for \fIregex\fR, for example, would cause only
+SSL controller connections to be accepted.
+
+The \fIregex\fR is implicitly anchored at the beginning of the
+controller location string, as if it begins with \fB^\fR.
+
+.TP
+\fB--exit-without-bind\fR
+By default, \fBovs\-discover\fR binds the network device that receives
+the first acceptable response to the IP address received over DHCP.
+With this option, the configuration of the network device is not
+changed at all, except to bring it up if it is initially down, and
+\fBovs\-discover\fR will exit immediately after it receives an
+acceptable DHCP response.
+
+This option is mutually exclusive with \fB--exit-after-bind\fR and
+\fB--no-detach\fR.
+
+.TP
+\fB--exit-after-bind\fR
+By default, after it receives an acceptable DHCP response,
+\fBovs\-discover\fR detaches itself from the foreground session and
+runs in the background maintaining the DHCP lease as necessary. With
+this option, \fBovs\-discover\fR will exit immediately after it
+receives an acceptable DHCP response and configures the network device
+with the received IP address. The address obtained via DHCP could
+therefore be used past the expiration of its lease.
+
+This option is mutually exclusive with \fB--exit-without-bind\fR and
+\fB--no-detach\fR.
+
+.TP
+\fB--no-detach\fR
+By default, \fBovs\-discover\fR runs in the foreground until it obtains
+an acceptable DHCP response, then it detaches itself from the
+foreground session and run as a background process. This option
+prevents \fBovs\-discover\fR from detaching, causing it to run in the
+foreground even after it obtains a DHCP response.
+
+This option is mutually exclusive with \fB--exit-without-bind\fR and
+\fB--exit-after-bind\fR.
+
+.TP
+\fB-P\fR[\fIpidfile\fR], \fB--pidfile\fR[\fB=\fIpidfile\fR]
+Causes a file (by default, \fBovs\-discover.pid\fR) to be created indicating
+the PID of the running process. If \fIpidfile\fR is not specified, or
+if it does not begin with \fB/\fR, then it is created in
+\fB@RUNDIR@\fR.
+
+The \fIpidfile\fR is created when \fBovs\-discover\fR detaches, so
+this this option has no effect when one of \fB--exit-without-bind\fR,
+\fB--exit-after-bind\fR, or \fB--no-detach\fR is also given.
+
+.TP
+\fB-f\fR, \fB--force\fR
+By default, when \fB-P\fR or \fB--pidfile\fR is specified and the
+specified pidfile already exists and is locked by a running process,
+\fBcontroller\fR refuses to start. Specify \fB-f\fR or \fB--force\fR
+to cause it to instead overwrite the pidfile.
+
+When \fB-P\fR or \fB--pidfile\fR is not specified, this option has no
+effect.
+
+.so lib/vlog.man
+.so lib/common.man
+
+.SH BUGS
+
+If the network devices specified on the command line have been added
+to an Open vSwitch datapath with \fBovs\-dpctl add\-if\fR, then controller
+discovery will fail because \fBovs\-discover\fR will not be able to
+see DHCP responses, even though tools such as \fBtcpdump\fR(8) and
+\fBwireshark\fR(1) can see them on the wire. This is because of the
+structure of the Linux kernel networking stack, which hands packets
+first to programs that listen for all arriving packets, then to
+Open vSwitch, then to programs that listen for a specific kind of packet.
+Open vSwitch consumes all the packets handed to it, so tools like
+\fBtcpdump\fR that look at all packets will see packets arriving on
+Open vSwitch interfaces, but \fRovs\-discover\fR, which listens only for
+arriving IP packets, will not.
+
+.SH "SEE ALSO"
+
+.BR secchan (8),
+.BR ovs-pki (8)
diff --git a/utilities/ovs-discover.c b/utilities/ovs-discover.c
new file mode 100644
index 000000000..b664321ff
--- /dev/null
+++ b/utilities/ovs-discover.c
@@ -0,0 +1,405 @@
+/*
+ * Copyright (c) 2008, 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <config.h>
+#include <getopt.h>
+#include <limits.h>
+#include <regex.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include "command-line.h"
+#include "daemon.h"
+#include "dhcp-client.h"
+#include "dhcp.h"
+#include "dirs.h"
+#include "dynamic-string.h"
+#include "fatal-signal.h"
+#include "netdev.h"
+#include "poll-loop.h"
+#include "timeval.h"
+#include "unixctl.h"
+#include "util.h"
+
+#include "vlog.h"
+#define THIS_MODULE VLM_ovs_discover
+
+struct iface {
+ const char *name;
+ struct dhclient *dhcp;
+};
+
+/* The interfaces that we serve. */
+static struct iface *ifaces;
+static int n_ifaces;
+
+/* --accept-vconn: Regular expression specifying the class of controller vconns
+ * that we will accept during autodiscovery. */
+static const char *accept_controller_re = ".*";
+static regex_t accept_controller_regex;
+
+/* --exit-without-bind: Exit after discovering the controller, without binding
+ * the network device to an IP address? */
+static bool exit_without_bind;
+
+/* --exit-after-bind: Exit after discovering the controller, after binding the
+ * network device to an IP address? */
+static bool exit_after_bind;
+
+static bool iface_init(struct iface *, const char *netdev_name);
+static void release_ifaces(void *aux UNUSED);
+
+static void parse_options(int argc, char *argv[]);
+static void usage(void) NO_RETURN;
+
+static void modify_dhcp_request(struct dhcp_msg *, void *aux);
+static bool validate_dhcp_offer(const struct dhcp_msg *, void *aux);
+
+int
+main(int argc, char *argv[])
+{
+ struct unixctl_server *unixctl;
+ int retval;
+ int i;
+
+ set_program_name(argv[0]);
+ time_init();
+ vlog_init();
+ parse_options(argc, argv);
+
+ argc -= optind;
+ argv += optind;
+ if (argc < 1) {
+ ovs_fatal(0, "need at least one non-option argument; "
+ "use --help for usage");
+ }
+
+ ifaces = xmalloc(argc * sizeof *ifaces);
+ n_ifaces = 0;
+ for (i = 0; i < argc; i++) {
+ if (iface_init(&ifaces[n_ifaces], argv[i])) {
+ n_ifaces++;
+ }
+ }
+ if (!n_ifaces) {
+ ovs_fatal(0, "failed to initialize any DHCP clients");
+ }
+
+ for (i = 0; i < n_ifaces; i++) {
+ struct iface *iface = &ifaces[i];
+ dhclient_init(iface->dhcp, 0);
+ }
+ fatal_signal_add_hook(release_ifaces, NULL, true);
+
+ retval = regcomp(&accept_controller_regex, accept_controller_re,
+ REG_NOSUB | REG_EXTENDED);
+ if (retval) {
+ size_t length = regerror(retval, &accept_controller_regex, NULL, 0);
+ char *buffer = xmalloc(length);
+ regerror(retval, &accept_controller_regex, buffer, length);
+ ovs_fatal(0, "%s: %s", accept_controller_re, buffer);
+ }
+
+ retval = unixctl_server_create(NULL, &unixctl);
+ if (retval) {
+ ovs_fatal(retval, "Could not listen for unixctl connections");
+ }
+
+ die_if_already_running();
+
+ signal(SIGPIPE, SIG_IGN);
+ for (;;) {
+ fatal_signal_block();
+ for (i = 0; i < n_ifaces; i++) {
+ struct iface *iface = &ifaces[i];
+ dhclient_run(iface->dhcp);
+ if (dhclient_changed(iface->dhcp)) {
+ bool is_bound = dhclient_is_bound(iface->dhcp);
+ int j;
+
+ /* Configure network device. */
+ if (!exit_without_bind) {
+ dhclient_configure_netdev(iface->dhcp);
+ dhclient_update_resolv_conf(iface->dhcp);
+ }
+
+ if (is_bound) {
+ static bool detached = false;
+ struct ds ds;
+
+ /* Disable timeout, since discovery was successful. */
+ time_alarm(0);
+
+ /* Print discovered parameters. */
+ ds_init(&ds);
+ dhcp_msg_to_string(dhclient_get_config(iface->dhcp),
+ true, &ds);
+ fputs(ds_cstr(&ds), stdout);
+ putchar('\n');
+ fflush(stdout);
+ ds_destroy(&ds);
+
+ /* Exit if the user requested it. */
+ if (exit_without_bind) {
+ VLOG_DBG("exiting because of successful binding on %s "
+ "and --exit-without-bind specified",
+ iface->name);
+ exit(0);
+ }
+ if (exit_after_bind) {
+ VLOG_DBG("exiting because of successful binding on %s "
+ "and --exit-after-bind specified",
+ iface->name);
+ exit(0);
+ }
+
+ /* Detach into background, if we haven't already. */
+ if (!detached) {
+ detached = true;
+ daemonize();
+ }
+ }
+
+ /* We only want an address on a single one of our interfaces.
+ * So: if we have an address on this interface, stop looking
+ * for one on the others; if we don't have an address on this
+ * interface, start looking everywhere. */
+ for (j = 0; j < n_ifaces; j++) {
+ struct iface *if2 = &ifaces[j];
+ if (iface != if2) {
+ if (is_bound) {
+ dhclient_release(if2->dhcp);
+ } else {
+ dhclient_init(if2->dhcp, 0);
+ }
+ }
+ }
+ }
+ }
+ unixctl_server_run(unixctl);
+ for (i = 0; i < n_ifaces; i++) {
+ struct iface *iface = &ifaces[i];
+ dhclient_wait(iface->dhcp);
+ }
+ unixctl_server_wait(unixctl);
+ fatal_signal_unblock();
+ poll_block();
+ }
+
+ return 0;
+}
+
+static bool
+iface_init(struct iface *iface, const char *netdev_name)
+{
+ int retval;
+
+ iface->name = netdev_name;
+ iface->dhcp = NULL;
+
+ if (exit_after_bind) {
+ /* Bring this interface up permanently, so that the bound address
+ * persists past program termination. */
+ struct netdev *netdev;
+
+ retval = netdev_open(iface->name, NETDEV_ETH_TYPE_NONE, &netdev);
+ if (retval) {
+ ovs_error(retval, "Could not open %s device", iface->name);
+ return false;
+ }
+ retval = netdev_turn_flags_on(netdev, NETDEV_UP, true);
+ if (retval) {
+ ovs_error(retval, "Could not bring %s device up", iface->name);
+ return false;
+ }
+ netdev_close(netdev);
+ }
+
+ retval = dhclient_create(iface->name, modify_dhcp_request,
+ validate_dhcp_offer, NULL, &iface->dhcp);
+ if (retval) {
+ ovs_error(retval, "%s: failed to initialize DHCP client", iface->name);
+ return false;
+ }
+
+ return true;
+}
+
+static void
+release_ifaces(void *aux UNUSED)
+{
+ int i;
+
+ for (i = 0; i < n_ifaces; i++) {
+ struct dhclient *dhcp = ifaces[i].dhcp;
+ dhclient_release(dhcp);
+ if (dhclient_changed(dhcp)) {
+ dhclient_configure_netdev(dhcp);
+ }
+ }
+}
+
+static void
+modify_dhcp_request(struct dhcp_msg *msg, void *aux UNUSED)
+{
+ dhcp_msg_put_string(msg, DHCP_CODE_VENDOR_CLASS, "OpenFlow");
+}
+
+static bool
+validate_dhcp_offer(const struct dhcp_msg *msg, void *aux UNUSED)
+{
+ static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(60, 60);
+ char *vconn_name;
+ bool accept;
+
+ vconn_name = dhcp_msg_get_string(msg, DHCP_CODE_OFP_CONTROLLER_VCONN);
+ if (!vconn_name) {
+ VLOG_WARN_RL(&rl, "rejecting DHCP offer missing controller vconn");
+ return false;
+ }
+ accept = !regexec(&accept_controller_regex, vconn_name, 0, NULL, 0);
+ free(vconn_name);
+ return accept;
+}
+
+static void
+parse_options(int argc, char *argv[])
+{
+ enum {
+ OPT_ACCEPT_VCONN = UCHAR_MAX + 1,
+ OPT_EXIT_WITHOUT_BIND,
+ OPT_EXIT_AFTER_BIND,
+ OPT_NO_DETACH,
+ };
+ static struct option long_options[] = {
+ {"accept-vconn", required_argument, 0, OPT_ACCEPT_VCONN},
+ {"exit-without-bind", no_argument, 0, OPT_EXIT_WITHOUT_BIND},
+ {"exit-after-bind", no_argument, 0, OPT_EXIT_AFTER_BIND},
+ {"no-detach", no_argument, 0, OPT_NO_DETACH},
+ {"timeout", required_argument, 0, 't'},
+ {"pidfile", optional_argument, 0, 'P'},
+ {"force", no_argument, 0, 'f'},
+ {"verbose", optional_argument, 0, 'v'},
+ {"help", no_argument, 0, 'h'},
+ {"version", no_argument, 0, 'V'},
+ {0, 0, 0, 0},
+ };
+ char *short_options = long_options_to_short_options(long_options);
+ bool detach_after_bind = true;
+
+ for (;;) {
+ unsigned long int timeout;
+ int c;
+
+ c = getopt_long(argc, argv, short_options, long_options, NULL);
+ if (c == -1) {
+ break;
+ }
+
+ switch (c) {
+ case OPT_ACCEPT_VCONN:
+ accept_controller_re = (optarg[0] == '^'
+ ? optarg
+ : xasprintf("^%s", optarg));
+ break;
+
+ case OPT_EXIT_WITHOUT_BIND:
+ exit_without_bind = true;
+ break;
+
+ case OPT_EXIT_AFTER_BIND:
+ exit_after_bind = true;
+ break;
+
+ case OPT_NO_DETACH:
+ detach_after_bind = false;
+ break;
+
+ case 'P':
+ set_pidfile(optarg);
+ break;
+
+ case 'f':
+ ignore_existing_pidfile();
+ break;
+
+ case 't':
+ timeout = strtoul(optarg, NULL, 10);
+ if (timeout <= 0) {
+ ovs_fatal(0, "value %s on -t or --timeout is not at least 1",
+ optarg);
+ } else {
+ time_alarm(timeout);
+ }
+ signal(SIGALRM, SIG_DFL);
+ break;
+
+ case 'h':
+ usage();
+
+ case 'V':
+ OVS_PRINT_VERSION(0, 0);
+ exit(EXIT_SUCCESS);
+
+ case 'v':
+ vlog_set_verbosity(optarg);
+ break;
+
+ case '?':
+ exit(EXIT_FAILURE);
+
+ default:
+ abort();
+ }
+ }
+ free(short_options);
+
+ if ((exit_without_bind + exit_after_bind + !detach_after_bind) > 1) {
+ ovs_fatal(0, "--exit-without-bind, --exit-after-bind, and --no-detach "
+ "are mutually exclusive");
+ }
+ if (detach_after_bind) {
+ set_detach();
+ }
+}
+
+static void
+usage(void)
+{
+ printf("%s: a tool for discovering OpenFlow controllers.\n"
+ "usage: %s [OPTIONS] NETDEV [NETDEV...]\n"
+ "where each NETDEV is a network device on which to perform\n"
+ "controller discovery.\n"
+ "\nOrdinarily, ovs-discover runs in the foreground until it\n"
+ "obtains an IP address and discovers an OpenFlow controller via\n"
+ "DHCP, then it prints information about the controller to stdout\n"
+ "and detaches to the background to maintain the IP address lease.\n"
+ "\nNetworking options:\n"
+ " --accept-vconn=REGEX accept matching discovered controllers\n"
+ " --exit-without-bind exit after discovery, without binding\n"
+ " --exit-after-bind exit after discovery, after binding\n"
+ " --no-detach do not detach after discovery\n",
+ program_name, program_name);
+ vlog_usage();
+ printf("\nOther options:\n"
+ " -t, --timeout=SECS give up discovery after SECS seconds\n"
+ " -P, --pidfile[=FILE] create pidfile (default: %s/%s.pid)\n"
+ " -f, --force with -P, start even if already running\n"
+ " -h, --help display this help message\n"
+ " -V, --version display version information\n",
+ ovs_rundir, program_name);
+ exit(EXIT_SUCCESS);
+}
diff --git a/utilities/ovs-dpctl.8.in b/utilities/ovs-dpctl.8.in
new file mode 100644
index 000000000..652ebb13f
--- /dev/null
+++ b/utilities/ovs-dpctl.8.in
@@ -0,0 +1,166 @@
+.TH ovs\-dpctl 8 "March 2009" "Open vSwitch" "Open vSwitch Manual"
+.ds PN ovs\-dpctl
+
+.SH NAME
+ovs\-dpctl \- administer Open vSwitch datapaths
+
+.SH SYNOPSIS
+.B ovs\-dpctl
+[\fIoptions\fR] \fIcommand \fR[\fIswitch\fR] [\fIargs\fR\&...]
+
+.SH DESCRIPTION
+
+The \fBovs\-dpctl\fR program can create, modify, and delete Open vSwitch
+datapaths. A single machine may host up to 256 datapaths (numbered 0
+to 255).
+
+A newly created datapath is associated with only one network device, a
+virtual network device sometimes called the datapath's ``local port''.
+A newly created datapath is not, however, associated with any of the
+host's other network devices. To intercept and process traffic on a
+given network device, use the \fBadd\-if\fR command to explicitly add
+that network device to the datapath.
+
+Do not use \fBovs\-dpctl\fR commands to modify datapaths if
+\fBovs\-vswitchd\fR(8) is in use. Instead, modify the
+\fBovs\-vswitchd\fR configuration file and send \fBSIGHUP\fR to the
+\fBovs\-vswitchd\fR process.
+
+.PP
+Most \fBovs\-dpctl\fR commands that work with datapaths take an argument
+that specifies the name of the datapath, in one of the following
+forms:
+
+.so lib/dpif.man
+
+.PP
+The following commands manage datapaths.
+
+.TP
+\fBadd\-dp \fIdp\fR [\fInetdev\fR...]
+
+Creates datapath \fIdp\fR. The name of the new datapath's local port
+depends on how \fIdp\fR is specified: if it takes the form
+\fBdp\fIN\fR, the local port will be named \fBdp\fIN\fR; if \fIdp\fR
+is \fBnl:\fI, the local port will be named \fBof\fIN\fR; otherwise,
+the local port's name will be \fIdp\fR.
+
+This will fail if the host already has 256 datapaths, if a network
+device with the same name as the new datapath's local port already
+exists, or if \fIdp\fR is given in the form \fBdp\fIN\fR or
+\fBnl:\fIN\fR and a datapath numbered \fIN\fR already exists.
+
+If \fInetdev\fRs are specified, \fBovs\-dpctl\fR adds them to the datapath.
+
+.TP
+\fBdel\-dp \fIdp\fR
+Deletes datapath \fIdp\fR. If \fIdp\fR is associated with any network
+devices, they are automatically removed.
+
+.TP
+\fBadd\-if \fIdp netdev\fR[\fIoption\fR...]...
+Adds each \fInetdev\fR to the set of network devices datapath
+\fIdp\fR monitors, where \fIdp\fR is the name of an existing
+datapath, and \fInetdev\fR is the name of one of the host's
+network devices, e.g. \fBeth0\fR. Once a network device has been added
+to a datapath, the datapath has complete ownership of the network device's
+traffic and the network device appears silent to the rest of the
+system.
+
+A \fInetdev\fR may be followed by a comma-separated list of options.
+The following options are currently supported:
+
+.RS
+.IP "\fBport=\fIportno\fR"
+Specifies \fIportno\fR (a number between 1 and 255) as the port number
+at which \fInetdev\fR will be attached. By default, \fBadd\-if\fR
+automatically selects the lowest available port number.
+
+.IP "\fBinternal\fR"
+Instead of attaching an existing \fInetdev\fR, creates an internal
+port (analogous to the local port) with that name.
+.RE
+
+.TP
+\fBdel\-if \fIdp netdev\fR...
+Removes each \fInetdev\fR from the list of network devices datapath
+\fIdp\fR monitors.
+
+.TP
+\fBshow \fR[\fIdp\fR...]
+Prints a summary of configured datapaths, including their datapath
+numbers and a list of ports connected to each datapath. (The local
+port is identified as port 0.)
+
+If one or more datapaths are specified, information on only those
+datapaths are displayed. Otherwise, \fBovs\-dpctl\fR displays information
+about all configured datapaths.
+
+.IP "\fBdump-flows \fIdp\fR"
+Prints to the console all flow entries in datapath \fIdp\fR's
+flow table.
+
+This command is primarily useful for debugging Open vSwitch. The flow
+table entries that it displays are not
+OpenFlow flow entries. Instead, they are different and considerably
+simpler flows maintained by the Open vSwitch kernel module.
+
+.IP "\fBdel-flows \fIdp\fR"
+Deletes all flow entries from datapath \fIdp\fR's flow table.
+
+This command is primarily useful for debugging Open vSwitch. As
+discussed in \fBdump-flows\fR, these entries are
+not OpenFlow flow entries. By deleting them, the process that set them
+up may be confused about their disappearance.
+
+.IP "\fBdump-groups \fIdp\fR"
+Prints to the console the sets of port groups maintained by datapath
+\fIdp\fR. Ordinarily there are at least 2 port groups in a datapath
+that \fBsecchan\fR or \fBvswitch\fR is controlling: group 0 contains
+all ports except those disabled by STP, and group 1 contains all
+ports. Additional groups might be used in the future.
+
+This command is primarily useful for debugging Open vSwitch. OpenFlow
+does not have a concept of port groups.
+
+.SH OPTIONS
+.TP
+\fB-t\fR, \fB--timeout=\fIsecs\fR
+Limits \fBovs\-dpctl\fR runtime to approximately \fIsecs\fR seconds. If
+the timeout expires, \fBovs\-dpctl\fR will exit with a \fBSIGALRM\fR
+signal.
+
+.so lib/vlog.man
+.so lib/common.man
+
+.SH EXAMPLES
+
+A typical \fBovs\-dpctl\fR command sequence for controlling an
+Open vSwitch kernel module:
+
+.TP
+\fBovs\-dpctl add\-dp dp0\fR
+Creates datapath number 0.
+
+.TP
+\fBovs\-dpctl add\-if dp0 eth0 eth1\fR
+Adds two network devices to the new datapath.
+
+.PP
+At this point one would ordinarily start \fBsecchan\fR(8) on
+\fBdp0\fR, transforming \fBdp0\fR into an OpenFlow switch. Then, when
+the switch and the datapath is no longer needed:
+
+.TP
+\fBovs\-dpctl del\-if dp0 eth0 eth1\fR
+Removes network devices from the datapath.
+
+.TP
+\fBovs\-dpctl del\-dp dp0\fR
+Deletes the datapath.
+
+.SH "SEE ALSO"
+
+.BR secchan (8),
+.BR ovs\-appctl (8),
+.BR ovs\-vswitchd (8)
diff --git a/utilities/ovs-dpctl.c b/utilities/ovs-dpctl.c
new file mode 100644
index 000000000..ccddd2f29
--- /dev/null
+++ b/utilities/ovs-dpctl.c
@@ -0,0 +1,552 @@
+/*
+ * Copyright (c) 2008, 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <config.h>
+#include <arpa/inet.h>
+#include <errno.h>
+#include <getopt.h>
+#include <inttypes.h>
+#include <net/if.h>
+#include <netinet/in.h>
+#include <signal.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+
+#include "command-line.h"
+#include "compiler.h"
+#include "dirs.h"
+#include "dpif.h"
+#include "dynamic-string.h"
+#include "netdev.h"
+#include "odp-util.h"
+#include "timeval.h"
+#include "util.h"
+
+#include "vlog.h"
+#define THIS_MODULE VLM_dpctl
+
+struct command {
+ const char *name;
+ int min_args;
+ int max_args;
+ void (*handler)(int argc, char *argv[]);
+};
+
+static struct command all_commands[];
+
+static void usage(void) NO_RETURN;
+static void parse_options(int argc, char *argv[]);
+
+int main(int argc, char *argv[])
+{
+ struct command *p;
+
+ set_program_name(argv[0]);
+ time_init();
+ vlog_init();
+ parse_options(argc, argv);
+ signal(SIGPIPE, SIG_IGN);
+
+ argc -= optind;
+ argv += optind;
+ if (argc < 1)
+ ovs_fatal(0, "missing command name; use --help for help");
+
+ for (p = all_commands; p->name != NULL; p++) {
+ if (!strcmp(p->name, argv[0])) {
+ int n_arg = argc - 1;
+ if (n_arg < p->min_args)
+ ovs_fatal(0, "'%s' command requires at least %d arguments",
+ p->name, p->min_args);
+ else if (n_arg > p->max_args)
+ ovs_fatal(0, "'%s' command takes at most %d arguments",
+ p->name, p->max_args);
+ else {
+ p->handler(argc, argv);
+ if (ferror(stdout)) {
+ ovs_fatal(0, "write to stdout failed");
+ }
+ if (ferror(stderr)) {
+ ovs_fatal(0, "write to stderr failed");
+ }
+ exit(0);
+ }
+ }
+ }
+ ovs_fatal(0, "unknown command '%s'; use --help for help", argv[0]);
+
+ return 0;
+}
+
+static void
+parse_options(int argc, char *argv[])
+{
+ static struct option long_options[] = {
+ {"timeout", required_argument, 0, 't'},
+ {"verbose", optional_argument, 0, 'v'},
+ {"help", no_argument, 0, 'h'},
+ {"version", no_argument, 0, 'V'},
+ {0, 0, 0, 0},
+ };
+ char *short_options = long_options_to_short_options(long_options);
+
+ for (;;) {
+ unsigned long int timeout;
+ int c;
+
+ c = getopt_long(argc, argv, short_options, long_options, NULL);
+ if (c == -1) {
+ break;
+ }
+
+ switch (c) {
+ case 't':
+ timeout = strtoul(optarg, NULL, 10);
+ if (timeout <= 0) {
+ ovs_fatal(0, "value %s on -t or --timeout is not at least 1",
+ optarg);
+ } else {
+ time_alarm(timeout);
+ }
+ break;
+
+ case 'h':
+ usage();
+
+ case 'V':
+ OVS_PRINT_VERSION(0, 0);
+ exit(EXIT_SUCCESS);
+
+ case 'v':
+ vlog_set_verbosity(optarg);
+ break;
+
+ case '?':
+ exit(EXIT_FAILURE);
+
+ default:
+ abort();
+ }
+ }
+ free(short_options);
+}
+
+static void
+usage(void)
+{
+ printf("%s: Open vSwitch datapath management utility\n"
+ "usage: %s [OPTIONS] COMMAND [ARG...]\n"
+ " add-dp DP [IFACE...] add new datapath DP (with IFACEs)\n"
+ " del-dp DP delete local datapath DP\n"
+ " add-if DP IFACE... add each IFACE as a port on DP\n"
+ " del-if DP IFACE... delete each IFACE from DP\n"
+ " show show basic info on all datapaths\n"
+ " show DP... show basic info on each DP\n"
+ " dump-flows DP display flows in DP\n"
+ " del-flows DP delete all flows from DP\n"
+ " dump-groups DP display port groups in DP\n",
+ program_name, program_name);
+ vlog_usage();
+ printf("\nOther options:\n"
+ " -t, --timeout=SECS give up after SECS seconds\n"
+ " -h, --help display this help message\n"
+ " -V, --version display version information\n");
+ exit(EXIT_SUCCESS);
+}
+
+static void run(int retval, const char *message, ...)
+ PRINTF_FORMAT(2, 3);
+
+static void run(int retval, const char *message, ...)
+{
+ if (retval) {
+ va_list args;
+
+ fprintf(stderr, "%s: ", program_name);
+ va_start(args, message);
+ vfprintf(stderr, message, args);
+ va_end(args);
+ if (retval == EOF) {
+ fputs(": unexpected end of file\n", stderr);
+ } else {
+ fprintf(stderr, ": %s\n", strerror(retval));
+ }
+
+ exit(EXIT_FAILURE);
+ }
+}
+
+static void do_add_if(int argc, char *argv[]);
+
+static int if_up(const char *netdev_name)
+{
+ struct netdev *netdev;
+ int retval;
+
+ retval = netdev_open(netdev_name, NETDEV_ETH_TYPE_NONE, &netdev);
+ if (!retval) {
+ retval = netdev_turn_flags_on(netdev, NETDEV_UP, true);
+ netdev_close(netdev);
+ }
+ return retval;
+}
+
+static void
+do_add_dp(int argc UNUSED, char *argv[])
+{
+ struct dpif dpif;
+ run(dpif_create(argv[1], &dpif), "add_dp");
+ dpif_close(&dpif);
+ if (argc > 2) {
+ do_add_if(argc, argv);
+ }
+}
+
+static void
+do_del_dp(int argc UNUSED, char *argv[])
+{
+ struct dpif dpif;
+ run(dpif_open(argv[1], &dpif), "opening datapath");
+ run(dpif_delete(&dpif), "del_dp");
+ dpif_close(&dpif);
+}
+
+static int
+compare_ports(const void *a_, const void *b_)
+{
+ const struct odp_port *a = a_;
+ const struct odp_port *b = b_;
+ return a->port < b->port ? -1 : a->port > b->port;
+}
+
+static void
+query_ports(struct dpif *dpif, struct odp_port **ports, size_t *n_ports)
+{
+ run(dpif_port_list(dpif, ports, n_ports), "listing ports");
+ qsort(*ports, *n_ports, sizeof **ports, compare_ports);
+}
+
+static uint16_t
+get_free_port(struct dpif *dpif)
+{
+ struct odp_port *ports;
+ size_t n_ports;
+ int port_no;
+
+ query_ports(dpif, &ports, &n_ports);
+ for (port_no = 0; port_no <= UINT16_MAX; port_no++) {
+ size_t i;
+ for (i = 0; i < n_ports; i++) {
+ if (ports[i].port == port_no) {
+ goto next_portno;
+ }
+ }
+ free(ports);
+ return port_no;
+
+ next_portno: ;
+ }
+ ovs_fatal(0, "no free datapath ports");
+}
+
+static void
+do_add_if(int argc UNUSED, char *argv[])
+{
+ bool failure = false;
+ struct dpif dpif;
+ int i;
+
+ run(dpif_open(argv[1], &dpif), "opening datapath");
+ for (i = 2; i < argc; i++) {
+ char *save_ptr = NULL;
+ char *devname, *suboptions;
+ int port = -1;
+ int flags = 0;
+ int error;
+
+ devname = strtok_r(argv[i], ",,", &save_ptr);
+ if (!devname) {
+ ovs_error(0, "%s is not a valid network device name", argv[i]);
+ continue;
+ }
+
+ suboptions = strtok_r(NULL, "", &save_ptr);
+ if (suboptions) {
+ enum {
+ AP_PORT,
+ AP_INTERNAL
+ };
+ static char *options[] = {
+ "port",
+ "internal"
+ };
+
+ while (*suboptions != '\0') {
+ char *value;
+
+ switch (getsubopt(&suboptions, options, &value)) {
+ case AP_PORT:
+ if (!value) {
+ ovs_error(0, "'port' suboption requires a value");
+ }
+ port = atoi(value);
+ break;
+
+ case AP_INTERNAL:
+ flags |= ODP_PORT_INTERNAL;
+ break;
+
+ default:
+ ovs_error(0, "unknown suboption '%s'", value);
+ break;
+ }
+ }
+ }
+ if (port < 0) {
+ port = get_free_port(&dpif);
+ }
+
+ error = dpif_port_add(&dpif, devname, port, flags);
+ if (error) {
+ ovs_error(error, "adding %s as port %"PRIu16" of %s failed",
+ devname, port, argv[1]);
+ failure = true;
+ } else if (if_up(devname)) {
+ failure = true;
+ }
+ }
+ dpif_close(&dpif);
+ if (failure) {
+ exit(EXIT_FAILURE);
+ }
+}
+
+static bool
+get_port_number(struct dpif *dpif, const char *name, uint16_t *port)
+{
+ struct odp_port *ports;
+ size_t n_ports;
+ size_t i;
+
+ query_ports(dpif, &ports, &n_ports);
+ for (i = 0; i < n_ports; i++) {
+ if (!strcmp(name, ports[i].devname)) {
+ *port = ports[i].port;
+ free(ports);
+ return true;
+ }
+ }
+ free(ports);
+ ovs_error(0, "no port named %s", name);
+ return false;
+}
+
+static void
+do_del_if(int argc UNUSED, char *argv[])
+{
+ bool failure = false;
+ struct dpif dpif;
+ int i;
+
+ run(dpif_open(argv[1], &dpif), "opening datapath");
+ for (i = 2; i < argc; i++) {
+ const char *name = argv[i];
+ uint16_t port;
+ int error;
+
+ if (!name[strspn(name, "0123456789")]) {
+ port = atoi(name);
+ } else if (!get_port_number(&dpif, name, &port)) {
+ failure = true;
+ continue;
+ }
+
+ error = dpif_port_del(&dpif, port);
+ if (error) {
+ ovs_error(error, "deleting port %s from %s failed", name, argv[1]);
+ failure = true;
+ }
+ }
+ dpif_close(&dpif);
+ if (failure) {
+ exit(EXIT_FAILURE);
+ }
+}
+
+static void
+show_dpif(struct dpif *dpif)
+{
+ struct odp_port *ports;
+ struct odp_stats stats;
+ size_t n_ports;
+ size_t i;
+
+ printf("dp%u:\n", dpif_id(dpif));
+ if (!dpif_get_dp_stats(dpif, &stats)) {
+ printf("\tflows: cur:%"PRIu32", soft-max:%"PRIu32", "
+ "hard-max:%"PRIu32"\n",
+ stats.n_flows, stats.cur_capacity, stats.max_capacity);
+ printf("\tports: cur:%"PRIu32", max:%"PRIu32"\n",
+ stats.n_ports, stats.max_ports);
+ printf("\tgroups: max:%"PRIu16"\n", stats.max_groups);
+ printf("\tlookups: frags:%"PRIu64", hit:%"PRIu64", missed:%"PRIu64", "
+ "lost:%"PRIu64"\n",
+ stats.n_frags, stats.n_hit, stats.n_missed, stats.n_lost);
+ printf("\tqueues: max-miss:%"PRIu16", max-action:%"PRIu16"\n",
+ stats.max_miss_queue, stats.max_action_queue);
+ }
+ query_ports(dpif, &ports, &n_ports);
+ for (i = 0; i < n_ports; i++) {
+ printf("\tport %u: %s", ports[i].port, ports[i].devname);
+ if (ports[i].flags & ODP_PORT_INTERNAL) {
+ printf(" (internal)");
+ }
+ printf("\n");
+ }
+ free(ports);
+ dpif_close(dpif);
+}
+
+static void
+do_show(int argc UNUSED, char *argv[])
+{
+ bool failure = false;
+ if (argc > 1) {
+ int i;
+ for (i = 1; i < argc; i++) {
+ const char *name = argv[i];
+ struct dpif dpif;
+ int error;
+
+ error = dpif_open(name, &dpif);
+ if (!error) {
+ show_dpif(&dpif);
+ } else {
+ ovs_error(error, "opening datapath %s failed", name);
+ failure = true;
+ }
+ }
+ } else {
+ unsigned int i;
+ for (i = 0; i < ODP_MAX; i++) {
+ char name[128];
+ struct dpif dpif;
+ int error;
+
+ sprintf(name, "dp%u", i);
+ error = dpif_open(name, &dpif);
+ if (!error) {
+ show_dpif(&dpif);
+ } else if (error != ENODEV) {
+ ovs_error(error, "opening datapath %s failed", name);
+ failure = true;
+ }
+ }
+ }
+ if (failure) {
+ exit(EXIT_FAILURE);
+ }
+}
+
+static void
+do_dump_flows(int argc UNUSED, char *argv[])
+{
+ struct odp_flow *flows;
+ struct dpif dpif;
+ size_t n_flows;
+ struct ds ds;
+ size_t i;
+
+ run(dpif_open(argv[1], &dpif), "opening datapath");
+ run(dpif_flow_list_all(&dpif, &flows, &n_flows), "listing all flows");
+
+ ds_init(&ds);
+ for (i = 0; i < n_flows; i++) {
+ struct odp_flow *f = &flows[i];
+ enum { MAX_ACTIONS = 4096 / sizeof(union odp_action) };
+ union odp_action actions[MAX_ACTIONS];
+
+ f->actions = actions;
+ f->n_actions = MAX_ACTIONS;
+ dpif_flow_get(&dpif, f);
+
+ ds_clear(&ds);
+ format_odp_flow(&ds, f);
+ printf("%s\n", ds_cstr(&ds));
+ }
+ ds_destroy(&ds);
+ dpif_close(&dpif);
+}
+
+static void
+do_del_flows(int argc UNUSED, char *argv[])
+{
+ struct dpif dpif;
+
+ run(dpif_open(argv[1], &dpif), "opening datapath");
+ run(dpif_flow_flush(&dpif), "deleting all flows");
+ dpif_close(&dpif);
+}
+
+static void
+do_dump_groups(int argc UNUSED, char *argv[])
+{
+ struct odp_stats stats;
+ struct dpif dpif;
+ unsigned int i;
+
+ run(dpif_open(argv[1], &dpif), "opening datapath");
+ run(dpif_get_dp_stats(&dpif, &stats), "get datapath stats");
+ for (i = 0; i < stats.max_groups; i++) {
+ uint16_t ports[UINT16_MAX];
+ size_t n_ports;
+
+ if (!dpif_port_group_get(&dpif, i, ports,
+ ARRAY_SIZE(ports), &n_ports) && n_ports) {
+ size_t j;
+
+ printf("group %u:", i);
+ for (j = 0; j < n_ports; j++) {
+ printf(" %"PRIu16, ports[j]);
+ }
+ printf("\n");
+ }
+ }
+ dpif_close(&dpif);
+}
+
+static void
+do_help(int argc UNUSED, char *argv[] UNUSED)
+{
+ usage();
+}
+
+static struct command all_commands[] = {
+ { "add-dp", 1, INT_MAX, do_add_dp },
+ { "del-dp", 1, 1, do_del_dp },
+ { "add-if", 2, INT_MAX, do_add_if },
+ { "del-if", 2, INT_MAX, do_del_if },
+ { "show", 0, INT_MAX, do_show },
+ { "dump-flows", 1, 1, do_dump_flows },
+ { "del-flows", 1, 1, do_del_flows },
+ { "dump-groups", 1, 1, do_dump_groups },
+ { "help", 0, INT_MAX, do_help },
+ { NULL, 0, 0, NULL },
+};
diff --git a/utilities/ovs-kill.8.in b/utilities/ovs-kill.8.in
new file mode 100644
index 000000000..af4ec987f
--- /dev/null
+++ b/utilities/ovs-kill.8.in
@@ -0,0 +1,60 @@
+.TH ovs\-kill 8 "May 2008" "Open vSwitch" "Open vSwitch Manual"
+.ds PN ovs\-kill
+
+.SH NAME
+ovs\-kill \- kills processes given their pidfiles
+
+.SH SYNOPSIS
+.B ovs\-kill
+[\fIoptions\fR] \fIpidfile\fR [\fIpidfile\fR...]
+
+.SH DESCRIPTION
+The \fBovs\-kill\fR program reads each \fIpidfile\fR specified on the
+command line and sends a signal to the program associated with it, if
+any. It reads one line of text from \fIpidfile\fR, which must contain
+the PID of the process to kill as a text string. It then uses
+\fBfcntl\fR(2) to verify that a process with the PID from the file
+owns a lock on \fIpidfile\fR before it sends the signal.
+
+A \fIpidfile\fR whose name begins with \fB/\fR is used literally.
+Otherwise, \fB@RUNDIR@/\fR is prefixed.
+
+This program exists for use by \fBovs\-switch\-setup\fR, which cannot
+easily implement its functionality since Perl has no portable
+interface to \fBfcntl\fR-based file locking.
+
+.SH OPTIONS
+.TP
+\fB-s \fInumber\fR|\fIname\fR, \fB\-\^\-signal=\fInumber\fR|\fIname\fR
+Sets the signal to be sent to each process. Signals may be given by
+number (e.g. \fB1\fR) or by name (e.g. \fBHUP\fR or \fBSIGHUP\fR).
+By default, \fBSIGTERM\fR is sent.
+
+.TP
+\fB-f\fR, \fB\-\^\-force\fR
+Causes \fBovs\-kill\fR to ignore all errors without printing a message
+to \fBstderr\fR, and to exit with return code 0.
+
+.so lib/common.man
+
+.SH "EXIT CODE"
+
+Without \fB-f\fR or \fB\-\^\-force\fR, \fBovs\-kill\fR exits with
+status 0 if at least one \fIpidfile\fR was given and the process
+represented by every \fIpidfile\fR was signaled successfully,
+otherwise with status 1.
+
+With \fB-f\fR or \fB\-\^\-force\fR, \fBovs\-kill\fR always exits with
+status 0.
+
+.SH BUGS
+
+There is a race between verifying the lock on \fIpidfile\fR and
+actually killing the process.
+
+\fBovs\-kill\fR does not wait for the signaled processes to die before
+exiting.
+
+.SH "SEE ALSO"
+
+.BR ovs\-switch\-setup (8)
diff --git a/utilities/ovs-kill.c b/utilities/ovs-kill.c
new file mode 100644
index 000000000..f30bf0b94
--- /dev/null
+++ b/utilities/ovs-kill.c
@@ -0,0 +1,210 @@
+/*
+ * Copyright (c) 2008, 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <config.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <getopt.h>
+#include <signal.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include <string.h>
+#include "command-line.h"
+#include "daemon.h"
+#include "timeval.h"
+#include "util.h"
+#include "vlog.h"
+
+/* -s, --signal: signal to send. */
+static int sig_nr = SIGTERM;
+
+/* -f, --force: ignore errors. */
+static bool force;
+
+static void cond_error(int err_no, const char *, ...) PRINTF_FORMAT(2, 3);
+
+static void parse_options(int argc, char *argv[]);
+static void usage(void);
+
+int
+main(int argc, char *argv[])
+{
+ bool ok = true;
+ int i;
+
+ set_program_name(argv[0]);
+ time_init();
+ vlog_init();
+ parse_options(argc, argv);
+
+ argc -= optind;
+ argv += optind;
+ if (argc < 1) {
+ if (!force) {
+ ovs_fatal(0, "need at least one non-option argument; "
+ "use --help for usage");
+ }
+ }
+
+ for (i = 0; i < argc; i++) {
+ char *pidfile;
+ pid_t pid;
+
+ pidfile = make_pidfile_name(argv[i]);
+ pid = read_pidfile(pidfile);
+ if (pid >= 0) {
+ if (kill(pid, sig_nr) < 0) {
+ cond_error(errno, "%s: kill(%ld)", pidfile, (long int) pid);
+ }
+ } else {
+ cond_error(-pid, "could not read %s", pidfile);
+ }
+ free(pidfile);
+ }
+
+ return ok || force ? EXIT_SUCCESS : EXIT_FAILURE;
+}
+
+static void
+parse_options(int argc, char *argv[])
+{
+ static struct option long_options[] = {
+ {"signal", required_argument, 0, 's'},
+ {"force", no_argument, 0, 'f'},
+ {"help", no_argument, 0, 'h'},
+ {"version", no_argument, 0, 'V'},
+ {0, 0, 0, 0},
+ };
+ char *short_options = long_options_to_short_options(long_options);
+
+ for (;;) {
+ int c;
+
+ c = getopt_long(argc, argv, short_options, long_options, NULL);
+ if (c == -1) {
+ break;
+ }
+
+ switch (c) {
+ case 's':
+ if (atoi(optarg) || !strcmp(optarg, "0")) {
+ sig_nr = atoi(optarg);
+ } else {
+ struct signal_name {
+ const char *name;
+ int number;
+ };
+
+ static const struct signal_name signals[] = {
+#define SIGNAL(NAME) { #NAME, NAME }
+ SIGNAL(SIGABRT),
+ SIGNAL(SIGALRM),
+ SIGNAL(SIGBUS),
+ SIGNAL(SIGCHLD),
+ SIGNAL(SIGCONT),
+ SIGNAL(SIGFPE),
+ SIGNAL(SIGHUP),
+ SIGNAL(SIGILL),
+ SIGNAL(SIGINT),
+ SIGNAL(SIGKILL),
+ SIGNAL(SIGPIPE),
+ SIGNAL(SIGQUIT),
+ SIGNAL(SIGSEGV),
+ SIGNAL(SIGSTOP),
+ SIGNAL(SIGTERM),
+ SIGNAL(SIGTSTP),
+ SIGNAL(SIGTTIN),
+ SIGNAL(SIGTTOU),
+ SIGNAL(SIGUSR1),
+ SIGNAL(SIGUSR2),
+#ifdef SIGPOLL
+ SIGNAL(SIGPOLL),
+#endif
+ SIGNAL(SIGPROF),
+ SIGNAL(SIGSYS),
+ SIGNAL(SIGTRAP),
+ SIGNAL(SIGURG),
+ SIGNAL(SIGVTALRM),
+ SIGNAL(SIGXCPU),
+ SIGNAL(SIGXFSZ),
+#undef SIGNAL
+ };
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(signals); i++) {
+ const struct signal_name *s = &signals[i];
+ if (!strcmp(optarg, s->name)
+ || !strcmp(optarg, s->name + 3)) {
+ sig_nr = s->number;
+ goto got_name;
+ }
+ }
+ ovs_fatal(0, "unknown signal \"%s\"", optarg);
+ got_name: ;
+ }
+ break;
+
+ case 'f':
+ force = true;
+ break;
+
+ case 'h':
+ usage();
+
+ case 'V':
+ OVS_PRINT_VERSION(0, 0);
+ exit(EXIT_SUCCESS);
+
+ case '?':
+ exit(EXIT_FAILURE);
+
+ default:
+ abort();
+ }
+ }
+ free(short_options);
+}
+
+static void
+usage(void)
+{
+ printf("%s: kills a program using a pidfile\n"
+ "usage: %s [OPTIONS] PIDFILE [PIDFILE...]\n"
+ "where PIDFILE is a pidfile created by an Open vSwitch daemon.\n"
+ "\nOptions:\n"
+ " -s, --signal=NUMBER|NAME signal to send (default: TERM)\n"
+ " -f, --force ignore errors\n"
+ " -h, --help display this help message\n"
+ " -V, --version display version information\n",
+ program_name, program_name);
+ exit(EXIT_SUCCESS);
+}
+
+static void
+cond_error(int err_no, const char *format, ...)
+{
+ if (!force) {
+ va_list args;
+
+ fprintf(stderr, "%s: ", program_name);
+ va_start(args, format);
+ vfprintf(stderr, format, args);
+ va_end(args);
+ if (err_no != 0)
+ fprintf(stderr, " (%s)", strerror(err_no));
+ putc('\n', stderr);
+ }
+}
diff --git a/utilities/ovs-monitor b/utilities/ovs-monitor
new file mode 100755
index 000000000..4e0986123
--- /dev/null
+++ b/utilities/ovs-monitor
@@ -0,0 +1,128 @@
+#!/bin/sh
+
+# Copyright (C) 2008, 2009 Nicira Networks, Inc.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+PATH=/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin
+
+SECCHAN_PID=/var/run/secchan.pid
+SECCHAN_SOCK=/var/run/secchan.mgmt
+LOG_FILE=/var/log/openflow/monitor
+INTERVAL=1
+FAIL_THRESH=3
+
+usage() {
+ echo usage: $0 options
+ echo
+ echo "OPTIONS:"
+ echo " -h Show this message"
+ echo " -p PID file for secchan (default: $SECCHAN_PID)"
+ echo " -s Unix socket for secchan (default: $SECCHAN_SOCK)"
+ echo " -l File to log messages (default: $LOG_FILE)"
+ echo " -i Interval to send probes in seconds (default: $INTERVAL)"
+ echo " -c Number of failed probes before reboot (default: $FAIL_THRESH)"
+}
+
+log() {
+ echo `date +"%b %d %X"`:$1
+ echo `date +"%b %d %X"`:$1 >> $LOG_FILE
+}
+
+
+while getopts "hp:s:l:i:c:" OPTION; do
+ case $OPTION in
+ h)
+ usage
+ exit 1
+ ;;
+
+ p)
+ SECCHAN_PID=$OPTARG
+ ;;
+
+ s)
+ SECCHAN_SOCK=$OPTARG
+ ;;
+
+ l)
+ LOG_FILE=$OPTARG
+ ;;
+
+ i)
+ INTERVAL=$OPTARG
+ ;;
+
+ c)
+ FAIL_THRESH=$OPTARG
+ ;;
+
+ *)
+ echo "Unknown option: ${OPTION}"
+ esac
+done
+
+
+if [ ! -f $SECCHAN_PID ]; then
+ log "No secchan pid file: ${SECCHAN_PID}"
+ echo "No secchan pid file: ${SECCHAN_PID}"
+fi
+
+if [ ! -S $SECCHAN_SOCK ]; then
+ log "No secchan sock file: ${SECCHAN_SOCK}"
+ echo "No secchan sock file: ${SECCHAN_SOCK}"
+fi
+
+if [ ! -d `dirname $LOG_FILE` ]; then
+ mkdir -p `dirname $LOG_FILE`
+fi
+
+let DP_DOWN=0
+let SECCHAN_DOWN=0
+log "===== Starting Monitor ===="
+while `/bin/true`; do
+ # Only check for liveness if the secchan's PID file exists. The PID
+ # file is removed when secchan is brought down gracefully.
+ if [ -f $SECCHAN_PID ]; then
+ pid=`cat $SECCHAN_PID`
+ if [ -d /proc/$pid ]; then
+ # Check if the secchan and datapath still can communicate
+ if [ -S $SECCHAN_SOCK ]; then
+ ovs-ofctl probe -t 2 unix:$SECCHAN_SOCK
+ if [ $? -ne 0 ]; then
+ log "datapath probe failed"
+ let DP_DOWN++
+ else
+ let DP_DOWN=0
+ fi
+ fi
+ let SECCHAN_DOWN=0
+ else
+ log "secchan probe failed"
+ let SECCHAN_DOWN++
+ fi
+ fi
+
+ if [ $SECCHAN_DOWN -ge $FAIL_THRESH ]; then
+ log "Failed to probe secchan after ${SECCHAN_DOWN} tries...rebooting!"
+ reboot
+ fi
+
+ if [ $DP_DOWN -ge $FAIL_THRESH ]; then
+ log "Failed to probe datapath after ${DP_DOWN} tries...rebooting!"
+ reboot
+ fi
+
+ sleep $INTERVAL
+done
diff --git a/utilities/ovs-ofctl.8.in b/utilities/ovs-ofctl.8.in
new file mode 100644
index 000000000..3a9c305f2
--- /dev/null
+++ b/utilities/ovs-ofctl.8.in
@@ -0,0 +1,489 @@
+.TH ovs\-ofctl 8 "March 2009" "Open vSwitch" "Open vSwitch Manual"
+.ds PN ovs\-ofctl
+
+.SH NAME
+ovs\-ofctl \- administer OpenFlow switches
+
+.SH SYNOPSIS
+.B ovs\-ofctl
+[\fIoptions\fR] \fIcommand \fR[\fIswitch\fR] [\fIargs\fR\&...]
+
+.SH DESCRIPTION
+The
+.B ovs\-ofctl
+program is a command line tool for monitoring and administering
+OpenFlow switches. It can also show the current state of an OpenFlow
+switch, including features, configuration, and table entries.
+
+.SS "OpenFlow Switch Management Commands"
+
+These commands allow \fBovs\-ofctl\fR to monitor and administer an OpenFlow
+switch. It is able to show the current state of a switch, including
+features, configuration, and table entries.
+
+Most of these commands take an argument that specifies the method for
+connecting to an OpenFlow switch. The following connection methods
+are supported:
+
+.RS
+.TP
+\fBssl:\fIhost\fR[\fB:\fIport\fR]
+The specified SSL \fIport\fR (default: 6633) on the given remote
+\fIhost\fR. The \fB--private-key\fR, \fB--certificate\fR, and
+\fB--ca-cert\fR options are mandatory when this form is used.
+
+.TP
+\fBtcp:\fIhost\fR[\fB:\fIport\fR]
+The specified TCP \fIport\fR (default: 6633) on the given remote
+\fIhost\fR.
+
+.TP
+\fBunix:\fIfile\fR
+The Unix domain server socket named \fIfile\fR.
+
+.IP "\fIfile\fR"
+This is short for \fBunix:\fIfile\fR, as long as \fIfile\fR does not
+contain a colon.
+
+.IP \fIdp\fR
+This is short for \fBunix:@RUNDIR@/\fIdp\fB.mgmt\fR, as long as
+\fIdp\fR does not contain a colon.
+.RE
+
+.TP
+\fBshow \fIswitch\fR
+Prints to the console information on \fIswitch\fR, including
+information on its flow tables and ports.
+
+.TP
+\fBstatus \fIswitch\fR [\fIkey\fR]
+Prints to the console a series of key-value pairs that report the
+status of \fIswitch\fR. If \fIkey\fR is specified, only the key-value
+pairs whose key names begin with \fIkey\fR are printed. If \fIkey\fR is
+omitted, all key-value pairs are printed.
+
+.TP
+\fBdump-tables \fIswitch\fR
+Prints to the console statistics for each of the flow tables used by
+\fIswitch\fR.
+
+.TP
+\fBdump-ports \fIswitch\fR
+Prints to the console statistics for each of the network devices
+associated with \fIswitch\fR.
+
+.TP
+\fBmod-port \fIswitch\fR \fInetdev\fR \fIaction\fR
+Modify characteristics of an interface monitored by \fIswitch\fR.
+\fInetdev\fR can be referred to by its OpenFlow assigned port number or
+the device name, e.g. \fBeth0\fR. The \fIaction\fR may be any one of the
+following:
+
+.RS
+.IP \fBup\fR
+Enables the interface. This is equivalent to ``ifconfig up'' on a Unix
+system.
+
+.IP \fBdown\fR
+Disables the interface. This is equivalent to ``ifconfig down'' on a Unix
+system.
+
+.IP \fBflood\fR
+When a \fIflood\fR action is specified, traffic will be sent out this
+interface. This is the default posture for monitored ports.
+
+.IP \fBnoflood\fR
+When a \fIflood\fR action is specified, traffic will not be sent out
+this interface. This is primarily useful to prevent loops when a
+spanning tree protocol is not in use.
+
+.RE
+
+.TP
+\fBdump-flows \fIswitch \fR[\fIflows\fR]
+Prints to the console all flow entries in \fIswitch\fR's
+tables that match \fIflows\fR. If \fIflows\fR is omitted, all flows
+in the switch are retrieved. See \fBFlow Syntax\fR, below, for the
+syntax of \fIflows\fR. The output format is described in
+\fBTable Entry Output\fR.
+
+.TP
+\fBdump-aggregate \fIswitch \fR[\fIflows\fR]
+Prints to the console aggregate statistics for flows in
+\fIswitch\fR's tables that match \fIflows\fR. If \fIflows\fR is omitted,
+the statistics are aggregated across all flows in the switch's flow
+tables. See \fBFlow Syntax\fR, below, for the syntax of \fIflows\fR.
+The output format is descrbed in \fBTable Entry Output\fR.
+
+.TP
+\fBadd-flow \fIswitch flow\fR
+Add the flow entry as described by \fIflow\fR to the \fIswitch\fR's
+tables. The flow entry is in the format described in \fBFlow Syntax\fR,
+below.
+
+.TP
+\fBadd-flows \fIswitch file\fR
+Add flow entries as described in \fIfile\fR to \fIswitch\fR's
+tables. Each line in \fIfile\fR is a flow entry in the format
+described in \fBFlow Syntax\fR, below.
+
+.TP
+\fBmod-flows \fIswitch flow\fR
+Modify the actions in entries from the \fIswitch\fR's tables
+that match \fIflow\fR. When invoked with the \fB--strict\fR option,
+wildcards are not treated as active for matching purposes. See
+\fBFlow Syntax\fR, below, for the syntax of \fIflows\fR.
+
+.TP
+\fBdel-flows \fIswitch \fR[\fIflow\fR]
+Deletes entries from the \fIswitch\fR's tables that match
+\fIflow\fR. When invoked with the \fB--strict\fR option, wildcards are
+not treated as active for matching purposes. If \fIflow\fR is
+omitted and the \fB--strict\fR option is not used, all flows in the
+switch's tables are removed. See \fBFlow Syntax\fR, below, for the
+syntax of \fIflows\fR.
+
+.TP
+\fBmonitor \fIswitch\fR [\fImiss-len\fR [\fIsend-exp]]
+Connects to \fIswitch\fR and prints to the console all OpenFlow
+messages received. Usually, \fIswitch\fR should specify a connection
+named on \fBsecchan\fR(8)'s \fB-l\fR or \fB--listen\fR command line
+option.
+
+If \fImiss-len\fR is provided, \fBovs\-ofctl\fR sends an OpenFlow ``set
+configuration'' message at connection setup time that requests
+\fImiss-len\fR bytes of each packet that misses the flow table. The
+OpenFlow reference implementation not send these messages to the
+\fBovs\-ofctl monitor\fR client connection unless a nonzero value is
+specified on this argument.
+
+If \fIsend-exp\fR is specified as \fB1\fR, \fBovs\-ofctl\fR will also
+request to be sent flow expiration messages. If this argument is
+omitted, or \fB0\fR is specified, then \fRovs\-ofctl\fR will not request
+flow expirations.
+
+This command may be useful for debugging switch or controller
+implementations.
+
+.TP
+\fBexecute \fIswitch command \fR[\fIarg\fR...]
+Sends a request to \fIswitch\fR to execute \fIcommand\fR along with
+each \fIarg\fR, if any, then waits for the command to complete and
+reports its completion status on \fBstderr\fR and its output, if any,
+on \fBstdout\fR. The set of available commands and their argument is
+switch-dependent. (This command uses a Nicira extension to OpenFlow
+that may not be available on all switches.)
+
+.SS "OpenFlow Switch and Controller Commands"
+
+The following commands, like those in the previous section, may be
+applied to OpenFlow switches, using any of the connection methods
+described in that section. Unlike those commands, these may also be
+applied to OpenFlow controllers.
+
+.TP
+\fBprobe \fItarget\fR
+Sends a single OpenFlow echo-request message to \fItarget\fR and waits
+for the response. With the \fB-t\fR or \fB--timeout\fR option, this
+command can test whether an OpenFlow switch or controller is up and
+running.
+
+.TP
+\fBping \fItarget \fR[\fIn\fR]
+Sends a series of 10 echo request packets to \fItarget\fR and times
+each reply. The echo request packets consist of an OpenFlow header
+plus \fIn\fR bytes (default: 64) of randomly generated payload. This
+measures the latency of individual requests.
+
+.TP
+\fBbenchmark \fItarget n count\fR
+Sends \fIcount\fR echo request packets that each consist of an
+OpenFlow header plus \fIn\fR bytes of payload and waits for each
+response. Reports the total time required. This is a measure of the
+maximum bandwidth to \fItarget\fR for round-trips of \fIn\fR-byte
+messages.
+
+.SS "Flow Syntax"
+
+Some \fBovs\-ofctl\fR commands accept an argument that describes a flow or
+flows. Such flow descriptions comprise a series
+\fIfield\fB=\fIvalue\fR assignments, separated by commas or white
+space. (Embedding spaces into a flow description normally requires
+quoting to prevent the shell from breaking the description into
+multiple arguments.)
+
+The following field assignments describe how a flow matches a packet.
+If any of these assignments is omitted from the flow syntax, the field
+is treated as a wildcard; thus, if all of them are omitted, the
+resulting flow matches all packets. The string \fB*\fR or \fBANY\fR
+may be specified to explicitly mark any of these fields as a wildcard.
+(\fB*\fR should be quoted to protect it from shell expansion.)
+
+.IP \fBin_port=\fIport_no\fR
+Matches physical port \fIport_no\fR. Switch ports are numbered as
+displayed by \fBovs\-ofctl show\fR.
+
+.IP \fBdl_vlan=\fIvlan\fR
+Matches IEEE 802.1q virtual LAN tag \fIvlan\fR. Specify \fB0xffff\fR
+as \fIvlan\fR to match packets that are not tagged with a virtual LAN;
+otherwise, specify a number between 0 and 4095, inclusive, as the
+12-bit VLAN ID to match.
+
+.IP \fBdl_src=\fImac\fR
+Matches Ethernet source address \fImac\fR, which is specified as 6 pairs
+of hexadecimal digits delimited by colons (e.g. \fB00:0A:E4:25:6B:B0\fR).
+
+.IP \fBdl_dst=\fImac\fR
+Matches Ethernet destination address \fImac\fR.
+
+.IP \fBdl_type=\fIethertype\fR
+Matches Ethernet protocol type \fIethertype\fR, which is specified as an
+integer between 0 and 65535, inclusive, either in decimal or as a
+hexadecimal number prefixed by \fB0x\fR (e.g. \fB0x0806\fR to match ARP
+packets).
+
+.IP \fBnw_src=\fIip\fR[\fB/\fInetmask\fR]
+Matches IPv4 source address \fIip\fR, which may be specified as an
+IP address or host name (e.g. \fB192.168.1.1\fR or
+\fBwww.example.com\fR). The optional \fInetmask\fR allows restricting a
+match to an IPv4 address prefix. The netmask may be specified as a dotted
+quad (e.g. \fB192.168.1.0/255.255.255.0\fR) or as a CIDR block
+(e.g. \fB192.168.1.0/24\fR).
+
+.IP \fBnw_dst=\fIip\fR[\fB/\fInetmask\fR]
+Matches IPv4 destination address \fIip\fR.
+
+.IP \fBnw_proto=\fIproto\fR
+Matches IP protocol type \fIproto\fR, which is specified as a decimal
+number between 0 and 255, inclusive (e.g. 6 to match TCP packets).
+
+.IP \fBtp_src=\fIport\fR
+Matches UDP or TCP source port \fIport\fR, which is specified as a decimal
+number between 0 and 65535, inclusive (e.g. 80 to match packets originating
+from a HTTP server).
+
+.IP \fBtp_dst=\fIport\fR
+Matches UDP or TCP destination port \fIport\fR.
+
+.IP \fBicmp_type=\fItype\fR
+Matches ICMP message with \fItype\fR, which is specified as a decimal
+number between 0 and 255, inclusive.
+
+.IP \fBicmp_code=\fIcode\fR
+Matches ICMP messages with \fIcode\fR.
+
+.PP
+The following shorthand notations are also available:
+
+.IP \fBip\fR
+Same as \fBdl_type=0x0800\fR.
+
+.IP \fBicmp\fR
+Same as \fBdl_type=0x0800,nw_proto=1\fR.
+
+.IP \fBtcp\fR
+Same as \fBdl_type=0x0800,nw_proto=6\fR.
+
+.IP \fBudp\fR
+Same as \fBdl_type=0x0800,nw_proto=17\fR.
+
+.IP \fBarp\fR
+Same as \fBdl_type=0x0806\fR.
+
+.PP
+The \fBadd-flow\fR and \fBadd-flows\fR commands require an additional field:
+
+.IP \fBactions=\fR[\fItarget\fR][\fB,\fItarget\fR...]\fR
+Specifies a comma-separated list of actions to take on a packet when the
+flow entry matches. If no \fItarget\fR is specified, then packets
+matching the flow are dropped. The \fItarget\fR may be a decimal port
+number designating the physical port on which to output the packet, or one
+of the following keywords:
+
+.RS
+.IP \fBoutput\fR:\fIport\fR
+Outputs the packet on the port specified by \fIport\fR.
+
+.IP \fBnormal\fR
+Subjects the packet to the device's normal L2/L3 processing. (This
+action is not implemented by all OpenFlow switches.)
+
+.IP \fBflood\fR
+Outputs the packet on all switch physical ports other than the port on
+which it was received and any ports on which flooding is disabled
+(typically, these would be ports disabled by the IEEE 802.1D spanning
+tree protocol).
+
+.IP \fBall\fR
+Outputs the packet on all switch physical ports other than the port on
+which it was received.
+
+.IP \fBcontroller\fR:\fImax_len\fR
+Sends the packet to the OpenFlow controller as a ``packet in''
+message. If \fImax_len\fR is a number, then it specifies the maximum
+number of bytes that should be sent. If \fImax_len\fR is \fBALL\fR or
+omitted, then the entire packet is sent.
+
+.IP \fBlocal\fR
+Outputs the packet on the ``local port,'' which corresponds to the
+\fBof\fIn\fR network device (see \fBCONTACTING THE CONTROLLER\fR in
+\fBsecchan\fR(8) for information on the \fBof\fIn\fR network device).
+
+.IP \fBdrop\fR
+Discards the packet, so no further processing or forwarding takes place.
+If a drop action is used, no other actions may be specified.
+
+.IP \fBmod_vlan_vid\fR:\fIvlan_vid\fR
+Modifies the VLAN id on a packet. The VLAN tag is added or modified
+as necessary to match the value specified. If the VLAN tag is added,
+a priority of zero is used (see the \fBmod_vlan_pcp\fR action to set
+this).
+
+.IP \fBmod_vlan_pcp\fR:\fIvlan_pcp\fR
+Modifies the VLAN priority on a packet. The VLAN tag is added or modified
+as necessary to match the value specified. Valid values are between 0
+(lowest) and 7 (highest). If the VLAN tag is added, a vid of zero is used
+(see the \fBmod_vlan_vid\fR action to set this).
+
+.IP \fBstrip_vlan\fR
+Strips the VLAN tag from a packet if it is present.
+
+.IP \fBmod_dl_src\fB:\fImac\fR
+Sets the source Ethernet address to \fImac\fR.
+
+.IP \fBmod_dl_dst\fB:\fImac\fR
+Sets the destination Ethernet address to \fImac\fR.
+.RE
+
+.IP
+(The OpenFlow protocol supports other actions that \fBovs\-ofctl\fR does
+not yet expose to the user.)
+
+.PP
+The \fBadd-flow\fR, \fBadd-flows\fR, and \fBdel-flows\fR commands
+support an additional optional field:
+
+.IP \fBpriority=\fIvalue\fR
+The priority at which a wildcarded entry will match in comparison to
+others. \fIvalue\fR is a number between 0 and 65535, inclusive. A higher
+\fIvalue\fR will match before a lower one. An exact-match entry will always
+have priority over an entry containing wildcards, so it has an implicit
+priority value of 65535. When adding a flow, if the field is not specified,
+the flow's priority will default to 32768.
+
+.PP
+The \fBadd-flow\fR and \fBadd-flows\fR commands support additional
+optional fields:
+
+.TP
+\fBidle_timeout=\fIseconds\fR
+Causes the flow to expire after the given number of seconds of
+inactivity. A value of 0 prevents a flow from expiring due to
+inactivity. The default is 60 seconds.
+
+.IP \fBhard_timeout=\fIseconds\fR
+Causes the flow to expire after the given number of seconds,
+regardless of activity. A value of 0 (the default) gives the flow no
+hard expiration deadline.
+
+.PP
+The \fBdump-flows\fR, \fBdump-aggregate\fR, \fBdel-flow\fR
+and \fBdel-flows\fR commands support one additional optional field:
+
+.TP
+\fBout_port=\fIport\fR
+If set, a matching flow must include an output action to \fIport\fR.
+
+.PP
+The \fBdump-flows\fR and \fBdump-aggregate\fR commands support an
+additional optional field:
+
+.IP \fBtable=\fInumber\fR
+If specified, limits the flows about which statistics are gathered to
+those in the table with the given \fInumber\fR. Tables are numbered
+as shown by the \fBdump-tables\fR command.
+
+If this field is not specified, or if \fInumber\fR is given as
+\fB255\fR, statistics are gathered about flows from all tables.
+
+.SS "Table Entry Output"
+
+The \fBdump-tables\fR and \fBdump-aggregate\fR commands print information
+about the entries in a datapath's tables. Each line of output is a
+unique flow entry, which begins with some common information:
+
+.IP \fBduration\fR
+The number of seconds the entry has been in the table.
+
+.IP \fBtable_id\fR
+The table that contains the flow. When a packet arrives, the switch
+begins searching for an entry at the lowest numbered table. Tables are
+numbered as shown by the \fBdump-tables\fR command.
+
+.IP \fBpriority\fR
+The priority of the entry in relation to other entries within the same
+table. A higher value will match before a lower one.
+
+.IP \fBn_packets\fR
+The number of packets that have matched the entry.
+
+.IP \fBn_bytes\fR
+The total number of bytes from packets that have matched the entry.
+
+.PP
+The rest of the line consists of a description of the flow entry as
+described in \fBFlow Syntax\fR, above.
+
+
+.SH OPTIONS
+.TP
+\fB--strict\fR
+Uses strict matching when running flow modification commands.
+
+.TP
+\fB-t\fR, \fB--timeout=\fIsecs\fR
+Limits \fBovs\-ofctl\fR runtime to approximately \fIsecs\fR seconds. If
+the timeout expires, \fBovs\-ofctl\fR will exit with a \fBSIGALRM\fR
+signal.
+
+.TP
+\fB-p\fR, \fB--private-key=\fIprivkey.pem\fR
+Specifies a PEM file containing the private key used as the
+identity for SSL connections to a switch.
+
+.TP
+\fB-c\fR, \fB--certificate=\fIcert.pem\fR
+Specifies a PEM file containing a certificate, signed by the
+controller's certificate authority (CA), that certifies the
+private key to identify a trustworthy controller.
+
+.TP
+\fB-C\fR, \fB--ca-cert=\fIcacert.pem\fR
+Specifies a PEM file containing the CA certificate used to verify that
+a switch is trustworthy.
+
+.so lib/vlog.man
+.so lib/common.man
+
+.SH EXAMPLES
+
+The following examples assume that an OpenFlow switch on the local
+host has been configured to listen for management connections on a
+Unix domain socket named \fB@RUNDIR@/openflow.sock\fR, e.g. by
+specifying \fB--listen=punix:@RUNDIR@/openflow.sock\fR on the
+\fBsecchan\fR(8) command line.
+
+.TP
+\fBovs\-ofctl dump-tables unix:@RUNDIR@/openflow.sock\fR
+Prints out the switch's table stats. (This is more interesting after
+some traffic has passed through.)
+
+.TP
+\fBovs\-ofctl dump-flows unix:@RUNDIR@/openflow.sock\fR
+Prints the flow entries in the switch.
+
+.SH "SEE ALSO"
+
+.BR ovs\-appctl (8),
+.BR ovs\-controller (8),
+.BR ovs\-vswitchd (8)
diff --git a/utilities/ovs-ofctl.c b/utilities/ovs-ofctl.c
new file mode 100644
index 000000000..8b343956d
--- /dev/null
+++ b/utilities/ovs-ofctl.c
@@ -0,0 +1,1278 @@
+/*
+ * Copyright (c) 2008, 2009 Nicira Networks.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <config.h>
+#include <arpa/inet.h>
+#include <errno.h>
+#include <getopt.h>
+#include <inttypes.h>
+#include <net/if.h>
+#include <netinet/in.h>
+#include <signal.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+
+#include "command-line.h"
+#include "compiler.h"
+#include "dirs.h"
+#include "dpif.h"
+#include "dynamic-string.h"
+#include "netdev.h"
+#include "netlink.h"
+#include "odp-util.h"
+#include "ofp-print.h"
+#include "ofpbuf.h"
+#include "openflow/nicira-ext.h"
+#include "openflow/openflow.h"
+#include "packets.h"
+#include "random.h"
+#include "socket-util.h"
+#include "timeval.h"
+#include "util.h"
+#include "vconn-ssl.h"
+#include "vconn.h"
+
+#include "vlog.h"
+#define THIS_MODULE VLM_ofctl
+
+#define DEFAULT_IDLE_TIMEOUT 60
+
+#define MOD_PORT_CMD_UP "up"
+#define MOD_PORT_CMD_DOWN "down"
+#define MOD_PORT_CMD_FLOOD "flood"
+#define MOD_PORT_CMD_NOFLOOD "noflood"
+
+
+/* Settings that may be configured by the user. */
+struct settings {
+ bool strict; /* Use strict matching for flow mod commands */
+};
+
+struct command {
+ const char *name;
+ int min_args;
+ int max_args;
+ void (*handler)(const struct settings *, int argc, char *argv[]);
+};
+
+static struct command all_commands[];
+
+static void usage(void) NO_RETURN;
+static void parse_options(int argc, char *argv[], struct settings *);
+
+int main(int argc, char *argv[])
+{
+ struct settings s;
+ struct command *p;
+
+ set_program_name(argv[0]);
+ time_init();
+ vlog_init();
+ parse_options(argc, argv, &s);
+ signal(SIGPIPE, SIG_IGN);
+
+ argc -= optind;
+ argv += optind;
+ if (argc < 1)
+ ovs_fatal(0, "missing command name; use --help for help");
+
+ for (p = all_commands; p->name != NULL; p++) {
+ if (!strcmp(p->name, argv[0])) {
+ int n_arg = argc - 1;
+ if (n_arg < p->min_args)
+ ovs_fatal(0, "'%s' command requires at least %d arguments",
+ p->name, p->min_args);
+ else if (n_arg > p->max_args)
+ ovs_fatal(0, "'%s' command takes at most %d arguments",
+ p->name, p->max_args);
+ else {
+ p->handler(&s, argc, argv);
+ if (ferror(stdout)) {
+ ovs_fatal(0, "write to stdout failed");
+ }
+ if (ferror(stderr)) {
+ ovs_fatal(0, "write to stderr failed");
+ }
+ exit(0);
+ }
+ }
+ }
+ ovs_fatal(0, "unknown command '%s'; use --help for help", argv[0]);
+
+ return 0;
+}
+
+static void
+parse_options(int argc, char *argv[], struct settings *s)
+{
+ enum {
+ OPT_STRICT = UCHAR_MAX + 1
+ };
+ static struct option long_options[] = {
+ {"timeout", required_argument, 0, 't'},
+ {"verbose", optional_argument, 0, 'v'},
+ {"strict", no_argument, 0, OPT_STRICT},
+ {"help", no_argument, 0, 'h'},
+ {"version", no_argument, 0, 'V'},
+ VCONN_SSL_LONG_OPTIONS
+ {0, 0, 0, 0},
+ };
+ char *short_options = long_options_to_short_options(long_options);
+
+ /* Set defaults that we can figure out before parsing options. */
+ s->strict = false;
+
+ for (;;) {
+ unsigned long int timeout;
+ int c;
+
+ c = getopt_long(argc, argv, short_options, long_options, NULL);
+ if (c == -1) {
+ break;
+ }
+
+ switch (c) {
+ case 't':
+ timeout = strtoul(optarg, NULL, 10);
+ if (timeout <= 0) {
+ ovs_fatal(0, "value %s on -t or --timeout is not at least 1",
+ optarg);
+ } else {
+ time_alarm(timeout);
+ }
+ break;
+
+ case 'h':
+ usage();
+
+ case 'V':
+ OVS_PRINT_VERSION(OFP_VERSION, OFP_VERSION);
+ exit(EXIT_SUCCESS);
+
+ case 'v':
+ vlog_set_verbosity(optarg);
+ break;
+
+ case OPT_STRICT:
+ s->strict = true;
+ break;
+
+ VCONN_SSL_OPTION_HANDLERS
+
+ case '?':
+ exit(EXIT_FAILURE);
+
+ default:
+ abort();
+ }
+ }
+ free(short_options);
+}
+
+static void
+usage(void)
+{
+ printf("%s: OpenFlow switch management utility\n"
+ "usage: %s [OPTIONS] COMMAND [ARG...]\n"
+ "\nFor OpenFlow switches:\n"
+ " show SWITCH show OpenFlow information\n"
+ " status SWITCH [KEY] report statistics (about KEY)\n"
+ " dump-desc SWITCH print switch description\n"
+ " dump-tables SWITCH print table stats\n"
+ " mod-port SWITCH IFACE ACT modify port behavior\n"
+ " dump-ports SWITCH print port statistics\n"
+ " dump-flows SWITCH print all flow entries\n"
+ " dump-flows SWITCH FLOW print matching FLOWs\n"
+ " dump-aggregate SWITCH print aggregate flow statistics\n"
+ " dump-aggregate SWITCH FLOW print aggregate stats for FLOWs\n"
+ " add-flow SWITCH FLOW add flow described by FLOW\n"
+ " add-flows SWITCH FILE add flows from FILE\n"
+ " mod-flows SWITCH FLOW modify actions of matching FLOWs\n"
+ " del-flows SWITCH [FLOW] delete matching FLOWs\n"
+ " monitor SWITCH MISSLEN EXP print packets received from SWITCH\n"
+ " execute SWITCH CMD [ARG...] execute CMD with ARGS on SWITCH\n"
+ "\nFor OpenFlow switches and controllers:\n"
+ " probe VCONN probe whether VCONN is up\n"
+ " ping VCONN [N] latency of N-byte echos\n"
+ " benchmark VCONN N COUNT bandwidth of COUNT N-byte echos\n"
+ "where each SWITCH is an active OpenFlow connection method.\n",
+ program_name, program_name);
+ vconn_usage(true, false, false);
+ vlog_usage();
+ printf("\nOther options:\n"
+ " --strict use strict match for flow commands\n"
+ " -t, --timeout=SECS give up after SECS seconds\n"
+ " -h, --help display this help message\n"
+ " -V, --version display version information\n");
+ exit(EXIT_SUCCESS);
+}
+
+static void run(int retval, const char *message, ...)
+ PRINTF_FORMAT(2, 3);
+
+static void run(int retval, const char *message, ...)
+{
+ if (retval) {
+ va_list args;
+
+ fprintf(stderr, "%s: ", program_name);
+ va_start(args, message);
+ vfprintf(stderr, message, args);
+ va_end(args);
+ if (retval == EOF) {
+ fputs(": unexpected end of file\n", stderr);
+ } else {
+ fprintf(stderr, ": %s\n", strerror(retval));
+ }
+
+ exit(EXIT_FAILURE);
+ }
+}
+
+/* Generic commands. */
+
+static void
+open_vconn(const char *name, struct vconn **vconnp)
+{
+ struct dpif dpif;
+ struct stat s;
+
+ if (strstr(name, ":")) {
+ run(vconn_open_block(name, OFP_VERSION, vconnp),
+ "connecting to %s", name);
+ } else if (!stat(name, &s) && S_ISSOCK(s.st_mode)) {
+ char *vconn_name = xasprintf("unix:%s", name);
+ VLOG_INFO("connecting to %s", vconn_name);
+ run(vconn_open_block(vconn_name, OFP_VERSION, vconnp),
+ "connecting to %s", vconn_name);
+ free(vconn_name);
+ } else if (!dpif_open(name, &dpif)) {
+ char dpif_name[IF_NAMESIZE + 1];
+ char *socket_name;
+ char *vconn_name;
+
+ run(dpif_get_name(&dpif, dpif_name, sizeof dpif_name),
+ "obtaining name of %s", dpif_name);
+ dpif_close(&dpif);
+ if (strcmp(dpif_name, name)) {
+ VLOG_INFO("datapath %s is named %s", name, dpif_name);
+ }
+
+ socket_name = xasprintf("%s/%s.mgmt", ovs_rundir, dpif_name);
+ if (stat(socket_name, &s)) {
+ ovs_fatal(errno, "cannot connect to %s: stat failed on %s",
+ name, socket_name);
+ } else if (!S_ISSOCK(s.st_mode)) {
+ ovs_fatal(0, "cannot connect to %s: %s is not a socket",
+ name, socket_name);
+ }
+
+ vconn_name = xasprintf("unix:%s", socket_name);
+ VLOG_INFO("connecting to %s", vconn_name);
+ run(vconn_open_block(vconn_name, OFP_VERSION, vconnp),
+ "connecting to %s", vconn_name);
+ free(socket_name);
+ free(vconn_name);
+ } else {
+ ovs_fatal(0, "%s is not a valid connection method", name);
+ }
+}
+
+static void *
+alloc_stats_request(size_t body_len, uint16_t type, struct ofpbuf **bufferp)
+{
+ struct ofp_stats_request *rq;
+ rq = make_openflow((offsetof(struct ofp_stats_request, body)
+ + body_len), OFPT_STATS_REQUEST, bufferp);
+ rq->type = htons(type);
+ rq->flags = htons(0);
+ return rq->body;
+}
+
+static void
+send_openflow_buffer(struct vconn *vconn, struct ofpbuf *buffer)
+{
+ update_openflow_length(buffer);
+ run(vconn_send_block(vconn, buffer), "failed to send packet to switch");
+}
+
+static void
+dump_transaction(const char *vconn_name, struct ofpbuf *request)
+{
+ struct vconn *vconn;
+ struct ofpbuf *reply;
+
+ update_openflow_length(request);
+ open_vconn(vconn_name, &vconn);
+ run(vconn_transact(vconn, request, &reply), "talking to %s", vconn_name);
+ ofp_print(stdout, reply->data, reply->size, 1);
+ vconn_close(vconn);
+}
+
+static void
+dump_trivial_transaction(const char *vconn_name, uint8_t request_type)
+{
+ struct ofpbuf *request;
+ make_openflow(sizeof(struct ofp_header), request_type, &request);
+ dump_transaction(vconn_name, request);
+}
+
+static void
+dump_stats_transaction(const char *vconn_name, struct ofpbuf *request)
+{
+ uint32_t send_xid = ((struct ofp_header *) request->data)->xid;
+ struct vconn *vconn;
+ bool done = false;
+
+ open_vconn(vconn_name, &vconn);
+ send_openflow_buffer(vconn, request);
+ while (!done) {
+ uint32_t recv_xid;
+ struct ofpbuf *reply;
+
+ run(vconn_recv_block(vconn, &reply), "OpenFlow packet receive failed");
+ recv_xid = ((struct ofp_header *) reply->data)->xid;
+ if (send_xid == recv_xid) {
+ struct ofp_stats_reply *osr;
+
+ ofp_print(stdout, reply->data, reply->size, 1);
+
+ osr = ofpbuf_at(reply, 0, sizeof *osr);
+ done = !osr || !(ntohs(osr->flags) & OFPSF_REPLY_MORE);
+ } else {
+ VLOG_DBG("received reply with xid %08"PRIx32" "
+ "!= expected %08"PRIx32, recv_xid, send_xid);
+ }
+ ofpbuf_delete(reply);
+ }
+ vconn_close(vconn);
+}
+
+static void
+dump_trivial_stats_transaction(const char *vconn_name, uint8_t stats_type)
+{
+ struct ofpbuf *request;
+ alloc_stats_request(0, stats_type, &request);
+ dump_stats_transaction(vconn_name, request);
+}
+
+static void
+do_show(const struct settings *s UNUSED, int argc UNUSED, char *argv[])
+{
+ dump_trivial_transaction(argv[1], OFPT_FEATURES_REQUEST);
+ dump_trivial_transaction(argv[1], OFPT_GET_CONFIG_REQUEST);
+}
+
+static void
+do_status(const struct settings *s UNUSED, int argc, char *argv[])
+{
+ struct nicira_header *request, *reply;
+ struct vconn *vconn;
+ struct ofpbuf *b;
+
+ request = make_openflow(sizeof *request, OFPT_VENDOR, &b);
+ request->vendor = htonl(NX_VENDOR_ID);
+ request->subtype = htonl(NXT_STATUS_REQUEST);
+ if (argc > 2) {
+ ofpbuf_put(b, argv[2], strlen(argv[2]));
+ update_openflow_length(b);
+ }
+ open_vconn(argv[1], &vconn);
+ run(vconn_transact(vconn, b, &b), "talking to %s", argv[1]);
+ vconn_close(vconn);
+
+ if (b->size < sizeof *reply) {
+ ovs_fatal(0, "short reply (%zu bytes)", b->size);
+ }
+ reply = b->data;
+ if (reply->header.type != OFPT_VENDOR
+ || reply->vendor != ntohl(NX_VENDOR_ID)
+ || reply->subtype != ntohl(NXT_STATUS_REPLY)) {
+ ofp_print(stderr, b->data, b->size, 2);
+ ovs_fatal(0, "bad reply");
+ }
+
+ fwrite(reply + 1, b->size - sizeof *reply, 1, stdout);
+}
+
+static void
+do_dump_desc(const struct settings *s UNUSED, int argc UNUSED, char *argv[])
+{
+ dump_trivial_stats_transaction(argv[1], OFPST_DESC);
+}
+
+static void
+do_dump_tables(const struct settings *s UNUSED, int argc UNUSED, char *argv[])
+{
+ dump_trivial_stats_transaction(argv[1], OFPST_TABLE);
+}
+
+
+static uint32_t
+str_to_u32(const char *str)
+{
+ char *tail;
+ uint32_t value;
+
+ errno = 0;
+ value = strtoul(str, &tail, 0);
+ if (errno == EINVAL || errno == ERANGE || *tail) {
+ ovs_fatal(0, "invalid numeric format %s", str);
+ }
+ return value;
+}
+
+static void
+str_to_mac(const char *str, uint8_t mac[6])
+{
+ if (sscanf(str, "%"SCNx8":%"SCNx8":%"SCNx8":%"SCNx8":%"SCNx8":%"SCNx8,
+ &mac[0], &mac[1], &mac[2], &mac[3], &mac[4], &mac[5]) != 6) {
+ ovs_fatal(0, "invalid mac address %s", str);
+ }
+}
+
+static uint32_t
+str_to_ip(const char *str_, uint32_t *ip)
+{
+ char *str = xstrdup(str_);
+ char *save_ptr = NULL;
+ const char *name, *netmask;
+ struct in_addr in_addr;
+ int n_wild, retval;
+
+ name = strtok_r(str, "//", &save_ptr);
+ retval = name ? lookup_ip(name, &in_addr) : EINVAL;
+ if (retval) {
+ ovs_fatal(0, "%s: could not convert to IP address", str);
+ }
+ *ip = in_addr.s_addr;
+
+ netmask = strtok_r(NULL, "//", &save_ptr);
+ if (netmask) {
+ uint8_t o[4];
+ if (sscanf(netmask, "%"SCNu8".%"SCNu8".%"SCNu8".%"SCNu8,
+ &o[0], &o[1], &o[2], &o[3]) == 4) {
+ uint32_t nm = (o[0] << 24) | (o[1] << 16) | (o[2] << 8) | o[3];
+ int i;
+
+ /* Find first 1-bit. */
+ for (i = 0; i < 32; i++) {
+ if (nm & (1u << i)) {
+ break;
+ }
+ }
+ n_wild = i;
+
+ /* Verify that the rest of the bits are 1-bits. */
+ for (; i < 32; i++) {
+ if (!(nm & (1u << i))) {
+ ovs_fatal(0, "%s: %s is not a valid netmask",
+ str, netmask);
+ }
+ }
+ } else {
+ int prefix = atoi(netmask);
+ if (prefix <= 0 || prefix > 32) {
+ ovs_fatal(0, "%s: network prefix bits not between 1 and 32",
+ str);
+ }
+ n_wild = 32 - prefix;
+ }
+ } else {
+ n_wild = 0;
+ }
+
+ free(str);
+ return n_wild;
+}
+
+static void *
+put_action(struct ofpbuf *b, size_t size, uint16_t type)
+{
+ struct ofp_action_header *ah = ofpbuf_put_zeros(b, size);
+ ah->type = htons(type);
+ ah->len = htons(size);
+ return ah;
+}
+
+static struct ofp_action_output *
+put_output_action(struct ofpbuf *b, uint16_t port)
+{
+ struct ofp_action_output *oao = put_action(b, sizeof *oao, OFPAT_OUTPUT);
+ oao->port = htons(port);
+ return oao;
+}
+
+static void
+put_dl_addr_action(struct ofpbuf *b, uint16_t type, const char *addr)
+{
+ struct ofp_action_dl_addr *oada = put_action(b, sizeof *oada, type);
+ str_to_mac(addr, oada->dl_addr);
+}
+
+
+static bool
+parse_port_name(const char *name, uint16_t *port)
+{
+ struct pair {
+ const char *name;
+ uint16_t value;
+ };
+ static const struct pair pairs[] = {
+#define DEF_PAIR(NAME) {#NAME, OFPP_##NAME}
+ DEF_PAIR(IN_PORT),
+ DEF_PAIR(TABLE),
+ DEF_PAIR(NORMAL),
+ DEF_PAIR(FLOOD),
+ DEF_PAIR(ALL),
+ DEF_PAIR(CONTROLLER),
+ DEF_PAIR(LOCAL),
+ DEF_PAIR(NONE),
+#undef DEF_PAIR
+ };
+ static const int n_pairs = ARRAY_SIZE(pairs);
+ size_t i;
+
+ for (i = 0; i < n_pairs; i++) {
+ if (!strcasecmp(name, pairs[i].name)) {
+ *port = pairs[i].value;
+ return true;
+ }
+ }
+ return false;
+}
+
+static void
+str_to_action(char *str, struct ofpbuf *b)
+{
+ char *act, *arg;
+ char *saveptr = NULL;
+ bool drop = false;
+ int n_actions;
+
+ for (act = strtok_r(str, ", \t\r\n", &saveptr), n_actions = 0; act;
+ act = strtok_r(NULL, ", \t\r\n", &saveptr), n_actions++)
+ {
+ uint16_t port;
+
+ if (drop) {
+ ovs_fatal(0, "Drop actions must not be followed by other actions");
+ }
+
+ /* Arguments are separated by colons */
+ arg = strchr(act, ':');
+ if (arg) {
+ *arg = '\0';
+ arg++;
+ }
+
+ if (!strcasecmp(act, "mod_vlan_vid")) {
+ struct ofp_action_vlan_vid *va;
+ va = put_action(b, sizeof *va, OFPAT_SET_VLAN_VID);
+ va->vlan_vid = htons(str_to_u32(arg));
+ } else if (!strcasecmp(act, "mod_vlan_pcp")) {
+ struct ofp_action_vlan_pcp *va;
+ va = put_action(b, sizeof *va, OFPAT_SET_VLAN_PCP);
+ va->vlan_pcp = str_to_u32(arg);
+ } else if (!strcasecmp(act, "strip_vlan")) {
+ struct ofp_action_header *ah;
+ ah = put_action(b, sizeof *ah, OFPAT_STRIP_VLAN);
+ ah->type = htons(OFPAT_STRIP_VLAN);
+ } else if (!strcasecmp(act, "mod_dl_src")) {
+ put_dl_addr_action(b, OFPAT_SET_DL_SRC, arg);
+ } else if (!strcasecmp(act, "mod_dl_dst")) {
+ put_dl_addr_action(b, OFPAT_SET_DL_DST, arg);
+ } else if (!strcasecmp(act, "output")) {
+ put_output_action(b, str_to_u32(arg));
+ } else if (!strcasecmp(act, "drop")) {
+ /* A drop action in OpenFlow occurs by just not setting
+ * an action. */
+ drop = true;
+ if (n_actions) {
+ ovs_fatal(0, "Drop actions must not be preceded by other "
+ "actions");
+ }
+ } else if (!strcasecmp(act, "CONTROLLER")) {
+ struct ofp_action_output *oao;
+ oao = put_output_action(b, OFPP_CONTROLLER);
+
+ /* Unless a numeric argument is specified, we send the whole
+ * packet to the controller. */
+ if (arg && (strspn(act, "0123456789") == strlen(act))) {
+ oao->max_len = htons(str_to_u32(arg));
+ }
+ } else if (parse_port_name(act, &port)) {
+ put_output_action(b, port);
+ } else if (strspn(act, "0123456789") == strlen(act)) {
+ put_output_action(b, str_to_u32(act));
+ } else {
+ ovs_fatal(0, "Unknown action: %s", act);
+ }
+ }
+}
+
+struct protocol {
+ const char *name;
+ uint16_t dl_type;
+ uint8_t nw_proto;
+};
+
+static bool
+parse_protocol(const char *name, const struct protocol **p_out)
+{
+ static const struct protocol protocols[] = {
+ { "ip", ETH_TYPE_IP, 0 },
+ { "arp", ETH_TYPE_ARP, 0 },
+ { "icmp", ETH_TYPE_IP, IP_TYPE_ICMP },
+ { "tcp", ETH_TYPE_IP, IP_TYPE_TCP },
+ { "udp", ETH_TYPE_IP, IP_TYPE_UDP },
+ };
+ const struct protocol *p;
+
+ for (p = protocols; p < &protocols[ARRAY_SIZE(protocols)]; p++) {
+ if (!strcmp(p->name, name)) {
+ *p_out = p;
+ return true;
+ }
+ }
+ *p_out = NULL;
+ return false;
+}
+
+struct field {
+ const char *name;
+ uint32_t wildcard;
+ enum { F_U8, F_U16, F_MAC, F_IP } type;
+ size_t offset, shift;
+};
+
+static bool
+parse_field(const char *name, const struct field **f_out)
+{
+#define F_OFS(MEMBER) offsetof(struct ofp_match, MEMBER)
+ static const struct field fields[] = {
+ { "in_port", OFPFW_IN_PORT, F_U16, F_OFS(in_port), 0 },
+ { "dl_vlan", OFPFW_DL_VLAN, F_U16, F_OFS(dl_vlan), 0 },
+ { "dl_src", OFPFW_DL_SRC, F_MAC, F_OFS(dl_src), 0 },
+ { "dl_dst", OFPFW_DL_DST, F_MAC, F_OFS(dl_dst), 0 },
+ { "dl_type", OFPFW_DL_TYPE, F_U16, F_OFS(dl_type), 0 },
+ { "nw_src", OFPFW_NW_SRC_MASK, F_IP,
+ F_OFS(nw_src), OFPFW_NW_SRC_SHIFT },
+ { "nw_dst", OFPFW_NW_DST_MASK, F_IP,
+ F_OFS(nw_dst), OFPFW_NW_DST_SHIFT },
+ { "nw_proto", OFPFW_NW_PROTO, F_U8, F_OFS(nw_proto), 0 },
+ { "tp_src", OFPFW_TP_SRC, F_U16, F_OFS(tp_src), 0 },
+ { "tp_dst", OFPFW_TP_DST, F_U16, F_OFS(tp_dst), 0 },
+ { "icmp_type", OFPFW_ICMP_TYPE, F_U16, F_OFS(icmp_type), 0 },
+ { "icmp_code", OFPFW_ICMP_CODE, F_U16, F_OFS(icmp_code), 0 }
+ };
+ const struct field *f;
+
+ for (f = fields; f < &fields[ARRAY_SIZE(fields)]; f++) {
+ if (!strcmp(f->name, name)) {
+ *f_out = f;
+ return true;
+ }
+ }
+ *f_out = NULL;
+ return false;
+}
+
+static void
+str_to_flow(char *string, struct ofp_match *match, struct ofpbuf *actions,
+ uint8_t *table_idx, uint16_t *out_port, uint16_t *priority,
+ uint16_t *idle_timeout, uint16_t *hard_timeout)
+{
+ char *save_ptr = NULL;
+ char *name;
+ uint32_t wildcards;
+
+ if (table_idx) {
+ *table_idx = 0xff;
+ }
+ if (out_port) {
+ *out_port = OFPP_NONE;
+ }
+ if (priority) {
+ *priority = OFP_DEFAULT_PRIORITY;
+ }
+ if (idle_timeout) {
+ *idle_timeout = DEFAULT_IDLE_TIMEOUT;
+ }
+ if (hard_timeout) {
+ *hard_timeout = OFP_FLOW_PERMANENT;
+ }
+ if (actions) {
+ char *act_str = strstr(string, "action");
+ if (!act_str) {
+ ovs_fatal(0, "must specify an action");
+ }
+ *(act_str-1) = '\0';
+
+ act_str = strchr(act_str, '=');
+ if (!act_str) {
+ ovs_fatal(0, "must specify an action");
+ }
+
+ act_str++;
+
+ str_to_action(act_str, actions);
+ }
+ memset(match, 0, sizeof *match);
+ wildcards = OFPFW_ALL;
+ for (name = strtok_r(string, "=, \t\r\n", &save_ptr); name;
+ name = strtok_r(NULL, "=, \t\r\n", &save_ptr)) {
+ const struct protocol *p;
+
+ if (parse_protocol(name, &p)) {
+ wildcards &= ~OFPFW_DL_TYPE;
+ match->dl_type = htons(p->dl_type);
+ if (p->nw_proto) {
+ wildcards &= ~OFPFW_NW_PROTO;
+ match->nw_proto = p->nw_proto;
+ }
+ } else {
+ const struct field *f;
+ char *value;
+
+ value = strtok_r(NULL, ", \t\r\n", &save_ptr);
+ if (!value) {
+ ovs_fatal(0, "field %s missing value", name);
+ }
+
+ if (table_idx && !strcmp(name, "table")) {
+ *table_idx = atoi(value);
+ } else if (out_port && !strcmp(name, "out_port")) {
+ *out_port = atoi(value);
+ } else if (priority && !strcmp(name, "priority")) {
+ *priority = atoi(value);
+ } else if (idle_timeout && !strcmp(name, "idle_timeout")) {
+ *idle_timeout = atoi(value);
+ } else if (hard_timeout && !strcmp(name, "hard_timeout")) {
+ *hard_timeout = atoi(value);
+ } else if (parse_field(name, &f)) {
+ void *data = (char *) match + f->offset;
+ if (!strcmp(value, "*") || !strcmp(value, "ANY")) {
+ wildcards |= f->wildcard;
+ } else {
+ wildcards &= ~f->wildcard;
+ if (f->wildcard == OFPFW_IN_PORT
+ && parse_port_name(value, (uint16_t *) data)) {
+ /* Nothing to do. */
+ } else if (f->type == F_U8) {
+ *(uint8_t *) data = str_to_u32(value);
+ } else if (f->type == F_U16) {
+ *(uint16_t *) data = htons(str_to_u32(value));
+ } else if (f->type == F_MAC) {
+ str_to_mac(value, data);
+ } else if (f->type == F_IP) {
+ wildcards |= str_to_ip(value, data) << f->shift;
+ } else {
+ NOT_REACHED();
+ }
+ }
+ } else {
+ ovs_fatal(0, "unknown keyword %s", name);
+ }
+ }
+ }
+ match->wildcards = htonl(wildcards);
+}
+
+static void
+do_dump_flows(const struct settings *s UNUSED, int argc, char *argv[])
+{
+ struct ofp_flow_stats_request *req;
+ uint16_t out_port;
+ struct ofpbuf *request;
+
+ req = alloc_stats_request(sizeof *req, OFPST_FLOW, &request);
+ str_to_flow(argc > 2 ? argv[2] : "", &req->match, NULL,
+ &req->table_id, &out_port, NULL, NULL, NULL);
+ memset(&req->pad, 0, sizeof req->pad);
+ req->out_port = htons(out_port);
+
+ dump_stats_transaction(argv[1], request);
+}
+
+static void
+do_dump_aggregate(const struct settings *s UNUSED, int argc, char *argv[])
+{
+ struct ofp_aggregate_stats_request *req;
+ struct ofpbuf *request;
+ uint16_t out_port;
+
+ req = alloc_stats_request(sizeof *req, OFPST_AGGREGATE, &request);
+ str_to_flow(argc > 2 ? argv[2] : "", &req->match, NULL,
+ &req->table_id, &out_port, NULL, NULL, NULL);
+ memset(&req->pad, 0, sizeof req->pad);
+ req->out_port = htons(out_port);
+
+ dump_stats_transaction(argv[1], request);
+}
+
+static void
+do_add_flow(const struct settings *s UNUSED, int argc UNUSED, char *argv[])
+{
+ struct vconn *vconn;
+ struct ofpbuf *buffer;
+ struct ofp_flow_mod *ofm;
+ uint16_t priority, idle_timeout, hard_timeout;
+ struct ofp_match match;
+
+ /* Parse and send. str_to_flow() will expand and reallocate the data in
+ * 'buffer', so we can't keep pointers to across the str_to_flow() call. */
+ make_openflow(sizeof *ofm, OFPT_FLOW_MOD, &buffer);
+ str_to_flow(argv[2], &match, buffer,
+ NULL, NULL, &priority, &idle_timeout, &hard_timeout);
+ ofm = buffer->data;
+ ofm->match = match;
+ ofm->command = htons(OFPFC_ADD);
+ ofm->idle_timeout = htons(idle_timeout);
+ ofm->hard_timeout = htons(hard_timeout);
+ ofm->buffer_id = htonl(UINT32_MAX);
+ ofm->priority = htons(priority);
+ ofm->reserved = htonl(0);
+
+ open_vconn(argv[1], &vconn);
+ send_openflow_buffer(vconn, buffer);
+ vconn_close(vconn);
+}
+
+static void
+do_add_flows(const struct settings *s UNUSED, int argc UNUSED, char *argv[])
+{
+ struct vconn *vconn;
+ FILE *file;
+ char line[1024];
+
+ file = fopen(argv[2], "r");
+ if (file == NULL) {
+ ovs_fatal(errno, "%s: open", argv[2]);
+ }
+
+ open_vconn(argv[1], &vconn);
+ while (fgets(line, sizeof line, file)) {
+ struct ofpbuf *buffer;
+ struct ofp_flow_mod *ofm;
+ uint16_t priority, idle_timeout, hard_timeout;
+ struct ofp_match match;
+
+ char *comment;
+
+ /* Delete comments. */
+ comment = strchr(line, '#');
+ if (comment) {
+ *comment = '\0';
+ }
+
+ /* Drop empty lines. */
+ if (line[strspn(line, " \t\n")] == '\0') {
+ continue;
+ }
+
+ /* Parse and send. str_to_flow() will expand and reallocate the data
+ * in 'buffer', so we can't keep pointers to across the str_to_flow()
+ * call. */
+ ofm = make_openflow(sizeof *ofm, OFPT_FLOW_MOD, &buffer);
+ str_to_flow(line, &match, buffer,
+ NULL, NULL, &priority, &idle_timeout, &hard_timeout);
+ ofm = buffer->data;
+ ofm->match = match;
+ ofm->command = htons(OFPFC_ADD);
+ ofm->idle_timeout = htons(idle_timeout);
+ ofm->hard_timeout = htons(hard_timeout);
+ ofm->buffer_id = htonl(UINT32_MAX);
+ ofm->priority = htons(priority);
+ ofm->reserved = htonl(0);
+
+ send_openflow_buffer(vconn, buffer);
+ }
+ vconn_close(vconn);
+ fclose(file);
+}
+
+static void
+do_mod_flows(const struct settings *s, int argc UNUSED, char *argv[])
+{
+ uint16_t priority, idle_timeout, hard_timeout;
+ struct vconn *vconn;
+ struct ofpbuf *buffer;
+ struct ofp_flow_mod *ofm;
+
+ /* Parse and send. */
+ ofm = make_openflow(sizeof *ofm, OFPT_FLOW_MOD, &buffer);
+ str_to_flow(argv[2], &ofm->match, buffer,
+ NULL, NULL, &priority, &idle_timeout, &hard_timeout);
+ if (s->strict) {
+ ofm->command = htons(OFPFC_MODIFY_STRICT);
+ } else {
+ ofm->command = htons(OFPFC_MODIFY);
+ }
+ ofm->idle_timeout = htons(idle_timeout);
+ ofm->hard_timeout = htons(hard_timeout);
+ ofm->buffer_id = htonl(UINT32_MAX);
+ ofm->priority = htons(priority);
+ ofm->reserved = htonl(0);
+
+ open_vconn(argv[1], &vconn);
+ send_openflow_buffer(vconn, buffer);
+ vconn_close(vconn);
+}
+
+static void do_del_flows(const struct settings *s, int argc, char *argv[])
+{
+ struct vconn *vconn;
+ uint16_t priority;
+ uint16_t out_port;
+ struct ofpbuf *buffer;
+ struct ofp_flow_mod *ofm;
+
+ /* Parse and send. */
+ ofm = make_openflow(sizeof *ofm, OFPT_FLOW_MOD, &buffer);
+ str_to_flow(argc > 2 ? argv[2] : "", &ofm->match, NULL, NULL,
+ &out_port, &priority, NULL, NULL);
+ if (s->strict) {
+ ofm->command = htons(OFPFC_DELETE_STRICT);
+ } else {
+ ofm->command = htons(OFPFC_DELETE);
+ }
+ ofm->idle_timeout = htons(0);
+ ofm->hard_timeout = htons(0);
+ ofm->buffer_id = htonl(UINT32_MAX);
+ ofm->out_port = htons(out_port);
+ ofm->priority = htons(priority);
+ ofm->reserved = htonl(0);
+
+ open_vconn(argv[1], &vconn);
+ send_openflow_buffer(vconn, buffer);
+ vconn_close(vconn);
+}
+
+static void
+do_monitor(const struct settings *s UNUSED, int argc UNUSED, char *argv[])
+{
+ struct vconn *vconn;
+
+ open_vconn(argv[1], &vconn);
+ if (argc > 2) {
+ int miss_send_len = atoi(argv[2]);
+ int send_flow_exp = argc > 3 ? atoi(argv[3]) : 0;
+ struct ofp_switch_config *osc;
+ struct ofpbuf *buf;
+
+ osc = make_openflow(sizeof *osc, OFPT_SET_CONFIG, &buf);
+ osc->flags = htons(send_flow_exp ? OFPC_SEND_FLOW_EXP : 0);
+ osc->miss_send_len = htons(miss_send_len);
+ send_openflow_buffer(vconn, buf);
+ }
+ for (;;) {
+ struct ofpbuf *b;
+ run(vconn_recv_block(vconn, &b), "vconn_recv");
+ ofp_print(stderr, b->data, b->size, 2);
+ ofpbuf_delete(b);
+ }
+}
+
+static void
+do_dump_ports(const struct settings *s UNUSED, int argc UNUSED, char *argv[])
+{
+ dump_trivial_stats_transaction(argv[1], OFPST_PORT);
+}
+
+static void
+do_probe(const struct settings *s UNUSED, int argc UNUSED, char *argv[])
+{
+ struct ofpbuf *request;
+ struct vconn *vconn;
+ struct ofpbuf *reply;
+
+ make_openflow(sizeof(struct ofp_header), OFPT_ECHO_REQUEST, &request);
+ open_vconn(argv[1], &vconn);
+ run(vconn_transact(vconn, request, &reply), "talking to %s", argv[1]);
+ if (reply->size != sizeof(struct ofp_header)) {
+ ovs_fatal(0, "reply does not match request");
+ }
+ ofpbuf_delete(reply);
+ vconn_close(vconn);
+}
+
+static void
+do_mod_port(const struct settings *s UNUSED, int argc UNUSED, char *argv[])
+{
+ struct ofpbuf *request, *reply;
+ struct ofp_switch_features *osf;
+ struct ofp_port_mod *opm;
+ struct vconn *vconn;
+ char *endptr;
+ int n_ports;
+ int port_idx;
+ int port_no;
+
+
+ /* Check if the argument is a port index. Otherwise, treat it as
+ * the port name. */
+ port_no = strtol(argv[2], &endptr, 10);
+ if (port_no == 0 && endptr == argv[2]) {
+ port_no = -1;
+ }
+
+ /* Send a "Features Request" to get the information we need in order
+ * to modify the port. */
+ make_openflow(sizeof(struct ofp_header), OFPT_FEATURES_REQUEST, &request);
+ open_vconn(argv[1], &vconn);
+ run(vconn_transact(vconn, request, &reply), "talking to %s", argv[1]);
+
+ osf = reply->data;
+ n_ports = (reply->size - sizeof *osf) / sizeof *osf->ports;
+
+ for (port_idx = 0; port_idx < n_ports; port_idx++) {
+ if (port_no != -1) {
+ /* Check argument as a port index */
+ if (osf->ports[port_idx].port_no == htons(port_no)) {
+ break;
+ }
+ } else {
+ /* Check argument as an interface name */
+ if (!strncmp((char *)osf->ports[port_idx].name, argv[2],
+ sizeof osf->ports[0].name)) {
+ break;
+ }
+
+ }
+ }
+ if (port_idx == n_ports) {
+ ovs_fatal(0, "couldn't find monitored port: %s", argv[2]);
+ }
+
+ opm = make_openflow(sizeof(struct ofp_port_mod), OFPT_PORT_MOD, &request);
+ opm->port_no = osf->ports[port_idx].port_no;
+ memcpy(opm->hw_addr, osf->ports[port_idx].hw_addr, sizeof opm->hw_addr);
+ opm->config = htonl(0);
+ opm->mask = htonl(0);
+ opm->advertise = htonl(0);
+
+ printf("modifying port: %s\n", osf->ports[port_idx].name);
+
+ if (!strncasecmp(argv[3], MOD_PORT_CMD_UP, sizeof MOD_PORT_CMD_UP)) {
+ opm->mask |= htonl(OFPPC_PORT_DOWN);
+ } else if (!strncasecmp(argv[3], MOD_PORT_CMD_DOWN,
+ sizeof MOD_PORT_CMD_DOWN)) {
+ opm->mask |= htonl(OFPPC_PORT_DOWN);
+ opm->config |= htonl(OFPPC_PORT_DOWN);
+ } else if (!strncasecmp(argv[3], MOD_PORT_CMD_FLOOD,
+ sizeof MOD_PORT_CMD_FLOOD)) {
+ opm->mask |= htonl(OFPPC_NO_FLOOD);
+ } else if (!strncasecmp(argv[3], MOD_PORT_CMD_NOFLOOD,
+ sizeof MOD_PORT_CMD_NOFLOOD)) {
+ opm->mask |= htonl(OFPPC_NO_FLOOD);
+ opm->config |= htonl(OFPPC_NO_FLOOD);
+ } else {
+ ovs_fatal(0, "unknown mod-port command '%s'", argv[3]);
+ }
+
+ send_openflow_buffer(vconn, request);
+
+ ofpbuf_delete(reply);
+ vconn_close(vconn);
+}
+
+static void
+do_ping(const struct settings *s UNUSED, int argc, char *argv[])
+{
+ size_t max_payload = 65535 - sizeof(struct ofp_header);
+ unsigned int payload;
+ struct vconn *vconn;
+ int i;
+
+ payload = argc > 2 ? atoi(argv[2]) : 64;
+ if (payload > max_payload) {
+ ovs_fatal(0, "payload must be between 0 and %zu bytes", max_payload);
+ }
+
+ open_vconn(argv[1], &vconn);
+ for (i = 0; i < 10; i++) {
+ struct timeval start, end;
+ struct ofpbuf *request, *reply;
+ struct ofp_header *rq_hdr, *rpy_hdr;
+
+ rq_hdr = make_openflow(sizeof(struct ofp_header) + payload,
+ OFPT_ECHO_REQUEST, &request);
+ random_bytes(rq_hdr + 1, payload);
+
+ gettimeofday(&start, NULL);
+ run(vconn_transact(vconn, ofpbuf_clone(request), &reply), "transact");
+ gettimeofday(&end, NULL);
+
+ rpy_hdr = reply->data;
+ if (reply->size != request->size
+ || memcmp(rpy_hdr + 1, rq_hdr + 1, payload)
+ || rpy_hdr->xid != rq_hdr->xid
+ || rpy_hdr->type != OFPT_ECHO_REPLY) {
+ printf("Reply does not match request. Request:\n");
+ ofp_print(stdout, request, request->size, 2);
+ printf("Reply:\n");
+ ofp_print(stdout, reply, reply->size, 2);
+ }
+ printf("%d bytes from %s: xid=%08"PRIx32" time=%.1f ms\n",
+ reply->size - sizeof *rpy_hdr, argv[1], rpy_hdr->xid,
+ (1000*(double)(end.tv_sec - start.tv_sec))
+ + (.001*(end.tv_usec - start.tv_usec)));
+ ofpbuf_delete(request);
+ ofpbuf_delete(reply);
+ }
+ vconn_close(vconn);
+}
+
+static void
+do_benchmark(const struct settings *s UNUSED, int argc UNUSED, char *argv[])
+{
+ size_t max_payload = 65535 - sizeof(struct ofp_header);
+ struct timeval start, end;
+ unsigned int payload_size, message_size;
+ struct vconn *vconn;
+ double duration;
+ int count;
+ int i;
+
+ payload_size = atoi(argv[2]);
+ if (payload_size > max_payload) {
+ ovs_fatal(0, "payload must be between 0 and %zu bytes", max_payload);
+ }
+ message_size = sizeof(struct ofp_header) + payload_size;
+
+ count = atoi(argv[3]);
+
+ printf("Sending %d packets * %u bytes (with header) = %u bytes total\n",
+ count, message_size, count * message_size);
+
+ open_vconn(argv[1], &vconn);
+ gettimeofday(&start, NULL);
+ for (i = 0; i < count; i++) {
+ struct ofpbuf *request, *reply;
+ struct ofp_header *rq_hdr;
+
+ rq_hdr = make_openflow(message_size, OFPT_ECHO_REQUEST, &request);
+ memset(rq_hdr + 1, 0, payload_size);
+ run(vconn_transact(vconn, request, &reply), "transact");
+ ofpbuf_delete(reply);
+ }
+ gettimeofday(&end, NULL);
+ vconn_close(vconn);
+
+ duration = ((1000*(double)(end.tv_sec - start.tv_sec))
+ + (.001*(end.tv_usec - start.tv_usec)));
+ printf("Finished in %.1f ms (%.0f packets/s) (%.0f bytes/s)\n",
+ duration, count / (duration / 1000.0),
+ count * message_size / (duration / 1000.0));
+}
+
+static void
+do_execute(const struct settings *s UNUSED, int argc, char *argv[])
+{
+ struct vconn *vconn;
+ struct ofpbuf *request;
+ struct nicira_header *nicira;
+ struct nx_command_reply *ncr;
+ uint32_t xid;
+ int i;
+
+ nicira = make_openflow(sizeof *nicira, OFPT_VENDOR, &request);
+ xid = nicira->header.xid;
+ nicira->vendor = htonl(NX_VENDOR_ID);
+ nicira->subtype = htonl(NXT_COMMAND_REQUEST);
+ ofpbuf_put(request, argv[2], strlen(argv[2]));
+ for (i = 3; i < argc; i++) {
+ ofpbuf_put_zeros(request, 1);
+ ofpbuf_put(request, argv[i], strlen(argv[i]));
+ }
+ update_openflow_length(request);
+
+ open_vconn(argv[1], &vconn);
+ run(vconn_send_block(vconn, request), "send");
+
+ for (;;) {
+ struct ofpbuf *reply;
+ uint32_t status;
+
+ run(vconn_recv_xid(vconn, xid, &reply), "recv_xid");
+ if (reply->size < sizeof *ncr) {
+ ovs_fatal(0, "reply is too short (%zu bytes < %zu bytes)",
+ reply->size, sizeof *ncr);
+ }
+ ncr = reply->data;
+ if (ncr->nxh.header.type != OFPT_VENDOR
+ || ncr->nxh.vendor != htonl(NX_VENDOR_ID)
+ || ncr->nxh.subtype != htonl(NXT_COMMAND_REPLY)) {
+ ovs_fatal(0, "reply is invalid");
+ }
+
+ status = ntohl(ncr->status);
+ if (status & NXT_STATUS_STARTED) {
+ /* Wait for a second reply. */
+ continue;
+ } else if (status & NXT_STATUS_EXITED) {
+ fprintf(stderr, "process terminated normally with exit code %d",
+ status & NXT_STATUS_EXITSTATUS);
+ } else if (status & NXT_STATUS_SIGNALED) {
+ fprintf(stderr, "process terminated by signal %d",
+ status & NXT_STATUS_TERMSIG);
+ } else if (status & NXT_STATUS_ERROR) {
+ fprintf(stderr, "error executing command");
+ } else {
+ fprintf(stderr, "process terminated for unknown reason");
+ }
+ if (status & NXT_STATUS_COREDUMP) {
+ fprintf(stderr, " (core dumped)");
+ }
+ putc('\n', stderr);
+
+ fwrite(ncr + 1, reply->size - sizeof *ncr, 1, stdout);
+ break;
+ }
+}
+
+static void
+do_help(const struct settings *s UNUSED, int argc UNUSED, char *argv[] UNUSED)
+{
+ usage();
+}
+
+static struct command all_commands[] = {
+ { "show", 1, 1, do_show },
+ { "status", 1, 2, do_status },
+ { "monitor", 1, 3, do_monitor },
+ { "dump-desc", 1, 1, do_dump_desc },
+ { "dump-tables", 1, 1, do_dump_tables },
+ { "dump-flows", 1, 2, do_dump_flows },
+ { "dump-aggregate", 1, 2, do_dump_aggregate },
+ { "add-flow", 2, 2, do_add_flow },
+ { "add-flows", 2, 2, do_add_flows },
+ { "mod-flows", 2, 2, do_mod_flows },
+ { "del-flows", 1, 2, do_del_flows },
+ { "dump-ports", 1, 1, do_dump_ports },
+ { "mod-port", 3, 3, do_mod_port },
+ { "probe", 1, 1, do_probe },
+ { "ping", 1, 2, do_ping },
+ { "benchmark", 3, 3, do_benchmark },
+ { "execute", 2, INT_MAX, do_execute },
+ { "help", 0, INT_MAX, do_help },
+ { NULL, 0, 0, NULL },
+};
diff --git a/utilities/ovs-parse-leaks.in b/utilities/ovs-parse-leaks.in
new file mode 100755
index 000000000..4e06847c2
--- /dev/null
+++ b/utilities/ovs-parse-leaks.in
@@ -0,0 +1,285 @@
+#! @PERL@
+
+use strict;
+use warnings;
+
+if (grep($_ eq '--help', @ARGV)) {
+ print <<EOF;
+$0, for parsing leak checker logs
+usage: $0 [BINARY] < LOG
+where LOG is a file produced by an Open vSwitch program's --check-leaks option
+ and BINARY is the binary that wrote LOG.
+EOF
+ exit 0;
+}
+
+die "$0: zero or one arguments required; use --help for help\n" if @ARGV > 1;
+die "$0: $ARGV[0] does not exist" if @ARGV > 0 && ! -e $ARGV[0];
+
+our ($binary);
+our ($a2l) = search_path("addr2line");
+my ($no_syms) = "symbols will not be translated";
+if (!@ARGV) {
+ print "no binary specified; $no_syms\n";
+} elsif (! -e $ARGV[0]) {
+ print "$ARGV[0] does not exist; $no_syms";
+} elsif (!defined($a2l)) {
+ print "addr2line not found in PATH; $no_syms";
+} else {
+ $binary = $ARGV[0];
+}
+
+our ($objdump) = search_path("objdump");
+print "objdump not found; dynamic library symbols will not be translated\n"
+ if !defined($objdump);
+
+our %blocks;
+our @segments;
+while (<STDIN>) {
+ my $ptr = "((?:0x)?[0-9a-fA-F]+|\\(nil\\))";
+ my $callers = ":((?: $ptr)+)";
+ if (/^malloc\((\d+)\) -> $ptr$callers$/) {
+ allocated($., $2, $1, $3);
+ } elsif (/^claim\($ptr\)$callers$/) {
+ claimed($., $1, $2);
+ } elsif (/realloc\($ptr, (\d+)\) -> $ptr$callers$/) {
+ my ($callers) = $4;
+ freed($., $1, $callers);
+ allocated($., $3, $2, $callers);
+ } elsif (/^free\($ptr\)$callers$/) {
+ freed($., $1, $2);
+ } elsif (/^segment: $ptr-$ptr $ptr [-r][-w][-x][sp] (.*)/) {
+ add_segment(hex($1), hex($2), hex($3), $4);
+ } else {
+ print "stdin:$.: syntax error\n";
+ }
+}
+if (%blocks) {
+ my $n_blocks = scalar(keys(%blocks));
+ my $n_bytes = 0;
+ $n_bytes += $_->{SIZE} foreach values(%blocks);
+ print "$n_bytes bytes in $n_blocks blocks not freed at end of run\n";
+ my %blocks_by_callers;
+ foreach my $block (values(%blocks)) {
+ my ($trimmed_callers) = trim_callers($block->{CALLERS});
+ push (@{$blocks_by_callers{$trimmed_callers}}, $block);
+ }
+ foreach my $callers (sort {@{$b} <=> @{$a}} (values(%blocks_by_callers))) {
+ $n_blocks = scalar(@{$callers});
+ $n_bytes = 0;
+ $n_bytes += $_->{SIZE} foreach @{$callers};
+ print "$n_bytes bytes in these $n_blocks blocks were not freed:\n";
+ my $i = 0;
+ my $max = 5;
+ foreach my $block (sort {$a->{LINE} <=> $b->{LINE}} (@{$callers})) {
+ printf "\t%d-byte block at 0x%08x allocated on stdin:%d\n",
+ $block->{SIZE}, $block->{BASE}, $block->{LINE};
+ last if $i++ > $max;
+ }
+ print "\t...and ", $n_blocks - $max, " others...\n"
+ if $n_blocks > $max;
+ print "The blocks listed above were allocated by:\n";
+ print_callers("\t", ${$callers}[0]->{CALLERS});
+ }
+}
+sub interp_pointer {
+ my ($s_ptr) = @_;
+ return $s_ptr eq '(nil)' ? 0 : hex($s_ptr);
+}
+
+sub allocated {
+ my ($line, $s_base, $size, $callers) = @_;
+ my ($base) = interp_pointer($s_base);
+ return if !$base;
+ my ($info) = {LINE => $line,
+ BASE => $base,
+ SIZE => $size,
+ CALLERS => $callers};
+ if (exists($blocks{$base})) {
+ print "In-use address returned by allocator:\n";
+ print "\tInitial allocation:\n";
+ print_block("\t\t", $blocks{$base});
+ print "\tNew allocation:\n";
+ print_block("\t\t", $info);
+ }
+ $blocks{$base} = $info;
+}
+
+sub claimed {
+ my ($line, $s_base, $callers) = @_;
+ my ($base) = interp_pointer($s_base);
+ return if !$base;
+ if (exists($blocks{$base})) {
+ $blocks{$base}{LINE} = $line;
+ $blocks{$base}{CALLERS} = $callers;
+ } else {
+ printf "Claim asserted on not-in-use block 0x%08x by:\n", $base;
+ print_callers('', $callers);
+ }
+}
+
+sub freed {
+ my ($line, $s_base, $callers) = @_;
+ my ($base) = interp_pointer($s_base);
+ return if !$base;
+
+ if (!delete($blocks{$base})) {
+ printf "Bad free of not-allocated address 0x%08x on stdin:%d by:\n", $base, $line;
+ print_callers('', $callers);
+ }
+}
+
+sub print_block {
+ my ($prefix, $info) = @_;
+ printf '%s%d-byte block at 0x%08x allocated on stdin:%d by:' . "\n",
+ $prefix, $info->{SIZE}, $info->{BASE}, $info->{LINE};
+ print_callers($prefix, $info->{CALLERS});
+}
+
+sub print_callers {
+ my ($prefix, $callers) = @_;
+ foreach my $pc (split(' ', $callers)) {
+ print "$prefix\t", lookup_pc($pc), "\n";
+ }
+}
+
+our (%cache);
+sub lookup_pc {
+ my ($s_pc) = @_;
+ if (defined($binary)) {
+ my ($pc) = hex($s_pc);
+ my ($output) = "$s_pc: ";
+ if (!exists($cache{$pc})) {
+ open(A2L, "$a2l -fe $binary --demangle $s_pc|");
+ chomp(my $function = <A2L>);
+ chomp(my $line = <A2L>);
+ close(A2L);
+ if ($function eq '??') {
+ ($function, $line) = lookup_pc_by_segment($pc);
+ }
+ $line =~ s/^(\.\.\/)*//;
+ $line = "..." . substr($line, -25) if length($line) > 28;
+ $cache{$pc} = "$s_pc: $function ($line)";
+ }
+ return $cache{$pc};
+ } else {
+ return "$s_pc";
+ }
+}
+
+sub trim_callers {
+ my ($in) = @_;
+ my (@out);
+ foreach my $pc (split(' ', $in)) {
+ my $xlated = lookup_pc($pc);
+ if ($xlated =~ /\?\?/) {
+ push(@out, "...") if !@out || $out[$#out] ne '...';
+ } else {
+ push(@out, $pc);
+ }
+ }
+ return join(' ', @out);
+}
+
+sub search_path {
+ my ($target) = @_;
+ for my $dir (split (':', $ENV{PATH})) {
+ my ($file) = "$dir/$target";
+ return $file if -e $file;
+ }
+ return undef;
+}
+
+sub add_segment {
+ my ($vm_start, $vm_end, $vm_pgoff, $file) = @_;
+ for (my $i = 0; $i <= $#segments; $i++) {
+ my ($s) = $segments[$i];
+ next if $vm_end <= $s->{START} || $vm_start >= $s->{END};
+ if ($vm_start <= $s->{START} && $vm_end >= $s->{END}) {
+ splice(@segments, $i, 1);
+ --$i;
+ } else {
+ $s->{START} = $vm_end if $vm_end > $s->{START};
+ $s->{END} = $vm_start if $vm_start <= $s->{END};
+ }
+ }
+ push(@segments, {START => $vm_start,
+ END => $vm_end,
+ PGOFF => $vm_pgoff,
+ FILE => $file});
+ @segments = sort { $a->{START} <=> $b->{START} } @segments;
+}
+
+sub binary_search {
+ my ($array, $value) = @_;
+ my $l = 0;
+ my $r = $#{$array};
+ while ($l <= $r) {
+ my $m = int(($l + $r) / 2);
+ my $e = $array->[$m];
+ if ($value < $e->{START}) {
+ $r = $m - 1;
+ } elsif ($value >= $e->{END}) {
+ $l = $m + 1;
+ } else {
+ return $e;
+ }
+ }
+ return undef;
+}
+
+sub read_sections {
+ my ($file) = @_;
+ my (@sections);
+ open(OBJDUMP, "$objdump -h $file|");
+ while (<OBJDUMP>) {
+ my $ptr = "([0-9a-fA-F]+)";
+ my ($name, $size, $vma, $lma, $file_off)
+ = /^\s*\d+\s+(\S+)\s+$ptr\s+$ptr\s+$ptr\s+$ptr/
+ or next;
+ push(@sections, {START => hex($file_off),
+ END => hex($file_off) + hex($size),
+ NAME => $name});
+ }
+ close(OBJDUMP);
+ return [sort { $a->{START} <=> $b->{START} } @sections ];
+}
+
+our %file_to_sections;
+sub segment_to_section {
+ my ($file, $file_offset) = @_;
+ if (!defined($file_to_sections{$file})) {
+ $file_to_sections{$file} = read_sections($file);
+ }
+ return binary_search($file_to_sections{$file}, $file_offset);
+}
+
+sub address_to_segment {
+ my ($pc) = @_;
+ return binary_search(\@segments, $pc);
+}
+
+sub lookup_pc_by_segment {
+ return ('??', 0) if !defined($objdump);
+
+ my ($pc) = @_;
+ my ($segment) = address_to_segment($pc);
+ return ('??', 0) if !defined($segment) || $segment->{FILE} eq '';
+
+ my ($file_offset) = $pc - $segment->{START} + $segment->{PGOFF};
+ my ($section) = segment_to_section($segment->{FILE}, $file_offset);
+ return ('??', 0) if !defined($section);
+
+ my ($section_offset) = $file_offset - $section->{START};
+ open(A2L, sprintf("%s -fe %s --demangle --section=$section->{NAME} 0x%x|",
+ $a2l, $segment->{FILE}, $section_offset));
+ chomp(my $function = <A2L>);
+ chomp(my $line = <A2L>);
+ close(A2L);
+
+ return ($function, $line);
+}
+
+# Local Variables:
+# mode: perl
+# End:
diff --git a/utilities/ovs-pki-cgi.in b/utilities/ovs-pki-cgi.in
new file mode 100755
index 000000000..837b3f92b
--- /dev/null
+++ b/utilities/ovs-pki-cgi.in
@@ -0,0 +1,41 @@
+#! @PERL@
+
+use CGI;
+use Digest::SHA1;
+use Fcntl;
+
+$CGI::POST_MAX = 65536; # Limit POSTs to 64 kB.
+
+use strict;
+use warnings;
+
+my $pkidir = '@PKIDIR@';
+my $q = new CGI;
+
+die unless $q->request_method() eq 'POST';
+
+my $type = $q->param('type');
+die unless defined $type;
+die unless $type eq 'switch' or $type eq 'controller';
+
+my $req = $q->param('req');
+die unless defined $req;
+die unless $req =~ /^-----BEGIN CERTIFICATE REQUEST-----$/m;
+die unless $req =~ /^-----END CERTIFICATE REQUEST-----$/m;
+
+my $digest = Digest::SHA1::sha1_hex($req);
+my $incoming = "$pkidir/${type}ca/incoming";
+my $dst = "$incoming/$digest-req.pem";
+
+sysopen(REQUEST, "$dst.tmp", O_RDWR | O_CREAT | O_EXCL, 0600)
+ or die "sysopen $dst.tmp: $!";
+print REQUEST $req;
+close(REQUEST) or die "close $dst.tmp: $!";
+
+rename("$dst.tmp", $dst) or die "rename $dst.tmp to $dst: $!";
+
+print $q->header('text/html', '204 No response');
+
+# Local Variables:
+# mode: perl
+# End:
diff --git a/utilities/ovs-pki.8.in b/utilities/ovs-pki.8.in
new file mode 100644
index 000000000..35bf0ea41
--- /dev/null
+++ b/utilities/ovs-pki.8.in
@@ -0,0 +1,323 @@
+.TH ovs\-pki 8 "May 2008" "Open vSwitch" "Open vSwitch Manual"
+
+.SH NAME
+ovs\-pki \- OpenFlow public key infrastructure management utility
+
+.SH SYNOPSIS
+\fBovs\-pki\fR [\fIOPTIONS\fR] \fICOMMAND\fR [\fIARGS\fR]
+.sp
+Stand\-alone commands with their arguments:
+.br
+\fBovs\-pki\fR \fBinit\fR
+.br
+\fBovs\-pki\fR \fBreq\fR \fINAME\fR
+.br
+\fBovs\-pki\fR \fBsign\fR \fINAME\fR [\fITYPE\fR]
+.br
+\fBovs\-pki\fR \fBreq+sign\fR \fINAME\fR [\fITYPE\fR]
+.br
+\fBovs\-pki\fR \fBverify\fR \fINAME\fR [\fITYPE\fR]
+.br
+\fBovs\-pki\fR \fBfingerprint\fR \fIFILE\fR
+.br
+\fBovs\-pki\fR \self-sign\fR \fINAME\fR
+.sp
+The following additional commands manage an online PKI:
+.br
+\fBovs\-pki\fR \fBls\fR [\fIPREFIX\fR] [\fITYPE\fR]
+.br
+\fBovs\-pki\fR \fBflush\fR [\fITYPE\fR]
+.br
+\fBovs\-pki\fR \fBreject\fR \fIPREFIX\fR [\fITYPE\fR]
+.br
+\fBovs\-pki\fR \fBapprove\fR \fIPREFIX\fR [\fITYPE\fR]
+.br
+\fBovs\-pki\fR \fBprompt\fR [\fITYPE\fR]
+.br
+\fBovs\-pki\fR \fBexpire\fR [\fIAGE\fR]
+.sp
+Each \fITYPE\fR above is a certificate type, either \fBswitch\fR
+(default) or \fBcontroller\fR.
+.sp
+The available options are:
+.br
+[\fB\-k\fR \fItype\fR | \fB\-\^\-key=\fItype\fR]
+[\fB\-B\fR \fInbits\fR | \fB\-\^\-bits=\fInbits\fR]
+[\fB\-D\fR \fIfile\fR | \fB\-\^\-dsaparam=\fIfile\fR]
+[\fB\-b\fR | \fB\-\^\-batch\fR]
+[\fB\-f\fR | \fB\-\^\-force\fR]
+[\fB\-d\fR \fIdir\fR | \fB\-\^\-dir=\fR\fIdir\fR]
+[\fB\-l\fR \fIfile\fR | \fB\-\^\-log=\fIfile\fR]
+[\fB\-h\fR | \fB\-\^\-help\fR]
+.br
+Some options do not apply to every command.
+
+.SH DESCRIPTION
+The \fBovs\-pki\fR program sets up and manages a public key
+infrastructure for use with OpenFlow. It is intended to be a simple
+interface for organizations that do not have an established public key
+infrastructure. Other PKI tools can substitute for or supplement the
+use of \fBovs\-pki\fR.
+
+\fBovs\-pki\fR uses \fBopenssl\fR(1) for certificate management and key
+generation.
+
+.SH "OFFLINE COMMANDS"
+
+The following \fBovs\-pki\fR commands support manual PKI
+administration:
+
+.TP
+\fBinit\fR
+Initializes a new PKI (by default in directory \fB@PKIDIR@\fR) and populates
+it with a pair of certificate authorities for controllers and
+switches.
+
+This command should ideally be run on a high\-security machine separate
+from any OpenFlow controller or switch, called the CA machine. The
+files \fBpki/controllerca/cacert.pem\fR and
+\fBpki/switchca/cacert.pem\fR that it produces will need to be copied
+over to the OpenFlow switches and controllers, respectively. Their
+contents may safely be made public.
+
+By default, \fBovs\-pki\fR generates 2048\-bit RSA keys. The \fB\-B\fR
+or \fB\-\^\-bits\fR option (see below) may be used to override the key
+length. The \fB\-k dsa\fR or \fB\-\^\-key=dsa\fR option may be used to use
+DSA in place of RSA. If DSA is selected, the \fBdsaparam.pem\fR file
+generated in the new PKI hierarchy must be copied to any machine on
+which the \fBreq\fR command (see below) will be executed. Its
+contents may safely be made public.
+
+Other files generated by \fBinit\fR may remain on the CA machine.
+The files \fBpki/controllerca/private/cakey.pem\fR and
+\fBpki/switchca/private/cakey.pem\fR have particularly sensitive
+contents that should not be exposed.
+
+.TP
+\fBreq\fR \fINAME\fR
+Generates a new private key named \fINAME\fR\fB\-privkey.pem\fR and
+corresponding certificate request named \fINAME\fR\fB\-req.pem\fR.
+The private key can be intended for use by a switch or a controller.
+
+This command should ideally be run on the switch or controller that
+will use the private key to identify itself. The file
+\fINAME\fR\fB\-req.pem\fR must be copied to the CA machine for signing
+with the \fBsign\fR command (below).
+
+This command will output a fingerprint to stdout as its final step.
+Write down the fingerprint and take it to the CA machine before
+continuing with the \fBsign\fR step.
+
+When RSA keys are in use (as is the default), \fBreq\fR, unlike the
+rest of \fBovs\-pki\fR's commands, does not need access to a PKI
+hierarchy created by \fBovs\-pki init\fR. The \fB\-B\fR or
+\fB\-\^\-bits\fR option (see below) may be used to specify the number of
+bits in the generated RSA key.
+
+When DSA keys are used (as specified with \fB\-\^\-key=dsa\fR), \fBreq\fR
+needs access to the \fBdsaparam.pem\fR file created as part of the PKI
+hierarchy (but not to other files in that tree). By default,
+\fBovs\-pki\fR looks for this file in \fB@PKIDIR@/dsaparam.pem\fR, but
+the \fB\-D\fR or \fB\-\^\-dsaparam\fR option (see below) may be used to
+specify an alternate location.
+
+\fINAME\fR\fB\-privkey.pem\fR has sensitive contents that should not be
+exposed. \fINAME\fR\fB\-req.pem\fR may be safely made public.
+
+.TP
+\fBsign\fR \fINAME\fR [\fITYPE\fR]
+Signs the certificate request named \fINAME\fR\fB\-req.pem\fR that was
+produced in the previous step, producing a certificate named
+\fINAME\fR\fB\-cert.pem\fR. \fITYPE\fR, either \fBswitch\fR (default) or
+\fBcontroller\fR, indicates the use for which the key is being
+certified.
+
+This command must be run on the CA machine.
+
+The command will output a fingerprint to stdout and request that you
+verify that it is the same fingerprint output by the \fBreq\fR
+command. This ensures that the request being signed is the same one
+produced by \fBreq\fR. (The \fB\-b\fR or \fB\-\^\-batch\fR option
+suppresses the verification step.)
+
+The file \fINAME\fR\fB\-cert.pem\fR will need to be copied back to the
+switch or controller for which it is intended. Its contents may
+safely be made public.
+
+.TP
+\fBreq+sign\fR \fINAME\fR [\fITYPE\fR]
+Combines the \fBreq\fR and \fBsign\fR commands into a single step,
+outputting all the files produced by each. The
+\fINAME\fR\fB\-privkey.pem\fR and \fINAME\fR\fB\-cert.pem\fR files must
+be copied securely to the switch or controller.
+\fINAME\fR\fB\-privkey.pem\fR has sensitive contents and must not be
+exposed in transit. Afterward, it should be deleted from the CA
+machine.
+
+This combined method is, theoretically, less secure than the
+individual steps performed separately on two different machines,
+because there is additional potential for exposure of the private
+key. However, it is also more convenient.
+
+.TP
+\fBverify\fR \fINAME\fR [\fITYPE\fR]
+Verifies that \fINAME\fR\fB\-cert.pem\fR is a valid certificate for the
+given \fITYPE\fR of use, either \fBswitch\fR (default) or
+\fBcontroller\fR. If the certificate is valid for this use, it prints
+the message ``\fINAME\fR\fB\-cert.pem\fR: OK''; otherwise, it prints an
+error message.
+
+.TP
+\fBfingerprint\fR \fIFILE\fR
+Prints the fingerprint for \fIFILE\fR. If \fIFILE\fR is a
+certificate, then this is the SHA\-1 digest of the DER encoded version
+of the certificate; otherwise, it is the SHA\-1 digest of the entire
+file.
+
+.TP
+\fBself-sign\fR \fINAME\fR
+Signs the certificate request named \fINAME\fB\-req.pem\fR using the
+private key \fINAME\fB-privkey.pem\fR, producing a self-signed
+certificate named \fINAMEfB\-cert.pem\fR. The input files should have
+been produced with \fBovs\-pki req\fR.
+
+Some controllers accept such self-signed certificates.
+
+.SH "ONLINE COMMANDS"
+
+An OpenFlow PKI can be administered online, in conjunction with
+.BR ovs\-pki\-cgi (8)
+and a web server such as Apache:
+
+.IP \(bu
+The web server exports the contents of the PKI via HTTP. All files in
+a PKI hierarchy files may be made public, except for the files
+\fBpki/controllerca/private/cakey.pem\fR and
+\fBpki/switchca/private/cakey.pem\fR, which must not be exposed.
+
+.IP \(bu
+\fBovs\-pki\-cgi\fR allows newly generated certificate requests for
+controllers and switches to be uploaded into the
+\fBpki/controllerca/incoming\fR and \fBpki/switchca/incoming\fR
+directories, respectively. Uploaded certificate requests are stored
+in those directories under names of the form
+\fIFINGERPRINT\fB\-req.pem\fR, which \fIFINGERPRINT\fR is the SHA\-1
+hash of the file.
+
+.IP \(bu
+These \fBovs\-pki\fR commands allow incoming certificate requests to
+be approved or rejected, in a form are suitable for use by humans or
+other software.
+
+.PP
+The following \fBovs\-pki\fR commands support online administration:
+
+.TP
+\fBovs\-pki\fR \fBls\fR [\fIPREFIX\fR] [\fITYPE\fR]
+Lists all of the incoming certificate requests of the given \fITYPE\fR
+(either \fBswitch\fR, the default, or \fBcontroller\fR). If
+\fIPREFIX\fR, which must be at least 4 characters long, is specified,
+it causes the list to be limited to files whose names begin with
+\fIPREFIX\fR. This is useful, for example, to avoid typing in an
+entire fingerprint when checking that a specific certificate request
+has been received.
+
+.TP
+\fBovs\-pki\fR \fBflush\fR [\fITYPE\fR]
+Deletes all certificate requests of the given \fITYPE\fR.
+
+.TP
+\fBovs\-pki\fR \fBreject\fR \fIPREFIX\fR [\fITYPE\fR]
+Rejects the certificate request whose name begins with \fIPREFIX\fR,
+which must be at least 4 characters long, of the given type (either
+\fBswitch\fR, the default, or \fBcontroller\fR). \fIPREFIX\fR must
+match exactly one certificate request; its purpose is to allow the
+user to type fewer characters, not to match multiple certificate
+requests.
+
+.TP
+\fBovs\-pki\fR \fBapprove\fR \fIPREFIX\fR [\fITYPE\fR]
+Approves the certificate request whose name begins with \fIPREFIX\fR,
+which must be at least 4 characters long, of the given \fITYPE\fR
+(either \fBswitch\fR, the default, or \fBcontroller\fR). \fIPREFIX\fR
+must match exactly one certificate request; its purpose is to allow
+the user to type fewer characters, not to match multiple certificate
+requests.
+
+The command will output a fingerprint to stdout and request that you
+verify that it is correct. (The \fB\-b\fR or \fB\-\^\-batch\fR option
+suppresses the verification step.)
+
+.TP
+\fBovs\-pki\fR \fBprompt\fR [\fITYPE\fR]
+Prompts the user for each incoming certificate request of the given
+\fITYPE\fR (either \fBswitch\fR, the default, or \fBcontroller\fR).
+Based on the certificate request's fingerprint, the user is given the
+option of approving, rejecting, or skipping the certificate request.
+
+.TP
+\fBovs\-pki\fR \fBexpire\fR [\fIAGE\fR]
+
+Rejects all the incoming certificate requests, of either type, that is
+older than \fIAGE\fR, which must in one of the forms \fIN\fBs\fR,
+\fIN\fBmin\fR, \fIN\fBh\fR, \fIN\fBday\fR. The default is \fB1day\fR.
+
+.SH OPTIONS
+.TP
+\fB\-k\fR \fItype\fR | \fB\-\^\-key=\fItype\fR
+For the \fBinit\fR command, sets the public key algorithm to use for
+the new PKI hierarchy. For the \fBreq\fR and \fBreq+sign\fR commands,
+sets the public key algorithm to use for the key to be generated,
+which must match the value specified on \fBinit\fR. With other
+commands, the value has no effect.
+
+The \fItype\fR may be \fBrsa\fR (the default) or \fBdsa\fR.
+
+.TP
+\fB\-B\fR \fInbits\fR | \fB\-\^\-bits=\fInbits\fR
+Sets the number of bits in the key to be generated. When RSA keys are
+in use, this option affects only the \fBinit\fR, \fBreq\fR, and
+\fBreq+sign\fR commands, and the same value should be given each time.
+With DSA keys are in use, this option affects only the \fBinit\fR
+command.
+
+The value must be at least 1024. The default is 2048.
+
+.TP
+\fB\-D\fR \fIfile\fR | \fB\-\^\-dsaparam=\fIfile\fR
+Specifies an alternate location for the \fBdsaparam.pem\fR file
+required by the \fBreq\fR and \fBreq+sign\fR commands. This option
+affects only these commands, and only when DSA keys are used.
+
+The default is \fBdsaparam.pem\fR under the PKI hierarchy.
+
+.TP
+\fB\-b\fR | \fB\-\^\-batch\fR
+Suppresses the interactive verification of fingerprints that the
+\fBsign\fR and \fBapprove\fR commands by default require.
+
+.TP
+\fB\-d\fR \fIdir\fR | \fB\-\^\-dir=\fR\fIdir\fR
+Specifies the location of the PKI hierarchy to be used or created by
+the command (default: \fB@PKIDIR@\fR). All commands, except \fBreq\fR,
+need access to a PKI hierarchy.
+
+.TP
+\fB\-f\fR | \fB\-\^\-force\fR
+By default, \fBovs\-pki\fR will not overwrite existing files or
+directories. This option overrides this behavior.
+
+.TP
+\fB\-l\fR \fIfile\fR | \fB\-\^\-log=\fIfile\fR
+Sets the log file to \fIfile\fR. Default:
+\fB@LOGDIR@/ovs\-pki.log\fR.
+
+.TP
+\fB\-h\fR | \fB\-\^\-help\fR
+Prints a help usage message and exits.
+
+.SH "SEE ALSO"
+
+.BR controller (8),
+.BR ovs\-pki\-cgi (8),
+.BR secchan (8)
diff --git a/utilities/ovs-pki.in b/utilities/ovs-pki.in
new file mode 100755
index 000000000..15ac17b92
--- /dev/null
+++ b/utilities/ovs-pki.in
@@ -0,0 +1,582 @@
+#! /bin/sh
+
+set -e
+
+pkidir='@PKIDIR@'
+command=
+prev=
+force=no
+batch=no
+log='@LOGDIR@/ovs-pki.log'
+keytype=rsa
+bits=2048
+for option; do
+ # This option-parsing mechanism borrowed from a Autoconf-generated
+ # configure script under the following license:
+
+ # Copyright (C) 1992, 1993, 1994, 1995, 1996, 1998, 1999, 2000, 2001,
+ # 2002, 2003, 2004, 2005, 2006, 2009 Free Software Foundation, Inc.
+ # This configure script is free software; the Free Software Foundation
+ # gives unlimited permission to copy, distribute and modify it.
+
+ # If the previous option needs an argument, assign it.
+ if test -n "$prev"; then
+ eval $prev=\$option
+ prev=
+ continue
+ fi
+ case $option in
+ *=*) optarg=`expr "X$option" : '[^=]*=\(.*\)'` ;;
+ *) optarg=yes ;;
+ esac
+
+ case $dashdash$option in
+ --)
+ dashdash=yes ;;
+ -h|--help)
+ cat <<EOF
+ovs-pki, for managing a simple OpenFlow public key infrastructure
+usage: $0 [OPTION...] COMMAND [ARG...]
+
+The valid stand-alone commands and their arguments are:
+ init Initialize the PKI
+ req NAME Create new private key and certificate request
+ named NAME-privkey.pem and NAME-req.pem, resp.
+ sign NAME [TYPE] Sign switch certificate request NAME-req.pem,
+ producing certificate NAME-cert.pem
+ req+sign NAME [TYPE] Combine the above two steps, producing all three files.
+ verify NAME [TYPE] Checks that NAME-cert.pem is a valid TYPE certificate
+ fingerprint FILE Prints the fingerprint for FILE
+ self-sign NAME Sign NAME-req.pem with NAME-privkey.pem,
+ producing self-signed certificate NAME-cert.pem
+
+The following additional commands manage an online PKI:
+ ls [PREFIX] [TYPE] Lists incoming requests of the given TYPE, optionally
+ limited to those whose fingerprint begins with PREFIX
+ flush [TYPE] Rejects all incoming requests of the given TYPE
+ reject PREFIX [TYPE] Rejects the incoming request(s) whose fingerprint begins
+ with PREFIX and has the given TYPE
+ approve PREFIX [TYPE] Approves the incoming request whose fingerprint begins
+ with PREFIX and has the given TYPE
+ expire [AGE] Rejects all incoming requests older than AGE, in
+ one of the forms Ns, Nmin, Nh, Nday (default: 1day)
+ prompt [TYPE] Interactively prompts to accept or reject each incoming
+ request of the given TYPE
+
+Each TYPE above is a certificate type: 'switch' (default) or 'controller'.
+
+Options for 'init', 'req', and 'req+sign' only:
+ -k, --key=rsa|dsa Type of keys to use (default: rsa)
+ -B, --bits=NBITS Number of bits in keys (default: 2048). For DSA keys,
+ this has an effect only on 'init'.
+ -D, --dsaparam=FILE File with DSA parameters (DSA only)
+ (default: dsaparam.pem within PKI directory)
+Options for use with the 'sign' and 'approve' commands:
+ -b, --batch Skip fingerprint verification
+Options that apply to any command:
+ -d, --dir=DIR Directory where the PKI is located
+ (default: $pkidir)
+ -f, --force Continue even if file or directory already exists
+ -l, --log=FILE Log openssl output to FILE (default: ovs-log.log)
+ -h, --help Print this usage message.
+EOF
+ exit 0
+ ;;
+ --di*=*)
+ pkidir=$optarg
+ ;;
+ --di*|-d)
+ prev=pkidir
+ ;;
+ --k*=*)
+ keytype=$optarg
+ ;;
+ --k*|-k)
+ prev=keytype
+ ;;
+ --bi*=*)
+ bits=$optarg
+ ;;
+ --bi*|-B)
+ prev=bits
+ ;;
+ --ds*=*)
+ dsaparam=$optarg
+ ;;
+ --ds*|-D)
+ prev=dsaparam
+ ;;
+ --l*=*)
+ log=$optarg
+ ;;
+ --l*|-l)
+ prev=log
+ ;;
+ --force|-f)
+ force=yes
+ ;;
+ --ba*|-b)
+ batch=yes
+ ;;
+ -*)
+ echo "unrecognized option $option" >&2
+ exit 1
+ ;;
+ *)
+ if test -z "$command"; then
+ command=$option
+ elif test -z "${arg1+set}"; then
+ arg1=$option
+ elif test -z "${arg2+set}"; then
+ arg2=$option
+ else
+ echo "$option: only two arguments may be specified" >&2
+ exit 1
+ fi
+ ;;
+ esac
+ shift
+done
+if test -n "$prev"; then
+ option=--`echo $prev | sed 's/_/-/g'`
+ { echo "$as_me: error: missing argument to $option" >&2
+ { (exit 1); exit 1; }; }
+fi
+if test -z "$command"; then
+ echo "$0: missing command name; use --help for help" >&2
+ exit 1
+fi
+if test "$keytype" != rsa && test "$keytype" != dsa; then
+ echo "$0: argument to -k or --key must be rsa or dsa"
+ exit 1
+fi
+if test "$bits" -lt 1024; then
+ echo "$0: argument to -B or --bits must be at least 1024"
+ exit 1
+fi
+if test -z "$dsaparam"; then
+ dsaparam=$pkidir/dsaparam.pem
+fi
+case $log in
+ /*) ;;
+ *) $log="$PWD/$log" ;;
+esac
+
+if test "$command" = "init"; then
+ if test -e "$pkidir" && test "$force" != "yes"; then
+ echo "$0: $pkidir already exists and --force not specified" >&2
+ exit 1
+ fi
+
+ if test ! -d "$pkidir"; then
+ mkdir -p "$pkidir"
+ fi
+ cd "$pkidir"
+ exec 3>>$log
+
+ if test $keytype = dsa && test ! -e dsaparam.pem; then
+ echo "Generating DSA parameters, please wait..." >&2
+ openssl dsaparam -out dsaparam.pem $bits 1>&3 2>&3
+ fi
+
+ # Create the CAs.
+ for ca in controllerca switchca; do
+ echo "Creating $ca..." >&2
+ oldpwd=$PWD
+ mkdir -p $ca
+ cd $ca
+
+ mkdir -p certs crl newcerts
+ mkdir -p -m 0700 private
+ mkdir -p -m 0733 incoming
+ touch index.txt
+ test -e crlnumber || echo 01 > crlnumber
+ test -e serial || echo 01 > serial
+
+ # Put DSA parameters in directory.
+ if test $keytype = dsa && test ! -e dsaparam.pem; then
+ cp ../dsaparam.pem .
+ fi
+
+ # Write CA configuration file.
+ if test ! -e ca.cnf; then
+ sed "s/@ca@/$ca/g" > ca.cnf <<'EOF'
+[ req ]
+prompt = no
+distinguished_name = req_distinguished_name
+
+[ req_distinguished_name ]
+C = US
+ST = CA
+L = Palo Alto
+O = Open vSwitch
+OU = @ca@
+CN = Open vSwitch @ca@ CA Certificate
+
+[ ca ]
+default_ca = the_ca
+
+[ the_ca ]
+dir = . # top dir
+database = $dir/index.txt # index file.
+new_certs_dir = $dir/newcerts # new certs dir
+certificate = $dir/cacert.pem # The CA cert
+serial = $dir/serial # serial no file
+private_key = $dir/private/cakey.pem# CA private key
+RANDFILE = $dir/private/.rand # random number file
+default_days = 365 # how long to certify for
+default_crl_days= 30 # how long before next CRL
+default_md = md5 # md to use
+policy = policy # default policy
+email_in_dn = no # Don't add the email into cert DN
+name_opt = ca_default # Subject name display option
+cert_opt = ca_default # Certificate display option
+copy_extensions = none # Don't copy extensions from request
+
+# For the CA policy
+[ policy ]
+countryName = optional
+stateOrProvinceName = optional
+organizationName = match
+organizationalUnitName = optional
+commonName = supplied
+emailAddress = optional
+EOF
+ fi
+
+ # Create certificate authority.
+ if test $keytype = dsa; then
+ newkey=dsa:dsaparam.pem
+ else
+ newkey=rsa:$bits
+ fi
+ openssl req -config ca.cnf -nodes \
+ -newkey $newkey -keyout private/cakey.pem -out careq.pem \
+ 1>&3 2>&3
+ openssl ca -config ca.cnf -create_serial -out cacert.pem \
+ -days 1095 -batch -keyfile private/cakey.pem -selfsign \
+ -infiles careq.pem 1>&3 2>&3
+ chmod 0700 private/cakey.pem
+
+ cd "$oldpwd"
+ done
+ exit 0
+fi
+
+one_arg() {
+ if test -z "$arg1" || test -n "$arg2"; then
+ echo "$0: $command must have exactly one argument; use --help for help" >&2
+ exit 1
+ fi
+}
+
+zero_or_one_args() {
+ if test -n "$arg2"; then
+ echo "$0: $command must have zero or one arguments; use --help for help" >&2
+ exit 1
+ fi
+}
+
+one_or_two_args() {
+ if test -z "$arg1"; then
+ echo "$0: $command must have one or two arguments; use --help for help" >&2
+ exit 1
+ fi
+}
+
+must_not_exist() {
+ if test -e "$1" && test "$force" != "yes"; then
+ echo "$0: $1 already exists and --force not supplied" >&2
+ exit 1
+ fi
+}
+
+resolve_prefix() {
+ test -n "$type" || exit 123 # Forgot to call check_type?
+
+ case $1 in
+ ????*)
+ ;;
+ *)
+ echo "Prefix $arg1 is too short (less than 4 hex digits)"
+ exit 0
+ ;;
+ esac
+
+ fingerprint=$(cd "$pkidir/${type}ca/incoming" && echo "$1"*-req.pem | sed 's/-req\.pem$//')
+ case $fingerprint in
+ "${1}*")
+ echo "No certificate requests matching $1"
+ exit 1
+ ;;
+ *" "*)
+ echo "$1 matches more than one certificate request:"
+ echo $fingerprint | sed 's/ /\
+/g'
+ exit 1
+ ;;
+ *)
+ # Nothing to do.
+ ;;
+ esac
+ req="$pkidir/${type}ca/incoming/$fingerprint-req.pem"
+ cert="$pkidir/${type}ca/certs/$fingerprint-cert.pem"
+}
+
+make_tmpdir() {
+ TMP=/tmp/ovs-pki.tmp$$
+ rm -rf $TMP
+ trap "rm -rf $TMP" 0
+ mkdir -m 0700 $TMP
+}
+
+fingerprint() {
+ local file=$1
+ local name=${1-$2}
+ local date=$(date -r $file)
+ local fingerprint
+ if grep -q -e '-BEGIN CERTIFICATE-' "$file"; then
+ fingerprint=$(openssl x509 -noout -in "$file" -fingerprint |
+ sed 's/SHA1 Fingerprint=//' | tr -d ':')
+ else
+ fingerprint=$(sha1sum "$file" | awk '{print $1}')
+ fi
+ printf "$name\\t$date\\n"
+ case $file in
+ $fingerprint*)
+ printf "\\t(correct fingerprint in filename)\\n"
+ ;;
+ *)
+ printf "\\tfingerprint $fingerprint\\n"
+ ;;
+ esac
+}
+
+verify_fingerprint() {
+ fingerprint "$@"
+ if test $batch != yes; then
+ echo "Does fingerprint match? (yes/no)"
+ read answer
+ if test "$answer" != yes; then
+ echo "Match failure, aborting" >&2
+ exit 1
+ fi
+ fi
+}
+
+check_type() {
+ if test x = x"$1"; then
+ type=switch
+ elif test "$1" = switch || test "$1" = controller; then
+ type=$1
+ else
+ echo "$0: type argument must be 'switch' or 'controller'" >&2
+ exit 1
+ fi
+}
+
+parse_age() {
+ number=$(echo $1 | sed 's/^\([0-9]\+\)\([[:alpha:]]\+\)/\1/')
+ unit=$(echo $1 | sed 's/^\([0-9]\+\)\([[:alpha:]]\+\)/\2/')
+ case $unit in
+ s)
+ factor=1
+ ;;
+ min)
+ factor=60
+ ;;
+ h)
+ factor=3600
+ ;;
+ day)
+ factor=86400
+ ;;
+ *)
+ echo "$1: age not in the form Ns, Nmin, Nh, Nday (e.g. 1day)" >&2
+ exit 1
+ ;;
+ esac
+ echo $(($number * $factor))
+}
+
+must_exist() {
+ if test ! -e "$1"; then
+ echo "$0: $1 does not exist" >&2
+ exit 1
+ fi
+}
+
+pkidir_must_exist() {
+ if test ! -e "$pkidir"; then
+ echo "$0: $pkidir does not exist (need to run 'init' or use '--dir'?)" >&2
+ exit 1
+ elif test ! -d "$pkidir"; then
+ echo "$0: $pkidir is not a directory" >&2
+ exit 1
+ fi
+}
+
+make_request() {
+ must_not_exist "$arg1-privkey.pem"
+ must_not_exist "$arg1-req.pem"
+ make_tmpdir
+ cat > "$TMP/req.cnf" <<EOF
+[ req ]
+prompt = no
+distinguished_name = req_distinguished_name
+
+[ req_distinguished_name ]
+C = US
+ST = CA
+L = Palo Alto
+O = Open vSwitch
+OU = Open vSwitch certifier
+CN = Open vSwitch certificate for $arg1
+EOF
+ if test $keytype = rsa; then
+ newkey=rsa:$bits
+ else
+ must_exist "$dsaparam"
+ newkey=dsa:$dsaparam
+ fi
+ openssl req -config "$TMP/req.cnf" -text -nodes \
+ -newkey $newkey -keyout "$1-privkey.pem" -out "$1-req.pem" 1>&3 2>&3
+}
+
+sign_request() {
+ must_exist "$1"
+ must_not_exist "$2"
+ pkidir_must_exist
+
+ (cd "$pkidir/${type}ca" &&
+ openssl ca -config ca.cnf -batch -in /dev/stdin) \
+ < "$1" > "$2.tmp$$" 2>&3
+ mv "$2.tmp$$" "$2"
+}
+
+glob() {
+ local files=$(echo $1)
+ if test "$files" != "$1"; then
+ echo "$files"
+ fi
+}
+
+exec 3>>$log || true
+if test "$command" = req; then
+ one_arg
+
+ make_request "$arg1"
+ fingerprint "$arg1-req.pem"
+elif test "$command" = sign; then
+ one_or_two_args
+ check_type "$arg2"
+ verify_fingerprint "$arg1-req.pem"
+
+ sign_request "$arg1-req.pem" "$arg2-cert.pem"
+elif test "$command" = req+sign; then
+ one_or_two_args
+ check_type "$arg2"
+
+ pkidir_must_exist
+ make_request "$arg1"
+ sign_request "$arg1-req.pem" "$arg1-cert.pem"
+ fingerprint "$arg1-req.pem"
+elif test "$command" = verify; then
+ one_or_two_args
+ must_exist "$arg1-cert.pem"
+ check_type "$arg2"
+
+ pkidir_must_exist
+ openssl verify -CAfile "$pkidir/${type}ca/cacert.pem" "$arg1-cert.pem"
+elif test "$command" = fingerprint; then
+ one_arg
+
+ fingerprint "$arg1"
+elif test "$command" = self-sign; then
+ one_arg
+ must_exist "$arg1-req.pem"
+ must_exist "$arg1-privkey.pem"
+ must_not_exist "$arg1-cert.pem"
+
+ openssl x509 -in "$arg1-req.pem" -out "$arg1-cert.pem" \
+ -signkey "$arg1-privkey.pem" -req -text 2>&3
+elif test "$command" = ls; then
+ check_type "$arg2"
+
+ cd "$pkidir/${type}ca/incoming"
+ for file in $(glob "$arg1*-req.pem"); do
+ fingerprint $file
+ done
+elif test "$command" = flush; then
+ check_type "$arg1"
+
+ rm -f "$pkidir/${type}ca/incoming/"*
+elif test "$command" = reject; then
+ one_or_two_args
+ check_type "$arg2"
+ resolve_prefix "$arg1"
+
+ rm -f "$req"
+elif test "$command" = approve; then
+ one_or_two_args
+ check_type "$arg2"
+ resolve_prefix "$arg1"
+
+ make_tmpdir
+ cp "$req" "$TMP/$req"
+ verify_fingerprint "$TMP/$req"
+ sign_request "$TMP/$req"
+ rm -f "$req" "$TMP/$req"
+elif test "$command" = prompt; then
+ zero_or_one_args
+ check_type "$arg1"
+
+ make_tmpdir
+ cd "$pkidir/${type}ca/incoming"
+ for req in $(glob "*-req.pem"); do
+ cp "$req" "$TMP/$req"
+
+ cert=$(echo "$pkidir/${type}ca/certs/$req" |
+ sed 's/-req.pem/-cert.pem/')
+ if test -f $cert; then
+ echo "Request $req already approved--dropping duplicate request"
+ rm -f "$req" "$TMP/$req"
+ continue
+ fi
+
+ echo
+ echo
+ fingerprint "$TMP/$req" "$req"
+ printf "Disposition for this request (skip/approve/reject)? "
+ read answer
+ case $answer in
+ approve)
+ echo "Approving $req"
+ sign_request "$TMP/$req" "$cert"
+ rm -f "$req" "$TMP/$req"
+ ;;
+ r*)
+ echo "Rejecting $req"
+ rm -f "$req" "$TMP/$req"
+ ;;
+ *)
+ echo "Skipping $req"
+ ;;
+ esac
+ done
+elif test "$command" = expire; then
+ zero_or_one_args
+ cutoff=$(($(date +%s) - $(parse_age ${arg1-1day})))
+ for type in switch controller; do
+ cd "$pkidir/${type}ca/incoming" || exit 1
+ for file in $(glob "*"); do
+ time=$(date -r "$file" +%s)
+ if test "$time" -lt "$cutoff"; then
+ rm -f "$file"
+ fi
+ done
+ done
+else
+ echo "$0: $command command unknown; use --help for help" >&2
+ exit 1
+fi
diff --git a/utilities/ovs-wdt.c b/utilities/ovs-wdt.c
new file mode 100644
index 000000000..3c5d797c0
--- /dev/null
+++ b/utilities/ovs-wdt.c
@@ -0,0 +1,263 @@
+/* Copyright (c) 2008, 2009 Nicira Networks, Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * In addition, as a special exception, Nicira Networks gives permission
+ * to link the code of its release of vswitchd with the OpenSSL project's
+ * "OpenSSL" library (or with modified versions of it that use the same
+ * license as the "OpenSSL" library), and distribute the linked
+ * executables. You must obey the GNU General Public License in all
+ * respects for all of the code used other than "OpenSSL". If you modify
+ * this file, you may extend this exception to your version of the file,
+ * but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version.
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <getopt.h>
+#include <signal.h>
+#include <sys/ioctl.h>
+#include <linux/types.h>
+#include <linux/watchdog.h>
+
+
+/* Default values for the interval and timer. In seconds. */
+#define DEFAULT_INTERVAL 1
+#define DEFAULT_TIMEOUT 30
+
+int fd = -1;
+
+/* The WDT is automatically enabled when /dev/watchdog is opened. If we
+ * do not send the magic value to the device first before exiting, the
+ * system will reboot. This function allows the program to exit without
+ * causing a reboot.
+ */
+static void
+cleanup(void)
+{
+ if (fd == -1) {
+ return;
+ }
+
+ /* Writing the magic value "V" to the device is an indication that
+ * the device is about to be closed. This causes the watchdog to be
+ * disabled after the call to close.
+ */
+ if (write(fd, "V", 1) != 1) {
+ fprintf(stderr, "Couldn't write magic val: %d\n", errno);
+ return;
+ }
+ close(fd);
+ fd = -1;
+}
+
+
+/* If we receive a SIGINT, cleanup first, which will disable the
+ * watchdog timer.
+ */
+static void
+sighandler(int signum)
+{
+ cleanup();
+ signal(signum, SIG_DFL);
+ raise(signum);
+}
+
+static void
+setup_signal(void)
+{
+ struct sigaction action;
+
+ action.sa_handler = sighandler;
+ sigemptyset(&action.sa_mask);
+ action.sa_flags = 0;
+
+ if (sigaction(SIGINT, &action, NULL) != 0) {
+ fprintf(stderr, "Problem setting up SIGINT handler...\n");
+ }
+ if (sigaction(SIGTERM, &action, NULL) != 0) {
+ fprintf(stderr, "Problem setting up SIGTERM handler...\n");
+ }
+}
+
+
+/* Print information on the WDT hardware */
+static void
+print_wdt_info(void)
+{
+ struct watchdog_info ident;
+
+ if (ioctl(fd, WDIOC_GETSUPPORT, &ident) == -1) {
+ fprintf(stderr, "Couldn't get version: %d\n", errno);
+ cleanup();
+ exit(-1);
+ }
+ printf("identity: %s, ver: %d, opt: %#x\n", ident.identity,
+ ident.firmware_version, ident.options);
+}
+
+
+static void
+print_help(char *progname)
+{
+ printf("%s: Watchdog timer utility\n", progname);
+ printf("usage: %s [OPTIONS]\n\n", progname);
+ printf("Options:\n");
+ printf(" -t, --timeout=SECS expiration time of WDT (default: %d)\n",
+ DEFAULT_TIMEOUT);
+ printf(" -i, --interval=SECS interval to send keep-alives (default: %d)\n",
+ DEFAULT_INTERVAL);
+ printf(" -d, --disable disable the WDT and exit\n");
+ printf(" -h, --help display this help message\n");
+ printf(" -v, --verbose enable verbose printing\n");
+ printf(" -V, --version display version information of WDT and exit\n");
+}
+
+
+int main(int argc, char *argv[])
+{
+ int arg;
+ int optc;
+ int verbose = 0;
+ int interval = DEFAULT_INTERVAL;
+ int timeout = DEFAULT_TIMEOUT;
+ static struct option const longopts[] =
+ {
+ {"timeout", required_argument, NULL, 't'},
+ {"interval", required_argument, NULL, 'i'},
+ {"disable", no_argument, NULL, 'd'},
+ {"help", no_argument, NULL, 'h'},
+ {"verbose", no_argument, NULL, 'v'},
+ {"version", no_argument, NULL, 'V'},
+ {0, 0, 0, 0}
+ };
+
+ setup_signal();
+
+ fd = open("/dev/watchdog", O_RDWR);
+ if (fd == -1) {
+ fprintf(stderr, "Couldn't open watchdog device: %s\n", strerror(errno));
+ exit(-1);
+ }
+
+ while ((optc = getopt_long(argc, argv, "t:i:dh?vV", longopts, NULL)) != -1) {
+ switch (optc) {
+ case 't':
+ timeout = strtol(optarg, NULL, 10);
+ if (!timeout) {
+ fprintf(stderr, "Invalid timeout: %s\n", optarg);
+ goto error;
+ }
+ break;
+
+ case 'i':
+ interval = strtol(optarg, NULL, 10);
+ if (!interval) {
+ fprintf(stderr, "Invalid interval: %s\n", optarg);
+ goto error;
+ }
+ break;
+
+ case 'd':
+ arg = WDIOS_DISABLECARD;
+ if (ioctl(fd, WDIOC_SETOPTIONS, &arg) == -1) {
+ fprintf(stderr, "Couldn't disable: %d\n", errno);
+ goto error;
+ }
+ cleanup();
+ exit(0);
+ break;
+
+ case 'h':
+ print_help(argv[0]);
+ cleanup();
+ exit(0);
+ break;
+
+ case 'v':
+ verbose = 1;
+ break;
+
+ case 'V':
+ print_wdt_info();
+ cleanup();
+ exit(0);
+ break;
+
+ default:
+ print_help(argv[0]);
+ goto error;
+ break;
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ /* Sanity-check the arguments */
+ if (argc != 0) {
+ fprintf(stderr, "Illegal argument: %s\n", argv[0]);
+ goto error;
+ }
+
+ if (verbose) {
+ print_wdt_info();
+ printf("timeout: %d, interval: %d\n", timeout, interval);
+ }
+
+ /* Prevent the interval being greater than the timeout, since it
+ * will always cause a reboot.
+ */
+ if (interval > timeout) {
+ fprintf(stderr, "Interval greater than timeout: %d > %d\n",
+ interval, timeout);
+ goto error;
+ }
+
+ /* Always set the timeout */
+ if (ioctl(fd, WDIOC_SETTIMEOUT, &timeout) == -1) {
+ fprintf(stderr, "Couldn't set timeout: %d\n", errno);
+ goto error;
+ }
+
+ /* Loop and send a keep-alive every "interval" seconds */
+ while (1) {
+ if (verbose) {
+ if (ioctl(fd, WDIOC_GETTIMELEFT, &arg) == -1) {
+ fprintf(stderr, "Couldn't get time left: %d\n", errno);
+ goto error;
+ }
+ printf("Sending keep alive, time remaining: %d\n", arg);
+ }
+
+ /* Send a keep-alive. The argument is ignored */
+ if (ioctl(fd, WDIOC_KEEPALIVE, &arg) == -1) {
+ fprintf(stderr, "Couldn't keepalive: %d\n", errno);
+ goto error;
+ }
+
+ sleep(interval);
+ }
+
+ /* Never directly reached... */
+error:
+ cleanup();
+ exit(-1);
+}
diff --git a/vswitchd/.gitignore b/vswitchd/.gitignore
new file mode 100644
index 000000000..01d57ae70
--- /dev/null
+++ b/vswitchd/.gitignore
@@ -0,0 +1,7 @@
+/Makefile
+/Makefile.in
+/ovs-brcompatd
+/ovs-brcompatd.8
+/ovs-vswitchd
+/ovs-vswitchd.8
+/ovs-vswitchd.conf.5
diff --git a/vswitchd/automake.mk b/vswitchd/automake.mk
new file mode 100644
index 000000000..6883731ea
--- /dev/null
+++ b/vswitchd/automake.mk
@@ -0,0 +1,40 @@
+sbin_PROGRAMS += vswitchd/ovs-vswitchd vswitchd/ovs-brcompatd
+man_MANS += \
+ vswitchd/ovs-vswitchd.conf.5 \
+ vswitchd/ovs-vswitchd.8 \
+ vswitchd/ovs-brcompatd.8
+DISTCLEANFILES += \
+ vswitchd/ovs-vswitchd.conf.5 \
+ vswitchd/ovs-vswitchd.8 \
+ vswitchd/ovs-brcompatd.8
+
+vswitchd_ovs_vswitchd_SOURCES = \
+ vswitchd/bridge.c \
+ vswitchd/bridge.h \
+ vswitchd/mgmt.c \
+ vswitchd/mgmt.h \
+ vswitchd/port.c \
+ vswitchd/port.h \
+ vswitchd/proc-net-compat.c \
+ vswitchd/proc-net-compat.h \
+ vswitchd/ovs-vswitchd.c \
+ vswitchd/ovs-vswitchd.h \
+ vswitchd/xenserver.c \
+ vswitchd/xenserver.h
+vswitchd_ovs_vswitchd_LDADD = \
+ secchan/libsecchan.a \
+ lib/libopenvswitch.a \
+ $(FAULT_LIBS) \
+ $(SSL_LIBS)
+
+vswitchd_ovs_brcompatd_SOURCES = \
+ vswitchd/ovs-brcompatd.c
+
+vswitchd_ovs_brcompatd_LDADD = \
+ lib/libopenvswitch.a \
+ $(FAULT_LIBS)
+
+EXTRA_DIST += \
+ vswitchd/ovs-vswitchd.conf.5.in \
+ vswitchd/ovs-vswitchd.8.in \
+ vswitchd/ovs-brcompatd.8.in
diff --git a/vswitchd/bridge.c b/vswitchd/bridge.c
new file mode 100644
index 000000000..cfd4dcf75
--- /dev/null
+++ b/vswitchd/bridge.c
@@ -0,0 +1,3058 @@
+/* Copyright (c) 2008, 2009 Nicira Networks
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * In addition, as a special exception, Nicira Networks gives permission
+ * to link the code of its release of vswitchd with the OpenSSL project's
+ * "OpenSSL" library (or with modified versions of it that use the same
+ * license as the "OpenSSL" library), and distribute the linked
+ * executables. You must obey the GNU General Public License in all
+ * respects for all of the code used other than "OpenSSL". If you modify
+ * this file, you may extend this exception to your version of the file,
+ * but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version.
+ *
+ */
+
+#include <config.h>
+#include "bridge.h"
+#include <assert.h>
+#include <errno.h>
+#include <arpa/inet.h>
+#include <ctype.h>
+#include <inttypes.h>
+#include <net/if.h>
+#include <openflow/openflow.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <strings.h>
+#include <sys/stat.h>
+#include <sys/socket.h>
+#include <unistd.h>
+#include "bitmap.h"
+#include "cfg.h"
+#include "coverage.h"
+#include "dirs.h"
+#include "dpif.h"
+#include "dynamic-string.h"
+#include "flow.h"
+#include "hash.h"
+#include "list.h"
+#include "mac-learning.h"
+#include "netdev.h"
+#include "odp-util.h"
+#include "ofp-print.h"
+#include "ofpbuf.h"
+#include "poll-loop.h"
+#include "port-array.h"
+#include "proc-net-compat.h"
+#include "process.h"
+#include "secchan/ofproto.h"
+#include "socket-util.h"
+#include "stp.h"
+#include "svec.h"
+#include "timeval.h"
+#include "util.h"
+#include "vconn.h"
+#include "vconn-ssl.h"
+#include "xenserver.h"
+#include "xtoxll.h"
+
+#define THIS_MODULE VLM_bridge
+#include "vlog.h"
+
+struct dst {
+ uint16_t vlan;
+ uint16_t dp_ifidx;
+};
+
+extern uint64_t mgmt_id;
+
+struct iface {
+ struct port *port; /* Containing port. */
+ size_t port_ifidx; /* Index within containing port. */
+
+ char *name; /* Host network device name. */
+ int dp_ifidx; /* Index within kernel datapath. */
+
+ uint8_t mac[ETH_ADDR_LEN]; /* Ethernet address (all zeros if unknowns). */
+
+ tag_type tag; /* Tag associated with this interface. */
+ bool enabled; /* May be chosen for flows? */
+ long long delay_expires; /* Time after which 'enabled' may change. */
+};
+
+#define BOND_MASK 0xff
+struct bond_entry {
+ int iface_idx; /* Index of assigned iface, or -1 if none. */
+ uint64_t tx_bytes; /* Count of bytes recently transmitted. */
+ tag_type iface_tag; /* Tag associated with iface_idx. */
+};
+
+#define MAX_MIRRORS 32
+typedef uint32_t mirror_mask_t;
+#define MIRROR_MASK_C(X) UINT32_C(X)
+BUILD_ASSERT_DECL(sizeof(mirror_mask_t) * CHAR_BIT >= MAX_MIRRORS);
+struct mirror {
+ struct bridge *bridge;
+ size_t idx;
+ char *name;
+
+ /* Selection criteria. */
+ struct svec src_ports;
+ struct svec dst_ports;
+ int *vlans;
+ size_t n_vlans;
+
+ /* Output. */
+ struct port *out_port;
+ int out_vlan;
+};
+
+#define FLOOD_PORT ((struct port *) 1) /* The 'flood' output port. */
+struct port {
+ struct bridge *bridge;
+ size_t port_idx;
+ int vlan; /* -1=trunk port, else a 12-bit VLAN ID. */
+ unsigned long *trunks; /* Bitmap of trunked VLANs, if 'vlan' == -1. */
+ char *name;
+
+ /* An ordinary bridge port has 1 interface.
+ * A bridge port for bonding has at least 2 interfaces. */
+ struct iface **ifaces;
+ size_t n_ifaces, allocated_ifaces;
+
+ /* Bonding info. */
+ struct bond_entry *bond_hash; /* An array of (BOND_MASK + 1) elements. */
+ int active_iface; /* Ifidx on which bcasts accepted, or -1. */
+ tag_type active_iface_tag; /* Tag for bcast flows. */
+ tag_type no_ifaces_tag; /* Tag for flows when all ifaces disabled. */
+ int updelay, downdelay; /* Delay before iface goes up/down, in ms. */
+
+ /* Port mirroring info. */
+ mirror_mask_t src_mirrors; /* Mirrors triggered when packet received. */
+ mirror_mask_t dst_mirrors; /* Mirrors triggered when packet sent. */
+ bool is_mirror_output_port; /* Does port mirroring send frames here? */
+
+ /* Spanning tree info. */
+ enum stp_state stp_state; /* Always STP_FORWARDING if STP not in use. */
+ tag_type stp_state_tag; /* Tag for STP state change. */
+};
+
+#define DP_MAX_PORTS 255
+struct bridge {
+ struct list node; /* Node in global list of bridges. */
+ char *name; /* User-specified arbitrary name. */
+ struct mac_learning *ml; /* MAC learning table, or null not to learn. */
+ bool sent_config_request; /* Successfully sent config request? */
+ uint8_t default_ea[ETH_ADDR_LEN]; /* Default MAC. */
+
+ /* Support for remote controllers. */
+ char *controller; /* NULL if there is no remote controller;
+ * "discover" to do controller discovery;
+ * otherwise a vconn name. */
+
+ /* OpenFlow switch processing. */
+ struct ofproto *ofproto; /* OpenFlow switch. */
+
+ /* Kernel datapath information. */
+ struct dpif dpif; /* Kernel datapath. */
+ struct port_array ifaces; /* Indexed by kernel datapath port number. */
+
+ /* Bridge ports. */
+ struct port **ports;
+ size_t n_ports, allocated_ports;
+
+ /* Bonding. */
+ bool has_bonded_ports;
+ long long int bond_next_rebalance;
+
+ /* Flow tracking. */
+ bool flush;
+
+ /* Flow statistics gathering. */
+ time_t next_stats_request;
+
+ /* Port mirroring. */
+ struct mirror *mirrors[MAX_MIRRORS];
+
+ /* Spanning tree. */
+ struct stp *stp;
+ long long int stp_last_tick;
+};
+
+/* List of all bridges. */
+static struct list all_bridges = LIST_INITIALIZER(&all_bridges);
+
+/* Maximum number of datapaths. */
+enum { DP_MAX = 256 };
+
+static struct bridge *bridge_create(const char *name);
+static void bridge_destroy(struct bridge *);
+static struct bridge *bridge_lookup(const char *name);
+static int bridge_run_one(struct bridge *);
+static void bridge_reconfigure_one(struct bridge *);
+static void bridge_reconfigure_controller(struct bridge *);
+static void bridge_get_all_ifaces(const struct bridge *, struct svec *ifaces);
+static void bridge_fetch_dp_ifaces(struct bridge *);
+static void bridge_flush(struct bridge *);
+static void bridge_pick_local_hw_addr(struct bridge *,
+ uint8_t ea[ETH_ADDR_LEN],
+ const char **devname);
+static uint64_t bridge_pick_datapath_id(struct bridge *,
+ const uint8_t bridge_ea[ETH_ADDR_LEN],
+ const char *devname);
+static uint64_t dpid_from_hash(const void *, size_t nbytes);
+
+static void bond_run(struct bridge *);
+static void bond_wait(struct bridge *);
+static void bond_rebalance_port(struct port *);
+
+static void port_create(struct bridge *, const char *name);
+static void port_reconfigure(struct port *);
+static void port_destroy(struct port *);
+static struct port *port_lookup(const struct bridge *, const char *name);
+static struct port *port_from_dp_ifidx(const struct bridge *,
+ uint16_t dp_ifidx);
+static void port_update_bond_compat(struct port *);
+static void port_update_vlan_compat(struct port *);
+
+static void mirror_create(struct bridge *, const char *name);
+static void mirror_destroy(struct mirror *);
+static void mirror_reconfigure(struct bridge *);
+static void mirror_reconfigure_one(struct mirror *);
+static bool vlan_is_mirrored(const struct mirror *, int vlan);
+
+static void brstp_reconfigure(struct bridge *);
+static void brstp_adjust_timers(struct bridge *);
+static void brstp_run(struct bridge *);
+static void brstp_wait(struct bridge *);
+
+static void iface_create(struct port *, const char *name);
+static void iface_destroy(struct iface *);
+static struct iface *iface_lookup(const struct bridge *, const char *name);
+static struct iface *iface_from_dp_ifidx(const struct bridge *,
+ uint16_t dp_ifidx);
+
+/* Hooks into ofproto processing. */
+static struct ofhooks bridge_ofhooks;
+
+/* Public functions. */
+
+/* Adds the name of each interface used by a bridge, including local and
+ * internal ports, to 'svec'. */
+void
+bridge_get_ifaces(struct svec *svec)
+{
+ struct bridge *br, *next;
+ size_t i, j;
+
+ LIST_FOR_EACH_SAFE (br, next, struct bridge, node, &all_bridges) {
+ for (i = 0; i < br->n_ports; i++) {
+ struct port *port = br->ports[i];
+
+ for (j = 0; j < port->n_ifaces; j++) {
+ struct iface *iface = port->ifaces[j];
+ if (iface->dp_ifidx < 0) {
+ VLOG_ERR("%s interface not in dp%u, ignoring",
+ iface->name, dpif_id(&br->dpif));
+ } else {
+ if (iface->dp_ifidx != ODPP_LOCAL) {
+ svec_add(svec, iface->name);
+ }
+ }
+ }
+ }
+ }
+}
+
+/* The caller must already have called cfg_read(). */
+void
+bridge_init(void)
+{
+ int retval;
+ int i;
+
+ for (i = 0; i < DP_MAX; i++) {
+ struct dpif dpif;
+ char devname[16];
+
+ sprintf(devname, "dp%d", i);
+ retval = dpif_open(devname, &dpif);
+ if (!retval) {
+ char dpif_name[IF_NAMESIZE];
+ if (dpif_get_name(&dpif, dpif_name, sizeof dpif_name)
+ || !cfg_has("bridge.%s.port", dpif_name)) {
+ dpif_delete(&dpif);
+ }
+ dpif_close(&dpif);
+ } else if (retval != ENODEV) {
+ VLOG_ERR("failed to delete datapath dp%d: %s",
+ i, strerror(retval));
+ }
+ }
+
+ bridge_reconfigure();
+}
+
+#ifdef HAVE_OPENSSL
+static bool
+config_string_change(const char *key, char **valuep)
+{
+ const char *value = cfg_get_string(0, "%s", key);
+ if (value && (!*valuep || strcmp(value, *valuep))) {
+ free(*valuep);
+ *valuep = xstrdup(value);
+ return true;
+ } else {
+ return false;
+ }
+}
+
+static void
+bridge_configure_ssl(void)
+{
+ /* XXX SSL should be configurable on a per-bridge basis.
+ * XXX should be possible to de-configure SSL. */
+ static char *private_key_file;
+ static char *certificate_file;
+ static char *cacert_file;
+
+ if (config_string_change("ssl.private-key", &private_key_file)) {
+ vconn_ssl_set_private_key_file(private_key_file);
+ }
+
+ if (config_string_change("ssl.certificate", &certificate_file)) {
+ vconn_ssl_set_certificate_file(certificate_file);
+ }
+
+ if (config_string_change("ssl.ca-cert", &cacert_file)) {
+ vconn_ssl_set_ca_cert_file(cacert_file,
+ cfg_get_bool(0, "ssl.bootstrap-ca-cert"));
+ }
+}
+#endif
+
+void
+bridge_reconfigure(void)
+{
+ struct svec old_br, new_br, raw_new_br;
+ struct bridge *br, *next;
+ size_t i, j;
+
+ COVERAGE_INC(bridge_reconfigure);
+
+ /* Collect old bridges. */
+ svec_init(&old_br);
+ LIST_FOR_EACH (br, struct bridge, node, &all_bridges) {
+ svec_add(&old_br, br->name);
+ }
+
+ /* Collect new bridges. */
+ svec_init(&raw_new_br);
+ cfg_get_subsections(&raw_new_br, "bridge");
+ svec_init(&new_br);
+ for (i = 0; i < raw_new_br.n; i++) {
+ const char *name = raw_new_br.names[i];
+ if ((!strncmp(name, "dp", 2) && isdigit(name[2])) ||
+ (!strncmp(name, "nl:", 3) && isdigit(name[3]))) {
+ VLOG_ERR("%s is not a valid bridge name (bridges may not be "
+ "named \"dp\" or \"nl:\" followed by a digit)", name);
+ } else {
+ svec_add(&new_br, name);
+ }
+ }
+ svec_destroy(&raw_new_br);
+
+ /* Get rid of deleted bridges and add new bridges. */
+ svec_sort(&old_br);
+ svec_sort(&new_br);
+ assert(svec_is_unique(&old_br));
+ assert(svec_is_unique(&new_br));
+ LIST_FOR_EACH_SAFE (br, next, struct bridge, node, &all_bridges) {
+ if (!svec_contains(&new_br, br->name)) {
+ bridge_destroy(br);
+ }
+ }
+ for (i = 0; i < new_br.n; i++) {
+ const char *name = new_br.names[i];
+ if (!svec_contains(&old_br, name)) {
+ bridge_create(name);
+ }
+ }
+ svec_destroy(&old_br);
+ svec_destroy(&new_br);
+
+#ifdef HAVE_OPENSSL
+ /* Configure SSL. */
+ bridge_configure_ssl();
+#endif
+
+ /* Reconfigure all bridges. */
+ LIST_FOR_EACH (br, struct bridge, node, &all_bridges) {
+ bridge_reconfigure_one(br);
+ }
+
+ /* Add and delete ports on all datapaths.
+ *
+ * The kernel will reject any attempt to add a given port to a datapath if
+ * that port already belongs to a different datapath, so we must do all
+ * port deletions before any port additions. */
+ LIST_FOR_EACH (br, struct bridge, node, &all_bridges) {
+ struct odp_port *dpif_ports;
+ size_t n_dpif_ports;
+ struct svec want_ifaces;
+
+ dpif_port_list(&br->dpif, &dpif_ports, &n_dpif_ports);
+ bridge_get_all_ifaces(br, &want_ifaces);
+ for (i = 0; i < n_dpif_ports; i++) {
+ const struct odp_port *p = &dpif_ports[i];
+ if (!svec_contains(&want_ifaces, p->devname)
+ && strcmp(p->devname, br->name)) {
+ int retval = dpif_port_del(&br->dpif, p->port);
+ if (retval) {
+ VLOG_ERR("failed to remove %s interface from dp%u: %s",
+ p->devname, dpif_id(&br->dpif), strerror(retval));
+ }
+ }
+ }
+ svec_destroy(&want_ifaces);
+ free(dpif_ports);
+ }
+ LIST_FOR_EACH (br, struct bridge, node, &all_bridges) {
+ struct odp_port *dpif_ports;
+ size_t n_dpif_ports;
+ struct svec cur_ifaces, want_ifaces, add_ifaces;
+ int next_port_no;
+
+ dpif_port_list(&br->dpif, &dpif_ports, &n_dpif_ports);
+ svec_init(&cur_ifaces);
+ for (i = 0; i < n_dpif_ports; i++) {
+ svec_add(&cur_ifaces, dpif_ports[i].devname);
+ }
+ free(dpif_ports);
+ svec_sort_unique(&cur_ifaces);
+ bridge_get_all_ifaces(br, &want_ifaces);
+ svec_diff(&want_ifaces, &cur_ifaces, &add_ifaces, NULL, NULL);
+
+ next_port_no = 1;
+ for (i = 0; i < add_ifaces.n; i++) {
+ const char *if_name = add_ifaces.names[i];
+ for (;;) {
+ int internal = cfg_get_bool(0, "iface.%s.internal", if_name);
+ int error = dpif_port_add(&br->dpif, if_name, next_port_no++,
+ internal ? ODP_PORT_INTERNAL : 0);
+ if (error != EEXIST) {
+ if (next_port_no >= 256) {
+ VLOG_ERR("ran out of valid port numbers on dp%u",
+ dpif_id(&br->dpif));
+ goto out;
+ }
+ if (error) {
+ VLOG_ERR("failed to add %s interface to dp%u: %s",
+ if_name, dpif_id(&br->dpif), strerror(error));
+ }
+ break;
+ }
+ }
+ }
+ out:
+ svec_destroy(&cur_ifaces);
+ svec_destroy(&want_ifaces);
+ svec_destroy(&add_ifaces);
+ }
+ LIST_FOR_EACH (br, struct bridge, node, &all_bridges) {
+ uint8_t ea[8];
+ uint64_t dpid;
+ struct iface *local_iface = NULL;
+ const char *devname;
+ uint8_t engine_type = br->dpif.minor;
+ uint8_t engine_id = br->dpif.minor;
+ bool add_id_to_iface = false;
+ struct svec nf_hosts;
+
+
+ bridge_fetch_dp_ifaces(br);
+ for (i = 0; i < br->n_ports; ) {
+ struct port *port = br->ports[i];
+
+ for (j = 0; j < port->n_ifaces; ) {
+ struct iface *iface = port->ifaces[j];
+ if (iface->dp_ifidx < 0) {
+ VLOG_ERR("%s interface not in dp%u, dropping",
+ iface->name, dpif_id(&br->dpif));
+ iface_destroy(iface);
+ } else {
+ if (iface->dp_ifidx == ODPP_LOCAL) {
+ local_iface = iface;
+ }
+ VLOG_DBG("dp%u has interface %s on port %d",
+ dpif_id(&br->dpif), iface->name, iface->dp_ifidx);
+ j++;
+ }
+ }
+ if (!port->n_ifaces) {
+ VLOG_ERR("%s port has no interfaces, dropping", port->name);
+ port_destroy(port);
+ continue;
+ }
+ i++;
+ }
+
+ /* Pick local port hardware address, datapath ID. */
+ bridge_pick_local_hw_addr(br, ea, &devname);
+ if (local_iface) {
+ int error = netdev_nodev_set_etheraddr(local_iface->name, ea);
+ if (error) {
+ static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
+ VLOG_ERR_RL(&rl, "bridge %s: failed to set bridge "
+ "Ethernet address: %s",
+ br->name, strerror(error));
+ }
+ }
+
+ dpid = bridge_pick_datapath_id(br, ea, devname);
+ ofproto_set_datapath_id(br->ofproto, dpid);
+
+ /* Set NetFlow configuration on this bridge. */
+ if (cfg_has("netflow.%s.engine-type", br->name)) {
+ engine_type = cfg_get_int(0, "netflow.%s.engine-type",
+ br->name);
+ }
+ if (cfg_has("netflow.%s.engine-id", br->name)) {
+ engine_id = cfg_get_int(0, "netflow.%s.engine-id", br->name);
+ }
+ if (cfg_has("netflow.%s.add-id-to-iface", br->name)) {
+ add_id_to_iface = cfg_get_bool(0, "netflow.%s.add-id-to-iface",
+ br->name);
+ }
+ if (add_id_to_iface && engine_id > 0x7f) {
+ VLOG_WARN("bridge %s: netflow port mangling may conflict with "
+ "another vswitch, choose an engine id less than 128",
+ br->name);
+ }
+ if (add_id_to_iface && br->n_ports > 0x1ff) {
+ VLOG_WARN("bridge %s: netflow port mangling will conflict with "
+ "another port when 512 or more ports are used",
+ br->name);
+ }
+ svec_init(&nf_hosts);
+ cfg_get_all_keys(&nf_hosts, "netflow.%s.host", br->name);
+ if (ofproto_set_netflow(br->ofproto, &nf_hosts, engine_type,
+ engine_id, add_id_to_iface)) {
+ VLOG_ERR("bridge %s: problem setting netflow collectors",
+ br->name);
+ }
+
+ /* Update the controller and related settings. It would be more
+ * straightforward to call this from bridge_reconfigure_one(), but we
+ * can't do it there for two reasons. First, and most importantly, at
+ * that point we don't know the dp_ifidx of any interfaces that have
+ * been added to the bridge (because we haven't actually added them to
+ * the datapath). Second, at that point we haven't set the datapath ID
+ * yet; when a controller is configured, resetting the datapath ID will
+ * immediately disconnect from the controller, so it's better to set
+ * the datapath ID before the controller. */
+ bridge_reconfigure_controller(br);
+ }
+ LIST_FOR_EACH (br, struct bridge, node, &all_bridges) {
+ for (i = 0; i < br->n_ports; i++) {
+ struct port *port = br->ports[i];
+ port_update_vlan_compat(port);
+ }
+ }
+ LIST_FOR_EACH (br, struct bridge, node, &all_bridges) {
+ brstp_reconfigure(br);
+ }
+}
+
+static void
+bridge_pick_local_hw_addr(struct bridge *br, uint8_t ea[ETH_ADDR_LEN],
+ const char **devname)
+{
+ uint64_t requested_ea;
+ size_t i, j;
+ int error;
+
+ *devname = NULL;
+
+ /* Did the user request a particular MAC? */
+ requested_ea = cfg_get_mac(0, "bridge.%s.mac", br->name);
+ if (requested_ea) {
+ eth_addr_from_uint64(requested_ea, ea);
+ if (eth_addr_is_multicast(ea)) {
+ VLOG_ERR("bridge %s: cannot set MAC address to multicast "
+ "address "ETH_ADDR_FMT, br->name, ETH_ADDR_ARGS(ea));
+ } else if (eth_addr_is_zero(ea)) {
+ VLOG_ERR("bridge %s: cannot set MAC address to zero", br->name);
+ } else {
+ return;
+ }
+ }
+
+ /* Otherwise choose the minimum MAC address among all of the interfaces.
+ * (Xen uses FE:FF:FF:FF:FF:FF for virtual interfaces so this will get the
+ * MAC of the physical interface in such an environment.) */
+ memset(ea, 0xff, sizeof ea);
+ for (i = 0; i < br->n_ports; i++) {
+ struct port *port = br->ports[i];
+ if (port->is_mirror_output_port) {
+ continue;
+ }
+ for (j = 0; j < port->n_ifaces; j++) {
+ struct iface *iface = port->ifaces[j];
+ uint8_t iface_ea[ETH_ADDR_LEN];
+ if (iface->dp_ifidx == ODPP_LOCAL
+ || cfg_get_bool(0, "iface.%s.internal", iface->name)) {
+ continue;
+ }
+ error = netdev_nodev_get_etheraddr(iface->name, iface_ea);
+ if (!error) {
+ if (!eth_addr_is_multicast(iface_ea) &&
+ !eth_addr_is_reserved(iface_ea) &&
+ !eth_addr_is_zero(iface_ea) &&
+ memcmp(iface_ea, ea, ETH_ADDR_LEN) < 0) {
+ memcpy(ea, iface_ea, ETH_ADDR_LEN);
+ *devname = iface->name;
+ }
+ } else {
+ static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
+ VLOG_ERR_RL(&rl, "failed to obtain Ethernet address of %s: %s",
+ iface->name, strerror(error));
+ }
+ }
+ }
+ if (eth_addr_is_multicast(ea) || eth_addr_is_vif(ea)) {
+ memcpy(ea, br->default_ea, ETH_ADDR_LEN);
+ *devname = NULL;
+ VLOG_WARN("bridge %s: using default bridge Ethernet "
+ "address "ETH_ADDR_FMT, br->name, ETH_ADDR_ARGS(ea));
+ } else {
+ VLOG_DBG("bridge %s: using bridge Ethernet address "ETH_ADDR_FMT,
+ br->name, ETH_ADDR_ARGS(ea));
+ }
+}
+
+/* Choose and returns the datapath ID for bridge 'br' given that the bridge
+ * Ethernet address is 'bridge_ea'. If 'bridge_ea' is the Ethernet address of
+ * a network device, then that network device's name must be passed in as
+ * 'devname'; if 'bridge_ea' was derived some other way, then 'devname' must be
+ * passed in as a null pointer. */
+static uint64_t
+bridge_pick_datapath_id(struct bridge *br,
+ const uint8_t bridge_ea[ETH_ADDR_LEN],
+ const char *devname)
+{
+ /*
+ * The procedure for choosing a bridge MAC address will, in the most
+ * ordinary case, also choose a unique MAC that we can use as a datapath
+ * ID. In some special cases, though, multiple bridges will end up with
+ * the same MAC address. This is OK for the bridges, but it will confuse
+ * the OpenFlow controller, because each datapath needs a unique datapath
+ * ID.
+ *
+ * Datapath IDs must be unique. It is also very desirable that they be
+ * stable from one run to the next, so that policy set on a datapath
+ * "sticks".
+ */
+ uint64_t dpid;
+
+ dpid = cfg_get_dpid(0, "bridge.%s.datapath-id", br->name);
+ if (dpid) {
+ return dpid;
+ }
+
+ if (devname) {
+ int vlan;
+ if (!netdev_get_vlan_vid(devname, &vlan)) {
+ /*
+ * A bridge whose MAC address is taken from a VLAN network device
+ * (that is, a network device created with vconfig(8) or similar
+ * tool) will have the same MAC address as a bridge on the VLAN
+ * device's physical network device.
+ *
+ * Handle this case by hashing the physical network device MAC
+ * along with the VLAN identifier.
+ */
+ uint8_t buf[ETH_ADDR_LEN + 2];
+ memcpy(buf, bridge_ea, ETH_ADDR_LEN);
+ buf[ETH_ADDR_LEN] = vlan >> 8;
+ buf[ETH_ADDR_LEN + 1] = vlan;
+ return dpid_from_hash(buf, sizeof buf);
+ } else {
+ /*
+ * Assume that this bridge's MAC address is unique, since it
+ * doesn't fit any of the cases we handle specially.
+ */
+ }
+ } else {
+ /*
+ * A purely internal bridge, that is, one that has no non-virtual
+ * network devices on it at all, is more difficult because it has no
+ * natural unique identifier at all.
+ *
+ * When the host is a XenServer, we handle this case by hashing the
+ * host's UUID with the name of the bridge. Names of bridges are
+ * persistent across XenServer reboots, although they can be reused if
+ * an internal network is destroyed and then a new one is later
+ * created, so this is fairly effective.
+ *
+ * When the host is not a XenServer, we punt by using a random MAC
+ * address on each run.
+ */
+ const char *host_uuid = xenserver_get_host_uuid();
+ if (host_uuid) {
+ char *combined = xasprintf("%s,%s", host_uuid, br->name);
+ dpid = dpid_from_hash(combined, strlen(combined));
+ free(combined);
+ return dpid;
+ }
+ }
+
+ return eth_addr_to_uint64(bridge_ea);
+}
+
+static uint64_t
+dpid_from_hash(const void *data, size_t n)
+{
+ uint8_t hash[SHA1HashSize];
+
+ BUILD_ASSERT_DECL(sizeof hash >= ETH_ADDR_LEN);
+ SHA1Bytes(data, n, hash);
+ eth_addr_mark_random(hash);
+ return eth_addr_to_uint64(hash);
+}
+
+int
+bridge_run(void)
+{
+ struct bridge *br, *next;
+ int retval;
+
+ retval = 0;
+ LIST_FOR_EACH_SAFE (br, next, struct bridge, node, &all_bridges) {
+ int error = bridge_run_one(br);
+ if (error) {
+ static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
+ VLOG_ERR_RL(&rl, "bridge %s: datapath was destroyed externally, "
+ "forcing reconfiguration", br->name);
+ if (!retval) {
+ retval = error;
+ }
+ }
+ }
+ return retval;
+}
+
+void
+bridge_wait(void)
+{
+ struct bridge *br;
+
+ LIST_FOR_EACH (br, struct bridge, node, &all_bridges) {
+ ofproto_wait(br->ofproto);
+ if (br->controller) {
+ continue;
+ }
+
+ if (br->ml) {
+ mac_learning_wait(br->ml);
+ }
+ bond_wait(br);
+ brstp_wait(br);
+ }
+}
+
+/* Forces 'br' to revalidate all of its flows. This is appropriate when 'br''s
+ * configuration changes. */
+static void
+bridge_flush(struct bridge *br)
+{
+ COVERAGE_INC(bridge_flush);
+ br->flush = true;
+ if (br->ml) {
+ mac_learning_flush(br->ml);
+ }
+}
+
+/* Bridge reconfiguration functions. */
+
+static struct bridge *
+bridge_create(const char *name)
+{
+ struct bridge *br;
+ int error;
+
+ assert(!bridge_lookup(name));
+ br = xcalloc(1, sizeof *br);
+
+ error = dpif_create(name, &br->dpif);
+ if (error == EEXIST) {
+ error = dpif_open(name, &br->dpif);
+ if (error) {
+ VLOG_ERR("datapath %s already exists but cannot be opened: %s",
+ name, strerror(error));
+ free(br);
+ return NULL;
+ }
+ dpif_flow_flush(&br->dpif);
+ } else if (error) {
+ VLOG_ERR("failed to create datapath %s: %s", name, strerror(error));
+ free(br);
+ return NULL;
+ }
+
+ error = ofproto_create(name, &bridge_ofhooks, br, &br->ofproto);
+ if (error) {
+ VLOG_ERR("failed to create switch %s: %s", name, strerror(error));
+ dpif_delete(&br->dpif);
+ dpif_close(&br->dpif);
+ free(br);
+ return NULL;
+ }
+
+ br->name = xstrdup(name);
+ br->ml = mac_learning_create();
+ br->sent_config_request = false;
+ eth_addr_random(br->default_ea);
+
+ port_array_init(&br->ifaces);
+
+ br->flush = false;
+ br->bond_next_rebalance = time_msec() + 10000;
+
+ list_push_back(&all_bridges, &br->node);
+
+ VLOG_INFO("created bridge %s on dp%u", br->name, dpif_id(&br->dpif));
+
+ return br;
+}
+
+static void
+bridge_destroy(struct bridge *br)
+{
+ if (br) {
+ int error;
+
+ while (br->n_ports > 0) {
+ port_destroy(br->ports[br->n_ports - 1]);
+ }
+ list_remove(&br->node);
+ error = dpif_delete(&br->dpif);
+ if (error && error != ENOENT) {
+ VLOG_ERR("failed to delete dp%u: %s",
+ dpif_id(&br->dpif), strerror(error));
+ }
+ dpif_close(&br->dpif);
+ ofproto_destroy(br->ofproto);
+ free(br->controller);
+ mac_learning_destroy(br->ml);
+ port_array_destroy(&br->ifaces);
+ free(br->ports);
+ free(br->name);
+ free(br);
+ }
+}
+
+static struct bridge *
+bridge_lookup(const char *name)
+{
+ struct bridge *br;
+
+ LIST_FOR_EACH (br, struct bridge, node, &all_bridges) {
+ if (!strcmp(br->name, name)) {
+ return br;
+ }
+ }
+ return NULL;
+}
+
+bool
+bridge_exists(const char *name)
+{
+ return bridge_lookup(name) ? true : false;
+}
+
+uint64_t
+bridge_get_datapathid(const char *name)
+{
+ struct bridge *br = bridge_lookup(name);
+ return br ? ofproto_get_datapath_id(br->ofproto) : 0;
+}
+
+static int
+bridge_run_one(struct bridge *br)
+{
+ int error;
+
+ error = ofproto_run1(br->ofproto);
+ if (error) {
+ return error;
+ }
+
+ if (br->ml) {
+ mac_learning_run(br->ml, ofproto_get_revalidate_set(br->ofproto));
+ }
+ bond_run(br);
+ brstp_run(br);
+
+ error = ofproto_run2(br->ofproto, br->flush);
+ br->flush = false;
+
+ return error;
+}
+
+static const char *
+bridge_get_controller(const struct bridge *br)
+{
+ const char *controller;
+
+ controller = cfg_get_string(0, "bridge.%s.controller", br->name);
+ if (!controller) {
+ controller = cfg_get_string(0, "mgmt.controller");
+ }
+ return controller && controller[0] ? controller : NULL;
+}
+
+static void
+bridge_reconfigure_one(struct bridge *br)
+{
+ struct svec old_ports, new_ports, ifaces;
+ struct svec listeners, old_listeners;
+ struct svec snoops, old_snoops;
+ size_t i, j;
+
+ /* Collect old ports. */
+ svec_init(&old_ports);
+ for (i = 0; i < br->n_ports; i++) {
+ svec_add(&old_ports, br->ports[i]->name);
+ }
+ svec_sort(&old_ports);
+ assert(svec_is_unique(&old_ports));
+
+ /* Collect new ports. */
+ svec_init(&new_ports);
+ cfg_get_all_keys(&new_ports, "bridge.%s.port", br->name);
+ svec_sort(&new_ports);
+ if (bridge_get_controller(br) && !svec_contains(&new_ports, br->name)) {
+ svec_add(&new_ports, br->name);
+ svec_sort(&new_ports);
+ }
+ if (!svec_is_unique(&new_ports)) {
+ VLOG_WARN("bridge %s: %s specified twice as bridge port",
+ br->name, svec_get_duplicate(&new_ports));
+ svec_unique(&new_ports);
+ }
+
+ ofproto_set_mgmt_id(br->ofproto, mgmt_id);
+
+ /* Get rid of deleted ports and add new ports. */
+ for (i = 0; i < br->n_ports; ) {
+ struct port *port = br->ports[i];
+ if (!svec_contains(&new_ports, port->name)) {
+ port_destroy(port);
+ } else {
+ i++;
+ }
+ }
+ for (i = 0; i < new_ports.n; i++) {
+ const char *name = new_ports.names[i];
+ if (!svec_contains(&old_ports, name)) {
+ port_create(br, name);
+ }
+ }
+ svec_destroy(&old_ports);
+ svec_destroy(&new_ports);
+
+ /* Reconfigure all ports. */
+ for (i = 0; i < br->n_ports; i++) {
+ port_reconfigure(br->ports[i]);
+ }
+
+ /* Check and delete duplicate interfaces. */
+ svec_init(&ifaces);
+ for (i = 0; i < br->n_ports; ) {
+ struct port *port = br->ports[i];
+ for (j = 0; j < port->n_ifaces; ) {
+ struct iface *iface = port->ifaces[j];
+ if (svec_contains(&ifaces, iface->name)) {
+ VLOG_ERR("bridge %s: %s interface is on multiple ports, "
+ "removing from %s",
+ br->name, iface->name, port->name);
+ iface_destroy(iface);
+ } else {
+ svec_add(&ifaces, iface->name);
+ svec_sort(&ifaces);
+ j++;
+ }
+ }
+ if (!port->n_ifaces) {
+ VLOG_ERR("%s port has no interfaces, dropping", port->name);
+ port_destroy(port);
+ } else {
+ i++;
+ }
+ }
+ svec_destroy(&ifaces);
+
+ /* Delete all flows if we're switching from connected to standalone or vice
+ * versa. (XXX Should we delete all flows if we are switching from one
+ * controller to another?) */
+
+ /* Configure OpenFlow management listeners. */
+ svec_init(&listeners);
+ cfg_get_all_strings(&listeners, "bridge.%s.openflow.listeners", br->name);
+ if (!listeners.n) {
+ svec_add_nocopy(&listeners, xasprintf("punix:%s/%s.mgmt",
+ ovs_rundir, br->name));
+ } else if (listeners.n == 1 && !strcmp(listeners.names[0], "none")) {
+ svec_clear(&listeners);
+ }
+ svec_sort_unique(&listeners);
+
+ svec_init(&old_listeners);
+ ofproto_get_listeners(br->ofproto, &old_listeners);
+ svec_sort_unique(&old_listeners);
+
+ if (!svec_equal(&listeners, &old_listeners)) {
+ ofproto_set_listeners(br->ofproto, &listeners);
+ }
+ svec_destroy(&listeners);
+ svec_destroy(&old_listeners);
+
+ /* Configure OpenFlow controller connection snooping. */
+ svec_init(&snoops);
+ cfg_get_all_strings(&snoops, "bridge.%s.openflow.snoops", br->name);
+ if (!snoops.n) {
+ svec_add_nocopy(&snoops, xasprintf("punix:%s/%s.snoop",
+ ovs_rundir, br->name));
+ } else if (snoops.n == 1 && !strcmp(snoops.names[0], "none")) {
+ svec_clear(&snoops);
+ }
+ svec_sort_unique(&snoops);
+
+ svec_init(&old_snoops);
+ ofproto_get_snoops(br->ofproto, &old_snoops);
+ svec_sort_unique(&old_snoops);
+
+ if (!svec_equal(&snoops, &old_snoops)) {
+ ofproto_set_snoops(br->ofproto, &snoops);
+ }
+ svec_destroy(&snoops);
+ svec_destroy(&old_snoops);
+
+ mirror_reconfigure(br);
+}
+
+static void
+bridge_reconfigure_controller(struct bridge *br)
+{
+ char *pfx = xasprintf("bridge.%s.controller", br->name);
+ const char *controller;
+
+ controller = bridge_get_controller(br);
+ if ((br->controller != NULL) != (controller != NULL)) {
+ ofproto_flush_flows(br->ofproto);
+ }
+ free(br->controller);
+ br->controller = controller ? xstrdup(controller) : NULL;
+
+ if (controller) {
+ const char *fail_mode;
+ int max_backoff, probe;
+ int rate_limit, burst_limit;
+
+ if (!strcmp(controller, "discover")) {
+ ofproto_set_discovery(br->ofproto, true,
+ cfg_get_string(0, "%s.accept-regex", pfx),
+ cfg_get_bool(0, "%s.update-resolv.conf",
+ pfx));
+ } else {
+ struct netdev *netdev;
+ bool in_band;
+ int error;
+
+ in_band = (!cfg_is_valid(CFG_BOOL | CFG_REQUIRED,
+ "%s.in-band", pfx)
+ || cfg_get_bool(0, "%s.in-band", pfx));
+ ofproto_set_discovery(br->ofproto, false, NULL, NULL);
+ ofproto_set_in_band(br->ofproto, in_band);
+
+ error = netdev_open(br->name, NETDEV_ETH_TYPE_NONE, &netdev);
+ if (!error) {
+ if (cfg_is_valid(CFG_IP | CFG_REQUIRED, "%s.ip", pfx)) {
+ struct in_addr ip, mask, gateway;
+ ip.s_addr = cfg_get_ip(0, "%s.ip", pfx);
+ mask.s_addr = cfg_get_ip(0, "%s.netmask", pfx);
+ gateway.s_addr = cfg_get_ip(0, "%s.gateway", pfx);
+
+ netdev_turn_flags_on(netdev, NETDEV_UP, true);
+ if (!mask.s_addr) {
+ mask.s_addr = guess_netmask(ip.s_addr);
+ }
+ if (!netdev_set_in4(netdev, ip, mask)) {
+ VLOG_INFO("bridge %s: configured IP address "IP_FMT", "
+ "netmask "IP_FMT,
+ br->name, IP_ARGS(&ip.s_addr),
+ IP_ARGS(&mask.s_addr));
+ }
+
+ if (gateway.s_addr) {
+ if (!netdev_add_router(gateway)) {
+ VLOG_INFO("bridge %s: configured gateway "IP_FMT,
+ br->name, IP_ARGS(&gateway.s_addr));
+ }
+ }
+ }
+ netdev_close(netdev);
+ }
+ }
+
+ fail_mode = cfg_get_string(0, "%s.fail-mode", pfx);
+ if (!fail_mode) {
+ fail_mode = cfg_get_string(0, "mgmt.fail-mode");
+ }
+ ofproto_set_failure(br->ofproto,
+ (!fail_mode
+ || !strcmp(fail_mode, "standalone")
+ || !strcmp(fail_mode, "open")));
+
+ probe = cfg_get_int(0, "%s.inactivity-probe", pfx);
+ ofproto_set_probe_interval(br->ofproto,
+ probe ? probe : cfg_get_int(0, "mgmt.inactivity-probe"));
+
+ max_backoff = cfg_get_int(0, "%s.max-backoff", pfx);
+ if (!max_backoff) {
+ max_backoff = cfg_get_int(0, "mgmt.max-backoff");
+ if (!max_backoff) {
+ max_backoff = 15;
+ }
+ }
+ ofproto_set_max_backoff(br->ofproto, max_backoff);
+
+ rate_limit = cfg_get_int(0, "%s.rate-limit", pfx);
+ if (!rate_limit) {
+ rate_limit = cfg_get_int(0, "mgmt.rate-limit");
+ }
+ burst_limit = cfg_get_int(0, "%s.burst-limit", pfx);
+ if (!burst_limit) {
+ burst_limit = cfg_get_int(0, "mgmt.burst-limit");
+ }
+ ofproto_set_rate_limit(br->ofproto, rate_limit, burst_limit);
+
+ ofproto_set_stp(br->ofproto, cfg_get_bool(0, "%s.stp", pfx));
+
+ if (cfg_has("%s.commands.acl", pfx)) {
+ struct svec command_acls;
+ char *command_acl;
+
+ svec_init(&command_acls);
+ cfg_get_all_strings(&command_acls, "%s.commands.acl", pfx);
+ command_acl = svec_join(&command_acls, ",", "");
+
+ ofproto_set_remote_execution(br->ofproto, command_acl,
+ cfg_get_string(0, "%s.commands.dir",
+ pfx));
+
+ svec_destroy(&command_acls);
+ free(command_acl);
+ } else {
+ ofproto_set_remote_execution(br->ofproto, NULL, NULL);
+ }
+ } else {
+ union ofp_action action;
+ flow_t flow;
+
+ /* Set up a flow that matches every packet and directs them to
+ * OFPP_NORMAL (which goes to us). */
+ memset(&action, 0, sizeof action);
+ action.type = htons(OFPAT_OUTPUT);
+ action.output.len = htons(sizeof action);
+ action.output.port = htons(OFPP_NORMAL);
+ memset(&flow, 0, sizeof flow);
+ ofproto_add_flow(br->ofproto, &flow, OFPFW_ALL, 0,
+ &action, 1, 0);
+
+ ofproto_set_in_band(br->ofproto, false);
+ ofproto_set_max_backoff(br->ofproto, 1);
+ ofproto_set_probe_interval(br->ofproto, 5);
+ ofproto_set_failure(br->ofproto, false);
+ ofproto_set_stp(br->ofproto, false);
+ }
+ free(pfx);
+
+ ofproto_set_controller(br->ofproto, br->controller);
+}
+
+static void
+bridge_get_all_ifaces(const struct bridge *br, struct svec *ifaces)
+{
+ size_t i, j;
+
+ svec_init(ifaces);
+ for (i = 0; i < br->n_ports; i++) {
+ struct port *port = br->ports[i];
+ for (j = 0; j < port->n_ifaces; j++) {
+ struct iface *iface = port->ifaces[j];
+ svec_add(ifaces, iface->name);
+ }
+ }
+ svec_sort(ifaces);
+ assert(svec_is_unique(ifaces));
+}
+
+/* For robustness, in case the administrator moves around datapath ports behind
+ * our back, we re-check all the datapath port numbers here.
+ *
+ * This function will set the 'dp_ifidx' members of interfaces that have
+ * disappeared to -1, so only call this function from a context where those
+ * 'struct iface's will be removed from the bridge. Otherwise, the -1
+ * 'dp_ifidx'es will cause trouble later when we try to send them to the
+ * datapath, which doesn't support UINT16_MAX+1 ports. */
+static void
+bridge_fetch_dp_ifaces(struct bridge *br)
+{
+ struct odp_port *dpif_ports;
+ size_t n_dpif_ports;
+ size_t i, j;
+
+ /* Reset all interface numbers. */
+ for (i = 0; i < br->n_ports; i++) {
+ struct port *port = br->ports[i];
+ for (j = 0; j < port->n_ifaces; j++) {
+ struct iface *iface = port->ifaces[j];
+ iface->dp_ifidx = -1;
+ }
+ }
+ port_array_clear(&br->ifaces);
+
+ dpif_port_list(&br->dpif, &dpif_ports, &n_dpif_ports);
+ for (i = 0; i < n_dpif_ports; i++) {
+ struct odp_port *p = &dpif_ports[i];
+ struct iface *iface = iface_lookup(br, p->devname);
+ if (iface) {
+ if (iface->dp_ifidx >= 0) {
+ VLOG_WARN("dp%u reported interface %s twice",
+ dpif_id(&br->dpif), p->devname);
+ } else if (iface_from_dp_ifidx(br, p->port)) {
+ VLOG_WARN("dp%u reported interface %"PRIu16" twice",
+ dpif_id(&br->dpif), p->port);
+ } else {
+ port_array_set(&br->ifaces, p->port, iface);
+ iface->dp_ifidx = p->port;
+ }
+ }
+ }
+ free(dpif_ports);
+}
+
+/* Bridge packet processing functions. */
+
+static struct bond_entry *
+lookup_bond_entry(const struct port *port, const uint8_t mac[ETH_ADDR_LEN])
+{
+ size_t h = hash_bytes(mac, ETH_ADDR_LEN, 0);
+ return &port->bond_hash[h & BOND_MASK];
+}
+
+static int
+bond_choose_iface(const struct port *port)
+{
+ size_t i;
+ for (i = 0; i < port->n_ifaces; i++) {
+ if (port->ifaces[i]->enabled) {
+ return i;
+ }
+ }
+ return -1;
+}
+
+static bool
+choose_output_iface(const struct port *port, const flow_t *flow,
+ uint16_t *dp_ifidx, tag_type *tags)
+{
+ struct iface *iface;
+
+ assert(port->n_ifaces);
+ if (port->n_ifaces == 1) {
+ iface = port->ifaces[0];
+ } else {
+ struct bond_entry *e = lookup_bond_entry(port, flow->dl_src);
+ if (e->iface_idx < 0 || e->iface_idx >= port->n_ifaces
+ || !port->ifaces[e->iface_idx]->enabled) {
+ /* XXX select interface properly. The current interface selection
+ * is only good for testing the rebalancing code. */
+ e->iface_idx = bond_choose_iface(port);
+ if (e->iface_idx < 0) {
+ *tags |= port->no_ifaces_tag;
+ return false;
+ }
+ e->iface_tag = tag_create_random();
+ }
+ *tags |= e->iface_tag;
+ iface = port->ifaces[e->iface_idx];
+ }
+ *dp_ifidx = iface->dp_ifidx;
+ *tags |= iface->tag; /* Currently only used for bonding. */
+ return true;
+}
+
+static void
+bond_link_status_update(struct iface *iface, bool carrier)
+{
+ static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
+ struct port *port = iface->port;
+
+ if ((carrier == iface->enabled) == (iface->delay_expires == LLONG_MAX)) {
+ /* Nothing to do. */
+ return;
+ }
+ VLOG_INFO_RL(&rl, "interface %s: carrier %s",
+ iface->name, carrier ? "detected" : "dropped");
+ if (carrier == iface->enabled) {
+ iface->delay_expires = LLONG_MAX;
+ VLOG_INFO_RL(&rl, "interface %s: will not be %s",
+ iface->name, carrier ? "disabled" : "enabled");
+ } else {
+ int delay = carrier ? port->updelay : port->downdelay;
+ iface->delay_expires = time_msec() + delay;
+ if (delay) {
+ VLOG_INFO_RL(&rl,
+ "interface %s: will be %s if it stays %s for %d ms",
+ iface->name,
+ carrier ? "enabled" : "disabled",
+ carrier ? "up" : "down",
+ delay);
+ }
+ }
+}
+
+static void
+bond_choose_active_iface(struct port *port)
+{
+ static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
+
+ port->active_iface = bond_choose_iface(port);
+ port->active_iface_tag = tag_create_random();
+ if (port->active_iface >= 0) {
+ VLOG_INFO_RL(&rl, "port %s: active interface is now %s",
+ port->name, port->ifaces[port->active_iface]->name);
+ } else {
+ VLOG_WARN_RL(&rl, "port %s: all ports disabled, no active interface",
+ port->name);
+ }
+}
+
+static void
+bond_run(struct bridge *br)
+{
+ size_t i, j;
+
+ for (i = 0; i < br->n_ports; i++) {
+ struct port *port = br->ports[i];
+ if (port->n_ifaces < 2) {
+ continue;
+ }
+ for (j = 0; j < port->n_ifaces; j++) {
+ struct iface *iface = port->ifaces[j];
+ if (time_msec() >= iface->delay_expires) {
+ iface->delay_expires = LLONG_MAX;
+ iface->enabled = !iface->enabled;
+ VLOG_WARN("interface %s: %s",
+ iface->name,
+ iface->enabled ? "enabled" : "disabled");
+ if (!iface->enabled) {
+ ofproto_revalidate(br->ofproto, iface->tag);
+ if (iface->port_ifidx == port->active_iface) {
+ ofproto_revalidate(br->ofproto,
+ port->active_iface_tag);
+ bond_choose_active_iface(port);
+ }
+ } else {
+ if (port->active_iface < 0) {
+ ofproto_revalidate(br->ofproto, port->no_ifaces_tag);
+ bond_choose_active_iface(port);
+ }
+ iface->tag = tag_create_random();
+ }
+ }
+ }
+ }
+}
+
+static void
+bond_wait(struct bridge *br)
+{
+ size_t i, j;
+
+ for (i = 0; i < br->n_ports; i++) {
+ struct port *port = br->ports[i];
+ if (port->n_ifaces < 2) {
+ continue;
+ }
+ for (j = 0; j < port->n_ifaces; j++) {
+ struct iface *iface = port->ifaces[j];
+ if (iface->delay_expires != LLONG_MAX) {
+ poll_timer_wait(iface->delay_expires - time_msec());
+ }
+ }
+ }
+}
+
+static bool
+set_dst(struct dst *p, const flow_t *flow,
+ const struct port *in_port, const struct port *out_port,
+ tag_type *tags)
+{
+ /* STP handling.
+ *
+ * XXX This uses too many tags: any broadcast flow will get one tag per
+ * destination port, and thus a broadcast on a switch of any size is likely
+ * to have all tag bits set. We should figure out a way to be smarter.
+ *
+ * This is OK when STP is disabled, because stp_state_tag is 0 then. */
+ *tags |= out_port->stp_state_tag;
+ if (!(out_port->stp_state & (STP_DISABLED | STP_FORWARDING))) {
+ return false;
+ }
+
+ p->vlan = (out_port->vlan >= 0 ? OFP_VLAN_NONE
+ : in_port->vlan >= 0 ? in_port->vlan
+ : ntohs(flow->dl_vlan));
+ return choose_output_iface(out_port, flow, &p->dp_ifidx, tags);
+}
+
+static void
+swap_dst(struct dst *p, struct dst *q)
+{
+ struct dst tmp = *p;
+ *p = *q;
+ *q = tmp;
+}
+
+/* Moves all the dsts with vlan == 'vlan' to the front of the 'n_dsts' in
+ * 'dsts'. (This may help performance by reducing the number of VLAN changes
+ * that we push to the datapath. We could in fact fully sort the array by
+ * vlan, but in most cases there are at most two different vlan tags so that's
+ * possibly overkill.) */
+static void
+partition_dsts(struct dst *dsts, size_t n_dsts, int vlan)
+{
+ struct dst *first = dsts;
+ struct dst *last = dsts + n_dsts;
+
+ while (first != last) {
+ /* Invariants:
+ * - All dsts < first have vlan == 'vlan'.
+ * - All dsts >= last have vlan != 'vlan'.
+ * - first < last. */
+ while (first->vlan == vlan) {
+ if (++first == last) {
+ return;
+ }
+ }
+
+ /* Same invariants, plus one additional:
+ * - first->vlan != vlan.
+ */
+ while (last[-1].vlan != vlan) {
+ if (--last == first) {
+ return;
+ }
+ }
+
+ /* Same invariants, plus one additional:
+ * - last[-1].vlan == vlan.*/
+ swap_dst(first++, --last);
+ }
+}
+
+static int
+mirror_mask_ffs(mirror_mask_t mask)
+{
+ BUILD_ASSERT_DECL(sizeof(unsigned int) >= sizeof(mask));
+ return ffs(mask);
+}
+
+static bool
+dst_is_duplicate(const struct dst *dsts, size_t n_dsts,
+ const struct dst *test)
+{
+ size_t i;
+ for (i = 0; i < n_dsts; i++) {
+ if (dsts[i].vlan == test->vlan && dsts[i].dp_ifidx == test->dp_ifidx) {
+ return true;
+ }
+ }
+ return false;
+}
+
+static bool
+port_trunks_vlan(const struct port *port, uint16_t vlan)
+{
+ return port->vlan < 0 && bitmap_is_set(port->trunks, vlan);
+}
+
+static bool
+port_includes_vlan(const struct port *port, uint16_t vlan)
+{
+ return vlan == port->vlan || port_trunks_vlan(port, vlan);
+}
+
+static size_t
+compose_dsts(const struct bridge *br, const flow_t *flow, uint16_t vlan,
+ const struct port *in_port, const struct port *out_port,
+ struct dst dsts[], tag_type *tags)
+{
+ mirror_mask_t mirrors = in_port->src_mirrors;
+ struct dst *dst = dsts;
+ size_t i;
+
+ *tags |= in_port->stp_state_tag;
+ if (out_port == FLOOD_PORT) {
+ /* XXX use ODP_FLOOD if no vlans or bonding. */
+ /* XXX even better, define each VLAN as a datapath port group */
+ for (i = 0; i < br->n_ports; i++) {
+ struct port *port = br->ports[i];
+ if (port != in_port && port_includes_vlan(port, vlan)
+ && !port->is_mirror_output_port
+ && set_dst(dst, flow, in_port, port, tags)) {
+ mirrors |= port->dst_mirrors;
+ dst++;
+ }
+ }
+ } else if (out_port && set_dst(dst, flow, in_port, out_port, tags)) {
+ mirrors |= out_port->dst_mirrors;
+ dst++;
+ }
+
+ while (mirrors) {
+ struct mirror *m = br->mirrors[mirror_mask_ffs(mirrors) - 1];
+ if (!m->n_vlans || vlan_is_mirrored(m, vlan)) {
+ if (m->out_port) {
+ if (set_dst(dst, flow, in_port, m->out_port, tags)
+ && !dst_is_duplicate(dsts, dst - dsts, dst)) {
+ dst++;
+ }
+ } else {
+ for (i = 0; i < br->n_ports; i++) {
+ struct port *port = br->ports[i];
+ if (port_includes_vlan(port, m->out_vlan)
+ && set_dst(dst, flow, in_port, port, tags)
+ && !dst_is_duplicate(dsts, dst - dsts, dst))
+ {
+ if (port->vlan < 0) {
+ dst->vlan = m->out_vlan;
+ }
+ if (dst->dp_ifidx == flow->in_port
+ && dst->vlan == vlan) {
+ /* Don't send out input port on same VLAN. */
+ continue;
+ }
+ dst++;
+ }
+ }
+ }
+ }
+ mirrors &= mirrors - 1;
+ }
+
+ partition_dsts(dsts, dst - dsts, ntohs(flow->dl_vlan));
+ return dst - dsts;
+}
+
+static void UNUSED
+print_dsts(const struct dst *dsts, size_t n)
+{
+ for (; n--; dsts++) {
+ printf(">p%"PRIu16, dsts->dp_ifidx);
+ if (dsts->vlan != OFP_VLAN_NONE) {
+ printf("v%"PRIu16, dsts->vlan);
+ }
+ }
+}
+
+static void
+compose_actions(struct bridge *br, const flow_t *flow, uint16_t vlan,
+ const struct port *in_port, const struct port *out_port,
+ tag_type *tags, struct odp_actions *actions)
+{
+ struct dst dsts[DP_MAX_PORTS * (MAX_MIRRORS + 1)];
+ size_t n_dsts;
+ const struct dst *p;
+ uint16_t cur_vlan;
+
+ n_dsts = compose_dsts(br, flow, vlan, in_port, out_port, dsts, tags);
+
+ cur_vlan = ntohs(flow->dl_vlan);
+ for (p = dsts; p < &dsts[n_dsts]; p++) {
+ union odp_action *a;
+ if (p->vlan != cur_vlan) {
+ if (p->vlan == OFP_VLAN_NONE) {
+ odp_actions_add(actions, ODPAT_STRIP_VLAN);
+ } else {
+ a = odp_actions_add(actions, ODPAT_SET_VLAN_VID);
+ a->vlan_vid.vlan_vid = htons(p->vlan);
+ }
+ cur_vlan = p->vlan;
+ }
+ a = odp_actions_add(actions, ODPAT_OUTPUT);
+ a->output.port = p->dp_ifidx;
+ }
+}
+
+static bool
+is_bcast_arp_reply(const flow_t *flow, const struct ofpbuf *packet)
+{
+ struct arp_eth_header *arp = (struct arp_eth_header *) packet->data;
+ return (flow->dl_type == htons(ETH_TYPE_ARP)
+ && eth_addr_is_broadcast(flow->dl_dst)
+ && packet->size >= sizeof(struct arp_eth_header)
+ && arp->ar_op == ARP_OP_REQUEST);
+}
+
+/* If the composed actions may be applied to any packet in the given 'flow',
+ * returns true. Otherwise, the actions should only be applied to 'packet', or
+ * not at all, if 'packet' was NULL. */
+static bool
+process_flow(struct bridge *br, const flow_t *flow,
+ const struct ofpbuf *packet, struct odp_actions *actions,
+ tag_type *tags)
+{
+ struct iface *in_iface;
+ struct port *in_port;
+ struct port *out_port = NULL; /* By default, drop the packet/flow. */
+ int vlan;
+
+ /* Find the interface and port structure for the received packet. */
+ in_iface = iface_from_dp_ifidx(br, flow->in_port);
+ if (!in_iface) {
+ /* No interface? Something fishy... */
+ if (packet != NULL) {
+ /* Odd. A few possible reasons here:
+ *
+ * - We deleted an interface but there are still a few packets
+ * queued up from it.
+ *
+ * - Someone externally added an interface (e.g. with "ovs-dpctl
+ * add-if") that we don't know about.
+ *
+ * - Packet arrived on the local port but the local port is not
+ * one of our bridge ports.
+ */
+ static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
+
+ VLOG_WARN_RL(&rl, "bridge %s: received packet on unknown "
+ "interface %"PRIu16, br->name, flow->in_port);
+ }
+
+ /* Return without adding any actions, to drop packets on this flow. */
+ return true;
+ }
+ in_port = in_iface->port;
+
+ /* Figure out what VLAN this packet belongs to.
+ *
+ * Note that dl_vlan of 0 and of OFP_VLAN_NONE both mean that the packet
+ * belongs to VLAN 0, so we should treat both cases identically. (In the
+ * former case, the packet has an 802.1Q header that specifies VLAN 0,
+ * presumably to allow a priority to be specified. In the latter case, the
+ * packet does not have any 802.1Q header.) */
+ vlan = ntohs(flow->dl_vlan);
+ if (vlan == OFP_VLAN_NONE) {
+ vlan = 0;
+ }
+ if (in_port->vlan >= 0) {
+ if (vlan) {
+ /* XXX support double tagging? */
+ if (packet != NULL) {
+ static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
+ VLOG_WARN_RL(&rl, "bridge %s: dropping VLAN %"PRIu16" tagged "
+ "packet received on port %s configured with "
+ "implicit VLAN %"PRIu16,
+ br->name, ntohs(flow->dl_vlan),
+ in_port->name, in_port->vlan);
+ }
+ goto done;
+ }
+ vlan = in_port->vlan;
+ } else {
+ if (!port_includes_vlan(in_port, vlan)) {
+ static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
+ VLOG_WARN_RL(&rl, "bridge %s: dropping VLAN %d tagged "
+ "packet received on port %s not configured for "
+ "trunking VLAN %d",
+ br->name, vlan, in_port->name, vlan);
+ goto done;
+ }
+ }
+
+ /* Drop frames for ports that STP wants entirely killed (both for
+ * forwarding and for learning). Later, after we do learning, we'll drop
+ * the frames that STP wants to do learning but not forwarding on. */
+ if (in_port->stp_state & (STP_LISTENING | STP_BLOCKING)) {
+ goto done;
+ }
+
+ /* Drop frames for reserved multicast addresses. */
+ if (eth_addr_is_reserved(flow->dl_dst)) {
+ goto done;
+ }
+
+ /* Drop frames on ports reserved for mirroring. */
+ if (in_port->is_mirror_output_port) {
+ static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
+ VLOG_WARN_RL(&rl, "bridge %s: dropping packet received on port %s, "
+ "which is reserved exclusively for mirroring",
+ br->name, in_port->name);
+ goto done;
+ }
+
+ /* Drop multicast and broadcast packets on inactive bonded interfaces, to
+ * avoid receiving duplicates. */
+ if (in_port->n_ifaces > 1 && eth_addr_is_multicast(flow->dl_dst)) {
+ *tags |= in_port->active_iface_tag;
+ if (in_port->active_iface != in_iface->port_ifidx) {
+ goto done;
+ }
+ }
+
+ /* MAC learning. */
+ out_port = FLOOD_PORT;
+ if (br->ml) {
+ int out_port_idx;
+ bool may_learn;
+
+ if (!packet) {
+ /* Don't try to learn from revalidation. */
+ may_learn = false;
+ } else if (in_port->n_ifaces > 1) {
+ /* If the packet arrived on a bonded port, don't learn from it
+ * unless we haven't learned any port at all for that address
+ * (because we probably sent the packet on one bonded interface and
+ * got it back on the other). Broadcast ARP replies are an
+ * exception to this rule: the host has moved to another switch. */
+ int src_idx = mac_learning_lookup(br->ml, flow->dl_src, vlan);
+ may_learn = (src_idx < 0
+ || src_idx == in_port->port_idx
+ || is_bcast_arp_reply(flow, packet));
+ } else {
+ may_learn = true;
+ }
+
+ /* Learn source MAC. */
+ if (may_learn) {
+ tag_type rev_tag = mac_learning_learn(br->ml, flow->dl_src,
+ vlan, in_port->port_idx);
+ if (rev_tag) {
+ /* The log messages here could actually be useful in debugging,
+ * so keep the rate limit relatively high. */
+ static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(30,
+ 300);
+ VLOG_DBG_RL(&rl, "bridge %s: learned that "ETH_ADDR_FMT" is "
+ "on port %s in VLAN %d",
+ br->name, ETH_ADDR_ARGS(flow->dl_src),
+ in_port->name, vlan);
+ ofproto_revalidate(br->ofproto, rev_tag);
+ }
+ }
+
+ /* Determine output port. */
+ out_port_idx = mac_learning_lookup_tag(br->ml, flow->dl_dst, vlan,
+ tags);
+ if (out_port_idx >= 0 && out_port_idx < br->n_ports) {
+ out_port = br->ports[out_port_idx];
+ }
+ }
+
+ /* Don't send packets out their input ports. Don't forward frames that STP
+ * wants us to discard. */
+ if (in_port == out_port || in_port->stp_state == STP_LEARNING) {
+ out_port = NULL;
+ }
+
+done:
+ compose_actions(br, flow, vlan, in_port, out_port, tags, actions);
+
+ /*
+ * We send out only a single packet, instead of setting up a flow, if the
+ * packet is an ARP directed to broadcast that arrived on a bonded
+ * interface. In such a situation ARP requests and replies must be handled
+ * differently, but OpenFlow unfortunately can't distinguish them.
+ */
+ return (in_port->n_ifaces < 2
+ || flow->dl_type != htons(ETH_TYPE_ARP)
+ || !eth_addr_is_broadcast(flow->dl_dst));
+}
+
+/* Careful: 'opp' is in host byte order and opp->port_no is an OFP port
+ * number. */
+static void
+bridge_port_changed_ofhook_cb(enum ofp_port_reason reason,
+ const struct ofp_phy_port *opp,
+ void *br_)
+{
+ struct bridge *br = br_;
+ struct iface *iface;
+ struct port *port;
+
+ iface = iface_from_dp_ifidx(br, ofp_port_to_odp_port(opp->port_no));
+ if (!iface) {
+ return;
+ }
+ port = iface->port;
+
+ if (reason == OFPPR_DELETE) {
+ VLOG_WARN("bridge %s: interface %s deleted unexpectedly",
+ br->name, iface->name);
+ iface_destroy(iface);
+ if (!port->n_ifaces) {
+ VLOG_WARN("bridge %s: port %s has no interfaces, dropping",
+ br->name, port->name);
+ port_destroy(port);
+ }
+
+ bridge_flush(br);
+ } else {
+ memcpy(iface->mac, opp->hw_addr, ETH_ADDR_LEN);
+ if (port->n_ifaces > 1) {
+ bool up = !(opp->state & OFPPS_LINK_DOWN);
+ bond_link_status_update(iface, up);
+ port_update_bond_compat(port);
+ }
+ }
+}
+
+static bool
+bridge_normal_ofhook_cb(const flow_t *flow, const struct ofpbuf *packet,
+ struct odp_actions *actions, tag_type *tags, void *br_)
+{
+ struct bridge *br = br_;
+
+#if 0
+ if (flow->dl_type == htons(OFP_DL_TYPE_NOT_ETH_TYPE)
+ && eth_addr_equals(flow->dl_dst, stp_eth_addr)) {
+ brstp_receive(br, flow, payload);
+ return true;
+ }
+#endif
+
+ COVERAGE_INC(bridge_process_flow);
+ return process_flow(br, flow, packet, actions, tags);
+}
+
+static void
+bridge_account_flow_ofhook_cb(const flow_t *flow,
+ const union odp_action *actions,
+ size_t n_actions, unsigned long long int n_bytes,
+ void *br_)
+{
+ struct bridge *br = br_;
+ const union odp_action *a;
+
+ if (!br->has_bonded_ports) {
+ return;
+ }
+
+ for (a = actions; a < &actions[n_actions]; a++) {
+ if (a->type == ODPAT_OUTPUT) {
+ struct port *port = port_from_dp_ifidx(br, a->output.port);
+ if (port && port->n_ifaces >= 2) {
+ struct bond_entry *e = lookup_bond_entry(port, flow->dl_src);
+ e->tx_bytes += n_bytes;
+ }
+ }
+ }
+}
+
+static void
+bridge_account_checkpoint_ofhook_cb(void *br_)
+{
+ struct bridge *br = br_;
+ size_t i;
+
+ if (!br->has_bonded_ports) {
+ return;
+ }
+
+ /* The current ofproto implementation calls this callback at least once a
+ * second, so this timer implementation is sufficient. */
+ if (time_msec() < br->bond_next_rebalance) {
+ return;
+ }
+ br->bond_next_rebalance = time_msec() + 10000;
+
+ for (i = 0; i < br->n_ports; i++) {
+ struct port *port = br->ports[i];
+ if (port->n_ifaces > 1) {
+ bond_rebalance_port(port);
+ }
+ }
+}
+
+static struct ofhooks bridge_ofhooks = {
+ bridge_port_changed_ofhook_cb,
+ bridge_normal_ofhook_cb,
+ bridge_account_flow_ofhook_cb,
+ bridge_account_checkpoint_ofhook_cb,
+};
+
+/* Statistics for a single interface on a bonded port, used for load-based
+ * bond rebalancing. */
+struct slave_balance {
+ struct iface *iface; /* The interface. */
+ uint64_t tx_bytes; /* Sum of hashes[*]->tx_bytes. */
+
+ /* All the "bond_entry"s that are assigned to this interface, in order of
+ * increasing tx_bytes. */
+ struct bond_entry **hashes;
+ size_t n_hashes;
+};
+
+/* Sorts pointers to pointers to bond_entries in ascending order by the
+ * interface to which they are assigned, and within a single interface in
+ * ascending order of bytes transmitted. */
+static int
+compare_bond_entries(const void *a_, const void *b_)
+{
+ const struct bond_entry *const *ap = a_;
+ const struct bond_entry *const *bp = b_;
+ const struct bond_entry *a = *ap;
+ const struct bond_entry *b = *bp;
+ if (a->iface_idx != b->iface_idx) {
+ return a->iface_idx > b->iface_idx ? 1 : -1;
+ } else if (a->tx_bytes != b->tx_bytes) {
+ return a->tx_bytes > b->tx_bytes ? 1 : -1;
+ } else {
+ return 0;
+ }
+}
+
+/* Sorts slave_balances so that enabled ports come first, and otherwise in
+ * *descending* order by number of bytes transmitted. */
+static int
+compare_slave_balance(const void *a_, const void *b_)
+{
+ const struct slave_balance *a = a_;
+ const struct slave_balance *b = b_;
+ if (a->iface->enabled != b->iface->enabled) {
+ return a->iface->enabled ? -1 : 1;
+ } else if (a->tx_bytes != b->tx_bytes) {
+ return a->tx_bytes > b->tx_bytes ? -1 : 1;
+ } else {
+ return 0;
+ }
+}
+
+static void
+swap_bals(struct slave_balance *a, struct slave_balance *b)
+{
+ struct slave_balance tmp = *a;
+ *a = *b;
+ *b = tmp;
+}
+
+/* Restores the 'n_bals' slave_balance structures in 'bals' to sorted order
+ * given that 'p' (and only 'p') might be in the wrong location.
+ *
+ * This function invalidates 'p', since it might now be in a different memory
+ * location. */
+static void
+resort_bals(struct slave_balance *p,
+ struct slave_balance bals[], size_t n_bals)
+{
+ if (n_bals > 1) {
+ for (; p > bals && p->tx_bytes > p[-1].tx_bytes; p--) {
+ swap_bals(p, p - 1);
+ }
+ for (; p < &bals[n_bals - 1] && p->tx_bytes < p[1].tx_bytes; p++) {
+ swap_bals(p, p + 1);
+ }
+ }
+}
+
+static void
+log_bals(const struct slave_balance *bals, size_t n_bals, struct port *port)
+{
+ if (VLOG_IS_DBG_ENABLED()) {
+ struct ds ds = DS_EMPTY_INITIALIZER;
+ const struct slave_balance *b;
+
+ for (b = bals; b < bals + n_bals; b++) {
+ size_t i;
+
+ if (b > bals) {
+ ds_put_char(&ds, ',');
+ }
+ ds_put_format(&ds, " %s %"PRIu64"kB",
+ b->iface->name, b->tx_bytes / 1024);
+
+ if (!b->iface->enabled) {
+ ds_put_cstr(&ds, " (disabled)");
+ }
+ if (b->n_hashes > 0) {
+ ds_put_cstr(&ds, " (");
+ for (i = 0; i < b->n_hashes; i++) {
+ const struct bond_entry *e = b->hashes[i];
+ if (i > 0) {
+ ds_put_cstr(&ds, " + ");
+ }
+ ds_put_format(&ds, "h%td: %"PRIu64"kB",
+ e - port->bond_hash, e->tx_bytes / 1024);
+ }
+ ds_put_cstr(&ds, ")");
+ }
+ }
+ VLOG_DBG("bond %s:%s", port->name, ds_cstr(&ds));
+ ds_destroy(&ds);
+ }
+}
+
+/* Shifts 'hash' from 'from' to 'to' within 'port'. */
+static void
+bond_shift_load(struct slave_balance *from, struct slave_balance *to,
+ struct bond_entry *hash)
+{
+ struct port *port = from->iface->port;
+ uint64_t delta = hash->tx_bytes;
+
+ VLOG_INFO("bond %s: shift %"PRIu64"kB of load (with hash %td) "
+ "from %s to %s (now carrying %"PRIu64"kB and "
+ "%"PRIu64"kB load, respectively)",
+ port->name, delta / 1024, hash - port->bond_hash,
+ from->iface->name, to->iface->name,
+ (from->tx_bytes - delta) / 1024,
+ (to->tx_bytes + delta) / 1024);
+
+ /* Delete element from from->hashes.
+ *
+ * We don't bother to add the element to to->hashes because not only would
+ * it require more work, the only purpose it would be to allow that hash to
+ * be migrated to another slave in this rebalancing run, and there is no
+ * point in doing that. */
+ if (from->hashes[0] == hash) {
+ from->hashes++;
+ } else {
+ int i = hash - from->hashes[0];
+ memmove(from->hashes + i, from->hashes + i + 1,
+ (from->n_hashes - (i + 1)) * sizeof *from->hashes);
+ }
+ from->n_hashes--;
+
+ /* Shift load away from 'from' to 'to'. */
+ from->tx_bytes -= delta;
+ to->tx_bytes += delta;
+
+ /* Arrange for flows to be revalidated. */
+ ofproto_revalidate(port->bridge->ofproto, hash->iface_tag);
+ hash->iface_idx = to->iface->port_ifidx;
+ hash->iface_tag = tag_create_random();
+
+}
+
+static void
+bond_rebalance_port(struct port *port)
+{
+ struct slave_balance bals[DP_MAX_PORTS];
+ size_t n_bals;
+ struct bond_entry *hashes[BOND_MASK + 1];
+ struct slave_balance *b, *from, *to;
+ struct bond_entry *e;
+ size_t i;
+
+ /* Sets up 'bals' to describe each of the port's interfaces, sorted in
+ * descending order of tx_bytes, so that bals[0] represents the most
+ * heavily loaded slave and bals[n_bals - 1] represents the least heavily
+ * loaded slave.
+ *
+ * The code is a bit tricky: to avoid dynamically allocating a 'hashes'
+ * array for each slave_balance structure, we sort our local array of
+ * hashes in order by slave, so that all of the hashes for a given slave
+ * become contiguous in memory, and then we point each 'hashes' members of
+ * a slave_balance structure to the start of a contiguous group. */
+ n_bals = port->n_ifaces;
+ for (b = bals; b < &bals[n_bals]; b++) {
+ b->iface = port->ifaces[b - bals];
+ b->tx_bytes = 0;
+ b->hashes = NULL;
+ b->n_hashes = 0;
+ }
+ for (i = 0; i <= BOND_MASK; i++) {
+ hashes[i] = &port->bond_hash[i];
+ }
+ qsort(hashes, BOND_MASK + 1, sizeof *hashes, compare_bond_entries);
+ for (i = 0; i <= BOND_MASK; i++) {
+ e = hashes[i];
+ if (e->iface_idx >= 0 && e->iface_idx < port->n_ifaces) {
+ b = &bals[e->iface_idx];
+ b->tx_bytes += e->tx_bytes;
+ if (!b->hashes) {
+ b->hashes = &hashes[i];
+ }
+ b->n_hashes++;
+ }
+ }
+ qsort(bals, n_bals, sizeof *bals, compare_slave_balance);
+ log_bals(bals, n_bals, port);
+
+ /* Discard slaves that aren't enabled (which were sorted to the back of the
+ * array earlier). */
+ while (!bals[n_bals - 1].iface->enabled) {
+ n_bals--;
+ if (!n_bals) {
+ return;
+ }
+ }
+
+ /* Shift load from the most-loaded slaves to the least-loaded slaves. */
+ to = &bals[n_bals - 1];
+ for (from = bals; from < to; ) {
+ uint64_t overload = from->tx_bytes - to->tx_bytes;
+ if (overload < to->tx_bytes >> 5 || overload < 100000) {
+ /* The extra load on 'from' (and all less-loaded slaves), compared
+ * to that of 'to' (the least-loaded slave), is less than ~3%, or
+ * it is less than ~1Mbps. No point in rebalancing. */
+ break;
+ } else if (from->n_hashes == 1) {
+ /* 'from' only carries a single MAC hash, so we can't shift any
+ * load away from it, even though we want to. */
+ from++;
+ } else {
+ /* 'from' is carrying significantly more load than 'to', and that
+ * load is split across at least two different hashes. Pick a hash
+ * to migrate to 'to' (the least-loaded slave), given that doing so
+ * must not cause 'to''s load to exceed 'from''s load.
+ *
+ * The sort order we use means that we prefer to shift away the
+ * smallest hashes instead of the biggest ones. There is little
+ * reason behind this decision; we could use the opposite sort
+ * order to shift away big hashes ahead of small ones. */
+ size_t i;
+
+ for (i = 0; i < from->n_hashes; i++) {
+ uint64_t delta = from->hashes[i]->tx_bytes;
+ if (to->tx_bytes + delta < from->tx_bytes - delta) {
+ break;
+ }
+ }
+ if (i < from->n_hashes) {
+ bond_shift_load(from, to, from->hashes[i]);
+
+ /* Re-sort 'bals'. Note that this may make 'from' and 'to'
+ * point to different slave_balance structures. It is only
+ * valid to do these two operations in a row at all because we
+ * know that 'from' will not move past 'to' and vice versa. */
+ resort_bals(from, bals, n_bals);
+ resort_bals(to, bals, n_bals);
+ } else {
+ from++;
+ }
+ }
+ }
+
+ /* Implement exponentially weighted moving average. A weight of 1/2 causes
+ * historical data to decay to <1% in 7 rebalancing runs. */
+ for (e = &port->bond_hash[0]; e <= &port->bond_hash[BOND_MASK]; e++) {
+ e->tx_bytes /= 2;
+ }
+}
+
+/* Port functions. */
+
+static void
+port_create(struct bridge *br, const char *name)
+{
+ struct port *port;
+
+ port = xcalloc(1, sizeof *port);
+ port->bridge = br;
+ port->port_idx = br->n_ports;
+ port->vlan = -1;
+ port->trunks = NULL;
+ port->name = xstrdup(name);
+ port->active_iface = -1;
+ port->stp_state = STP_DISABLED;
+ port->stp_state_tag = 0;
+
+ if (br->n_ports >= br->allocated_ports) {
+ br->ports = x2nrealloc(br->ports, &br->allocated_ports,
+ sizeof *br->ports);
+ }
+ br->ports[br->n_ports++] = port;
+
+ VLOG_INFO("created port %s on bridge %s", port->name, br->name);
+ bridge_flush(br);
+}
+
+static void
+port_reconfigure(struct port *port)
+{
+ bool bonded = cfg_has_section("bonding.%s", port->name);
+ struct svec old_ifaces, new_ifaces;
+ unsigned long *trunks;
+ int vlan;
+ size_t i;
+
+ /* Collect old and new interfaces. */
+ svec_init(&old_ifaces);
+ svec_init(&new_ifaces);
+ for (i = 0; i < port->n_ifaces; i++) {
+ svec_add(&old_ifaces, port->ifaces[i]->name);
+ }
+ svec_sort(&old_ifaces);
+ if (bonded) {
+ cfg_get_all_keys(&new_ifaces, "bonding.%s.slave", port->name);
+ if (!new_ifaces.n) {
+ VLOG_ERR("port %s: no interfaces specified for bonded port",
+ port->name);
+ } else if (new_ifaces.n == 1) {
+ VLOG_WARN("port %s: only 1 interface specified for bonded port",
+ port->name);
+ }
+
+ port->updelay = cfg_get_int(0, "bonding.%s.updelay", port->name);
+ if (port->updelay < 0) {
+ port->updelay = 0;
+ }
+ port->downdelay = cfg_get_int(0, "bonding.%s.downdelay", port->name);
+ if (port->downdelay < 0) {
+ port->downdelay = 0;
+ }
+ } else {
+ svec_init(&new_ifaces);
+ svec_add(&new_ifaces, port->name);
+ }
+
+ /* Get rid of deleted interfaces and add new interfaces. */
+ for (i = 0; i < port->n_ifaces; i++) {
+ struct iface *iface = port->ifaces[i];
+ if (!svec_contains(&new_ifaces, iface->name)) {
+ iface_destroy(iface);
+ } else {
+ i++;
+ }
+ }
+ for (i = 0; i < new_ifaces.n; i++) {
+ const char *name = new_ifaces.names[i];
+ if (!svec_contains(&old_ifaces, name)) {
+ iface_create(port, name);
+ }
+ }
+
+ /* Get VLAN tag. */
+ vlan = -1;
+ if (cfg_has("vlan.%s.tag", port->name)) {
+ if (!bonded) {
+ vlan = cfg_get_vlan(0, "vlan.%s.tag", port->name);
+ if (vlan >= 0 && vlan <= 4095) {
+ VLOG_DBG("port %s: assigning VLAN tag %d", port->name, vlan);
+ }
+ } else {
+ /* It's possible that bonded, VLAN-tagged ports make sense. Maybe
+ * they even work as-is. But they have not been tested. */
+ VLOG_WARN("port %s: VLAN tags not supported on bonded ports",
+ port->name);
+ }
+ }
+ if (port->vlan != vlan) {
+ port->vlan = vlan;
+ bridge_flush(port->bridge);
+ }
+
+ /* Get trunked VLANs. */
+ trunks = NULL;
+ if (vlan < 0) {
+ size_t n_trunks, n_errors;
+ size_t i;
+
+ trunks = bitmap_allocate(4096);
+ n_trunks = cfg_count("vlan.%s.trunks", port->name);
+ n_errors = 0;
+ for (i = 0; i < n_trunks; i++) {
+ int trunk = cfg_get_vlan(i, "vlan.%s.trunks", port->name);
+ if (trunk >= 0) {
+ bitmap_set1(trunks, trunk);
+ } else {
+ n_errors++;
+ }
+ }
+ if (n_errors) {
+ VLOG_ERR("port %s: invalid values for %zu trunk VLANs",
+ port->name, n_trunks);
+ }
+ if (n_errors == n_trunks) {
+ if (n_errors) {
+ VLOG_ERR("port %s: no valid trunks, trunking all VLANs",
+ port->name);
+ }
+ bitmap_set_multiple(trunks, 0, 4096, 1);
+ }
+ } else {
+ if (cfg_has("vlan.%s.trunks", port->name)) {
+ VLOG_ERR("ignoring vlan.%s.trunks in favor of vlan.%s.vlan",
+ port->name, port->name);
+ }
+ }
+ if (trunks == NULL
+ ? port->trunks != NULL
+ : port->trunks == NULL || !bitmap_equal(trunks, port->trunks, 4096)) {
+ bridge_flush(port->bridge);
+ }
+ bitmap_free(port->trunks);
+ port->trunks = trunks;
+
+ svec_destroy(&old_ifaces);
+ svec_destroy(&new_ifaces);
+}
+
+static void
+port_destroy(struct port *port)
+{
+ if (port) {
+ struct bridge *br = port->bridge;
+ struct port *del;
+ size_t i;
+
+ proc_net_compat_update_vlan(port->name, NULL, 0);
+
+ for (i = 0; i < MAX_MIRRORS; i++) {
+ struct mirror *m = br->mirrors[i];
+ if (m && m->out_port == port) {
+ mirror_destroy(m);
+ }
+ }
+
+ while (port->n_ifaces > 0) {
+ iface_destroy(port->ifaces[port->n_ifaces - 1]);
+ }
+
+ del = br->ports[port->port_idx] = br->ports[--br->n_ports];
+ del->port_idx = port->port_idx;
+
+ free(port->ifaces);
+ bitmap_free(port->trunks);
+ free(port->name);
+ free(port);
+ bridge_flush(br);
+ }
+}
+
+static struct port *
+port_from_dp_ifidx(const struct bridge *br, uint16_t dp_ifidx)
+{
+ struct iface *iface = iface_from_dp_ifidx(br, dp_ifidx);
+ return iface ? iface->port : NULL;
+}
+
+static struct port *
+port_lookup(const struct bridge *br, const char *name)
+{
+ size_t i;
+
+ for (i = 0; i < br->n_ports; i++) {
+ struct port *port = br->ports[i];
+ if (!strcmp(port->name, name)) {
+ return port;
+ }
+ }
+ return NULL;
+}
+
+static void
+port_update_bonding(struct port *port)
+{
+ if (port->n_ifaces < 2) {
+ /* Not a bonded port. */
+ if (port->bond_hash) {
+ free(port->bond_hash);
+ port->bond_hash = NULL;
+ proc_net_compat_update_bond(port->name, NULL);
+ }
+ } else {
+ if (!port->bond_hash) {
+ size_t i;
+
+ port->bond_hash = xcalloc(BOND_MASK + 1, sizeof *port->bond_hash);
+ for (i = 0; i <= BOND_MASK; i++) {
+ struct bond_entry *e = &port->bond_hash[i];
+ e->iface_idx = -1;
+ e->tx_bytes = 0;
+ }
+ port->no_ifaces_tag = tag_create_random();
+ bond_choose_active_iface(port);
+ }
+ port_update_bond_compat(port);
+ }
+}
+
+static void
+port_update_bond_compat(struct port *port)
+{
+ struct compat_bond bond;
+ size_t i;
+
+ if (port->n_ifaces < 2) {
+ return;
+ }
+
+ bond.up = false;
+ bond.updelay = port->updelay;
+ bond.downdelay = port->downdelay;
+ bond.n_slaves = port->n_ifaces;
+ bond.slaves = xmalloc(port->n_ifaces * sizeof *bond.slaves);
+ for (i = 0; i < port->n_ifaces; i++) {
+ struct iface *iface = port->ifaces[i];
+ struct compat_bond_slave *slave = &bond.slaves[i];
+ slave->name = iface->name;
+ slave->up = ((iface->enabled && iface->delay_expires == LLONG_MAX) ||
+ (!iface->enabled && iface->delay_expires != LLONG_MAX));
+ if (slave->up) {
+ bond.up = true;
+ }
+ memcpy(slave->mac, iface->mac, ETH_ADDR_LEN);
+ }
+ proc_net_compat_update_bond(port->name, &bond);
+ free(bond.slaves);
+}
+
+static void
+port_update_vlan_compat(struct port *port)
+{
+ struct bridge *br = port->bridge;
+ char *vlandev_name = NULL;
+
+ if (port->vlan > 0) {
+ /* Figure out the name that the VLAN device should actually have, if it
+ * existed. This takes some work because the VLAN device would not
+ * have port->name in its name; rather, it would have the trunk port's
+ * name, and 'port' would be attached to a bridge that also had the
+ * VLAN device one of its ports. So we need to find a trunk port that
+ * includes port->vlan.
+ *
+ * There might be more than one candidate. This doesn't happen on
+ * XenServer, so if it happens we just pick the first choice in
+ * alphabetical order instead of creating multiple VLAN devices. */
+ size_t i;
+ for (i = 0; i < br->n_ports; i++) {
+ struct port *p = br->ports[i];
+ if (port_trunks_vlan(p, port->vlan)
+ && p->n_ifaces
+ && (!vlandev_name || strcmp(p->name, vlandev_name) <= 0))
+ {
+ const uint8_t *ea = p->ifaces[0]->mac;
+ if (!eth_addr_is_multicast(ea) &&
+ !eth_addr_is_reserved(ea) &&
+ !eth_addr_is_zero(ea)) {
+ vlandev_name = p->name;
+ }
+ }
+ }
+ }
+ proc_net_compat_update_vlan(port->name, vlandev_name, port->vlan);
+}
+
+/* Interface functions. */
+
+static void
+iface_create(struct port *port, const char *name)
+{
+ enum netdev_flags flags;
+ struct iface *iface;
+
+ iface = xcalloc(1, sizeof *iface);
+ iface->port = port;
+ iface->port_ifidx = port->n_ifaces;
+ iface->name = xstrdup(name);
+ iface->dp_ifidx = -1;
+ iface->tag = tag_create_random();
+ iface->enabled = true;
+ iface->delay_expires = LLONG_MAX;
+
+ netdev_nodev_get_etheraddr(name, iface->mac);
+
+ if (!netdev_nodev_get_flags(name, &flags)) {
+ iface->enabled = (flags & NETDEV_UP) != 0;
+ }
+
+ if (port->n_ifaces >= port->allocated_ifaces) {
+ port->ifaces = x2nrealloc(port->ifaces, &port->allocated_ifaces,
+ sizeof *port->ifaces);
+ }
+ port->ifaces[port->n_ifaces++] = iface;
+ if (port->n_ifaces > 1) {
+ port->bridge->has_bonded_ports = true;
+ }
+
+ VLOG_DBG("attached network device %s to port %s", iface->name, port->name);
+
+ port_update_bonding(port);
+ bridge_flush(port->bridge);
+}
+
+static void
+iface_destroy(struct iface *iface)
+{
+ if (iface) {
+ struct port *port = iface->port;
+ struct bridge *br = port->bridge;
+ bool del_active = port->active_iface == iface->port_ifidx;
+ struct iface *del;
+
+ if (iface->dp_ifidx >= 0) {
+ port_array_set(&br->ifaces, iface->dp_ifidx, NULL);
+ }
+
+ del = port->ifaces[iface->port_ifidx] = port->ifaces[--port->n_ifaces];
+ del->port_ifidx = iface->port_ifidx;
+
+ free(iface->name);
+ free(iface);
+
+ if (del_active) {
+ ofproto_revalidate(port->bridge->ofproto, port->active_iface_tag);
+ bond_choose_active_iface(port);
+ }
+
+ port_update_bonding(port);
+ bridge_flush(port->bridge);
+ }
+}
+
+static struct iface *
+iface_lookup(const struct bridge *br, const char *name)
+{
+ size_t i, j;
+
+ for (i = 0; i < br->n_ports; i++) {
+ struct port *port = br->ports[i];
+ for (j = 0; j < port->n_ifaces; j++) {
+ struct iface *iface = port->ifaces[j];
+ if (!strcmp(iface->name, name)) {
+ return iface;
+ }
+ }
+ }
+ return NULL;
+}
+
+static struct iface *
+iface_from_dp_ifidx(const struct bridge *br, uint16_t dp_ifidx)
+{
+ return port_array_get(&br->ifaces, dp_ifidx);
+}
+
+/* Port mirroring. */
+
+static void
+mirror_reconfigure(struct bridge *br)
+{
+ struct svec old_mirrors, new_mirrors;
+ size_t i;
+
+ /* Collect old and new mirrors. */
+ svec_init(&old_mirrors);
+ svec_init(&new_mirrors);
+ cfg_get_subsections(&new_mirrors, "mirror.%s", br->name);
+ for (i = 0; i < MAX_MIRRORS; i++) {
+ if (br->mirrors[i]) {
+ svec_add(&old_mirrors, br->mirrors[i]->name);
+ }
+ }
+
+ /* Get rid of deleted mirrors and add new mirrors. */
+ svec_sort(&old_mirrors);
+ assert(svec_is_unique(&old_mirrors));
+ svec_sort(&new_mirrors);
+ assert(svec_is_unique(&new_mirrors));
+ for (i = 0; i < MAX_MIRRORS; i++) {
+ struct mirror *m = br->mirrors[i];
+ if (m && !svec_contains(&new_mirrors, m->name)) {
+ mirror_destroy(m);
+ }
+ }
+ for (i = 0; i < new_mirrors.n; i++) {
+ const char *name = new_mirrors.names[i];
+ if (!svec_contains(&old_mirrors, name)) {
+ mirror_create(br, name);
+ }
+ }
+ svec_destroy(&old_mirrors);
+ svec_destroy(&new_mirrors);
+
+ /* Reconfigure all mirrors. */
+ for (i = 0; i < MAX_MIRRORS; i++) {
+ if (br->mirrors[i]) {
+ mirror_reconfigure_one(br->mirrors[i]);
+ }
+ }
+
+ /* Update port reserved status. */
+ for (i = 0; i < br->n_ports; i++) {
+ br->ports[i]->is_mirror_output_port = false;
+ }
+ for (i = 0; i < MAX_MIRRORS; i++) {
+ struct mirror *m = br->mirrors[i];
+ if (m && m->out_port) {
+ m->out_port->is_mirror_output_port = true;
+ }
+ }
+}
+
+static void
+mirror_create(struct bridge *br, const char *name)
+{
+ struct mirror *m;
+ size_t i;
+
+ for (i = 0; ; i++) {
+ if (i >= MAX_MIRRORS) {
+ VLOG_WARN("bridge %s: maximum of %d port mirrors reached, "
+ "cannot create %s", br->name, MAX_MIRRORS, name);
+ return;
+ }
+ if (!br->mirrors[i]) {
+ break;
+ }
+ }
+
+ VLOG_INFO("created port mirror %s on bridge %s", name, br->name);
+ bridge_flush(br);
+
+ br->mirrors[i] = m = xcalloc(1, sizeof *m);
+ m->bridge = br;
+ m->idx = i;
+ m->name = xstrdup(name);
+ svec_init(&m->src_ports);
+ svec_init(&m->dst_ports);
+ m->vlans = NULL;
+ m->n_vlans = 0;
+ m->out_vlan = -1;
+ m->out_port = NULL;
+}
+
+static void
+mirror_destroy(struct mirror *m)
+{
+ if (m) {
+ struct bridge *br = m->bridge;
+ size_t i;
+
+ for (i = 0; i < br->n_ports; i++) {
+ br->ports[i]->src_mirrors &= ~(MIRROR_MASK_C(1) << m->idx);
+ br->ports[i]->dst_mirrors &= ~(MIRROR_MASK_C(1) << m->idx);
+ }
+
+ svec_destroy(&m->src_ports);
+ svec_destroy(&m->dst_ports);
+ free(m->vlans);
+
+ m->bridge->mirrors[m->idx] = NULL;
+ free(m);
+
+ bridge_flush(br);
+ }
+}
+
+static void
+prune_ports(struct mirror *m, struct svec *ports)
+{
+ struct svec tmp;
+ size_t i;
+
+ svec_sort_unique(ports);
+
+ svec_init(&tmp);
+ for (i = 0; i < ports->n; i++) {
+ const char *name = ports->names[i];
+ if (port_lookup(m->bridge, name)) {
+ svec_add(&tmp, name);
+ } else {
+ VLOG_WARN("mirror.%s.%s: cannot match on nonexistent port %s",
+ m->bridge->name, m->name, name);
+ }
+ }
+ svec_swap(ports, &tmp);
+ svec_destroy(&tmp);
+}
+
+static size_t
+prune_vlans(struct mirror *m, struct svec *vlan_strings, int **vlans)
+{
+ size_t n_vlans, i;
+
+ /* This isn't perfect: it won't combine "0" and "00", and the textual sort
+ * order won't give us numeric sort order. But that's good enough for what
+ * we need right now. */
+ svec_sort_unique(vlan_strings);
+
+ *vlans = xmalloc(sizeof *vlans * vlan_strings->n);
+ n_vlans = 0;
+ for (i = 0; i < vlan_strings->n; i++) {
+ const char *name = vlan_strings->names[i];
+ int vlan;
+ if (!str_to_int(name, 10, &vlan) || vlan < 0 || vlan > 4095) {
+ VLOG_WARN("mirror.%s.%s.select.vlan: ignoring invalid VLAN %s",
+ m->bridge->name, m->name, name);
+ } else {
+ (*vlans)[n_vlans++] = vlan;
+ }
+ }
+ return n_vlans;
+}
+
+static bool
+vlan_is_mirrored(const struct mirror *m, int vlan)
+{
+ size_t i;
+
+ for (i = 0; i < m->n_vlans; i++) {
+ if (m->vlans[i] == vlan) {
+ return true;
+ }
+ }
+ return false;
+}
+
+static bool
+port_trunks_any_mirrored_vlan(const struct mirror *m, const struct port *p)
+{
+ size_t i;
+
+ for (i = 0; i < m->n_vlans; i++) {
+ if (port_trunks_vlan(p, m->vlans[i])) {
+ return true;
+ }
+ }
+ return false;
+}
+
+static void
+mirror_reconfigure_one(struct mirror *m)
+{
+ char *pfx = xasprintf("mirror.%s.%s", m->bridge->name, m->name);
+ struct svec src_ports, dst_ports, ports;
+ struct svec vlan_strings;
+ mirror_mask_t mirror_bit;
+ const char *out_port_name;
+ struct port *out_port;
+ int out_vlan;
+ size_t n_vlans;
+ int *vlans;
+ size_t i;
+ bool mirror_all_ports;
+
+ /* Get output port. */
+ out_port_name = cfg_get_key(0, "mirror.%s.%s.output.port",
+ m->bridge->name, m->name);
+ if (out_port_name) {
+ out_port = port_lookup(m->bridge, out_port_name);
+ if (!out_port) {
+ VLOG_ERR("%s.output.port: bridge %s does not have a port "
+ "named %s", pfx, m->bridge->name, out_port_name);
+ mirror_destroy(m);
+ free(pfx);
+ return;
+ }
+ out_vlan = -1;
+
+ if (cfg_has("%s.output.vlan", pfx)) {
+ VLOG_ERR("%s.output.port and %s.output.vlan both specified; "
+ "ignoring %s.output.vlan", pfx, pfx, pfx);
+ }
+ } else if (cfg_has("%s.output.vlan", pfx)) {
+ out_port = NULL;
+ out_vlan = cfg_get_vlan(0, "%s.output.vlan", pfx);
+ } else {
+ VLOG_ERR("%s: neither %s.output.port nor %s.output.vlan specified, "
+ "but exactly one is required; disabling port mirror %s",
+ pfx, pfx, pfx, pfx);
+ mirror_destroy(m);
+ free(pfx);
+ return;
+ }
+
+ /* Get all the ports, and drop duplicates and ports that don't exist. */
+ svec_init(&src_ports);
+ svec_init(&dst_ports);
+ svec_init(&ports);
+ cfg_get_all_keys(&src_ports, "%s.select.src-port", pfx);
+ cfg_get_all_keys(&dst_ports, "%s.select.dst-port", pfx);
+ cfg_get_all_keys(&ports, "%s.select.port", pfx);
+ svec_append(&src_ports, &ports);
+ svec_append(&dst_ports, &ports);
+ svec_destroy(&ports);
+ prune_ports(m, &src_ports);
+ prune_ports(m, &dst_ports);
+
+ /* Get all the vlans, and drop duplicate and invalid vlans. */
+ svec_init(&vlan_strings);
+ cfg_get_all_keys(&vlan_strings, "%s.select.vlan", pfx);
+ n_vlans = prune_vlans(m, &vlan_strings, &vlans);
+ svec_destroy(&vlan_strings);
+
+ /* Update mirror data. */
+ if (!svec_equal(&m->src_ports, &src_ports)
+ || !svec_equal(&m->dst_ports, &dst_ports)
+ || m->n_vlans != n_vlans
+ || memcmp(m->vlans, vlans, sizeof *vlans * n_vlans)
+ || m->out_port != out_port
+ || m->out_vlan != out_vlan) {
+ bridge_flush(m->bridge);
+ }
+ svec_swap(&m->src_ports, &src_ports);
+ svec_swap(&m->dst_ports, &dst_ports);
+ free(m->vlans);
+ m->vlans = vlans;
+ m->n_vlans = n_vlans;
+ m->out_port = out_port;
+ m->out_vlan = out_vlan;
+
+ /* If no selection criteria have been given, mirror for all ports. */
+ mirror_all_ports = (!m->src_ports.n) && (!m->dst_ports.n) && (!m->n_vlans);
+
+ /* Update ports. */
+ mirror_bit = MIRROR_MASK_C(1) << m->idx;
+ for (i = 0; i < m->bridge->n_ports; i++) {
+ struct port *port = m->bridge->ports[i];
+
+ if (mirror_all_ports
+ || svec_contains(&m->src_ports, port->name)
+ || (m->n_vlans
+ && (!port->vlan
+ ? port_trunks_any_mirrored_vlan(m, port)
+ : vlan_is_mirrored(m, port->vlan)))) {
+ port->src_mirrors |= mirror_bit;
+ } else {
+ port->src_mirrors &= ~mirror_bit;
+ }
+
+ if (mirror_all_ports || svec_contains(&m->dst_ports, port->name)) {
+ port->dst_mirrors |= mirror_bit;
+ } else {
+ port->dst_mirrors &= ~mirror_bit;
+ }
+ }
+
+ /* Clean up. */
+ svec_destroy(&src_ports);
+ svec_destroy(&dst_ports);
+ free(pfx);
+}
+
+/* Spanning tree protocol. */
+
+static void brstp_update_port_state(struct port *);
+
+static void
+brstp_send_bpdu(struct ofpbuf *pkt, int port_no, void *br_)
+{
+ struct bridge *br = br_;
+ static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
+ struct iface *iface = iface_from_dp_ifidx(br, port_no);
+ if (!iface) {
+ VLOG_WARN_RL(&rl, "%s: cannot send BPDU on unknown port %d",
+ br->name, port_no);
+ } else if (eth_addr_is_zero(iface->mac)) {
+ VLOG_WARN_RL(&rl, "%s: cannot send BPDU on port %d with unknown MAC",
+ br->name, port_no);
+ } else {
+ union ofp_action action;
+ struct eth_header *eth = pkt->l2;
+ flow_t flow;
+
+ memcpy(eth->eth_src, iface->mac, ETH_ADDR_LEN);
+
+ memset(&action, 0, sizeof action);
+ action.type = htons(OFPAT_OUTPUT);
+ action.output.len = htons(sizeof action);
+ action.output.port = htons(port_no);
+
+ flow_extract(pkt, ODPP_NONE, &flow);
+ ofproto_send_packet(br->ofproto, &flow, &action, 1, pkt);
+ }
+ ofpbuf_delete(pkt);
+}
+
+static void
+brstp_reconfigure(struct bridge *br)
+{
+ size_t i;
+
+ if (!cfg_get_bool(0, "stp.%s.enabled", br->name)) {
+ if (br->stp) {
+ stp_destroy(br->stp);
+ br->stp = NULL;
+
+ bridge_flush(br);
+ }
+ } else {
+ uint64_t bridge_address, bridge_id;
+ int bridge_priority;
+
+ bridge_address = cfg_get_mac(0, "stp.%s.address", br->name);
+ if (!bridge_address) {
+ if (br->stp) {
+ bridge_address = (stp_get_bridge_id(br->stp)
+ & ((UINT64_C(1) << 48) - 1));
+ } else {
+ uint8_t mac[ETH_ADDR_LEN];
+ eth_addr_random(mac);
+ bridge_address = eth_addr_to_uint64(mac);
+ }
+ }
+
+ if (cfg_is_valid(CFG_INT | CFG_REQUIRED, "stp.%s.priority",
+ br->name)) {
+ bridge_priority = cfg_get_int(0, "stp.%s.priority", br->name);
+ } else {
+ bridge_priority = STP_DEFAULT_BRIDGE_PRIORITY;
+ }
+
+ bridge_id = bridge_address | ((uint64_t) bridge_priority << 48);
+ if (!br->stp) {
+ br->stp = stp_create(br->name, bridge_id, brstp_send_bpdu, br);
+ br->stp_last_tick = time_msec();
+ bridge_flush(br);
+ } else {
+ if (bridge_id != stp_get_bridge_id(br->stp)) {
+ stp_set_bridge_id(br->stp, bridge_id);
+ bridge_flush(br);
+ }
+ }
+
+ for (i = 0; i < br->n_ports; i++) {
+ struct port *p = br->ports[i];
+ int dp_ifidx;
+ struct stp_port *sp;
+ int path_cost, priority;
+ bool enable;
+
+ if (!p->n_ifaces) {
+ continue;
+ }
+ dp_ifidx = p->ifaces[0]->dp_ifidx;
+ if (dp_ifidx < 0 || dp_ifidx >= STP_MAX_PORTS) {
+ continue;
+ }
+
+ sp = stp_get_port(br->stp, dp_ifidx);
+ enable = (!cfg_is_valid(CFG_BOOL | CFG_REQUIRED,
+ "stp.%s.port.%s.enabled",
+ br->name, p->name)
+ || cfg_get_bool(0, "stp.%s.port.%s.enabled",
+ br->name, p->name));
+ if (p->is_mirror_output_port) {
+ enable = false;
+ }
+ if (enable != (stp_port_get_state(sp) != STP_DISABLED)) {
+ bridge_flush(br); /* Might not be necessary. */
+ if (enable) {
+ stp_port_enable(sp);
+ } else {
+ stp_port_disable(sp);
+ }
+ }
+
+ path_cost = cfg_get_int(0, "stp.%s.port.%s.path-cost",
+ br->name, p->name);
+ stp_port_set_path_cost(sp, path_cost ? path_cost : 19 /* XXX */);
+
+ priority = (cfg_is_valid(CFG_INT | CFG_REQUIRED,
+ "stp.%s.port.%s.priority",
+ br->name, p->name)
+ ? cfg_get_int(0, "stp.%s.port.%s.priority",
+ br->name, p->name)
+ : STP_DEFAULT_PORT_PRIORITY);
+ stp_port_set_priority(sp, priority);
+ }
+
+ brstp_adjust_timers(br);
+ }
+ for (i = 0; i < br->n_ports; i++) {
+ brstp_update_port_state(br->ports[i]);
+ }
+}
+
+static void
+brstp_update_port_state(struct port *p)
+{
+ struct bridge *br = p->bridge;
+ enum stp_state state;
+
+ /* Figure out new state. */
+ state = STP_DISABLED;
+ if (br->stp && p->n_ifaces > 0) {
+ int dp_ifidx = p->ifaces[0]->dp_ifidx;
+ if (dp_ifidx >= 0 && dp_ifidx < STP_MAX_PORTS) {
+ state = stp_port_get_state(stp_get_port(br->stp, dp_ifidx));
+ }
+ }
+
+ /* Update state. */
+ if (p->stp_state != state) {
+ static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(10, 10);
+ VLOG_INFO_RL(&rl, "port %s: STP state changed from %s to %s",
+ p->name, stp_state_name(p->stp_state),
+ stp_state_name(state));
+ if (p->stp_state == STP_DISABLED) {
+ bridge_flush(br);
+ } else {
+ ofproto_revalidate(p->bridge->ofproto, p->stp_state_tag);
+ }
+ p->stp_state = state;
+ p->stp_state_tag = (p->stp_state == STP_DISABLED ? 0
+ : tag_create_random());
+ }
+}
+
+static void
+brstp_adjust_timers(struct bridge *br)
+{
+ int hello_time = cfg_get_int(0, "stp.%s.hello-time", br->name);
+ int max_age = cfg_get_int(0, "stp.%s.max-age", br->name);
+ int forward_delay = cfg_get_int(0, "stp.%s.forward-delay", br->name);
+
+ stp_set_hello_time(br->stp, hello_time ? hello_time : 2000);
+ stp_set_max_age(br->stp, max_age ? max_age : 20000);
+ stp_set_forward_delay(br->stp, forward_delay ? forward_delay : 15000);
+}
+
+static void
+brstp_run(struct bridge *br)
+{
+ if (br->stp) {
+ long long int now = time_msec();
+ long long int elapsed = now - br->stp_last_tick;
+ struct stp_port *sp;
+
+ if (elapsed > 0) {
+ stp_tick(br->stp, MIN(INT_MAX, elapsed));
+ br->stp_last_tick = now;
+ }
+ while (stp_get_changed_port(br->stp, &sp)) {
+ struct port *p = port_from_dp_ifidx(br, stp_port_no(sp));
+ if (p) {
+ brstp_update_port_state(p);
+ }
+ }
+ }
+}
+
+static void
+brstp_wait(struct bridge *br)
+{
+ if (br->stp) {
+ poll_timer_wait(1000);
+ }
+}
diff --git a/vswitchd/bridge.h b/vswitchd/bridge.h
new file mode 100644
index 000000000..b9435370a
--- /dev/null
+++ b/vswitchd/bridge.h
@@ -0,0 +1,43 @@
+/* Copyright (c) 2008, 2009 Nicira Networks
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * In addition, as a special exception, Nicira Networks gives permission
+ * to link the code of its release of vswitchd with the OpenSSL project's
+ * "OpenSSL" library (or with modified versions of it that use the same
+ * license as the "OpenSSL" library), and distribute the linked
+ * executables. You must obey the GNU General Public License in all
+ * respects for all of the code used other than "OpenSSL". If you modify
+ * this file, you may extend this exception to your version of the file,
+ * but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version.
+ */
+
+#ifndef VSWITCHD_BRIDGE_H
+#define VSWITCHD_BRIDGE_H 1
+
+#include <stddef.h>
+#include "list.h"
+
+struct svec;
+
+void bridge_init(void);
+void bridge_reconfigure(void);
+int bridge_run(void);
+void bridge_wait(void);
+bool bridge_exists(const char *);
+uint64_t bridge_get_datapathid(const char *name);
+void bridge_get_ifaces(struct svec *svec);
+
+#endif /* bridge.h */
diff --git a/vswitchd/mgmt.c b/vswitchd/mgmt.c
new file mode 100644
index 000000000..f5dcd1840
--- /dev/null
+++ b/vswitchd/mgmt.c
@@ -0,0 +1,679 @@
+/* Copyright (c) 2009 Nicira Networks
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * In addition, as a special exception, Nicira Networks gives permission
+ * to link the code of its release of vswitchd with the OpenSSL project's
+ * "OpenSSL" library (or with modified versions of it that use the same
+ * license as the "OpenSSL" library), and distribute the linked
+ * executables. You must obey the GNU General Public License in all
+ * respects for all of the code used other than "OpenSSL". If you modify
+ * this file, you may extend this exception to your version of the file,
+ * but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version.
+ *
+ */
+
+#include <config.h>
+
+#include <arpa/inet.h>
+#include <assert.h>
+#include <errno.h>
+#include <stdlib.h>
+
+#include "bridge.h"
+#include "cfg.h"
+#include "coverage.h"
+#include "list.h"
+#include "mgmt.h"
+#include "openflow/nicira-ext.h"
+#include "openflow/openflow.h"
+#include "openflow/openflow-mgmt.h"
+#include "ofpbuf.h"
+#include "ovs-vswitchd.h"
+#include "packets.h"
+#include "rconn.h"
+#include "svec.h"
+#include "vconn.h"
+#include "vconn-ssl.h"
+#include "xtoxll.h"
+
+#define THIS_MODULE VLM_mgmt
+#include "vlog.h"
+
+#define MAX_BACKOFF_DEFAULT 15
+#define INACTIVITY_PROBE_DEFAULT 15
+
+static struct svec mgmt_cfg;
+static uint8_t cfg_cookie[CFG_COOKIE_LEN];
+static struct rconn *mgmt_rconn;
+static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(60, 60);
+static struct svec capabilities;
+uint64_t mgmt_id;
+
+
+#define TXQ_LIMIT 128 /* Max number of packets to queue for tx. */
+struct rconn_packet_counter *txqlen; /* # pkts queued for tx on mgmt_rconn. */
+
+static uint64_t pick_fallback_mgmt_id(void);
+static void send_config_update(uint32_t xid, bool use_xid);
+static void send_resources_update(uint32_t xid, bool use_xid);
+
+void
+mgmt_init(void)
+{
+ txqlen = rconn_packet_counter_create();
+
+ svec_init(&mgmt_cfg);
+ svec_init(&capabilities);
+ svec_add_nocopy(&capabilities,
+ xasprintf("com.nicira.mgmt.manager=true\n"));
+
+ mgmt_id = cfg_get_dpid(0, "mgmt.id");
+ if (!mgmt_id) {
+ /* Randomly generate a mgmt id */
+ mgmt_id = pick_fallback_mgmt_id();
+ }
+}
+
+#ifdef HAVE_OPENSSL
+static bool
+config_string_change(const char *key, char **valuep)
+{
+ const char *value = cfg_get_string(0, "%s", key);
+ if (value && (!*valuep || strcmp(value, *valuep))) {
+ free(*valuep);
+ *valuep = xstrdup(value);
+ return true;
+ } else {
+ return false;
+ }
+}
+
+static void
+mgmt_configure_ssl(void)
+{
+ static char *private_key_file;
+ static char *certificate_file;
+ static char *cacert_file;
+
+ /* XXX SSL should be configurable separate from the bridges.
+ * XXX should be possible to de-configure SSL. */
+ if (config_string_change("ssl.private-key", &private_key_file)) {
+ vconn_ssl_set_private_key_file(private_key_file);
+ }
+
+ if (config_string_change("ssl.certificate", &certificate_file)) {
+ vconn_ssl_set_certificate_file(certificate_file);
+ }
+
+ if (config_string_change("ssl.ca-cert", &cacert_file)) {
+ vconn_ssl_set_ca_cert_file(cacert_file,
+ cfg_get_bool(0, "ssl.bootstrap-ca-cert"));
+ }
+}
+#endif
+
+void
+mgmt_reconfigure(void)
+{
+ struct svec new_cfg;
+ uint8_t new_cookie[CFG_COOKIE_LEN];
+ bool cfg_updated = false;
+ const char *controller_name;
+ int max_backoff;
+ int inactivity_probe;
+ int retval;
+
+ if (!cfg_has_section("mgmt")) {
+ if (mgmt_rconn) {
+ rconn_destroy(mgmt_rconn);
+ mgmt_rconn = NULL;
+ }
+ return;
+ }
+
+ /* If this is an established connection, send a resources update. */
+ /* xxx This is wasteful if there were no resource changes!!! */
+ if (mgmt_rconn) {
+ send_resources_update(0, false);
+ }
+
+ cfg_get_cookie(new_cookie);
+ if (memcmp(cfg_cookie, new_cookie, sizeof(cfg_cookie))) {
+ memcpy(cfg_cookie, new_cookie, sizeof(cfg_cookie));
+ cfg_updated = true;
+ }
+
+ svec_init(&new_cfg);
+ cfg_get_section(&new_cfg, "mgmt");
+ if (svec_equal(&mgmt_cfg, &new_cfg)) {
+ /* Reconnecting to the controller causes the config file to be
+ * resent automatically. If we're not reconnecting and the
+ * config file has changed, we need to notify the controller of
+ * changes. */
+ if (cfg_updated && mgmt_rconn) {
+ send_config_update(0, false);
+ }
+ svec_destroy(&new_cfg);
+ return;
+ }
+
+ controller_name = cfg_get_string(0, "mgmt.controller");
+ if (!controller_name) {
+ VLOG_ERR("no controller specified for managment");
+ svec_destroy(&new_cfg);
+ return;
+ }
+
+ max_backoff = cfg_get_int(0, "mgmt.max-backoff");
+ if (max_backoff < 1) {
+ max_backoff = MAX_BACKOFF_DEFAULT;
+ } else if (max_backoff > 3600) {
+ max_backoff = 3600;
+ }
+
+ inactivity_probe = cfg_get_int(0, "mgmt.inactivity-probe");
+ if (inactivity_probe < 5) {
+ inactivity_probe = INACTIVITY_PROBE_DEFAULT;
+ }
+
+ /* xxx If this changes, we need to restart bridges to use new id,
+ * xxx but they need the id before the connect to controller, but we
+ * xxx need their dpids. */
+ /* Check if a different mgmt id has been assigned. */
+ if (cfg_has("mgmt.id")) {
+ uint64_t cfg_mgmt_id = cfg_get_dpid(0, "mgmt.id");
+ if (cfg_mgmt_id != mgmt_id) {
+ mgmt_id = cfg_mgmt_id;
+ }
+ }
+
+ svec_swap(&new_cfg, &mgmt_cfg);
+ svec_destroy(&new_cfg);
+
+#ifdef HAVE_OPENSSL
+ /* Configure SSL. */
+ mgmt_configure_ssl();
+#endif
+
+ if (mgmt_rconn) {
+ rconn_destroy(mgmt_rconn);
+ mgmt_rconn = NULL;
+ }
+ mgmt_rconn = rconn_create(inactivity_probe, max_backoff);
+ retval = rconn_connect(mgmt_rconn, controller_name);
+ if (retval == EAFNOSUPPORT) {
+ VLOG_ERR("no support for %s vconn", controller_name);
+ }
+}
+
+static int
+send_openflow_buffer(struct ofpbuf *buffer)
+{
+ int retval;
+
+ if (!mgmt_rconn) {
+ VLOG_ERR("attempt to send openflow packet with no rconn\n");
+ return EINVAL;
+ }
+
+ update_openflow_length(buffer);
+ retval = rconn_send_with_limit(mgmt_rconn, buffer, txqlen, TXQ_LIMIT);
+ if (retval) {
+ VLOG_WARN_RL(&rl, "send to %s failed: %s",
+ rconn_get_name(mgmt_rconn), strerror(retval));
+ }
+ return retval;
+}
+
+static void
+send_features_reply(uint32_t xid)
+{
+ struct ofpbuf *buffer;
+ struct ofp_switch_features *ofr;
+
+ ofr = make_openflow_xid(sizeof *ofr, OFPT_FEATURES_REPLY, xid, &buffer);
+ ofr->datapath_id = 0;
+ ofr->n_tables = 0;
+ ofr->n_buffers = 0;
+ ofr->capabilities = 0;
+ ofr->actions = 0;
+ send_openflow_buffer(buffer);
+}
+
+static void *
+make_ofmp_xid(size_t ofmp_len, uint16_t type, uint32_t xid,
+ struct ofpbuf **bufferp)
+{
+ struct ofmp_header *oh;
+
+ oh = make_openflow_xid(ofmp_len, OFPT_VENDOR, xid, bufferp);
+ oh->header.vendor = htonl(NX_VENDOR_ID);
+ oh->header.subtype = htonl(NXT_MGMT);
+ oh->type = htons(type);
+
+ return oh;
+}
+
+static void *
+make_ofmp(size_t ofmp_len, uint16_t type, struct ofpbuf **bufferp)
+{
+ struct ofmp_header *oh;
+
+ oh = make_openflow(ofmp_len, OFPT_VENDOR, bufferp);
+ oh->header.vendor = htonl(NX_VENDOR_ID);
+ oh->header.subtype = htonl(NXT_MGMT);
+ oh->type = htons(type);
+
+ return oh;
+}
+
+static void
+send_capability_reply(uint32_t xid)
+{
+ int i;
+ struct ofpbuf *buffer;
+ struct ofmp_capability_reply *ofmpcr;
+
+ ofmpcr = make_ofmp_xid(sizeof *ofmpcr, OFMPT_CAPABILITY_REPLY,
+ xid, &buffer);
+ ofmpcr->format = htonl(OFMPCOF_SIMPLE);
+ ofmpcr->mgmt_id = htonll(mgmt_id);
+ for (i=0; i<capabilities.n; i++) {
+ ofpbuf_put(buffer, capabilities.names[i],
+ strlen(capabilities.names[i]));
+ }
+ send_openflow_buffer(buffer);
+}
+
+static void
+send_resources_update(uint32_t xid, bool use_xid)
+{
+ struct ofpbuf *buffer;
+ struct ofmp_resources_update *ofmpru;
+ struct ofmp_tlv *tlv;
+ struct svec br_list;
+ int i;
+
+ if (use_xid) {
+ ofmpru = make_ofmp_xid(sizeof *ofmpru, OFMPT_RESOURCES_UPDATE,
+ xid, &buffer);
+ } else {
+ ofmpru = make_ofmp(sizeof *ofmpru, OFMPT_RESOURCES_UPDATE, &buffer);
+ }
+
+ svec_init(&br_list);
+ cfg_get_subsections(&br_list, "bridge");
+ for (i=0; i < br_list.n; i++) {
+ struct ofmptsr_dp *dp_tlv;
+ uint64_t dp_id = bridge_get_datapathid(br_list.names[i]);
+ if (!dp_id) {
+ VLOG_WARN_RL(&rl, "bridge %s doesn't seem to exist",
+ br_list.names[i]);
+ continue;
+ }
+ dp_tlv = ofpbuf_put_zeros(buffer, sizeof(*dp_tlv));
+ dp_tlv->type = htons(OFMPTSR_DP);
+ dp_tlv->len = htons(sizeof(*dp_tlv));
+
+ dp_tlv->dp_id = htonll(dp_id);
+ memcpy(dp_tlv->name, br_list.names[i], strlen(br_list.names[i])+1);
+ }
+
+ /* Put end marker. */
+ tlv = ofpbuf_put_zeros(buffer, sizeof(*tlv));
+ tlv->type = htons(OFMPTSR_END);
+ tlv->len = htons(sizeof(*tlv));
+ send_openflow_buffer(buffer);
+}
+
+static void
+send_config_update(uint32_t xid, bool use_xid)
+{
+ struct ofpbuf *buffer;
+ struct ofmp_config_update *ofmpcu;
+
+ if (use_xid) {
+ ofmpcu = make_ofmp_xid(sizeof *ofmpcu, OFMPT_CONFIG_UPDATE,
+ xid, &buffer);
+ } else {
+ ofmpcu = make_ofmp(sizeof *ofmpcu, OFMPT_CONFIG_UPDATE, &buffer);
+ }
+
+ ofmpcu->format = htonl(OFMPCOF_SIMPLE);
+ memcpy(ofmpcu->cookie, cfg_cookie, sizeof(ofmpcu->cookie));
+ cfg_buf_put(buffer);
+ send_openflow_buffer(buffer);
+}
+
+static void
+send_config_update_ack(uint32_t xid, bool success)
+{
+ struct ofpbuf *buffer;
+ struct ofmp_config_update_ack *ofmpcua;
+
+ ofmpcua = make_ofmp_xid(sizeof *ofmpcua, OFMPT_CONFIG_UPDATE_ACK,
+ xid, &buffer);
+
+ ofmpcua->format = htonl(OFMPCOF_SIMPLE);
+ if (success) {
+ ofmpcua->flags = htonl(OFMPCUAF_SUCCESS);
+ }
+ cfg_get_cookie(ofmpcua->cookie);
+ send_openflow_buffer(buffer);
+}
+
+static void
+send_ofmp_error_msg(uint32_t xid, uint16_t type, uint16_t code,
+ const void *data, size_t len)
+{
+ struct ofpbuf *buffer;
+ struct ofmp_error_msg *oem;
+
+ oem = make_ofmp_xid(sizeof(*oem)+len, OFMPT_ERROR, xid, &buffer);
+ oem->type = htons(type);
+ oem->code = htons(code);
+ memcpy(oem->data, data, len);
+ send_openflow_buffer(buffer);
+}
+
+static void
+send_error_msg(uint32_t xid, uint16_t type, uint16_t code,
+ const void *data, size_t len)
+{
+ struct ofpbuf *buffer;
+ struct ofp_error_msg *oem;
+
+ oem = make_openflow_xid(sizeof(*oem)+len, OFPT_ERROR, xid, &buffer);
+ oem->type = htons(type);
+ oem->code = htons(code);
+ memcpy(oem->data, data, len);
+ send_openflow_buffer(buffer);
+}
+
+static int
+recv_echo_request(uint32_t xid UNUSED, const void *msg)
+{
+ const struct ofp_header *rq = msg;
+ send_openflow_buffer(make_echo_reply(rq));
+ return 0;
+}
+
+static int
+recv_features_request(uint32_t xid, const void *msg UNUSED)
+{
+ send_features_reply(xid);
+ return 0;
+}
+
+static int
+recv_set_config(uint32_t xid UNUSED, const void *msg UNUSED)
+{
+ /* Nothing to configure! */
+ return 0;
+}
+
+static int
+recv_ofmp_capability_request(uint32_t xid, const struct ofmp_header *ofmph)
+{
+ struct ofmp_capability_request *ofmpcr;
+
+ if (htons(ofmph->header.header.length) != sizeof(*ofmpcr)) {
+ /* xxx Send error */
+ return -EINVAL;
+ }
+
+ ofmpcr = (struct ofmp_capability_request *)ofmph;
+ if (ofmpcr->format != htonl(OFMPCAF_SIMPLE)) {
+ /* xxx Send error */
+ return -EINVAL;
+ }
+
+ send_capability_reply(xid);
+
+ return 0;
+}
+
+static int
+recv_ofmp_resources_request(uint32_t xid, const void *msg UNUSED)
+{
+ send_resources_update(xid, true);
+ return 0;
+}
+
+static int
+recv_ofmp_config_request(uint32_t xid, const struct ofmp_header *ofmph)
+{
+ struct ofmp_config_request *ofmpcr;
+
+ if (htons(ofmph->header.header.length) != sizeof(*ofmpcr)) {
+ /* xxx Send error */
+ return -EINVAL;
+ }
+
+ ofmpcr = (struct ofmp_config_request *)ofmph;
+ if (ofmpcr->format != htonl(OFMPCOF_SIMPLE)) {
+ /* xxx Send error */
+ return -EINVAL;
+ }
+
+ send_config_update(xid, true);
+
+ return 0;
+}
+
+static int
+recv_ofmp_config_update(uint32_t xid, const struct ofmp_header *ofmph)
+{
+ struct ofmp_config_update *ofmpcu;
+ int data_len;
+
+ data_len = htons(ofmph->header.header.length) - sizeof(*ofmpcu);
+ if (data_len <= sizeof(*ofmpcu)) {
+ /* xxx Send error. */
+ return -EINVAL;
+ }
+
+ ofmpcu = (struct ofmp_config_update *)ofmph;
+ if (ofmpcu->format != htonl(OFMPCOF_SIMPLE)) {
+ /* xxx Send error */
+ return -EINVAL;
+ }
+
+ /* Check if the supplied cookie matches our current understanding of
+ * it. If they don't match, tell the controller and let it sort
+ * things out. */
+ if (cfg_lock(ofmpcu->cookie, 0)) {
+ /* xxx cfg_lock can fail for other reasons, such as being
+ * xxx locked... */
+ VLOG_WARN_RL(&rl, "config update failed due to bad cookie\n");
+ send_config_update_ack(xid, false);
+ return 0;
+ }
+
+ /* xxx We should probably do more sanity checking than this. */
+
+ cfg_write_data(ofmpcu->data, data_len);
+ cfg_unlock();
+
+ /* Send the ACK before running reconfigure, since our management
+ * connection settings may have changed. */
+ send_config_update_ack(xid, true);
+
+ reconfigure();
+
+
+ return 0;
+}
+
+static
+int recv_ofmp(uint32_t xid, struct ofmp_header *ofmph)
+{
+ /* xxx Should sanity-check for min/max length */
+ switch (ntohs(ofmph->type))
+ {
+ case OFMPT_CAPABILITY_REQUEST:
+ return recv_ofmp_capability_request(xid, ofmph);
+ case OFMPT_RESOURCES_REQUEST:
+ return recv_ofmp_resources_request(xid, ofmph);
+ case OFMPT_CONFIG_REQUEST:
+ return recv_ofmp_config_request(xid, ofmph);
+ case OFMPT_CONFIG_UPDATE:
+ return recv_ofmp_config_update(xid, ofmph);
+ default:
+ VLOG_WARN_RL(&rl, "unknown mgmt message: %d",
+ ntohs(ofmph->type));
+ return -EINVAL;
+ }
+}
+
+static int
+recv_nx_msg(uint32_t xid, const void *oh)
+{
+ const struct nicira_header *nh = oh;
+
+ switch (ntohl(nh->subtype)) {
+
+ case NXT_MGMT:
+ return recv_ofmp(xid, (struct ofmp_header *)oh);
+
+ default:
+ send_error_msg(xid, OFPET_BAD_REQUEST, OFPBRC_BAD_SUBTYPE,
+ oh, htons(nh->header.length));
+ return -EINVAL;
+ }
+}
+
+static int
+recv_vendor(uint32_t xid, const void *oh)
+{
+ const struct ofp_vendor_header *ovh = oh;
+
+ switch (ntohl(ovh->vendor))
+ {
+ case NX_VENDOR_ID:
+ return recv_nx_msg(xid, oh);
+
+ default:
+ VLOG_WARN_RL(&rl, "unknown vendor: 0x%x", ntohl(ovh->vendor));
+ send_error_msg(xid, OFPET_BAD_REQUEST, OFPBRC_BAD_VENDOR,
+ oh, ntohs(ovh->header.length));
+ return -EINVAL;
+ }
+}
+
+static int
+handle_msg(uint32_t xid, const void *msg, size_t length)
+{
+ int (*handler)(uint32_t, const void *);
+ struct ofp_header *oh;
+ size_t min_size;
+
+ COVERAGE_INC(mgmt_received);
+
+ /* Check encapsulated length. */
+ oh = (struct ofp_header *) msg;
+ if (ntohs(oh->length) > length) {
+ return -EINVAL;
+ }
+ assert(oh->version == OFP_VERSION);
+
+ /* Figure out how to handle it. */
+ switch (oh->type) {
+ case OFPT_ECHO_REQUEST:
+ min_size = sizeof(struct ofp_header);
+ handler = recv_echo_request;
+ break;
+ case OFPT_ECHO_REPLY:
+ return 0;
+ case OFPT_FEATURES_REQUEST:
+ min_size = sizeof(struct ofp_header);
+ handler = recv_features_request;
+ break;
+ case OFPT_SET_CONFIG:
+ min_size = sizeof(struct ofp_switch_config);
+ handler = recv_set_config;
+ break;
+ case OFPT_VENDOR:
+ min_size = sizeof(struct ofp_vendor_header);
+ handler = recv_vendor;
+ break;
+ default:
+ VLOG_WARN_RL(&rl, "unknown openflow type: %d", oh->type);
+ send_error_msg(xid, OFPET_BAD_REQUEST, OFPBRC_BAD_TYPE,
+ msg, length);
+ return -EINVAL;
+ }
+
+ /* Handle it. */
+ if (length < min_size) {
+ return -EFAULT;
+ }
+ return handler(xid, msg);
+}
+
+void
+mgmt_run(void)
+{
+ int i;
+
+ if (!mgmt_rconn) {
+ return;
+ }
+
+ rconn_run(mgmt_rconn);
+
+ /* Do some processing, but cap it at a reasonable amount so that
+ * other processing doesn't starve. */
+ for (i=0; i<50; i++) {
+ struct ofpbuf *buffer;
+ struct ofp_header *oh;
+
+ buffer = rconn_recv(mgmt_rconn);
+ if (!buffer) {
+ break;
+ }
+
+ if (buffer->size >= sizeof *oh) {
+ oh = buffer->data;
+ handle_msg(oh->xid, buffer->data, buffer->size);
+ ofpbuf_delete(buffer);
+ } else {
+ VLOG_WARN_RL(&rl, "received too-short OpenFlow message");
+ }
+ }
+}
+
+void
+mgmt_wait(void)
+{
+ if (!mgmt_rconn) {
+ return;
+ }
+
+ rconn_run_wait(mgmt_rconn);
+ rconn_recv_wait(mgmt_rconn);
+}
+
+static uint64_t
+pick_fallback_mgmt_id(void)
+{
+ uint8_t ea[ETH_ADDR_LEN];
+ eth_addr_random(ea);
+ ea[0] = 0x00; /* Set Nicira OUI. */
+ ea[1] = 0x23;
+ ea[2] = 0x20;
+ return eth_addr_to_uint64(ea);
+}
diff --git a/vswitchd/mgmt.h b/vswitchd/mgmt.h
new file mode 100644
index 000000000..6db66c69f
--- /dev/null
+++ b/vswitchd/mgmt.h
@@ -0,0 +1,36 @@
+/* Copyright (c) 2009 Nicira Networks
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * In addition, as a special exception, Nicira Networks gives permission
+ * to link the code of its release of vswitchd with the OpenSSL project's
+ * "OpenSSL" library (or with modified versions of it that use the same
+ * license as the "OpenSSL" library), and distribute the linked
+ * executables. You must obey the GNU General Public License in all
+ * respects for all of the code used other than "OpenSSL". If you modify
+ * this file, you may extend this exception to your version of the file,
+ * but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version.
+ */
+
+#ifndef VSWITCHD_MGMT_H
+#define VSWITCHD_MGMT_H 1
+
+void mgmt_init(void);
+void mgmt_reconfigure(void);
+void mgmt_run(void);
+void mgmt_wait(void);
+uint64_t mgmt_get_mgmt_id(void);
+
+#endif /* mgmt.h */
diff --git a/vswitchd/ovs-brcompatd.8.in b/vswitchd/ovs-brcompatd.8.in
new file mode 100644
index 000000000..ebd67028f
--- /dev/null
+++ b/vswitchd/ovs-brcompatd.8.in
@@ -0,0 +1,49 @@
+.TH ovs\-brcompatd 8 "March 2009" "Open vSwitch" "Open vSwitch Manual"
+.ds PN ovs\-brcompatd
+.
+.SH NAME
+ovs\-brcompatd \- Bridge compatibility front-end for ovs\-vswitchd
+.
+.SH SYNOPSIS
+.B ovs\-brcompatd
+[\fIoptions\fR] \fIconfig\fR
+.
+.SH DESCRIPTION
+A daemon that provides a legacy bridge front-end for \fBovs\-vswitchd\fR. It
+does this by listening for bridge ioctl commands (e.g., those generated by
+the \fBbrctl\fR program) to add or remove datapaths and the interfaces
+that attach to them. It modifies \fIconfig\fR and forces
+\fBovs\-vswitchd\fR to reload its configuration file.
+.PP
+.SH OPTIONS
+.IP "\fB--reload-command=\fIcommand\fR"
+Sets the command that \fBovs\-brcompatd\fR runs to force \fBovs\-vswitchd\fR to
+reload its configuration file to \fIcommand\fR. The command is run in
+a subshell, so it may contain arbitrary shell metacharacters, etc.
+The \fB--help\fR option displays the default reload command.
+.TP
+\fB--prune-timeout=\fIsecs\fR
+.
+Sets the maximum time between pruning port entries to \fIsecs\fR seconds.
+Pruning ports is the process of removing port entries from \fIconfig\fR
+that no longer exist. If \fIsecs\fR is zero, then entries are never
+pruned. The default prune timeout is 5 seconds.
+.TP
+\fB--lock-timeout=\fImsecs\fR
+.
+Sets the maximum time to wait for \fIconfig\fR to become unlocked to
+\fImsecs\fR milliseconds. The default lock timeout is 500 milliseconds.
+.
+.so lib/daemon.man
+.so lib/vlog.man
+.so lib/common.man
+.so lib/leak-checker.man
+.
+.SH NOTES
+\fBovs\-brcompatd\fR requires the \fBbrcompat_mod.ko\fR kernel module to be
+loaded.
+.SH "SEE ALSO"
+.BR ovs\-appctl (8),
+.BR ovs\-vswitchd (8),
+.BR ovs\-vswitchd.conf (5),
+\fBINSTALL\fR in the Open vSwitch distribution.
diff --git a/vswitchd/ovs-brcompatd.c b/vswitchd/ovs-brcompatd.c
new file mode 100644
index 000000000..93d9469bd
--- /dev/null
+++ b/vswitchd/ovs-brcompatd.c
@@ -0,0 +1,766 @@
+/* Copyright (c) 2008, 2009 Nicira Networks
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include <config.h>
+
+#include <assert.h>
+#include <errno.h>
+#include <getopt.h>
+#include <inttypes.h>
+#include <limits.h>
+#include <net/if.h>
+#include <linux/genetlink.h>
+#include <linux/rtnetlink.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+
+#include "cfg.h"
+#include "command-line.h"
+#include "coverage.h"
+#include "daemon.h"
+#include "dirs.h"
+#include "dpif.h"
+#include "fatal-signal.h"
+#include "fault.h"
+#include "leak-checker.h"
+#include "netdev.h"
+#include "netlink.h"
+#include "ofpbuf.h"
+#include "openvswitch/brcompat-netlink.h"
+#include "poll-loop.h"
+#include "process.h"
+#include "signals.h"
+#include "svec.h"
+#include "timeval.h"
+#include "unixctl.h"
+#include "util.h"
+
+#include "vlog.h"
+#define THIS_MODULE VLM_brcompatd
+
+
+/* xxx Just hangs if datapath is rmmod/insmod. Learn to reconnect? */
+
+/* Actions to modify bridge compatibility configuration. */
+enum bmc_action {
+ BMC_ADD_DP,
+ BMC_DEL_DP,
+ BMC_ADD_PORT,
+ BMC_DEL_PORT
+};
+
+static void parse_options(int argc, char *argv[]);
+static void usage(void) NO_RETURN;
+
+static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 60);
+
+/* Maximum number of milliseconds to wait for the config file to be
+ * unlocked. If set to zero, no waiting will occur. */
+static int lock_timeout = 500;
+
+/* Maximum number of milliseconds to wait before pruning port entries that
+ * no longer exist. If set to zero, ports are never pruned. */
+static int prune_timeout = 5000;
+
+/* Config file shared with ovs-vswitchd (usually ovs-vswitchd.conf). */
+static char *config_file;
+
+/* Command to run (via system()) to reload the ovs-vswitchd configuration
+ * file. */
+static char *reload_command;
+
+/* Netlink socket to listen for interface changes. */
+static struct nl_sock *rtnl_sock;
+
+/* Netlink socket to bridge compatibility kernel module. */
+static struct nl_sock *brc_sock;
+
+/* The Generic Netlink family number used for bridge compatibility. */
+static int brc_family;
+
+static const struct nl_policy brc_multicast_policy[] = {
+ [BRC_GENL_A_MC_GROUP] = {.type = NL_A_U32 }
+};
+
+static const struct nl_policy rtnlgrp_link_policy[] = {
+ [IFLA_IFNAME] = { .type = NL_A_STRING, .optional = false },
+ [IFLA_MASTER] = { .type = NL_A_U32, .optional = true },
+};
+
+static int
+lookup_brc_multicast_group(int *multicast_group)
+{
+ struct nl_sock *sock;
+ struct ofpbuf request, *reply;
+ struct nlattr *attrs[ARRAY_SIZE(brc_multicast_policy)];
+ int retval;
+
+ retval = nl_sock_create(NETLINK_GENERIC, 0, 0, 0, &sock);
+ if (retval) {
+ return retval;
+ }
+ ofpbuf_init(&request, 0);
+ nl_msg_put_genlmsghdr(&request, sock, 0, brc_family,
+ NLM_F_REQUEST, BRC_GENL_C_QUERY_MC, 1);
+ retval = nl_sock_transact(sock, &request, &reply);
+ ofpbuf_uninit(&request);
+ if (retval) {
+ nl_sock_destroy(sock);
+ return retval;
+ }
+ if (!nl_policy_parse(reply, NLMSG_HDRLEN + GENL_HDRLEN,
+ brc_multicast_policy, attrs,
+ ARRAY_SIZE(brc_multicast_policy))) {
+ nl_sock_destroy(sock);
+ ofpbuf_delete(reply);
+ return EPROTO;
+ }
+ *multicast_group = nl_attr_get_u32(attrs[BRC_GENL_A_MC_GROUP]);
+ nl_sock_destroy(sock);
+ ofpbuf_delete(reply);
+
+ return 0;
+}
+
+/* Opens a socket for brcompat notifications. Returns 0 if successful,
+ * otherwise a positive errno value. */
+static int
+brc_open(struct nl_sock **sock)
+{
+ int multicast_group = 0;
+ int retval;
+
+ retval = nl_lookup_genl_family(BRC_GENL_FAMILY_NAME, &brc_family);
+ if (retval) {
+ return retval;
+ }
+
+ retval = lookup_brc_multicast_group(&multicast_group);
+ if (retval) {
+ return retval;
+ }
+
+ retval = nl_sock_create(NETLINK_GENERIC, multicast_group, 0, 0, sock);
+ if (retval) {
+ return retval;
+ }
+
+ return 0;
+}
+
+static const struct nl_policy brc_dp_policy[] = {
+ [BRC_GENL_A_DP_NAME] = { .type = NL_A_STRING },
+};
+
+static bool
+bridge_exists(const char *name)
+{
+ return cfg_has_section("bridge.%s", name);
+}
+
+static int
+rewrite_and_reload_config(void)
+{
+ if (cfg_is_dirty()) {
+ int error1 = cfg_write();
+ int error2 = cfg_read();
+ long long int reload_start = time_msec();
+ int error3 = system(reload_command);
+ long long int elapsed = time_msec() - reload_start;
+ COVERAGE_INC(brcompatd_reload);
+ if (elapsed > 0) {
+ VLOG_INFO("reload command executed in %lld ms", elapsed);
+ }
+ if (error3 == -1) {
+ VLOG_ERR("failed to execute reload command: %s", strerror(errno));
+ } else if (error3 != 0) {
+ char *msg = process_status_msg(error3);
+ VLOG_ERR("reload command exited with error (%s)", msg);
+ free(msg);
+ }
+ return error1 ? error1 : error2 ? error2 : error3 ? ECHILD : 0;
+ }
+ return 0;
+}
+
+/* Go through the configuration file and remove any ports that no longer
+ * exist associated with a bridge. */
+static void
+prune_ports(void)
+{
+ int i, j;
+ int error;
+ struct svec bridges, delete;
+
+ if (cfg_lock(NULL, 0)) {
+ /* Couldn't lock config file. */
+ return;
+ }
+
+ svec_init(&bridges);
+ svec_init(&delete);
+ cfg_get_subsections(&bridges, "bridge");
+ for (i=0; i<bridges.n; i++) {
+ const char *br_name = bridges.names[i];
+ struct svec ports, ifaces;
+
+ svec_init(&ports);
+
+ /* Get all the interfaces for the given bridge, breaking bonded
+ * interfaces down into their constituent parts. */
+ svec_init(&ifaces);
+ cfg_get_all_keys(&ports, "bridge.%s.port", br_name);
+ for (j=0; j<ports.n; j++) {
+ const char *port_name = ports.names[j];
+ if (cfg_has_section("bonding.%s", port_name)) {
+ struct svec slaves;
+ svec_init(&slaves);
+ cfg_get_all_keys(&slaves, "bonding.%s.slave", port_name);
+ svec_append(&ifaces, &slaves);
+ svec_destroy(&slaves);
+ } else {
+ svec_add(&ifaces, port_name);
+ }
+ }
+ svec_destroy(&ports);
+
+ /* Check that the interfaces exist. */
+ for (j = 0; j < ifaces.n; j++) {
+ const char *iface_name = ifaces.names[j];
+ enum netdev_flags flags;
+
+ /* The local port and internal ports are created and destroyed by
+ * ovs-vswitchd itself, so don't bother checking for them at all.
+ * In practice, they might not exist if ovs-vswitchd hasn't
+ * finished reloading since the configuration file was updated. */
+ if (!strcmp(iface_name, br_name)
+ || cfg_get_bool(0, "iface.%s.internal", iface_name)) {
+ continue;
+ }
+
+ error = netdev_nodev_get_flags(iface_name, &flags);
+ if (error == ENODEV) {
+ VLOG_DBG_RL(&rl, "removing dead interface %s from %s",
+ iface_name, br_name);
+ svec_add(&delete, iface_name);
+ } else if (error) {
+ VLOG_DBG_RL(&rl, "unknown error %d on interface %s from %s",
+ error, iface_name, br_name);
+ }
+ }
+ svec_destroy(&ifaces);
+ }
+ svec_destroy(&bridges);
+
+ if (delete.n) {
+ size_t i;
+
+ for (i = 0; i < delete.n; i++) {
+ cfg_del_match("bridge.*.port=%s", delete.names[i]);
+ cfg_del_match("bonding.*.slave=%s", delete.names[i]);
+ }
+ rewrite_and_reload_config();
+ cfg_unlock();
+ } else {
+ cfg_unlock();
+ }
+ svec_destroy(&delete);
+}
+
+
+/* Checks whether a network device named 'name' exists and returns true if so,
+ * false otherwise.
+ *
+ * XXX it is possible that this doesn't entirely accomplish what we want in
+ * context, since ovs-vswitchd.conf may cause vswitchd to create or destroy
+ * network devices based on iface.*.internal settings.
+ *
+ * XXX may want to move this to lib/netdev. */
+static bool
+netdev_exists(const char *name)
+{
+ struct stat s;
+ char *filename;
+ int error;
+
+ filename = xasprintf("/sys/class/net/%s", name);
+ error = stat(filename, &s);
+ free(filename);
+ return !error;
+}
+
+static int
+add_bridge(const char *br_name)
+{
+ if (bridge_exists(br_name)) {
+ VLOG_WARN("addbr %s: bridge %s exists", br_name, br_name);
+ return EEXIST;
+ } else if (netdev_exists(br_name)) {
+ if (cfg_get_bool(0, "iface.%s.fake-bridge", br_name)) {
+ VLOG_WARN("addbr %s: %s exists as a fake bridge",
+ br_name, br_name);
+ return 0;
+ } else {
+ VLOG_WARN("addbr %s: cannot create bridge %s because a network "
+ "device named %s already exists",
+ br_name, br_name, br_name);
+ return EEXIST;
+ }
+ }
+
+ cfg_add_entry("bridge.%s.port=%s", br_name, br_name);
+ VLOG_INFO("addbr %s: success", br_name);
+
+ return 0;
+}
+
+static int
+del_bridge(const char *br_name)
+{
+ if (!bridge_exists(br_name)) {
+ VLOG_WARN("delbr %s: no bridge named %s", br_name, br_name);
+ return ENXIO;
+ }
+
+ cfg_del_section("bridge.%s", br_name);
+ VLOG_INFO("delbr %s: success", br_name);
+
+ return 0;
+}
+
+static int
+parse_command(struct ofpbuf *buffer, uint32_t *seq, const char **br_name,
+ const char **port_name)
+{
+ static const struct nl_policy policy[] = {
+ [BRC_GENL_A_DP_NAME] = { .type = NL_A_STRING },
+ [BRC_GENL_A_PORT_NAME] = { .type = NL_A_STRING, .optional = true },
+ };
+ struct nlattr *attrs[ARRAY_SIZE(policy)];
+
+ if (!nl_policy_parse(buffer, NLMSG_HDRLEN + GENL_HDRLEN, policy,
+ attrs, ARRAY_SIZE(policy))
+ || (port_name && !attrs[BRC_GENL_A_PORT_NAME])) {
+ return EINVAL;
+ }
+
+ *seq = ((struct nlmsghdr *) buffer->data)->nlmsg_seq;
+ *br_name = nl_attr_get_string(attrs[BRC_GENL_A_DP_NAME]);
+ if (port_name) {
+ *port_name = nl_attr_get_string(attrs[BRC_GENL_A_PORT_NAME]);
+ }
+ return 0;
+}
+
+static void
+send_reply(uint32_t seq, int error)
+{
+ struct ofpbuf msg;
+ int retval;
+
+ /* Compose reply. */
+ ofpbuf_init(&msg, 0);
+ nl_msg_put_genlmsghdr(&msg, brc_sock, 32, brc_family, NLM_F_REQUEST,
+ BRC_GENL_C_DP_RESULT, 1);
+ ((struct nlmsghdr *) msg.data)->nlmsg_seq = seq;
+ nl_msg_put_u32(&msg, BRC_GENL_A_ERR_CODE, error);
+
+ /* Send reply. */
+ retval = nl_sock_send(brc_sock, &msg, false);
+ if (retval) {
+ VLOG_WARN_RL(&rl, "replying to brcompat request: %s",
+ strerror(retval));
+ }
+ ofpbuf_uninit(&msg);
+}
+
+static int
+handle_bridge_cmd(struct ofpbuf *buffer, bool add)
+{
+ const char *br_name;
+ uint32_t seq;
+ int error;
+
+ error = parse_command(buffer, &seq, &br_name, NULL);
+ if (!error) {
+ error = add ? add_bridge(br_name) : del_bridge(br_name);
+ if (!error) {
+ error = rewrite_and_reload_config();
+ }
+ send_reply(seq, error);
+ }
+ return error;
+}
+
+static const struct nl_policy brc_port_policy[] = {
+ [BRC_GENL_A_DP_NAME] = { .type = NL_A_STRING },
+ [BRC_GENL_A_PORT_NAME] = { .type = NL_A_STRING },
+};
+
+static void
+del_port(const char *br_name, const char *port_name)
+{
+ cfg_del_entry("bridge.%s.port=%s", br_name, port_name);
+ cfg_del_match("bonding.*.slave=%s", port_name);
+ cfg_del_match("vlan.%s.*", port_name);
+}
+
+static int
+handle_port_cmd(struct ofpbuf *buffer, bool add)
+{
+ const char *cmd_name = add ? "add-if" : "del-if";
+ const char *br_name, *port_name;
+ uint32_t seq;
+ int error;
+
+ error = parse_command(buffer, &seq, &br_name, &port_name);
+ if (!error) {
+ if (!bridge_exists(br_name)) {
+ VLOG_WARN("%s %s %s: no bridge named %s",
+ cmd_name, br_name, port_name, br_name);
+ error = EINVAL;
+ } else if (!netdev_exists(port_name)) {
+ VLOG_WARN("%s %s %s: no network device named %s",
+ cmd_name, br_name, port_name, port_name);
+ error = EINVAL;
+ } else {
+ if (add) {
+ cfg_add_entry("bridge.%s.port=%s", br_name, port_name);
+ } else {
+ del_port(br_name, port_name);
+ }
+ VLOG_INFO("%s %s %s: success", cmd_name, br_name, port_name);
+ error = rewrite_and_reload_config();
+ }
+ send_reply(seq, error);
+ }
+
+ return error;
+}
+
+static int
+brc_recv_update(void)
+{
+ int retval;
+ struct ofpbuf *buffer;
+ struct genlmsghdr *genlmsghdr;
+
+
+ buffer = NULL;
+ do {
+ ofpbuf_delete(buffer);
+ retval = nl_sock_recv(brc_sock, &buffer, false);
+ } while (retval == ENOBUFS
+ || (!retval
+ && (nl_msg_nlmsgerr(buffer, NULL)
+ || nl_msg_nlmsghdr(buffer)->nlmsg_type == NLMSG_DONE)));
+ if (retval) {
+ if (retval != EAGAIN) {
+ VLOG_WARN_RL(&rl, "brc_recv_update: %s", strerror(retval));
+ }
+ return retval;
+ }
+
+ genlmsghdr = nl_msg_genlmsghdr(buffer);
+ if (!genlmsghdr) {
+ VLOG_WARN_RL(&rl, "received packet too short for generic NetLink");
+ goto error;
+ }
+
+ if (nl_msg_nlmsghdr(buffer)->nlmsg_type != brc_family) {
+ VLOG_DBG_RL(&rl, "received type (%"PRIu16") != brcompat family (%d)",
+ nl_msg_nlmsghdr(buffer)->nlmsg_type, brc_family);
+ goto error;
+ }
+
+ if (cfg_lock(NULL, lock_timeout)) {
+ /* Couldn't lock config file. */
+ retval = EAGAIN;
+ goto error;
+ }
+
+ switch (genlmsghdr->cmd) {
+ case BRC_GENL_C_DP_ADD:
+ retval = handle_bridge_cmd(buffer, true);
+ break;
+
+ case BRC_GENL_C_DP_DEL:
+ retval = handle_bridge_cmd(buffer, false);
+ break;
+
+ case BRC_GENL_C_PORT_ADD:
+ retval = handle_port_cmd(buffer, true);
+ break;
+
+ case BRC_GENL_C_PORT_DEL:
+ retval = handle_port_cmd(buffer, false);
+ break;
+
+ default:
+ retval = EPROTO;
+ }
+
+ cfg_unlock();
+
+error:
+ ofpbuf_delete(buffer);
+ return retval;
+}
+
+/* Check for interface configuration changes announced through RTNL. */
+static void
+rtnl_recv_update(void)
+{
+ struct ofpbuf *buf;
+
+ int error = nl_sock_recv(rtnl_sock, &buf, false);
+ if (error == EAGAIN) {
+ /* Nothing to do. */
+ } else if (error == ENOBUFS) {
+ VLOG_WARN_RL(&rl, "network monitor socket overflowed");
+ } else if (error) {
+ VLOG_WARN_RL(&rl, "error on network monitor socket: %s",
+ strerror(error));
+ } else {
+ struct nlattr *attrs[ARRAY_SIZE(rtnlgrp_link_policy)];
+ struct nlmsghdr *nlh;
+ struct ifinfomsg *iim;
+
+ nlh = ofpbuf_at(buf, 0, NLMSG_HDRLEN);
+ iim = ofpbuf_at(buf, NLMSG_HDRLEN, sizeof *iim);
+ if (!iim) {
+ VLOG_WARN_RL(&rl, "received bad rtnl message (no ifinfomsg)");
+ ofpbuf_delete(buf);
+ return;
+ }
+
+ if (!nl_policy_parse(buf, NLMSG_HDRLEN + sizeof(struct ifinfomsg),
+ rtnlgrp_link_policy,
+ attrs, ARRAY_SIZE(rtnlgrp_link_policy))) {
+ VLOG_WARN_RL(&rl,"received bad rtnl message (policy)");
+ ofpbuf_delete(buf);
+ return;
+ }
+ if (nlh->nlmsg_type == RTM_DELLINK && attrs[IFLA_MASTER]) {
+ const char *port_name = nl_attr_get_string(attrs[IFLA_IFNAME]);
+ char br_name[IFNAMSIZ];
+ uint32_t br_idx = nl_attr_get_u32(attrs[IFLA_MASTER]);
+ struct svec ports;
+
+ if (!if_indextoname(br_idx, br_name)) {
+ ofpbuf_delete(buf);
+ return;
+ }
+
+ if (cfg_lock(NULL, lock_timeout)) {
+ /* Couldn't lock config file. */
+ /* xxx this should try again and print error msg. */
+ ofpbuf_delete(buf);
+ return;
+ }
+
+ svec_init(&ports);
+ cfg_get_all_keys(&ports, "bridge.%s.port", br_name);
+ svec_sort(&ports);
+ if (svec_contains(&ports, port_name)) {
+ del_port(br_name, port_name);
+ rewrite_and_reload_config();
+ }
+ cfg_unlock();
+ }
+ ofpbuf_delete(buf);
+ }
+}
+
+int
+main(int argc, char *argv[])
+{
+ struct unixctl_server *unixctl;
+ int retval;
+
+ set_program_name(argv[0]);
+ register_fault_handlers();
+ time_init();
+ vlog_init();
+ parse_options(argc, argv);
+ signal(SIGPIPE, SIG_IGN);
+ process_init();
+
+ die_if_already_running();
+ daemonize();
+
+ retval = unixctl_server_create(NULL, &unixctl);
+ if (retval) {
+ ovs_fatal(retval, "could not listen for vlog connections");
+ }
+
+ if (brc_open(&brc_sock)) {
+ ovs_fatal(0, "could not open brcompat socket. Check "
+ "\"brcompat\" kernel module.");
+ }
+
+ if (prune_timeout) {
+ if (nl_sock_create(NETLINK_ROUTE, RTNLGRP_LINK, 0, 0, &rtnl_sock)) {
+ ovs_fatal(0, "could not create rtnetlink socket");
+ }
+ }
+
+ cfg_read();
+
+ for (;;) {
+ unixctl_server_run(unixctl);
+ brc_recv_update();
+
+ /* If 'prune_timeout' is non-zero, we actively prune from the
+ * config file any 'bridge.<br_name>.port' entries that are no
+ * longer valid. We use two methods:
+ *
+ * 1) The kernel explicitly notifies us of removed ports
+ * through the RTNL messages.
+ *
+ * 2) We periodically check all ports associated with bridges
+ * to see if they no longer exist.
+ */
+ if (prune_timeout) {
+ rtnl_recv_update();
+ prune_ports();
+
+ nl_sock_wait(rtnl_sock, POLLIN);
+ poll_timer_wait(prune_timeout);
+ }
+
+ nl_sock_wait(brc_sock, POLLIN);
+ unixctl_server_wait(unixctl);
+ poll_block();
+ }
+
+ return 0;
+}
+
+static void
+parse_options(int argc, char *argv[])
+{
+ enum {
+ OPT_LOCK_TIMEOUT = UCHAR_MAX + 1,
+ OPT_PRUNE_TIMEOUT,
+ OPT_RELOAD_COMMAND,
+ VLOG_OPTION_ENUMS,
+ LEAK_CHECKER_OPTION_ENUMS
+ };
+ static struct option long_options[] = {
+ {"help", no_argument, 0, 'h'},
+ {"version", no_argument, 0, 'V'},
+ {"lock-timeout", required_argument, 0, OPT_LOCK_TIMEOUT},
+ {"prune-timeout", required_argument, 0, OPT_PRUNE_TIMEOUT},
+ {"reload-command", required_argument, 0, OPT_RELOAD_COMMAND},
+ DAEMON_LONG_OPTIONS,
+ VLOG_LONG_OPTIONS,
+ LEAK_CHECKER_LONG_OPTIONS,
+ {0, 0, 0, 0},
+ };
+ char *short_options = long_options_to_short_options(long_options);
+ int error;
+
+ reload_command = xasprintf("%s/ovs-appctl -t "
+ "%s/ovs-vswitchd.`cat %s/ovs-vswitchd.pid`.ctl "
+ "-e vswitchd/reload 2>&1 "
+ "| /usr/bin/logger -t brcompatd-reload",
+ ovs_bindir, ovs_rundir, ovs_rundir);
+ for (;;) {
+ int c;
+
+ c = getopt_long(argc, argv, short_options, long_options, NULL);
+ if (c == -1) {
+ break;
+ }
+
+ switch (c) {
+ case 'H':
+ case 'h':
+ usage();
+
+ case 'V':
+ OVS_PRINT_VERSION(0, 0);
+ exit(EXIT_SUCCESS);
+
+ case OPT_LOCK_TIMEOUT:
+ lock_timeout = atoi(optarg);
+ break;
+
+ case OPT_PRUNE_TIMEOUT:
+ prune_timeout = atoi(optarg) * 1000;
+ break;
+
+ case OPT_RELOAD_COMMAND:
+ reload_command = optarg;
+ break;
+
+ VLOG_OPTION_HANDLERS
+ DAEMON_OPTION_HANDLERS
+ LEAK_CHECKER_OPTION_HANDLERS
+
+ case '?':
+ exit(EXIT_FAILURE);
+
+ default:
+ abort();
+ }
+ }
+ free(short_options);
+
+ argc -= optind;
+ argv += optind;
+
+ if (argc != 1) {
+ ovs_fatal(0, "exactly one non-option argument required; "
+ "use --help for usage");
+ }
+
+ config_file = argv[0];
+ error = cfg_set_file(config_file);
+ if (error) {
+ ovs_fatal(error, "failed to add configuration file \"%s\"",
+ config_file);
+ }
+}
+
+static void
+usage(void)
+{
+ printf("%s: bridge compatibility front-end for ovs-vswitchd\n"
+ "usage: %s [OPTIONS] CONFIG\n"
+ "CONFIG is the configuration file used by ovs-vswitchd.\n",
+ program_name, program_name);
+ printf("\nConfiguration options:\n"
+ " --reload-command=COMMAND shell command to reload ovs-vswitchd\n"
+ " --prune-timeout=SECS wait at most SECS before pruning ports\n"
+ " --lock-timeout=MSECS wait at most MSECS for CONFIG to unlock\n"
+ );
+ daemon_usage();
+ vlog_usage();
+ printf("\nOther options:\n"
+ " -h, --help display this help message\n"
+ " -V, --version display version information\n");
+ leak_checker_usage();
+ printf("\nThe default reload command is:\n%s\n", reload_command);
+ exit(EXIT_SUCCESS);
+}
diff --git a/vswitchd/ovs-vswitchd.8.in b/vswitchd/ovs-vswitchd.8.in
new file mode 100644
index 000000000..28e55ba31
--- /dev/null
+++ b/vswitchd/ovs-vswitchd.8.in
@@ -0,0 +1,87 @@
+.TH ovs\-vswitchd 8 "March 2009" "Open vSwitch" "OpenVSwitch Manual"
+.ds PN ovs\-vswitchd
+.
+.SH NAME
+ovs\-vswitchd \- virtual switch daemon
+.
+.SH SYNOPSIS
+.B ovs\-vswitchd
+\fIconfig\fR
+.
+.SH DESCRIPTION
+A daemon that manages and controls any number of virtual switches on
+the local machine.
+.PP
+The mandatory \fIconfig\fR argument specifies a configuration file.
+For a description of \fBovs\-vswitchd\fR configuration syntax, see
+\fBovs\-vswitchd.conf\fR(5).
+.PP
+At startup or upon receipt of a \fBSIGHUP\fR signal, \fBovs\-vswitchd\fR
+reads the configuration file. It sets up Open vSwitch datapaths and then
+operates switching across each bridge described in its configuration
+files. If a logfile was specified on the command line it will also
+be opened or reopened.
+.PP
+\fBovs\-vswitchd\fR virtual switches may be configured with any of the
+following features:
+.
+.IP \(bu
+L2 switching with MAC learning.
+.
+.IP \(bu
+NIC bonding with automatic fail-over and source MAC-based TX load
+balancing ("SLB").
+.
+.IP \(bu
+802.1Q VLAN support.
+.
+.IP \(bu
+Port mirroring, with optional VLAN tagging.
+.
+.IP \(bu
+NetFlow v5 flow logging.
+.
+.IP \(bu
+Connectivity to an external OpenFlow controller, such as NOX.
+.
+.PP
+Only a single instance of \fBovs\-vswitchd\fR is intended to run at a time.
+A single \fBovs\-vswitchd\fR can manage any number of virtual switches, up
+to the maximum number of supported Open vSwitch datapaths.
+.PP
+\fBovs\-vswitchd\fR does all the necessary management of OpenVSwitch datapaths
+itself. Thus, external tools, such \fBovs\-dpctl\fR(8), are not needed for
+managing datapaths in conjunction with \fBovs\-vswitchd\fR, and their use
+to modify datapaths when \fBovs\-vswitchd\fR is running can interfere with
+its operation. (\fBovs\-dpctl\fR may still be useful for diagnostics.)
+.PP
+An Open vSwitch datapath kernel module must be loaded for \fBovs\-vswitchd\fR
+to be useful. Please refer to the \fBINSTALL\fR file included in the
+Open vSwitch distribution for instructions on how to build and load
+the Open vSwitch kernel module.
+.PP
+.SH OPTIONS
+.IP "\fB--fake-proc-net\fR"
+Causes \fBovs\-vswitchd\fR to simulate some files in \fB/proc/net/vlan\fR
+and \fB/proc/net/bonding\fR that some legacy software expects to
+exist. This option should only be used if such legacy software is
+actually in use. It requires the \fBbrcompat_mod.ko\fR kernel module
+to be loaded.
+.
+.so lib/daemon.man
+.so lib/vlog.man
+.so lib/common.man
+.so lib/leak-checker.man
+.
+.SH "BUGS"
+.
+Only Open vSwitch kernel-based datapaths are currently supported. In the
+future, this restriction may be lifted.
+.PP
+Only Linux 2.6.\fIx\fR is currently supported.
+.
+.SH "SEE ALSO"
+.BR ovs\-appctl (8),
+.BR ovs\-vswitchd.conf (5),
+.BR ovs\-brcompatd (8),
+\fBINSTALL\fR in the Open vSwitch distribution.
diff --git a/vswitchd/ovs-vswitchd.c b/vswitchd/ovs-vswitchd.c
new file mode 100644
index 000000000..9528ec5fd
--- /dev/null
+++ b/vswitchd/ovs-vswitchd.c
@@ -0,0 +1,255 @@
+/* Copyright (c) 2008, 2009 Nicira Networks
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * In addition, as a special exception, Nicira Networks gives permission
+ * to link the code of its release of vswitchd with the OpenSSL project's
+ * "OpenSSL" library (or with modified versions of it that use the same
+ * license as the "OpenSSL" library), and distribute the linked
+ * executables. You must obey the GNU General Public License in all
+ * respects for all of the code used other than "OpenSSL". If you modify
+ * this file, you may extend this exception to your version of the file,
+ * but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version.
+ */
+
+#include <config.h>
+
+#include <assert.h>
+#include <errno.h>
+#include <getopt.h>
+#include <limits.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "bridge.h"
+#include "cfg.h"
+#include "command-line.h"
+#include "compiler.h"
+#include "daemon.h"
+#include "fault.h"
+#include "leak-checker.h"
+#include "mgmt.h"
+#include "ovs-vswitchd.h"
+#include "poll-loop.h"
+#include "port.h"
+#include "proc-net-compat.h"
+#include "process.h"
+#include "signals.h"
+#include "svec.h"
+#include "timeval.h"
+#include "unixctl.h"
+#include "util.h"
+#include "vconn-ssl.h"
+#include "vconn.h"
+
+#include "vlog.h"
+#define THIS_MODULE VLM_vswitchd
+
+static void parse_options(int argc, char *argv[]);
+static void usage(void) NO_RETURN;
+static void reload(struct unixctl_conn *, const char *args);
+
+static bool need_reconfigure;
+static struct unixctl_conn **conns;
+static size_t n_conns;
+
+int
+main(int argc, char *argv[])
+{
+ struct unixctl_server *unixctl;
+ struct signal *sighup;
+ int retval;
+
+ set_program_name(argv[0]);
+ register_fault_handlers();
+ time_init();
+ vlog_init();
+ parse_options(argc, argv);
+ signal(SIGPIPE, SIG_IGN);
+ sighup = signal_register(SIGHUP);
+ process_init();
+
+ die_if_already_running();
+ daemonize();
+
+ retval = unixctl_server_create(NULL, &unixctl);
+ if (retval) {
+ ovs_fatal(retval, "could not listen for control connections");
+ }
+ unixctl_command_register("vswitchd/reload", reload);
+
+ cfg_read();
+ mgmt_init();
+ bridge_init();
+ port_init();
+ mgmt_reconfigure();
+
+ need_reconfigure = false;
+ for (;;) {
+ if (need_reconfigure || signal_poll(sighup)) {
+ need_reconfigure = false;
+ vlog_reopen_log_file();
+ reconfigure();
+ }
+ mgmt_run();
+ if (bridge_run()) {
+ need_reconfigure = true;
+ }
+ unixctl_server_run(unixctl);
+
+ if (need_reconfigure) {
+ poll_immediate_wake();
+ }
+ signal_wait(sighup);
+ mgmt_wait();
+ bridge_wait();
+ unixctl_server_wait(unixctl);
+ poll_block();
+ }
+
+ return 0;
+}
+
+static void
+reload(struct unixctl_conn *conn, const char *args UNUSED)
+{
+ need_reconfigure = true;
+ conns = xrealloc(conns, sizeof *conns * (n_conns + 1));
+ conns[n_conns++] = conn;
+}
+
+void
+reconfigure(void)
+{
+ size_t i;
+
+ cfg_read();
+ bridge_reconfigure();
+ mgmt_reconfigure();
+ port_reconfigure();
+
+ for (i = 0; i < n_conns; i++) {
+ unixctl_command_reply(conns[i], 202, NULL);
+ }
+ free(conns);
+ conns = NULL;
+ n_conns = 0;
+}
+
+static void
+parse_options(int argc, char *argv[])
+{
+ enum {
+ OPT_PEER_CA_CERT = UCHAR_MAX + 1,
+ OPT_FAKE_PROC_NET,
+ VLOG_OPTION_ENUMS,
+ LEAK_CHECKER_OPTION_ENUMS
+ };
+ static struct option long_options[] = {
+ {"help", no_argument, 0, 'h'},
+ {"version", no_argument, 0, 'V'},
+ {"fake-proc-net", no_argument, 0, OPT_FAKE_PROC_NET},
+ DAEMON_LONG_OPTIONS,
+ VLOG_LONG_OPTIONS,
+ LEAK_CHECKER_LONG_OPTIONS,
+#ifdef HAVE_OPENSSL
+ VCONN_SSL_LONG_OPTIONS
+ {"peer-ca-cert", required_argument, 0, OPT_PEER_CA_CERT},
+#endif
+ {0, 0, 0, 0},
+ };
+ char *short_options = long_options_to_short_options(long_options);
+ const char *config_file;
+ int error;
+
+ for (;;) {
+ int c;
+
+ c = getopt_long(argc, argv, short_options, long_options, NULL);
+ if (c == -1) {
+ break;
+ }
+
+ switch (c) {
+ case 'H':
+ case 'h':
+ usage();
+
+ case 'V':
+ OVS_PRINT_VERSION(OFP_VERSION, OFP_VERSION);
+ exit(EXIT_SUCCESS);
+
+ case OPT_FAKE_PROC_NET:
+ error = proc_net_compat_init();
+ if (error) {
+ ovs_fatal(error, "failed to initialize /proc/net "
+ "compatibility");
+ }
+ break;
+
+ VLOG_OPTION_HANDLERS
+ DAEMON_OPTION_HANDLERS
+ VCONN_SSL_OPTION_HANDLERS
+ LEAK_CHECKER_OPTION_HANDLERS
+
+#ifdef HAVE_OPENSSL
+ case OPT_PEER_CA_CERT:
+ vconn_ssl_set_peer_ca_cert_file(optarg);
+ break;
+#endif
+
+ case '?':
+ exit(EXIT_FAILURE);
+
+ default:
+ abort();
+ }
+ }
+ free(short_options);
+
+ argc -= optind;
+ argv += optind;
+
+ if (argc != 1) {
+ ovs_fatal(0, "config file is only non-option argument; "
+ "use --help for usage");
+ }
+
+ config_file = argv[0];
+ error = cfg_set_file(config_file);
+ if (error) {
+ ovs_fatal(error, "failed to add configuration file \"%s\"",
+ config_file);
+ }
+}
+
+static void
+usage(void)
+{
+ printf("%s: virtual switch daemon\n"
+ "usage: %s [OPTIONS] CONFIG\n"
+ "CONFIG is a configuration file in ovs-vswitchd.conf(5) format.\n",
+ program_name, program_name);
+ daemon_usage();
+ vlog_usage();
+ printf("\nLegacy compatibility options:\n"
+ " --fake-proc-net simulate some files in /proc/net\n"
+ "\nOther options:\n"
+ " -h, --help display this help message\n"
+ " -V, --version display version information\n");
+ leak_checker_usage();
+ exit(EXIT_SUCCESS);
+}
diff --git a/vswitchd/ovs-vswitchd.conf.5.in b/vswitchd/ovs-vswitchd.conf.5.in
new file mode 100644
index 000000000..898721845
--- /dev/null
+++ b/vswitchd/ovs-vswitchd.conf.5.in
@@ -0,0 +1,642 @@
+.\" -*- nroff -*-
+.de TQ
+. br
+. ns
+. TP "\\$1"
+..
+.de IQ
+. br
+. ns
+. IP "\\$1"
+..
+.de ST
+. PP
+. RS -0.15in
+. I "\\$1"
+. RE
+. PP
+..
+.TH ovs\-vswitchd.conf 5 "April 2009" "Open vSwitch" "OpenVSwitch Manual"
+.
+.SH NAME
+ovs\-vswitchd.conf \- configuration file for \fBovs\-vswitchd\fR
+.
+.SH DESCRIPTION
+This manual page describes the syntax for the configuration file used
+by \fBovs\-vswitchd\fR(8), the virtual switch daemon.
+.PP
+The configuration file is based on key-value pairs, which are given
+one per line in the form \fIkey\fB=\fIvalue\fR. Each \fIkey\fR
+consists of one or more parts separated by dots,
+e.g. \fIpart1\fB.\fIpart2\fB.\fIpart3\fR. Each \fIpart\fR may consist
+only of the English letters, digits, and the special characters
+\fB_-@$:+\fR. White space within \fIvalue\fR and at the beginning of a
+line is significant, but is otherwise ignored.
+.PP
+If a single key is specified more than once, that key has multiple
+values, one value for each time the key is specified. The ordering of
+key-value pairs, and the ordering of multiple values for a single key,
+within a configuration file is not significant.
+.PP
+Blank lines, lines that consist only of white space, and lines that
+begin with \fB#\fR (optionally preceded by white space) are ignored.
+Keep in mind that programs that modify the configuration file, such as
+\fBovs\-brcompatd\fR and \fBovs-cfg-mod\fR, may alter the order of
+elements and
+strip comments and blank lines.
+.PP
+The following subsections describe how key-value pairs are used to
+configure \fBovs\-vswitchd\fR.
+.SS "Bridge Configuration"
+A bridge (switch) with a given \fIname\fR is configured by specifying
+the names of its network devices as values for key
+\fBbridge.\fIname\fB.port\fR. (The specified \fIname\fR may not begin
+with \fBdp\fR or \fBnl:\fR followed by a digit.)
+.PP
+The names given on \fBbridge.\fIname\fB.port\fR must be the names of
+existing network devices, except for ``internal ports.'' An internal
+port is a simulated network device that receives traffic only
+through the virtual switch and switches any traffic sent it through
+virtual switch. An internal port may configured with an IP address,
+etc. using the usual system tools (e.g. \fBifconfig\fR, \fBip\fR). To
+designate network device \fInetdev\fR as an internal port, add
+\fBiface.\fInetdev\fB.internal=true\fR to the configuration file.
+\fBovs\-vswitchd\fR will honor this configuration setting by automatically
+creating the named internal port.
+.PP
+A bridge with a given \fIname\fR always has an internal port with the
+same \fIname\fR, called the ``local port.'' This network device may
+be included
+in the bridge, by specifying it as one of the values for key
+\fBbridge.\fIname\fB.port\fR, or it may be omitted. If it is
+included, then its MAC address is by default the lowest-numbered MAC
+address among the other bridge ports, ignoring other internal ports
+and bridge ports that are
+used as port mirroring destinations (see \fBPort Mirroring\fR, below). To
+use a specific MAC address instead, set \fBbridge.\fIname\fB.mac\fR to
+a MAC address in the format
+\fIxx\fB:\fIxx\fB:\fIxx\fB:\fIxx\fB:\fIxx\fB:\fIxx\fR, where each
+\fIx\fR is a hex digit. If no valid MAC address can be determined
+either of these ways, then a MAC address is randomly generated.
+.PP
+The following syntax defines a bridge named \fBmybr\fR, configured
+with network devices \fBeth0\fR, \fBeth1\fR, and \fBeth2\fR:
+.RS
+.nf
+
+bridge.mybr.port=eth0
+bridge.mybr.port=eth1
+bridge.mybr.port=eth2
+
+.fi
+.RE
+.SS "802.1Q VLAN support"
+A bridge port may be configured either as a trunk port or as belonging
+to a single, untagged VLAN. These two options are mutually exclusive,
+and a port must be configured in one way or the other.
+.ST "Trunk Ports"
+By default, bridge ports are trunk ports that carry all VLANs. To
+limit the VLANs that a trunk port carries, define
+\fBvlan.\fIport\fB.trunks\fR to one or more integers between 0 and
+4095 designating VLANs. Only frames that have an 802.1Q header with
+one of the listed VLANs are accepted on a trunk port. If 0 is
+included in the list, then frames without an 802.1Q header are also
+accepted. Other frames are discarded.
+.PP
+The following syntax makes network device \fBeth0\fR a trunk port that
+carries VLANs 1, 2, and 3:
+.PP
+.RS
+.nf
+
+vlan.eth0.trunks=1
+vlan.eth0.trunks=2
+vlan.eth0.trunks=3
+
+.fi
+.RE
+.ST "Untagged VLAN Ports"
+A bridge port may be configured with an implicit, untagged VLAN.
+Define key
+\fBvlan.\fIport\fB.tag\fR to an integer value \fIvid\fR between 0 and
+4095, inclusive, to designate the named \fIport\fR as a member
+of 802.1Q VLAN \fIvid\fR. When \fIport\fR is assigned a VLAN tag this
+way, frames arriving on trunk ports will be forwarded to \fIport\fR
+only if they are tagged with VLAN \fIvid\fR, and frames arriving on
+other VLAN ports will be forwarded to \fIport\fR only if their
+\fIvid\fR values are equal. Frames forwarded to \fIport\fR will not
+have an 802.1Q header.
+.PP
+When \fIvid\fR is 0, frames arriving on trunk ports without an 802.1Q
+VLAN header will also be forwarded to \fIport\fR.
+.PP
+When a frame with a 802.1Q header that indicates a nonzero VLAN is
+received on an implicit VLAN port, it is discarded.
+.PP
+The following syntax makes network device \fBeth0\fR a member of VLAN
+101:
+.PP
+.RS
+.nf
+
+vlan.eth0.tag=101
+
+.fi
+.RE
+.SS "Network Device Bonding"
+Bonding allows multiple ``slave'' network devices to be treated as if
+they were a single virtual ``bonded'' network device. It is useful for
+load balancing and fail-over.
+.PP
+\fBovs\-vswitchd\fR supports ``source load balancing'' (SLB) bonding, which
+assigns flows to slaves based on source MAC address, with periodic
+rebalancing as traffic patterns change. This form of bonding does not
+require 802.3ad or other special support from the upstream switch to
+which the slave devices are connected.
+.PP
+To configure bonding, create a virtual bonding device by specifying
+the slave network device names as values for
+\fBbonding.\fIname\fB.slave\fR, then specify \fIname\fR as a bridge
+port. The chosen \fIname\fR should not be the name of any real
+network device on the host system.
+.PP
+By default, bonding interfaces are enabled or disabled immediately
+when a carrier is detected or dropped on the underlying network
+device. To insert a delay when carrier comes up or goes down before
+enabling or disabling an interface, set the value of
+\fBbonding.\fIname\fB.updelay\fR or
+\fBbonding.\fIname\fB.downdelay\fR, respectively, to a positive
+integer, interpreted in milliseconds.
+.PP
+The following syntax bonds \fBeth0\fR and \fBeth1\fR into a bonding
+device named \fBbond0\fR, which is added to bridge \fBmybr\fR along
+with physical network devices \fBeth2\fR and \fBeth3\fR:
+.PP
+.RS
+.nf
+
+bridge.mybr.port=bond0
+bridge.mybr.port=eth2
+bridge.mybr.port=eth3
+
+bonding.bond0.slave=eth0
+bonding.bond0.slave=eth1
+
+.fi
+.RE
+.SS "Port Mirroring (SPAN and RSPAN)"
+\fBovs\-vswitchd\fR may be configured to send selected frames to special
+``mirrored'' ports, in addition to their normal destinations. Mirroring
+traffic may also be referred to as SPAN or RSPAN, depending on the
+mechanism used for delivery.
+.PP
+Up to 32 instances of port mirroring may be configured on a given
+bridge. Each must be given a name that is unique within the bridge.
+The keys associated with port mirroring instance \fIpmname\fR for
+bridge \fIbrname\fR begin with \fBmirror.\fIbrname\fB.\fIpmname\fR.
+.PP
+The selection of the frames to mirror and the form in which they
+should be output is configured separately for each port mirroring
+instances, through a subsection of
+\fBmirror.\fIbrname\fB.\fIpmname\fR, named \fBselect\fR, and
+\fBoutput\fR, respectively.
+.ST "Selecting Frames to Mirror"
+The values for the following keys, if specified, limit the frames that
+are chosen for mirroring. If none of these keys is specified, then
+all frames received by the bridge are mirrored. If more than one of
+these keys is specified, then a frame must meet all specified criteria
+to be mirrored.
+.TP
+\fBmirror.\fIbrname\fB.\fIpmname\fB.select.src-port=\fIport\fR
+.TQ
+\fBmirror.\fIbrname\fB.\fIpmname\fB.select.dst-port=\fIport\fR
+.TQ
+\fBmirror.\fIbrname\fB.\fIpmname\fB.select.port=\fIport\fR
+Frame received on \fIport\fR, output to \fIport\fR, or either received
+on or output to \fIport\fR, respectively. \fIport\fR must be part of
+the bridge \fIbrname\fR; that is, it must be listed on
+\fBbridge.\fIbrname\fB.port\fR.
+.TP
+\fBmirror.\fIbrname\fB.\fIpmname\fB.select.vlan=\fIvid\fR
+.
+\fIvid\fR must be an integer between 0 and 4095, inclusive. A nonzero
+\fIvid\fR selects frames that belong to VLAN \fIvid\fR, that is,
+frames that arrived on a trunk port tagged with VLAN \fIvid\fR or on a
+port that is configured as part of VLAN \fIvid\fR (see \fB802.1Q VLAN
+tagging\fR, above). A \fIvid\fR of zero selects frames that do not
+belong to a VLAN, that is, frames that arrived on a trunk port without
+a VLAN tag or tagged with VLAN 0.
+.ST "Mirror Output"
+The values of the following keys determine how frames selected for
+mirroring are output. Only one of the keys may be specified.
+.TP
+\fBmirror.\fIbrname\fB.\fIpmname\fB.output.port=\fIport\fR
+.
+Causes the selected frames to be sent out \fIport\fR, which must be
+part of the bridge \fIbrname\fR; that is, it must be listed on
+\fBbridge.\fIbrname\fB.port\fR.
+.IP
+Specifying a \fIport\fR in this way reserves that port exclusively for
+mirroring. No frames other than those selected for mirroring will be
+forwarded to \fIport\fR, and any frames received on \fIport\fR will be
+discarded. This type of mirroring may be referred to as SPAN.
+.TP
+\fBmirror.\fIbrname\fB.\fIpmname\fB.output.vlan=\fIvid\fR
+.
+Causes the selected frames to be sent on the VLAN numbered \fIvid\fR,
+which must be an integer between 0 and 4095, inclusive. The frames
+will be sent out all ports that trunk VLAN \fIvid\fR, as well as any
+ports with implicit VLAN \fIvid\fR. When a mirrored frame is sent out
+a trunk port, the frame's VLAN tag will be set to \fIvid\fR, replacing
+any existing tag; when it is sent out an implicit VLAN port, the frame
+will not be tagged. This type of mirroring may be referred to as
+RSPAN.
+.ST "Example"
+The following \fBovs\-vswitchd\fR configuration copies all frames received
+on \fBeth1\fR or \fBeth2\fR to \fBeth3\fR.
+.PP
+.RS
+.nf
+
+bridge.mybr.port=eth1
+bridge.mybr.port=eth2
+bridge.mybr.port=eth3
+
+mirror.mybr.a.select.src-port=eth1
+mirror.mybr.a.select.src-port=eth2
+mirror.mybr.a.output.port=eth3
+
+.fi
+.RE
+.SS "Port Rate-Limiting"
+Traffic policing and shaping are configured on physical ports. Policing
+defines a hard limit at which traffic that exceeds the specified rate is
+dropped. Shaping uses queues to delay packets so that egress traffic
+leaves at the specified rate.
+
+.ST "Ingress Policing"
+The rate at which traffic is allowed to enter through a particular
+physical port can be configured with ingress policing. The rate is
+specified in kilobits (1000 bits) per second with a maximum burst size
+specified in kilobits (1000 bits). The burst size should be at least
+the size of the port's MTU.
+
+A port may be configured to enforce ingress policing by defining the
+key \fBport.\fIname\fB.ingress.policing-rate\fR with an integer
+indicating the rate. The port \fIname\fR will only allow traffic to be
+received at the rate specified in kilobits per second. If the rate is zero
+or the key is not defined, then ingress policing is disabled.
+
+If ingress policing is enabled, then the burst rate may be set by defining
+the key \fBport.\fIname\fB.ingress.policing-burst\fR with an integer
+indicating the burst rate in kilobits. If the key is not supplied or is
+zero, then the default burst is 10 kilobits.
+
+.PP
+The following syntax limits port \fBeth1\fR to receiving traffic at
+\fB512\fR kilobits per second with a burst of \fB20\fR kilobits:
+.PP
+.RS
+.nf
+
+port.eth1.ingress.policing-rate=512
+port.eth1.ingress.policing-burst=20
+
+.fi
+.SS "NetFlow v5 Flow Logging"
+NetFlow is a protocol that exports a number of details about terminating
+IP flows, such as the principals involved and duration. A bridge may be
+configured to send NetFlow v5 records to NetFlow collectors when flows
+end. To enable, define the key \fBnetflow.\fIbridge\fB.host\fR for each
+collector in the form \fIhost\fB:\fIport\fR. Records from \fIbridge\fR
+will be sent to each \fIhost\fR on UDP \fIport\fR.
+
+The NetFlow messages will use the datapath index for the engine type and id.
+This can be overridden with the \fBnetflow.\fIbridge\fB.engine-type\fR and
+\fBnetflow.\fIbridge\fB.engine-id\fR, respectively. Each takes a value
+between 0 and 255, inclusive.
+
+Many NetFlow collectors do not expect multiple virtual switches to be
+sending messages from the same host, and they do not store the engine
+information which could be used to disambiguate the traffic. To prevent
+flows from multiple switches appearing as if they came on the interface,
+add \fBnetflow.\fIbridge\fB.add-id-to-iface=true\fR to the configuration
+file. This will place the least significant 7 bits of the engine id
+into the most significant bits of the ingress and egress interface fields
+of flow records. By default, this behavior is disabled.
+
+The following syntax sends NetFlow records for \fBmybr\fR to the NetFlow
+collector \fBnflow.example.com\fR on UDP port \fB9995\fR:
+.PP
+.RS
+.nf
+
+netflow.mybr.host=nflow.example.com:9995
+
+.fi
+.RE
+.SS "Remote Management"
+A \fBovs\-vswitchd\fR instance may be remotely managed by a controller that
+supports the OpenFlow Management Protocol, such as NOX. This
+functionality is enabled by setting the key \fBmgmt.controller\fR to one
+of the following values:
+.
+.TP
+\fBssl:\fIhost\fR[\fB:\fIport\fR]
+The specified SSL \fIport\fR (default: 6633) on the given remote
+\fIhost\fR. SSL must be configured when this form is used (see \fBSSL
+Configuration\fR, below).
+.
+.TP
+\fBtcp:\fIhost\fR[\fB:\fIport\fR]
+The specified TCP \fIport\fR (default: 6633) on the given remote
+\fIhost\fR.
+.PP
+The maximum time between attempts to connect to the controller may be
+specified in integral seconds with the \fBmgmt.max-backoff\fR key. The
+default maximum backoff is 15 seconds, and the minimum value is 1
+second.
+
+An inactivity probe may be configured with the \fBmgmt.inactivity-probe\fR
+key. If \fBovs\-vswitchd\fR does not communicate with the controller for the
+specified number of seconds, it will send a probe. If a response is not
+received for an additional amount of that time, \fBovs\-vswitchd\fR assumes
+the connection has been broken and attempts to reconnect. The default
+is 15 seconds, and the minimum value is 5 seconds.
+
+A management id may be specified with the \fBmgmt.id\fR key. It takes
+an id in the form of exactly 12 hexadecimal digits. If one is not
+specified, a random id is generated each time \fBovs\-vswitchd\fR is started.
+.fi
+.RE
+.SS "OpenFlow Controller Connectivity"
+\fBovs\-vswitchd\fR can perform all configured bridging and switching
+locally, or it can be configured to connect a given bridge to an
+external OpenFlow controller, such as NOX. Its behavior depends on
+the \fBbridge.\fIname\fB.controller\fR setting:
+.
+.TP
+\fI\[la]unset\[ra]\fR
+When the key is not set, the behavior depends on whether remote
+management is configured. If management is configured, then the switch
+will connect to the controller specified on \fBmgmt.controller\fR. If
+management is not configured, the switch will perform all configured
+bridging and switching locally.
+.
+.TP
+\fI\[la]empty\[ra]\fR
+Setting an empty string value disables controller connectivity. The
+switch will perform all configured bridging and switching locally.
+.
+.TP
+\fBdiscover\fR
+Use controller discovery to find the local OpenFlow controller.
+Refer to \fBsecchan\fR(8) for information on how to configure a DHCP
+server to support controller discovery. The following additional
+options control the discovery process:
+.
+.RS
+.TP
+\fBbridge.\fIname\fB.controller.accept-regex=\fIregex\fR
+A POSIX extended regular expression against which the discovered
+controller location is validated. Only controllers whose names match
+the regular expression will be accepted.
+.IP
+The default regular expression is \fBssl:.*\fR, meaning that only SSL
+controller connections will be accepted, when SSL is configured (see
+\fBSSL Configuration\fR), and \fB.*\fR otherwise, meaning that any
+controller will be accepted.
+.IP
+The regular expression is implicitly anchored at the beginning of the
+controller location string, as if it begins with \fB^\fR.
+.TP
+\fBbridge.\fIname\fB.controller.update-resolv.conf=\fBtrue\fR|\fBfalse\fR
+By default, or if this is set to \fBtrue\fR, \fBovs\-vswitchd\fR overwrites
+the system's \fB/etc/resolv.conf\fR with domain information and DNS
+servers obtained via DHCP. If this setting is \fBfalse\fR,
+\fBovs\-vswitchd\fR will not modify \fB/etc/resolv.conf\fR.
+.IP
+\fBovs\-vswitchd\fR will only modify \fBresolv.conf\fR if the DHCP response
+that it receives specifies one or more DNS servers.
+.RE
+.
+.TP
+\fBssl:\fIhost\fR[\fB:\fIport\fR]
+The specified SSL \fIport\fR (default: 6633) on the given remote
+\fIhost\fR. SSL must be configured when this form is used (see \fBSSL
+Configuration\fR, below).
+.
+.TP
+\fBtcp:\fIhost\fR[\fB:\fIport\fR]
+The specified TCP \fIport\fR (default: 6633) on the given remote
+\fIhost\fR.
+.
+.TP
+\fBunix:\fIfile\fR
+The Unix domain server socket named \fIfile\fR.
+.PP
+The datapath ID used by the bridge to identify itself to the remote
+controller may be specified as \fBbridge.\fIname\fB.datapath-id\fR,
+in the form of exactly 12 hexadecimal digits. If the datapath ID
+is not specified, then it defaults to the bridge's MAC address (see
+\fBBridge Configuration\fR, above, for information on how the bridge's
+MAC address is chosen).
+.ST "Local Port Network Configuration"
+When an external controller is configured, but controller discovery is
+not in use, the following additional settings are honored:
+.TP
+\fBbridge.\fIname\fB.controller.in-band=\fBtrue\fR|\fBfalse\fR
+By default, or if this is set to \fBtrue\fR, \fBovs\-vswitchd\fR connects
+to the controller in-band. If this is set to \fBfalse\fR,
+\fBovs\-vswitchd\fR connects to the controller out-of-band. Refer to
+\fBsecchan\fR(8) for a description of in-band and out-of-band control.
+.IP "\fBbridge.\fIname\fB.controller.ip=\fIip\fR"
+If specified, the IP address to configure on the bridge's local port.
+.IP "\fBbridge.\fIname\fB.controller.netmask=\fInetmask\fR"
+When an IP is specified, the corresponding netmask. The default is
+255.255.255.0 for a Class C IP address, 255.255.0.0 for Class B, and
+255.0.0.0 for Class A.
+.IP "\fBbridge.\fIname\fB.controller.gateway=\fIip\fR"
+When an IP is specified, the corresponding IP gateway. There is no
+default gateway.
+.ST "Controller Failure Settings"
+The following additional settings take effect when any remote
+controller is configured:
+.IP "\fBbridge.\fIname\fB.controller.inactivity-probe=\fIsecs\fR"
+This optional setting may be set to \fIsecs\fR, a number of seconds.
+The minimum value of \fIsecs\fR is 5 seconds. The default is taken
+from \fBmgmt.inactivity-probe\fR (see above).
+.IP
+When the virtual switch is connected to the controller, it waits for a
+message to be received from the controller for \fIsecs\fR seconds
+before it sends a inactivity probe to the controller. After sending
+the inactivity probe, if no response is received for an additional
+\fIsecs\fR seconds, the secure channel assumes that the connection has
+been broken and attempts to reconnect.
+.IP
+Changing the inactivity probe interval also changes the interval
+before entering standalone mode (see below).
+.IP "\fBbridge.\fIname\fB.controller.fail-mode=\fBstandalone\fR|\fBsecure\fR"
+.IQ "\fBmgmt.fail-mode=standalone\fR|\fBsecure\fR"
+When a controller is configured, it is, ordinarily, responsible for
+setting up all flows on the virtual switch. Thus, if the connection to
+the controller fails, no new network connections can be set up. If
+the connection to the controller stays down long enough, no packets
+can pass through the switch at all.
+.IP
+The first of these that is set takes effect.
+If the value is \fBstandalone\fR, \fBovs\-vswitchd\fR will take over
+responsibility for setting up
+flows when no message has been received from the controller for three
+times the inactivity probe interval (see above). In this mode,
+\fBovs\-vswitchd\fR causes the datapath to act like an ordinary
+MAC-learning switch. \fBovs\-vswitchd\fR will continue to retry connecting
+to the controller in the background and, when the connection succeeds,
+it discontinues its standalone behavior.
+.IP
+If this option is set to \fBsecure\fR, or if neither of these settings
+is set, \fBovs\-vswitchd\fR will not set up flows on its own when the
+controller connection fails.
+.IP "\fBbridge.\fIname\fB.controller.max-backoff=\fIsecs\fR"
+Sets the maximum time between attempts to connect to the controller to
+\fIsecs\fR, which must be at least 1. The actual interval between
+connection attempts starts at 1 second and doubles on each failing
+attempt until it reaches the maximum. The default maximum backoff
+time is taken from \fBmgmt.max-backoff\fR.
+.ST "Controller Rate-Limiting"
+These settings configure how the virtual switch applies a ``token
+bucket'' to limit the rate at which packets in unknown flows are
+forwarded to the OpenFlow controller for flow-setup processing. This
+feature prevents a single bridge from overwhelming a controller.
+.IP "\fBbridge.\fIname\fB.controller.rate-limit=\fIrate\fR"
+.IQ "\fBmgmt.rate-limit=\fIrate\fR"
+Limits the maximum rate at which packets will be forwarded to the
+OpenFlow controller to \fIrate\fR packets per second. A rate specified
+explicitly for \fIname\fR overrides a value configured using the
+\fBmgmt.rate-limit\fR key.
+.IP
+If neither one of these settings is set, then the bridge does not
+limit the rate at which packets are forwarded to the controller.
+.IP "\fBbridge.\fIname\fB.controller.burst-limit=\fIburst\fR"
+.IQ "\fBmgmt.burst-limit=\fIburst\fR"
+Sets the maximum number of unused packet credits that the bridge will
+allow to accumulate during the time in which no packets are being
+forwarded to the OpenFlow controller to \fIburst\fR (measured in
+packets). The default \fIburst\fR is one-quarter of the \fIrate\fR
+specified in the rate-limit setting.
+.IP
+A burst specified explicitly for \fIname\fR overrides a value configured
+using the \fBmgmt.burst-limit\fR key. This option takes effect only
+when a rate-limit is specified.
+.ST "Remote Command Execution Settings"
+These settings configure the commands that remote OpenFlow connections
+are allowed to invoke using (e.g.) \fBovs\-ofctl execute\fR. To be
+permitted, a command name must be whitelisted and must not be
+blacklisted. When the whitelist and blacklist permit a command name,
+\fBovs\-vswitchd\fR looks for a program with the same name as the command
+in the commands directory (see below). Other directories are not
+searched.
+.IP "\fBbridge.\fIname\fB.controller.commands.acl=\fIglob\fR"
+Whitelists commands whose names match shell glob pattern \fIglob\fR,
+allowing those commands to be invoked by the remote controller.
+.IP
+By default, no commands are whitelisted, so this setting is mandatory
+if any remote command execution is to be allowed.
+.IP "\fBbridge.\fIname\fB.controller.commands.acl=\fB!\fR\fIglob\fR"
+Blacklists commands whose names match shell glob pattern \fIglob\fR,
+prohibiting those commands from being invoked by the remote
+controller. Command names that include characters other than upper-
+and lower-case English letters, digits, and the underscore and hyphen
+characters are blacklisted unconditionally.
+.IP "\fBbridge.\fIname\fB.controller.commands.dir=\fIdirectory\fR"
+Sets the directory searched for remote command execution to
+\fIdirectory\fR. The default directory is
+\fB@pkgdatadir@/commands\fR.
+.SS "SSL Configuration"
+When \fBovs\-vswitchd\fR is configured to connect over SSL for management or
+for controller connectivity, the following settings are required:
+.TP
+\fBssl.private-key=\fIprivkey.pem\fR
+Specifies a PEM file containing the private key used as the virtual
+switch's identity for SSL connections to the controller.
+.TP
+\fBssl.certificate=\fIcert.pem\fR
+Specifies a PEM file containing a certificate, signed by the
+certificate authority (CA) used by the controller and manager, that
+certifies the virtual switch's private key, identifying a trustworthy
+switch.
+.TP
+\fBssl.ca-cert=\fIcacert.pem\fR
+Specifies a PEM file containing the CA certificate used to verify that
+the virtual switch is connected to a trustworthy controller.
+.PP
+These files are read only once, at \fBovs\-vswitchd\fR startup time. If
+their contents change, \fBovs\-vswitchd\fR must be killed and restarted.
+.PP
+These SSL settings apply to all SSL connections made by the virtual
+switch.
+.ST "CA Certificate Bootstrap"
+Ordinarily, all of the files named in the SSL configuration must exist
+when \fBovs\-vswitchd\fR starts. However, if \fBssl.bootstrap-ca-cert\fR
+is set to \fBtrue\fR, then \fBovs\-vswitchd\fR will attempt to obtain the
+CA certificate from the controller on its first SSL connection and
+save it to the named PEM file. If it is successful, it will
+immediately drop the connection and reconnect, and from then on all
+SSL connections must be authenticated by a certificate signed by the
+CA certificate thus obtained.
+.PP
+\fBThis option exposes the SSL connection to a man-in-the-middle
+attack obtaining the initial CA certificate\fR, but it may be useful
+for bootstrapping.
+.PP
+This option is only useful if the controller sends its CA certificate
+as part of the SSL certificate chain. The SSL protocol does not
+require the controller to send the CA certificate, but
+\fBcontroller\fR(8) can be configured to do so with the
+\fB--peer-ca-cert\fR option.
+.SS "OpenFlow Management Connections"
+By default, each bridge \fIname\fR listens for OpenFlow management
+connections on a Unix domain socket named
+\fB@RUNDIR@/\fIname\fB.mgmt\fR. This socket can be used to perform
+local OpenFlow monitoring and administration, e.g., \fBovs\-ofctl dump-flows
+unix:@RUNDIR@/\fIname\fB.mgmt\fR to display the flows currently set up
+in bridge \fIname\fR.
+.PP
+If \fBbridge.\fIname\fB.openflow.listeners\fR is set to one or more
+values, \fBovs\-vswitchd\fR instead listens on the specified connection
+methods. Acceptable connection methods include:
+.RS
+.IP "\fBpunix:\fIfile\fR"
+Listens for connections on the Unix domain server socket named \fIfile\fR.
+.IP "\fBpssl:\fR[\fIport\fR]"
+Listens for SSL connections on \fIport\fR (default: 6633). SSL must
+be configured when this form is used (see \fBSSL Configuration\fR,
+above).
+.IP "\fBptcp:\fR[\fIport\fR]"
+Listens for TCP connections on \fIport\fR (default: 6633).
+.RE
+To entirely disable listening for management connections, set
+\fBbridge.\fIname\fB.openflow.listeners\fR to the single value
+\fBnone\fR.
+
+.SS "OpenFlow Controller Connection Snooping"
+By default, each bridge \fIname\fR listens for OpenFlow controller
+connection snooping connections on a Unix domain socket named
+\fB@RUNDIR@/\fIname\fB.snoop\fR. A client that connects to this
+socket, e.g., \fBovs\-ofctl monitor unix:@RUNDIR@/\fIname\fB.snoop\fR, will
+receive a copy of every OpenFlow message sent by the switch to the
+controller, or vice versa, on the primary OpenFlow controller
+connection.
+.PP
+If \fBbridge.\fIname\fB.openflow.snoops\fR is set to one or more
+values, \fBovs\-vswitchd\fR instead listens on the specified connection
+methods. The acceptable connection methods are the same as for
+OpenFlow management connections (see above).
+.PP
+To entirely disable controller connection snooping, set
+\fBbridge.\fIname\fB.openflow.snoops\fR to the single value
+\fBnone\fR.
+.SH "SEE ALSO"
+.BR ovs\-brcompatd (8),
+.BR ovs\-cfg\-mod (8),
+.BR ovs\-vswitchd (8)
diff --git a/vswitchd/ovs-vswitchd.h b/vswitchd/ovs-vswitchd.h
new file mode 100644
index 000000000..f292d6824
--- /dev/null
+++ b/vswitchd/ovs-vswitchd.h
@@ -0,0 +1,32 @@
+/* Copyright (c) 2009 Nicira Networks
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * In addition, as a special exception, Nicira Networks gives permission
+ * to link the code of its release of vswitchd with the OpenSSL project's
+ * "OpenSSL" library (or with modified versions of it that use the same
+ * license as the "OpenSSL" library), and distribute the linked
+ * executables. You must obey the GNU General Public License in all
+ * respects for all of the code used other than "OpenSSL". If you modify
+ * this file, you may extend this exception to your version of the file,
+ * but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version.
+ */
+
+#ifndef VSWITCHD_H
+#define VSWITCHD_H 1
+
+void reconfigure(void);
+
+#endif /* ovs-vswitchd.h */
diff --git a/vswitchd/port.c b/vswitchd/port.c
new file mode 100644
index 000000000..f6348f353
--- /dev/null
+++ b/vswitchd/port.c
@@ -0,0 +1,68 @@
+/* Copyright (c) 2009 Nicira Networks
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * In addition, as a special exception, Nicira Networks gives permission
+ * to link the code of its release of vswitchd with the OpenSSL project's
+ * "OpenSSL" library (or with modified versions of it that use the same
+ * license as the "OpenSSL" library), and distribute the linked
+ * executables. You must obey the GNU General Public License in all
+ * respects for all of the code used other than "OpenSSL". If you modify
+ * this file, you may extend this exception to your version of the file,
+ * but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version.
+ *
+ */
+
+#include <config.h>
+
+#include "bridge.h"
+#include "cfg.h"
+#include "netdev.h"
+#include "ovs-vswitchd.h"
+#include "port.h"
+#include "svec.h"
+
+#define THIS_MODULE VLM_port
+#include "vlog.h"
+
+static int
+set_ingress_policing(const char *port_name)
+{
+ int kbits_rate = cfg_get_int(0, "port.%s.ingress.policing-rate",
+ port_name);
+ int kbits_burst = cfg_get_int(0, "port.%s.ingress.policing-burst",
+ port_name);
+
+ return netdev_nodev_set_policing(port_name, kbits_rate, kbits_burst);
+}
+
+void
+port_init(void)
+{
+ port_reconfigure();
+}
+
+void
+port_reconfigure(void)
+{
+ struct svec ports;
+ int i;
+
+ svec_init(&ports);
+ bridge_get_ifaces(&ports);
+ for (i=0; i<ports.n; i++) {
+ set_ingress_policing(ports.names[i]);
+ }
+}
diff --git a/vswitchd/port.h b/vswitchd/port.h
new file mode 100644
index 000000000..55c2d7bcd
--- /dev/null
+++ b/vswitchd/port.h
@@ -0,0 +1,33 @@
+/* Copyright (c) 2009 Nicira Networks
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * In addition, as a special exception, Nicira Networks gives permission
+ * to link the code of its release of vswitchd with the OpenSSL project's
+ * "OpenSSL" library (or with modified versions of it that use the same
+ * license as the "OpenSSL" library), and distribute the linked
+ * executables. You must obey the GNU General Public License in all
+ * respects for all of the code used other than "OpenSSL". If you modify
+ * this file, you may extend this exception to your version of the file,
+ * but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version.
+ */
+
+#ifndef VSWITCHD_PORT_H
+#define VSWITCHD_PORT_H 1
+
+void port_init(void);
+void port_reconfigure(void);
+
+#endif /* port.h */
diff --git a/vswitchd/proc-net-compat.c b/vswitchd/proc-net-compat.c
new file mode 100644
index 000000000..3f5cf44ad
--- /dev/null
+++ b/vswitchd/proc-net-compat.c
@@ -0,0 +1,344 @@
+/* Copyright (c) 2009 Nicira Networks
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include <config.h>
+#include "proc-net-compat.h"
+#include <assert.h>
+#include <dirent.h>
+#include <errno.h>
+#include <inttypes.h>
+#include <string.h>
+#include "dynamic-string.h"
+#include "hash.h"
+#include "netlink-protocol.h"
+#include "netlink.h"
+#include "ofpbuf.h"
+#include "openvswitch/brcompat-netlink.h"
+#include "hmap.h"
+#include "shash.h"
+#include "svec.h"
+
+#define THIS_MODULE VLM_proc_net_compat
+#include "vlog.h"
+
+/* Netlink socket to bridge compatibility kernel module. */
+static struct nl_sock *brc_sock;
+
+/* The Generic Netlink family number used for bridge compatibility. */
+static int brc_family = 0;
+
+/* Rate limiting for log messages. */
+static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
+
+static void flush_dir(const char *dir);
+static int set_proc_file(const char *dir, const char *file, const char *data);
+
+/* Initializes the /proc/net compatibility layer. Returns 0 if successful,
+ * otherwise a positive errno value. */
+int
+proc_net_compat_init(void)
+{
+ if (!brc_sock) {
+ int retval = nl_lookup_genl_family(BRC_GENL_FAMILY_NAME, &brc_family);
+ if (retval) {
+ return retval;
+ }
+
+ retval = nl_sock_create(NETLINK_GENERIC, 0, 0, 0, &brc_sock);
+ if (retval) {
+ return retval;
+ }
+
+ flush_dir("/proc/net/vlan");
+ flush_dir("/proc/net/bonding");
+ }
+ return 0;
+}
+
+static int
+set_proc_file(const char *dir, const char *file, const char *data)
+{
+ struct ofpbuf request, *reply;
+ int retval;
+
+ ofpbuf_init(&request, 0);
+ nl_msg_put_genlmsghdr(&request, brc_sock, 1024, brc_family, NLM_F_REQUEST,
+ BRC_GENL_C_SET_PROC, 1);
+ nl_msg_put_string(&request, BRC_GENL_A_PROC_DIR, dir);
+ nl_msg_put_string(&request, BRC_GENL_A_PROC_NAME, file);
+ if (data) {
+ nl_msg_put_string(&request, BRC_GENL_A_PROC_DATA, data);
+ }
+
+ retval = nl_sock_transact(brc_sock, &request, &reply);
+ ofpbuf_uninit(&request);
+ ofpbuf_delete(reply);
+ if (retval) {
+ VLOG_WARN_RL(&rl, "failed to %s /proc/%s/%s (%s)",
+ data ? "update" : "remove", dir, file, strerror(retval));
+ }
+ return retval;
+}
+
+static void
+flush_dir(const char *dir)
+{
+ const char *subdir;
+ struct dirent *de;
+ DIR *stream;
+
+ assert(!memcmp(dir, "/proc/", 6));
+ subdir = dir + 6;
+
+ stream = opendir(dir);
+ if (!stream) {
+ if (errno != ENOENT) {
+ VLOG_WARN_RL(&rl, "%s: open failed (%s)", dir, strerror(errno));
+ }
+ return;
+ }
+
+ while ((de = readdir(stream)) != NULL) {
+ if (strcmp(de->d_name, ".") && strcmp(de->d_name, "..")) {
+ set_proc_file(subdir, de->d_name, NULL);
+ }
+ }
+ closedir(stream);
+}
+
+/* If 'bond' is nonnull, creates a file in /proc/net/bonding for a bond with
+ * the given 'name' and the details in 'bond'. If 'bond' is null, deletes
+ * the /proc/net/bonding file with the given 'name'.
+ *
+ * This function has no effect unless proc_net_compat_init() has been
+ * called. */
+void
+proc_net_compat_update_bond(const char *name, const struct compat_bond *bond)
+{
+ struct ds ds;
+ int i;
+
+ if (!brc_sock) {
+ return;
+ }
+
+ if (!bond) {
+ set_proc_file("net/bonding", name, NULL);
+ return;
+ }
+
+ ds_init(&ds);
+ ds_put_format(
+ &ds,
+ "Ethernet Channel Bonding Driver: ovs-vswitchd "
+ VERSION BUILDNR" ("__DATE__" "__TIME__")\n"
+ "Bonding Mode: source load balancing\n"
+ "Primary Slave: None\n"
+ "Currently Active Slave: None\n"
+ "MII Status: %s\n"
+ "MII Polling Interval (ms): 100\n"
+ "Up Delay (ms): %d\n"
+ "Down Delay (ms): %d\n"
+ "\n"
+ "Source load balancing info:\n",
+ bond->up ? "up" : "down", bond->updelay, bond->downdelay);
+ for (i = 0; i < bond->n_slaves; i++) {
+ const struct compat_bond_slave *slave = &bond->slaves[i];
+ ds_put_format(
+ &ds,
+ "\n"
+ "Slave Interface: %s\n"
+ "MII Status: %s\n"
+ "Link Failure Count: 0\n"
+ "Permanent HW addr: "ETH_ADDR_FMT"\n",
+ slave->name, slave->up ? "up" : "down",
+ ETH_ADDR_ARGS(slave->mac));
+ }
+ set_proc_file("net/bonding", name, ds_cstr(&ds));
+ ds_destroy(&ds);
+}
+
+/* /proc/net/vlan compatibility.
+ *
+ * This is much more complex than I expected it to be. */
+
+struct compat_vlan {
+ /* Hash key. */
+ struct hmap_node trunk_node; /* Hash map node. */
+ char *trunk_dev; /* Name of trunk network device. */
+ int vid; /* VLAN number. */
+
+ /* Auxiliary data. */
+ char *vlan_dev; /* sprintf("%s.%d", trunk_dev, vid); */
+ struct svec tagged_devs; /* Name of tagged network device(s). */
+};
+
+/* Current set of VLAN devices, indexed two different ways. */
+static struct hmap vlans_by_trunk = HMAP_INITIALIZER(&vlans_by_trunk);
+static struct shash vlans_by_tagged = SHASH_INITIALIZER(&vlans_by_tagged);
+
+static bool remove_tagged_dev(struct shash_node *, const char *tagged_dev);
+static void update_vlan_config(void);
+static void set_vlan_proc_file(const struct compat_vlan *);
+static uint32_t hash_vlan(const char *trunk_dev, uint32_t vid);
+
+/* Updates the /proc/net/vlan compatibility layer's idea of what trunk device
+ * and VLAN the given 'tagged_dev' is associated with. If 'tagged_dev' has an
+ * implicit VLAN tag, then 'trunk_dev' should be the name of a network device
+ * on the same bridge that trunks that VLAN, and 'vid' should be the VLAN tag
+ * number. If 'tagged_dev' does not have an implicit VLAN tag, then
+ * 'trunk_dev' should be NULL and 'vid' should be -1.
+ *
+ * This function has no effect unless proc_net_compat_init() has been
+ * called. */
+void
+proc_net_compat_update_vlan(const char *tagged_dev, const char *trunk_dev,
+ int vid)
+{
+ struct compat_vlan *vlan;
+ struct shash_node *node;
+
+ if (!brc_sock) {
+ return;
+ }
+
+ /* Find the compat_vlan that we currently have for 'tagged_dev' (if
+ * any). */
+ node = shash_find(&vlans_by_tagged, tagged_dev);
+ vlan = node ? node->data : NULL;
+ if (vid <= 0 || !trunk_dev) {
+ if (vlan) {
+ if (remove_tagged_dev(node, tagged_dev)) {
+ update_vlan_config();
+ }
+ }
+ } else {
+ if (vlan) {
+ if (!strcmp(trunk_dev, vlan->trunk_dev) && vid == vlan->vid) {
+ /* No change. */
+ return;
+ } else {
+ /* 'tagged_dev' is attached to the wrong compat_vlan. Start
+ * by removing it from that one. */
+ remove_tagged_dev(node, tagged_dev);
+ node = NULL;
+ vlan = NULL;
+ }
+ }
+
+ /* 'tagged_dev' is not attached to any compat_vlan. Find the
+ * compat_vlan corresponding to (trunk_dev,vid) to attach it to, or
+ * create a new compat_vlan if none exists for (trunk_dev,vid). */
+ HMAP_FOR_EACH_WITH_HASH (vlan, struct compat_vlan, trunk_node,
+ hash_vlan(trunk_dev, vid),
+ &vlans_by_trunk) {
+ if (!strcmp(trunk_dev, vlan->trunk_dev) && vid == vlan->vid) {
+ break;
+ }
+ }
+ if (!vlan) {
+ /* Create a new compat_vlan for (trunk_dev,vid). */
+ vlan = xcalloc(1, sizeof *vlan);
+ vlan->trunk_dev = xstrdup(trunk_dev);
+ vlan->vid = vid;
+ vlan->vlan_dev = xasprintf("%s.%d", trunk_dev, vid);
+ svec_init(&vlan->tagged_devs);
+ hmap_insert(&vlans_by_trunk, &vlan->trunk_node,
+ hash_vlan(trunk_dev, vid));
+ set_vlan_proc_file(vlan);
+ }
+
+ /* Attach 'tagged_dev' to 'vlan'. */
+ svec_add(&vlan->tagged_devs, tagged_dev);
+ shash_add(&vlans_by_tagged, tagged_dev, vlan);
+ svec_sort(&vlan->tagged_devs);
+ update_vlan_config();
+ }
+}
+
+/* Remove 'tagged_dev' from the compat_vlan in 'node'. If that causes the
+ * compat_vlan to have no tagged_devs left, destroy the compat_vlan too. */
+static bool
+remove_tagged_dev(struct shash_node *node, const char *tagged_dev)
+{
+ struct compat_vlan *vlan = node->data;
+
+ svec_del(&vlan->tagged_devs, tagged_dev);
+ shash_delete(&vlans_by_tagged, node);
+ if (!vlan->tagged_devs.n) {
+ set_proc_file("net/vlan", vlan->vlan_dev, NULL);
+
+ hmap_remove(&vlans_by_trunk, &vlan->trunk_node);
+ svec_destroy(&vlan->tagged_devs);
+ free(vlan->trunk_dev);
+ free(vlan->vlan_dev);
+ free(vlan);
+ return true;
+ }
+ return false;
+}
+
+/* Returns a hash value for (trunk_dev,vid). */
+static uint32_t
+hash_vlan(const char *trunk_dev, uint32_t vid)
+{
+ return hash_int(vid, hash_string(trunk_dev, 0));
+}
+
+/* Update /proc/net/vlan/<vlan_dev> for 'vlan'. */
+static void
+set_vlan_proc_file(const struct compat_vlan *vlan)
+{
+ struct ds ds;
+
+ ds_init(&ds);
+ ds_put_format(
+ &ds,
+ "%s VID: %d\t REORDER_HDR: 1 dev->priv_flags: 81\n"
+ " total frames received 0\n"
+ " total bytes received 0\n"
+ " Broadcast/Multicast Rcvd 0\n"
+ "\n"
+ " total frames transmitted 0\n"
+ " total bytes transmitted 0\n"
+ " total headroom inc 0\n"
+ " total encap on xmit 0\n"
+ "Device: %s\n"
+ "INGRESS priority mappings: 0:0 1:0 2:0 3:0 4:0 5:0 6:0 7:0\n"
+ "EGRESSS priority Mappings: \n",
+ vlan->vlan_dev, vlan->vid, vlan->trunk_dev);
+ set_proc_file("net/vlan", vlan->vlan_dev, ds_cstr(&ds));
+ ds_destroy(&ds);
+}
+
+/* Update /proc/net/vlan/config. */
+static void
+update_vlan_config(void)
+{
+ struct compat_vlan *vlan;
+ struct ds ds;
+
+ ds_init(&ds);
+ ds_put_cstr(&ds, "VLAN Dev name | VLAN ID\n"
+ "Name-Type: VLAN_NAME_TYPE_RAW_PLUS_VID_NO_PAD\n");
+ HMAP_FOR_EACH (vlan, struct compat_vlan, trunk_node, &vlans_by_trunk) {
+ ds_put_format(&ds, "%-15s| %d | %s\n",
+ vlan->vlan_dev, vlan->vid, vlan->trunk_dev);
+ }
+ set_proc_file("net/vlan", "config", ds_cstr(&ds));
+ ds_destroy(&ds);
+}
diff --git a/vswitchd/proc-net-compat.h b/vswitchd/proc-net-compat.h
new file mode 100644
index 000000000..ce97176bc
--- /dev/null
+++ b/vswitchd/proc-net-compat.h
@@ -0,0 +1,51 @@
+/* Copyright (c) 2009 Nicira Networks
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * In addition, as a special exception, Nicira Networks gives permission
+ * to link the code of its release of vswitchd with the OpenSSL project's
+ * "OpenSSL" library (or with modified versions of it that use the same
+ * license as the "OpenSSL" library), and distribute the linked
+ * executables. You must obey the GNU General Public License in all
+ * respects for all of the code used other than "OpenSSL". If you modify
+ * this file, you may extend this exception to your version of the file,
+ * but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version.
+ */
+
+#ifndef VSWITCHD_PROC_NET_COMPAT_H
+#define VSWITCHD_PROC_NET_COMPAT_H 1
+
+#include "packets.h"
+
+struct compat_bond {
+ bool up;
+ int updelay;
+ int downdelay;
+ int n_slaves;
+ struct compat_bond_slave *slaves;
+};
+
+struct compat_bond_slave {
+ const char *name;
+ bool up;
+ uint8_t mac[ETH_ADDR_LEN];
+};
+
+int proc_net_compat_init(void);
+void proc_net_compat_update_bond(const char *name, const struct compat_bond *);
+void proc_net_compat_update_vlan(const char *dev, const char *vlandev,
+ int vlan);
+
+#endif /* vswitchd/proc-net-compat.h */
diff --git a/vswitchd/xenserver.c b/vswitchd/xenserver.c
new file mode 100644
index 000000000..7a8d255f9
--- /dev/null
+++ b/vswitchd/xenserver.c
@@ -0,0 +1,90 @@
+/* Copyright (c) 2009 Nicira Networks
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * In addition, as a special exception, Nicira Networks gives permission
+ * to link the code of its release of vswitchd with the OpenSSL project's
+ * "OpenSSL" library (or with modified versions of it that use the same
+ * license as the "OpenSSL" library), and distribute the linked
+ * executables. You must obey the GNU General Public License in all
+ * respects for all of the code used other than "OpenSSL". If you modify
+ * this file, you may extend this exception to your version of the file,
+ * but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version.
+ */
+
+#include <config.h>
+#include "xenserver.h"
+#include <ctype.h>
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include "dynamic-string.h"
+#include "process.h"
+
+#include "vlog.h"
+#define THIS_MODULE VLM_xenserver
+
+static char *
+read_host_uuid(void)
+{
+ static const char filename[] = "/etc/xensource-inventory";
+ char line[128];
+ FILE *file;
+
+ file = fopen(filename, "r");
+ if (!file) {
+ if (errno == ENOENT) {
+ VLOG_INFO("not running on a XenServer");
+ } else {
+ VLOG_INFO("%s: open: %s", filename, strerror(errno));
+ }
+ return NULL;
+ }
+
+ while (fgets(line, sizeof line, file)) {
+ static const char leader[] = "INSTALLATION_UUID='";
+ const int leader_len = strlen(leader);
+ const int uuid_len = 36;
+ static const char trailer[] = "'\n";
+ const int trailer_len = strlen(trailer);
+
+ if (strlen(line) == leader_len + uuid_len + trailer_len
+ && !memcmp(line, leader, leader_len)
+ && !memcmp(line + leader_len + uuid_len, trailer, trailer_len)) {
+ char *host_uuid = xmemdup0(line + leader_len, uuid_len);
+ VLOG_INFO("running on XenServer, host-uuid %s", host_uuid);
+ fclose(file);
+ return host_uuid;
+ }
+ }
+ fclose(file);
+ VLOG_ERR("%s: INSTALLATION_UUID not found", filename);
+ return NULL;
+}
+
+const char *
+xenserver_get_host_uuid(void)
+{
+ static char *host_uuid;
+ static bool inited;
+
+ if (!inited) {
+ host_uuid = read_host_uuid();
+ inited = true;
+ }
+ return host_uuid;
+}
+
diff --git a/vswitchd/xenserver.h b/vswitchd/xenserver.h
new file mode 100644
index 000000000..c69b133ac
--- /dev/null
+++ b/vswitchd/xenserver.h
@@ -0,0 +1,32 @@
+/* Copyright (c) 2009 Nicira Networks
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * In addition, as a special exception, Nicira Networks gives permission
+ * to link the code of its release of vswitchd with the OpenSSL project's
+ * "OpenSSL" library (or with modified versions of it that use the same
+ * license as the "OpenSSL" library), and distribute the linked
+ * executables. You must obey the GNU General Public License in all
+ * respects for all of the code used other than "OpenSSL". If you modify
+ * this file, you may extend this exception to your version of the file,
+ * but you are not obligated to do so. If you do not wish to do so,
+ * delete this exception statement from your version.
+ */
+
+#ifndef VSWITCHD_XENSERVER_H
+#define VSWITCHD_XENSERVER_H 1
+
+const char *xenserver_get_host_uuid(void);
+
+#endif /* xenserver.h */
diff --git a/xenserver/README b/xenserver/README
new file mode 100644
index 000000000..7cd04ab69
--- /dev/null
+++ b/xenserver/README
@@ -0,0 +1,78 @@
+This directory contains files for seamless integration of vswitch on
+Citrix XenServer hosts managed by the Citrix management tools.
+
+Some of these files are modifications of Citrix's proprietary code.
+Citrix has given permission to distribute these modified files.
+Citrix has not specified a particular license for them. There is no
+guarantee that, should Citrix specify a license, that it would be
+DFSG-compliant or GPL-compatible.
+
+Most of the files in this directory is installed on a XenServer system
+under the same name, if underscores are replaced by slashes. The
+files are:
+
+ etc_init.d_vswitch
+
+ Initializes the vswitch at boot and shuts it down at shutdown.
+
+ etc_init.d_vswitch-xapi-update
+
+ Init script to ensure vswitch-cfg-update is called for the
+ current host at boot.
+
+ etc_logrotate.d_vswitch
+
+ Ensures that /var/log/ovs-vswitchd.log is rotated periodically
+ and that ovs-vswitchd reopens its log file at that point.
+
+ etc_profile.d_vswitch.sh
+
+ vswitch-related shell functions for the administrator's
+ convenience.
+
+ etc_sysconfig_vswitch.example
+
+ Example configuration options for vswitch.
+
+ etc_xapi.d_plugins_vswitch-cfg-update
+
+ xapi plugin script to update the cache of configuration items
+ in the ovs-vswitchd configuration file that are managed in the
+ xapi database when integrated with Citrix management tools.
+
+ etc_xensource_scripts_vif
+
+ vswitch-aware replacement for Citrix script of the same name.
+
+ opt_xensource_libexec_interface-reconfigure
+
+ vswitch-aware replacement for Citrix script of the same name.
+
+ usr_lib_xsconsole_plugins-base_XSFeatureVSwitch.py
+
+ xsconsole plugin to configure the pool-wide configuration keys
+ used to control vswitch when integrated with Citrix management
+ tools.
+
+ vswitch-xen.spec
+
+ spec file for building RPMs to install on a XenServer host.
+
+To install, build the vswitch RPM with a command like this:
+
+ rpmbuild -D "vswitch_version $full_version" \
+ -D "xen_version $XENKERNEL" \
+ -D "build_number --with-build-number=$buildnr" \
+ -bb vswitch-xen.spec
+
+Then, "rpm -U" the resulting vswitch package on the XenServer hosts in
+question and reboot them. (The vswitch-dbg package that is also
+produced need not be installed, but it is harmless to do so.)
+
+----------------------------------------------------------------------
+Copyright (C) 2009 Nicira Networks, Inc.
+
+Copying and distribution of this file, with or without modification,
+are permitted in any medium without royalty provided the copyright
+notice and this notice are preserved. This file is offered as-is,
+without warranty of any kind.
diff --git a/xenserver/automake.mk b/xenserver/automake.mk
new file mode 100644
index 000000000..53e109d5a
--- /dev/null
+++ b/xenserver/automake.mk
@@ -0,0 +1,19 @@
+# Copyright (C) 2009 Nicira Networks, Inc.
+#
+# Copying and distribution of this file, with or without modification,
+# are permitted in any medium without royalty provided the copyright
+# notice and this notice are preserved. This file is offered as-is,
+# without warranty of any kind.
+
+EXTRA_DIST += \
+ xenserver/README \
+ xenserver/etc_init.d_vswitch \
+ xenserver/etc_init.d_vswitch-xapi-update \
+ xenserver/etc_logrotate.d_vswitch \
+ xenserver/etc_profile.d_vswitch.sh \
+ xenserver/etc_sysconfig_vswitch.example \
+ xenserver/etc_xapi.d_plugins_vswitch-cfg-update \
+ xenserver/etc_xensource_scripts_vif \
+ xenserver/opt_xensource_libexec_interface-reconfigure \
+ xenserver/usr_lib_xsconsole_plugins-base_XSFeatureVSwitch.py \
+ xenserver/vswitch-xen.spec
diff --git a/xenserver/etc_init.d_vswitch b/xenserver/etc_init.d_vswitch
new file mode 100755
index 000000000..25aca61f9
--- /dev/null
+++ b/xenserver/etc_init.d_vswitch
@@ -0,0 +1,302 @@
+#!/bin/bash
+#
+# vswitch
+#
+# chkconfig: 2345 09 91
+# description: Manage vswitch kernel modules and user-space daemon
+
+# Copyright (C) 2009 Nicira Networks, Inc.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+. /etc/init.d/functions
+
+test -e /etc/sysconfig/vswitch && . /etc/sysconfig/vswitch
+
+# General config variables in /etc/sysconfig/vswitch
+VSWITCH_BASE="${VSWITCH_BASE:-/root/vswitch}"
+ENABLE_BRCOMPAT="${ENABLE_BRCOMPAT:-y}"
+ENABLE_FAKE_PROC_NET="${ENABLE_FAKE_PROC_NET:-y}"
+FORCE_COREFILES="${FORCE_COREFILES:-n}"
+COREFILE_PATTERN="${COREFILE_PATTERN:-/var/log/%e-%t}"
+
+# Config variables specific to ovs-vswitchd
+VSWITCHD_CONF="${VSWITCHD_CONF:-/etc/ovs-vswitchd.conf}"
+VSWITCHD_PIDFILE="${VSWITCHD_PIDFILE:-/var/run/ovs-vswitchd.pid}"
+VSWITCHD_PRIORITY="${VSWITCHD_PRIORITY:--5}"
+VSWITCHD_LOGFILE="${VSWITCHD_LOGFILE:-/var/log/ovs-vswitchd.log}"
+VSWITCHD_FILE_LOGLEVEL="${VSWITCHD_FILE_LOGLEVEL:-}"
+VSWITCHD_SYSLOG_LOGLEVEL="${VSWITCHD_SYSLOG_LOGLEVEL:-WARN}"
+VSWITCHD_MEMLEAK_LOGFILE="${VSWITCHD_MEMLEAK_LOGFILE:-}"
+VSWITCHD_STRACE_LOG="${VSWITCHD_STRACE_LOG:-}"
+VSWITCHD_STRACE_OPT="${VSWITCHD_STRACE_OPT:-}"
+VSWITCHD_VALGRIND_LOG="${VSWITCHD_VALGRIND_LOG:-}"
+VSWITCHD_VALGRIND_OPT="${VSWITCHD_VALGRIND_OPT:-}"
+
+# Config variables specific to ovs-brcompatd
+BRCOMPATD_PIDFILE="${BRCOMPATD_PIDFILE:-/var/run/ovs-brcompatd.pid}"
+BRCOMPATD_PRIORITY="${BRCOMPATD_PRIORITY:--5}"
+BRCOMPATD_LOGFILE="${BRCOMPATD_LOGFILE:-/var/log/ovs-brcompatd.log}"
+BRCOMPATD_FILE_LOGLEVEL="${BRCOMPATD_FILE_LOGLEVEL:-}"
+BRCOMPATD_SYSLOG_LOGLEVEL="${BRCOMPATD_SYSLOG_LOGLEVEL:-WARN}"
+BRCOMPATD_MEMLEAK_LOGFILE="${BRCOMPATD_MEMLEAK_LOGFILE:-}"
+BRCOMPATD_STRACE_LOG="${BRCOMPATD_STRACE_LOG:-}"
+BRCOMPATD_STRACE_OPT="${BRCOMPATD_STRACE_OPT:-}"
+BRCOMPATD_VALGRIND_LOG="${BRCOMPATD_VALGRIND_LOG:-}"
+BRCOMPATD_VALGRIND_OPT="${BRCOMPATD_VALGRIND_OPT:-}"
+
+
+
+
+# Full paths to executables & modules
+vswitchd="$VSWITCH_BASE/sbin/ovs-vswitchd"
+brcompatd="$VSWITCH_BASE/sbin/ovs-brcompatd"
+dpctl="$VSWITCH_BASE/bin/ovs-dpctl"
+appctl="$VSWITCH_BASE/bin/ovs-appctl"
+ofctl="$VSWITCH_BASE/bin/ovs-ofctl"
+
+
+if [ "$ENABLE_FAKE_PROC_NET" == "y" ]; then
+ if [ "$ENABLE_BRCOMPAT" != "y" ]; then
+ warning "FAKE_PROC_NET required BRCOMPAT which was disabled. Force enabling."
+ ENABLE_BRCOMPAT="y"
+ fi
+fi
+
+function dp_list {
+ "$dpctl" show | grep '^dp[0-9]\+:' | cut -d':' -f 1
+}
+
+function turn_on_corefiles {
+ # This has global effect so should not normally be used...
+ ulimit -c unlimited
+ echo "$COREFILE_PATTERN" > /proc/sys/kernel/core_pattern
+}
+
+function remove_all_dp {
+ for dp in $(dp_list); do
+ action "Removing datapath: $dp" "$dpctl" del-dp "$dp"
+ done
+}
+
+function insert_modules_if_required {
+ if ! lsmod | grep -q "openvswitch_mod"; then
+ action "Inserting llc module" modprobe llc
+ action "Inserting openvswitch module" insmod $VSWITCH_BASE/kernel_modules/openvswitch_mod.ko
+ fi
+ if [ -n "$BRCOMPATD_PIDFILE" ] && ! lsmod | grep -q "brcompat_mod"; then
+ action "Inserting brcompat module" insmod $VSWITCH_BASE/kernel_modules/brcompat_mod.ko
+ fi
+}
+
+function remove_modules {
+ if lsmod | grep -q "brcompat_mod"; then
+ action "Removing brcompat module" rmmod brcompat_mod.ko
+ fi
+ if lsmod | grep -q "openvswitch_mod"; then
+ action "Removing openvswitch module" rmmod openvswitch_mod.ko
+ fi
+}
+
+function reload_vswitchd {
+ if [ -f "$VSWITCHD_PIDFILE" ]; then
+ "$appctl" \
+ --target=ovs-vswitchd.$(cat "$VSWITCHD_PIDFILE").ctl \
+ --execute=vswitchd/reload
+ fi
+}
+
+function start_vswitchd {
+ local syslog_opt="-vANY:SYSLOG:${VSWITCHD_SYSLOG_LOGLEVEL}"
+ local logfile_file_opt=""
+ local logfile_level_opt=""
+ if [ -n "$VSWITCHD_FILE_LOGLEVEL" ]; then
+ logfile_level_opt="-vANY:FILE:${VSWITCHD_FILE_LOGLEVEL}"
+ logfile_file_opt="--log-file=$VSWITCHD_LOGFILE"
+ fi
+ local leak_opt=""
+ if [ -n "$VSWITCHD_MEMLEAK_LOGFILE" ]; then
+ leak_opt="--check-leaks=$VSWITCHD_MEMLEAK_LOGFILE"
+ if [ -e "$VSWITCHD_MEMLEAK_LOGFILE" ]; then
+ mv "$VSWITCHD_MEMLEAK_LOGFILE" "$VSWITCHD_MEMLEAK_LOGFILE.prev"
+ fi
+ fi
+ local strace_opt=""
+ local daemonize="y"
+ if [ -n "$VSWITCHD_STRACE_LOG" ] && [ -n "$VSWITCHD_VALGRIND_LOG" ]; then
+ printf "Can not start with both VALGRIND and STRACE\n"
+ exit 1
+ fi
+ if [ -n "$VSWITCHD_STRACE_LOG" ]; then
+ strace_opt="strace -o $VSWITCHD_STRACE_LOG $VSWITCHD_STRACE_OPT"
+ daemonize="n"
+ fi
+ if [ -n "$VSWITCHD_VALGRIND_LOG" ]; then
+ valgrind_opt="valgrind --log-file=$VSWITCHD_VALGRIND_LOG $VSWITCHD_VALGRIND_OPT"
+ daemonize="n"
+ fi
+ local fake_proc_net_opt=""
+ if [ "$ENABLE_FAKE_PROC_NET" == "y" ]; then
+ fake_proc_net_opt="--fake-proc-net"
+ fi
+ if [ "$daemonize" != "y" ]; then
+ # Start in background and force a "success" message
+ action "Starting ovs-vswitchd ($strace_opt$valgrind_opt)" true
+ (nice -n "$VSWITCHD_PRIORITY" $strace_opt $valgrind_opt "$vswitchd" -P"$VSWITCHD_PIDFILE" -D $fake_proc_net_opt -vANY:CONSOLE:EMER $syslog_opt $logfile_level_opt $logfile_file_opt $leak_opt "$VSWITCHD_CONF") &
+ else
+ action "Starting ovs-vswitchd" nice -n "$VSWITCHD_PRIORITY" "$vswitchd" -P"$VSWITCHD_PIDFILE" -D $fake_proc_net_opt -vANY:CONSOLE:EMER $syslog_opt $logfile_level_opt $logfile_file_opt $leak_opt "$VSWITCHD_CONF"
+ fi
+}
+
+function start_brcompatd {
+ local syslog_opt="-vANY:SYSLOG:${BRCOMPATD_SYSLOG_LOGLEVEL}"
+ local logfile_file_opt=""
+ local logfile_level_opt=""
+ if [ -n "$BRCOMPATD_FILE_LOGLEVEL" ]; then
+ logfile_level_opt="-vANY:FILE:${BRCOMPATD_FILE_LOGLEVEL}"
+ logfile_file_opt="--log-file=$BRCOMPATD_LOGFILE"
+ fi
+ local leak_opt=""
+ if [ -n "$BRCOMPATD_MEMLEAK_LOG" ]; then
+ leak_opt="--check-leaks=$BRCOMPATD_MEMLEAK_LOGFILE"
+ if [ -e "$BRCOMPATD_MEMLEAK_LOGFILE" ]; then
+ mv "$BRCOMPATD_MEMLEAK_LOGFILE" "$BRCOMPATD_MEMLEAK_LOGFILE.prev"
+ fi
+ fi
+ local strace_opt=""
+ local daemonize="y"
+ if [ -n "$BRCOMPATD_STRACE_LOG" ] && [ -n "$BRCOMPATD_VALGRIND_LOG" ]; then
+ printf "Can not start with both VALGRIND and STRACE\n"
+ exit 1
+ fi
+ if [ -n "$BRCOMPATD_STRACE_LOG" ]; then
+ strace_opt="strace -o $BRCOMPATD_STRACE_LOG $BRCOMPATD_STRACE_OPT"
+ daemonize="n"
+ fi
+ if [ -n "$VALGRIND_LOG" ]; then
+ valgrind_opt="valgrind --log-file=$BRCOMPATD_VALGRIND_LOG $BRCOMPATD_VALGRIND_OPT"
+ daemonize="n"
+ fi
+ reload_cmd='/root/vswitch/bin/ovs-appctl -t /var/run/ovs-vswitchd.`cat /var/run/ovs-vswitchd.pid`.ctl -e vswitchd/reload 2>&1 | /usr/bin/logger -t brcompatd-reload'
+ if [ "$daemonize" != "y" ]; then
+ # Start in background and force a "success" message
+ action "Starting ovs-brcompatd ($strace_opt$valgrind_opt)" true
+ (nice -n "$VSWITCHD_PRIORITY" $strace_opt $valgrind_opt "$brcompatd" --reload-command="$reload_cmd" -P$BRCOMPATD_PIDFILE -vANY:CONSOLE:EMER $syslog_opt $logfile_level_opt $logfile_file_opt $leak_opt "$VSWITCHD_CONF") &
+ else
+ action "Starting ovs-brcompatd" nice -n "$BRCOMPATD_PRIORITY" $strace_opt $valgrind_opt "$brcompatd" --reload-command="$reload_cmd" -P$BRCOMPATD_PIDFILE -D -vANY:CONSOLE:EMER $syslog_opt $logfile_level_opt $logfile_file_opt $leak_opt "$VSWITCHD_CONF"
+ fi
+}
+
+function stop_vswitchd {
+ if [ -f "$VSWITCHD_PIDFILE" ]; then
+ local pid=$(cat "$VSWITCHD_PIDFILE")
+ action "Killing ovs-vswitchd ($pid)" kill -TERM $pid
+ rm -f "$VSWITCHD_PIDFILE"
+ fi
+}
+
+function stop_brcompatd {
+ if [ -f "$BRCOMPATD_PIDFILE" ]; then
+ local pid=$(cat "$BRCOMPATD_PIDFILE")
+ action "Killing ovs-brcompatd ($pid)" kill -TERM $pid
+ rm -f "$BRCOMPATD_PIDFILE"
+ fi
+}
+
+function restart_approval {
+ if test ! -t 0; then
+ # Don't prompt if invoked non-interactively.
+ return 0
+ fi
+ cat <<EOF
+
+WARNING!!!
+
+Restarting vswitch on a live server is not guaranteed to work. It is
+provided as a convenience for those situations in which it does work.
+If you just want to reload the configuration file, use "reload"
+instead of restart.
+
+EOF
+ read -s -r -n 1 -p "Countinue with restart (y/N): " response
+ printf "\n"
+ case "$response" in
+ y|Y)
+ return 0
+ ;;
+ *)
+ return 1
+ ;;
+ esac
+}
+
+function start {
+ insert_modules_if_required
+ start_vswitchd
+ start_brcompatd
+ reload_vswitchd # ensures ovs-vswitchd has fully read config file.
+}
+
+function stop {
+ stop_brcompatd
+ stop_vswitchd
+}
+
+function restart {
+ if restart_approval; then
+ stop
+ start
+ fi
+}
+
+case "$1" in
+ start)
+ if [ "$FORCE_COREFILES" == "y" ]; then
+ turn_on_corefiles
+ fi
+ start
+ ;;
+ stop)
+ stop
+ ;;
+ restart)
+ restart
+ ;;
+ reload)
+ reload_vswitchd
+ ;;
+ strace-vswitchd)
+ shift
+ strace -p $(cat "$VSWITCHD_PIDFILE") "$@"
+ ;;
+ strace-brcompatd)
+ shift
+ strace -p $(cat "$BRCOMPATD_PIDFILE") "$@"
+ ;;
+ status)
+ status -p ovs-vswitchd.pid ovs-vswitchd
+ status -p ovs-brcompatd.pid ovs-brcompatd
+ ;;
+ version)
+ "$VSWITCH_BASE"/sbin/ovs-vswitchd -V
+ "$VSWITCH_BASE"/sbin/ovs-brcompatd -V
+ ;;
+ help)
+ printf "vswitch [start|stop|restart|reload|unload|status|version]\n"
+ ;;
+ *)
+ printf "Unknown command: $1\n"
+ exit 1
+ ;;
+esac
diff --git a/xenserver/etc_init.d_vswitch-xapi-update b/xenserver/etc_init.d_vswitch-xapi-update
new file mode 100755
index 000000000..d1be0b3a9
--- /dev/null
+++ b/xenserver/etc_init.d_vswitch-xapi-update
@@ -0,0 +1,71 @@
+#!/bin/bash
+#
+# vswitch-xapi-update
+#
+# chkconfig: 2345 95 01
+# description: Update vswitch configuration from XAPI database at boot
+
+# Copyright (C) 2009 Nicira Networks, Inc.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+. /etc/init.d/functions
+
+test -e /etc/sysconfig/vswitch && . /etc/sysconfig/vswitch
+VSWITCH_BASE="${VSWITCH_BASE:-/root/vswitch}"
+VSWITCHD_CONF="${VSWITCHD_CONF:-/etc/ovs-vswitchd.conf}"
+VSWITCHD_PIDFILE="${VSWITCHD_PIDFILE:-/var/run/ovs-vswitchd.pid}"
+VSWITCHD_PRIORITY="${VSWITCHD_PRIORITY:--5}"
+VSWITCHD_LOGFILE="${VSWITCHD_LOGFILE:-/var/log/ovs-vswitchd.log}"
+VSWITCHD_FILE_LOGLEVEL="${VSWITCHD_FILE_LOGLEVEL:-}"
+VSWITCHD_SYSLOG_LOGLEVEL="${VSWITCHD_SYSLOG_LOGLEVEL:-WARN}"
+VSWITCHD_MEMLEAK_LOGFILE="${VSWITCHD_MEMLEAK_LOGFILE:-}"
+BRCOMPATD_PIDFILE="${BRCOMPATD_PIDFILE:-/var/run/ovs-brcompatd.pid}"
+BRCOMPATD_PRIORITY="${BRCOMPATD_PRIORITY:--5}"
+BRCOMPATD_LOGFILE="${BRCOMPATD_LOGFILE:-/var/log/ovs-brcompatd.log}"
+BRCOMPATD_FILE_LOGLEVEL="${BRCOMPATD_FILE_LOGLEVEL:-}"
+BRCOMPATD_SYSLOG_LOGLEVEL="${BRCOMPATD_SYSLOG_LOGLEVEL:-WARN}"
+BRCOMPATD_MEMLEAK_LOGFILE="${BRCOMPATD_MEMLEAK_LOGFILE:-}"
+
+function do_host_call {
+ xe host-call-plugin host-uuid="$INSTALLATION_UUID" plugin="vswitch-cfg-update" fn="update" >/dev/null
+}
+
+function start {
+ if [ ! -f /etc/xensource-inventory ]; then
+ printf "vxwitch-xapi-update ERROR: XenSource inventory not present in /etc/xensource-inventory\n"
+ exit 1
+ fi
+ source /etc/xensource-inventory
+ action "Updating configuration" do_host_call
+}
+
+case "$1" in
+ start)
+ start
+ ;;
+ stop)
+ # Nothing to do here.
+ ;;
+ restart)
+ start
+ ;;
+ help)
+ printf "vswitch [start|stop|restart]\n"
+ ;;
+ *)
+ printf "Unknown command: $1\n"
+ exit 1
+ ;;
+esac
diff --git a/xenserver/etc_logrotate.d_vswitch b/xenserver/etc_logrotate.d_vswitch
new file mode 100644
index 000000000..dae235bd5
--- /dev/null
+++ b/xenserver/etc_logrotate.d_vswitch
@@ -0,0 +1,14 @@
+# Copyright (C) 2009 Nicira Networks, Inc.
+#
+# Copying and distribution of this file, with or without modification,
+# are permitted in any medium without royalty provided the copyright
+# notice and this notice are preserved. This file is offered as-is,
+# without warranty of any kind.
+
+/var/log/ovs-vswitchd.log {
+ sharedscripts
+ postrotate
+ # Send sighup to vswitch which will cause it to reopen its log files.
+ /sbin/service vswitch reload
+ endscript
+}
diff --git a/xenserver/etc_profile.d_vswitch.sh b/xenserver/etc_profile.d_vswitch.sh
new file mode 100644
index 000000000..90927547d
--- /dev/null
+++ b/xenserver/etc_profile.d_vswitch.sh
@@ -0,0 +1,56 @@
+# Copyright (C) 2009 Nicira Networks, Inc.
+#
+# Copying and distribution of this file, with or without modification,
+# are permitted in any medium without royalty provided the copyright
+# notice and this notice are preserved. This file is offered as-is,
+# without warranty of any kind.
+
+PATH=/root/vswitch/bin:$PATH
+export PATH
+MANPATH=/root/vswitch/share/man:$MANPATH
+export MANPATH
+
+alias vswitch='service vswitch'
+
+function watchconf {
+ watch cat /etc/ovs-vswitchd.conf
+}
+
+function watchdp {
+ watch ovs-dpctl show "$@"
+}
+
+function watchdpflows {
+ local grep=""
+ local dp=$1
+ shift
+ if [ $# -gt 0 ]; then
+ grep="| grep $@"
+ fi
+ watch "ovs-dpctl dump-flows $dp $grep"
+}
+
+function watchflows {
+ local grep=""
+ local dp=$1
+ shift
+ bridge=$(ovs-dpctl show $dp | grep 'port 0:' | cut -d' ' -f 3)
+ if [ $# -gt 0 ]; then
+ grep="| grep $@"
+ fi
+ watch "ovs-ofctl dump-flows unix:/var/run/$bridge.mgmt $grep"
+}
+
+function monitorlogs {
+ local grep=""
+ if [ $# -gt 0 ]; then
+ grep="| grep --line-buffered '^==> .* <==$"
+ for i in "$@"; do
+ grep="$grep\|$i"
+ done
+ grep="$grep'"
+ fi
+ cmd="tail -F /var/log/messages /var/log/ovs-vswitchd.log /var/log/xensource.log $grep | tee /var/log/monitorlogs.out"
+ printf "cmd: $cmd\n"
+ eval "$cmd"
+}
diff --git a/xenserver/etc_sysconfig_vswitch.example b/xenserver/etc_sysconfig_vswitch.example
new file mode 100644
index 000000000..cd13b5915
--- /dev/null
+++ b/xenserver/etc_sysconfig_vswitch.example
@@ -0,0 +1,79 @@
+### Configuration options for vswitch
+
+# Copyright (C) 2009 Nicira Networks, Inc.
+#
+# Copying and distribution of this file, with or without modification,
+# are permitted in any medium without royalty provided the copyright
+# notice and this notice are preserved. This file is offered as-is,
+# without warranty of any kind.
+
+# VSWITCH_BASE: Root directory where vswitch binaries are installed
+# VSWITCH_BASE=/root/vswitch/openvswitch/build
+
+# ENABLE_BRCOMPAT: If 'y' than emulate linux bridging interfaces
+# using the brcompat kernel module and ovs-brcompatd daemon
+# ENABLE_BRCOMPAT=y
+
+# ENABLE_FAKE_PROC_NET: If 'y' then emulate linux bonding and vlan
+# files in /proc as if the bonding and vlan demultiplexing done in
+# ovs-vswitchd were being implemented using existing Linux mechanisms.
+# This is useful in some cases when replacing existing solutions.
+# ENABLE_FAKE_PROC_NET=y
+
+# FORCE_COREFILES: If 'y' then core files will be enabled.
+# FORCE_COREFILES=n
+
+# COREFILE_PATTERN: Pattern used to determine path and filename for
+# core files when FORCE_COREFILES is 'y'. This is Linux specific.
+# See the manpage for "core".
+# COREFILE_PATTERN="/var/log/%e-%t"
+
+# VSWITCHD_CONF: File in which ovs-vswitchd stores its configuration.
+# VSWITCHD_CONF=/etc/ovs-vswitchd.conf
+
+# VSWITCHD_PIDFILE: File in which to store the pid of the running
+# ovs-vswitchd.
+# VSWITCHD_PIDFILE=/var/run/ovs-vswitchd.pid
+
+# VSWITCHD_PRIORITY: "nice" priority at which to run ovs-vswitchd and related
+# processes.
+# VSWITCHD_PRIORITY=-5
+
+# VSWITCHD_LOGFILE: File to send the FILE_LOGLEVEL log messages to.
+# VSWITCHD_LOGFILE=/var/log/ovs-vswitchd.log
+
+# VSWITCHD_FILE_LOGLEVEL: Log level at which to log into the
+# VSWITCHD_LOG file. If this is null or not set the logfile will
+# not be created and nothing will be sent to it. This is the
+# default. The available options are: EMER, WARN, INFO and DBG.
+# VSWITCHD_FILE_LOGLEVEL=""
+
+# VSWITCHD_SYSLOG_LOGLEVEL: Log level at which to log into syslog. If
+# this is null or not set the default is to log to syslog
+# emergency and warning level messages only.
+# VSWITCHD_SYSLOG_LOGLEVEL="WARN"
+
+# BRCOMPATD_PIDFILE: File in which to store the pid of the running
+# ovs-brcompatd (the Linux bridge compatibility daemon for ovs-vswitchd).
+# If this is the empty string, ovs-brcompatd will not be started and
+# the brcompat_mod kernel module will not be inserted. Note that
+# the default is to use brcompat!
+# BRCOMPATD_PIDFILE=/var/run/ovs-brcompatd.pid
+
+# BRCOMPATD_PRIORITY: "nice" priority at which to run ovs-vswitchd and related
+# processes.
+# BRCOMPATD_PRIORITY=-5
+
+# BRCOMPATD_LOGFILE: File to send the FILE_LOGLEVEL log messages to.
+# BRCOMPATD_LOGFILE=/var/log/ovs-brcompatd.log
+
+# BRCOMPATD_FILE_LOGLEVEL: Log level at which to log into the
+# BRCOMPATD_LOG file. If this is null or not set the logfile will
+# not be created and nothing will be sent to it. This is the
+# default. The available options are: EMER, WARN, INFO and DBG.
+# BRCOMPATD_FILE_LOGLEVEL=""
+
+# BRCOMPATD_SYSLOG_LOGLEVEL: Log level at which to log into syslog. If
+# this is null or not set the default is to log to syslog
+# emergency and warning level messages only.
+# BRCOMPATD_SYSLOG_LOGLEVEL="WARN"
diff --git a/xenserver/etc_xapi.d_plugins_vswitch-cfg-update b/xenserver/etc_xapi.d_plugins_vswitch-cfg-update
new file mode 100755
index 000000000..b21cf46d2
--- /dev/null
+++ b/xenserver/etc_xapi.d_plugins_vswitch-cfg-update
@@ -0,0 +1,123 @@
+#!/usr/bin/env python
+#
+# xapi plugin script to update the cache of configuration items in the
+# ovs-vswitchd configuration file that are managed in the xapi database
+# when integrated with Citrix management tools.
+
+# Copyright (C) 2009 Nicira Networks, Inc.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+# TBD: - error handling needs to be improved. Currently this can leave
+# TBD: the system in a bad state if anything goes wrong.
+
+import logging
+log = logging.getLogger("vswitch-cfg-update")
+logging.basicConfig(filename="/var/log/vswitch-cfg-update.log", level=logging.DEBUG)
+
+import XenAPIPlugin
+import XenAPI
+import subprocess
+
+cfg_mod="/root/vswitch/bin/ovs-cfg-mod"
+vswitchd_cfg_filename="/etc/ovs-vswitchd.conf"
+
+def update(session, args):
+ pools = session.xenapi.pool.get_all()
+ # We assume there is only ever one pool...
+ if len(pools) == 0:
+ log.error("No pool for host.")
+ raise XenAPIPlugin.Failure("NO_POOL_FOR_HOST", [])
+ if len(pools) > 1:
+ log.error("More than one pool for host.")
+ raise XenAPIPlugin.Failure("MORE_THAN_ONE_POOL_FOR_HOST", [])
+ pool = session.xenapi.pool.get_record(pools[0])
+ try:
+ controller = pool["other_config"]["vSwitchController"]
+ except KeyError, e:
+ controller = ""
+ currentController = vswitchCurrentController()
+ if controller == "" and currentController != "":
+ log.debug("Removing controller configuration.")
+ removeControllerCfg()
+ return "Successfully removed controller config"
+ elif controller != currentController:
+ if len(controller) == 0:
+ log.debug("Setting controller to: %s" % (controller))
+ else:
+ log.debug("Changing controller from %s to %s" % (currentController, controller))
+ setControllerCfg(controller)
+ return "Successfully set controller to " + controller
+ else:
+ log.debug("No change to controller configuration required.")
+ return "No change to configuration"
+
+def vswitchCurrentController():
+ controller = vswitchCfgQuery("mgmt.controller")
+ if controller == "":
+ return controller
+ if len(controller) < 4 or controller[0:4] != "ssl:":
+ log.warning("Controller does not specify ssl connection type, returning entire string.")
+ return controller
+ else:
+ return controller[4:]
+
+def removeControllerCfg():
+ vswitchCfgMod(["--del-match", "mgmt.controller=*",
+ "--del-match", "ssl.bootstrap-ca-cert=*",
+ "--del-match", "ssl.ca-cert=*",
+ "--del-match", "ssl.private-key=*",
+ "--del-match", "ssl.certificate=*"])
+
+def setControllerCfg(controller):
+ vswitchCfgMod(["--del-match", "mgmt.controller=*",
+ "--del-match", "ssl.bootstrap-ca-cert=*",
+ "--del-match", "ssl.ca-cert=*",
+ "--del-match", "ssl.private-key=*",
+ "--del-match", "ssl.certificate=*",
+ "-a", "mgmt.controller=ssl:" + controller,
+ "-a", "ssl.bootstrap-ca-cert=true",
+ "-a", "ssl.ca-cert=/etc/ovs-vswitchd.cacert",
+ "-a", "ssl.private-key=/etc/xensource/xapi-ssl.pem",
+ "-a", "ssl.certificate=/etc/xensource/xapi-ssl.pem"])
+
+def vswitchCfgQuery(key):
+ cmd = [cfg_mod, "--config-file=" + vswitchd_cfg_filename, "-q", key]
+ output = subprocess.Popen(cmd, stdout=subprocess.PIPE).communicate()
+ if len(output) == 0 or output[0] == None:
+ output = ""
+ else:
+ output = output[0].strip()
+ return output
+
+def vswitchCfgMod(action_args):
+ cmd = [cfg_mod, "-vANY:console:emer",
+ "--config-file=" + vswitchd_cfg_filename] + action_args
+ exitcode = subprocess.call(cmd)
+ if exitcode != 0:
+ log.error("ovs-cfg-mod failed with exit code "
+ + str(exitcode) + " for " + repr(action_args))
+ raise XenAPIPlugin.Failure("VSWITCH_CONFIG_MOD_FAILURE",
+ [ str(exitcode) , str(action_args) ])
+ vswitchReload()
+
+def vswitchReload():
+ exitcode = subprocess.call(["/sbin/service", "vswitch", "reload"])
+ if exitcode != 0:
+ log.error("vswitch reload failed with exit code " + str(exitcode))
+ raise XenAPIPlugin.Failure("VSWITCH_CFG_RELOAD_FAILURE", [ str(exitcode) ])
+
+
+if __name__ == "__main__":
+ XenAPIPlugin.dispatch({"update": update})
diff --git a/xenserver/etc_xensource_scripts_vif b/xenserver/etc_xensource_scripts_vif
new file mode 100755
index 000000000..fcf13a690
--- /dev/null
+++ b/xenserver/etc_xensource_scripts_vif
@@ -0,0 +1,130 @@
+#!/bin/sh
+
+# This file is based on /etc/xensource/script/vif from Citrix XenServer 5.0.0.
+# The original file did not contain a copyright notice or license statement.
+#
+# Copyright (C) 2009 Nicira Networks, Inc.
+
+# CA-23900: Warning: when VIFs are added to windows guests with PV drivers the backend vif device is registered,
+# unregistered and then registered again. This causes the udev event to fire twice and this script runs twice.
+# Since the first invocation of the script races with the device unregistration, spurious errors are possible
+# which will be logged but are safe to ignore since the second script invocation should complete the operation.
+# Note that each script invocation is run synchronously from udev and so the scripts don't race with each other.
+
+# Keep other-config/ keys in sync with device.ml:vif_udev_keys
+
+cfg_mod="/root/vswitch/bin/ovs-cfg-mod"
+service="/sbin/service"
+
+TYPE=`echo ${XENBUS_PATH} | cut -f 2 -d '/'`
+DOMID=`echo ${XENBUS_PATH} | cut -f 3 -d '/'`
+DEVID=`echo ${XENBUS_PATH} | cut -f 4 -d '/'`
+
+XAPI=/xapi/${DOMID}/hotplug/${TYPE}/${DEVID}
+HOTPLUG=/xapi/${DOMID}/hotplug/${TYPE}/${DEVID}
+PRIVATE=/xapi/${DOMID}/private/${TYPE}/${DEVID}
+BRCTL=/usr/sbin/brctl
+IP=/sbin/ip
+
+
+handle_promiscuous()
+{
+ local arg=$(xenstore-read "${PRIVATE}/other-config/promiscuous")
+ if [ $? -eq 0 -a -n "${arg}" ] ; then
+ case "${arg}" in
+ true|on) echo 1 > /sys/class/net/${vif}/brport/promisc ;;
+ *) echo 0 > /sys/class/net/${vif}/brport/promisc ;;
+ esac
+ fi
+}
+
+handle_ethtool()
+{
+ local opt=$1
+ local arg=$(xenstore-read "${PRIVATE}/other-config/ethtool-${opt}")
+ if [ $? -eq 0 -a -n "${arg}" ] ; then
+ case "${arg}" in
+ true|on) /sbin/ethtool -K "${vif}" "${opt}" on ;;
+ false|off) /sbin/ethtool -K "${vif}" "${opt}" off ;;
+ *) logger -t scripts-vif "Unknown ethtool argument ${opt}=${arg} on ${vif}/${VIFUUID}" ;;
+ esac
+ fi
+}
+
+handle_mtu()
+{
+ local mtu=$(xenstore-read "${PRIVATE}/MTU")
+ if [ $? -eq 0 -a -n "${mtu}" ]; then
+ echo "${mtu}" > /sys/class/net/${vif}/mtu
+ fi
+}
+
+add_to_bridge()
+{
+ local address=$(xenstore-read "${PRIVATE}/bridge-MAC")
+ if [ $? -ne 0 -o -z "${address}" ]; then
+ logger -t scripts-vif "Failed to read ${PRIVATE}/bridge-MAC from xenstore"
+ fi
+ local bridge=$(xenstore-read "${PRIVATE}/bridge")
+ if [ $? -ne 0 -o -z "${bridge}" ]; then
+ logger -t scripts-vif "Failed to read ${PRIVATE}/bridge from xenstore"
+ fi
+ logger -t scripts-vif "Adding ${vif} to ${bridge} with address ${address}"
+
+ vid=
+ if [ -e "/etc/openvswitch/br-$bridge" ]; then
+ . "/etc/openvswitch/br-$bridge"
+ if [ -n "$VLAN_SLAVE" -a -n "$VLAN_VID" ]; then
+ bridge=$VLAN_SLAVE
+ vid="--add=vlan.$vif.tag=$VLAN_VID"
+ fi
+ fi
+
+ ${IP} link set "${vif}" down || logger -t scripts-vif "Failed to ip link set ${vif} down"
+ ${IP} link set "${vif}" arp off || logger -t scripts-vif "Failed to ip link set ${vif} arp off"
+ ${IP} link set "${vif}" multicast off || logger -t scripts-vif "Failed to ip link set ${vif} multicast off"
+ ${IP} link set "${vif}" address "${address}" || logger -t scripts-vif "Failed to ip link set ${vif} address ${address}"
+ ${IP} addr flush "${vif}" || logger -t scripts-vif "Failed to ip addr flush ${vif}"
+
+ $cfg_mod -F /etc/ovs-vswitchd.conf \
+ --del-match="bridge.*.port=$vif" \
+ --del-match="vlan.$vif.[!0-9]*" \
+ --del-match="port.$vif.[!0-9]*" \
+ --add="bridge.$bridge.port=$vif" \
+ $vid -c
+ $service vswitch reload
+
+ ${IP} link set "${vif}" up || logger -t scripts-vif "Failed to ip link set ${vif} up"
+}
+
+echo Called as "$@" "$TYPE" "$DOMID" "$DEVID" | logger -t scripts-vif
+case "$1" in
+online)
+ handle_ethtool rx
+ handle_ethtool tx
+ handle_ethtool sg
+ handle_ethtool tso
+ handle_ethtool ufo
+ handle_ethtool gso
+
+ handle_mtu
+ add_to_bridge
+ handle_promiscuous
+
+ xenstore-write "${HOTPLUG}/vif" "${vif}"
+ xenstore-write "${HOTPLUG}/hotplug" "online"
+
+ # xs-xen.pq.hq:91e986b8e49f netback-wait-for-hotplug
+ xenstore-write "/local/domain/0/backend/vif/${DOMID}/${DEVID}/hotplug-status" "connected"
+
+ ;;
+remove)
+ xenstore-rm "${HOTPLUG}/hotplug"
+ vif=vif${DOMID}.${DEVID}
+ logger -t scripts-vif "${vif} has been removed"
+ $cfg_mod -vANY:console:emer -F /etc/ovs-vswitchd.conf \
+ --del-match="bridge.*.port=${vif}" \
+ --del-match="vlan.${vif}.[!0-9]*" \
+ --del-match="port.${vif}.[!0-9]*" -c
+ ;;
+esac
diff --git a/xenserver/opt_xensource_libexec_interface-reconfigure b/xenserver/opt_xensource_libexec_interface-reconfigure
new file mode 100755
index 000000000..6ea369ffb
--- /dev/null
+++ b/xenserver/opt_xensource_libexec_interface-reconfigure
@@ -0,0 +1,1572 @@
+#!/usr/bin/python
+#
+# Copyright (c) Citrix Systems 2008. All rights reserved.
+# Copyright (c) 2009 Nicira Networks.
+#
+"""Usage:
+
+ %(command-name)s --session <SESSION-REF> --pif <PIF-REF> [up|down|rewrite]
+ %(command-name)s --force <BRIDGE> [up|down|rewrite <CONFIG>]
+ %(command-name)s --force all down
+
+ where,
+ <CONFIG> = --device=<INTERFACE> --mode=dhcp
+ <CONFIG> = --device=<INTERFACE> --mode=static --ip=<IPADDR> --netmask=<NM> [--gateway=<GW>]
+
+ Options:
+ --session A session reference to use to access the xapi DB
+ --pif A PIF reference.
+ --force-interface An interface name. Mutually exclusive with --session/--pif.
+
+ Either both --session and --pif or just --pif-uuid.
+
+ <ACTION> is either "up" or "down" or "rewrite"
+"""
+
+#
+# Undocumented parameters for test & dev:
+#
+# --output-directory=<DIR> Write configuration to <DIR>. Also disables actually
+# raising/lowering the interfaces
+# --pif-uuid A PIF UUID, use instead of --session/--pif.
+#
+#
+#
+# Notes:
+# 1. Every pif belongs to exactly one network
+# 2. Every network has zero or one pifs
+# 3. A network may have an associated bridge, allowing vifs to be attached
+# 4. A network may be bridgeless (there's no point having a bridge over a storage pif)
+
+# XXX: --force-interface=all down
+
+# XXX: --force-interface rewrite
+
+# XXX: Sometimes this leaves "orphaned" datapaths, e.g. a datapath whose
+# only port is the local port. Should delete those.
+
+# XXX: This can leave crud in ovs-vswitchd.conf in this scenario:
+# - Create bond in XenCenter.
+# - Create VLAN on bond in XenCenter.
+# - Attempt to delete bond in XenCenter (this will fail because there
+# is a VLAN on the bond, although the error may not be reported
+# until the next step)
+# - Delete VLAN in XenCenter.
+# - Delete bond in XenCenter.
+# At this point there will still be some configuration data for the bond
+# or the VLAN in ovs-vswitchd.conf.
+
+import XenAPI
+import os, sys, getopt, time, signal
+import syslog
+import traceback
+import time
+import re
+import pickle
+
+output_directory = None
+
+db = None
+management_pif = None
+
+dbcache_file = "/etc/vswitch.dbcache"
+vswitch_config_dir = "/etc/openvswitch"
+
+class Usage(Exception):
+ def __init__(self, msg):
+ Exception.__init__(self)
+ self.msg = msg
+
+class Error(Exception):
+ def __init__(self, msg):
+ Exception.__init__(self)
+ self.msg = msg
+
+class ConfigurationFile(object):
+ """Write a file, tracking old and new versions.
+
+ Supports writing a new version of a file and applying and
+ reverting those changes.
+ """
+
+ __STATE = {"OPEN":"OPEN",
+ "NOT-APPLIED":"NOT-APPLIED", "APPLIED":"APPLIED",
+ "REVERTED":"REVERTED", "COMMITTED": "COMMITTED"}
+
+ def __init__(self, fname, path="/etc/sysconfig/network-scripts"):
+
+ self.__state = self.__STATE['OPEN']
+ self.__fname = fname
+ self.__children = []
+
+ if debug_mode():
+ dirname = output_directory
+ else:
+ dirname = path
+
+ self.__path = os.path.join(dirname, fname)
+ self.__oldpath = os.path.join(dirname, "." + fname + ".xapi-old")
+ self.__newpath = os.path.join(dirname, "." + fname + ".xapi-new")
+ self.__unlink = False
+
+ self.__f = open(self.__newpath, "w")
+
+ def attach_child(self, child):
+ self.__children.append(child)
+
+ def path(self):
+ return self.__path
+
+ def readlines(self):
+ try:
+ return open(self.path()).readlines()
+ except:
+ return ""
+
+ def write(self, args):
+ if self.__state != self.__STATE['OPEN']:
+ raise Error("Attempt to write to file in state %s" % self.__state)
+ self.__f.write(args)
+
+ def unlink(self):
+ if self.__state != self.__STATE['OPEN']:
+ raise Error("Attempt to unlink file in state %s" % self.__state)
+ self.__unlink = True
+ self.__f.close()
+ self.__state = self.__STATE['NOT-APPLIED']
+
+ def close(self):
+ if self.__state != self.__STATE['OPEN']:
+ raise Error("Attempt to close file in state %s" % self.__state)
+
+ self.__f.close()
+ self.__state = self.__STATE['NOT-APPLIED']
+
+ def changed(self):
+ if self.__state != self.__STATE['NOT-APPLIED']:
+ raise Error("Attempt to compare file in state %s" % self.__state)
+
+ return True
+
+ def apply(self):
+ if self.__state != self.__STATE['NOT-APPLIED']:
+ raise Error("Attempt to apply configuration from state %s" % self.__state)
+
+ for child in self.__children:
+ child.apply()
+
+ log("Applying changes to %s configuration" % self.__fname)
+
+ # Remove previous backup.
+ if os.access(self.__oldpath, os.F_OK):
+ os.unlink(self.__oldpath)
+
+ # Save current configuration.
+ if os.access(self.__path, os.F_OK):
+ os.link(self.__path, self.__oldpath)
+ os.unlink(self.__path)
+
+ # Apply new configuration.
+ assert(os.path.exists(self.__newpath))
+ if not self.__unlink:
+ os.link(self.__newpath, self.__path)
+ else:
+ pass # implicit unlink of original file
+
+ # Remove temporary file.
+ os.unlink(self.__newpath)
+
+ self.__state = self.__STATE['APPLIED']
+
+ def revert(self):
+ if self.__state != self.__STATE['APPLIED']:
+ raise Error("Attempt to revert configuration from state %s" % self.__state)
+
+ for child in self.__children:
+ child.revert()
+
+ log("Reverting changes to %s configuration" % self.__fname)
+
+ # Remove existing new configuration
+ if os.access(self.__newpath, os.F_OK):
+ os.unlink(self.__newpath)
+
+ # Revert new configuration.
+ if os.access(self.__path, os.F_OK):
+ os.link(self.__path, self.__newpath)
+ os.unlink(self.__path)
+
+ # Revert to old configuration.
+ if os.access(self.__oldpath, os.F_OK):
+ os.link(self.__oldpath, self.__path)
+ os.unlink(self.__oldpath)
+
+ # Leave .*.xapi-new as an aid to debugging.
+
+ self.__state = self.__STATE['REVERTED']
+
+ def commit(self):
+ if self.__state != self.__STATE['APPLIED']:
+ raise Error("Attempt to commit configuration from state %s" % self.__state)
+
+ for child in self.__children:
+ child.commit()
+
+ log("Committing changes to %s configuration" % self.__fname)
+
+ if os.access(self.__oldpath, os.F_OK):
+ os.unlink(self.__oldpath)
+ if os.access(self.__newpath, os.F_OK):
+ os.unlink(self.__newpath)
+
+ self.__state = self.__STATE['COMMITTED']
+
+def debug_mode():
+ return output_directory is not None
+
+def log(s):
+ if debug_mode():
+ print >>sys.stderr, s
+ else:
+ syslog.syslog(s)
+
+def check_allowed(pif):
+ pifrec = db.get_pif_record(pif)
+ try:
+ f = open("/proc/ardence")
+ macline = filter(lambda x: x.startswith("HWaddr:"), f.readlines())
+ f.close()
+ if len(macline) == 1:
+ p = re.compile(".*\s%(MAC)s\s.*" % pifrec, re.IGNORECASE)
+ if p.match(macline[0]):
+ log("Skipping PVS device %(device)s (%(MAC)s)" % pifrec)
+ return False
+ except IOError:
+ pass
+ return True
+
+def interface_exists(i):
+ return os.path.exists("/sys/class/net/" + i)
+
+class DatabaseCache(object):
+ def __init__(self, session_ref=None, cache_file=None):
+ if session_ref and cache_file:
+ raise Error("can't specify session reference and cache file")
+
+ if cache_file == None:
+ session = XenAPI.xapi_local()
+
+ if not session_ref:
+ log("No session ref given on command line, logging in.")
+ session.xenapi.login_with_password("root", "")
+ else:
+ session._session = session_ref
+
+ try:
+ self.__vlans = session.xenapi.VLAN.get_all_records()
+ self.__bonds = session.xenapi.Bond.get_all_records()
+ self.__pifs = session.xenapi.PIF.get_all_records()
+ self.__networks = session.xenapi.network.get_all_records()
+ finally:
+ if not session_ref:
+ session.xenapi.session.logout()
+ else:
+ log("Loading xapi database cache from %s" % cache_file)
+ f = open(cache_file, 'r')
+ members = pickle.load(f)
+ self.extras = pickle.load(f)
+ f.close()
+
+ self.__vlans = members['vlans']
+ self.__bonds = members['bonds']
+ self.__pifs = members['pifs']
+ self.__networks = members['networks']
+
+ def save(self, cache_file, extras):
+ f = open(cache_file, 'w')
+ pickle.dump({'vlans': self.__vlans,
+ 'bonds': self.__bonds,
+ 'pifs': self.__pifs,
+ 'networks': self.__networks}, f)
+ pickle.dump(extras, f)
+ f.close()
+
+ def get_pif_by_uuid(self, uuid):
+ pifs = map(lambda (ref,rec): ref,
+ filter(lambda (ref,rec): uuid == rec['uuid'],
+ self.__pifs.items()))
+ if len(pifs) == 0:
+ raise Error("Unknown PIF \"%s\"" % uuid)
+ elif len(pifs) > 1:
+ raise Error("Non-unique PIF \"%s\"" % uuid)
+
+ return pifs[0]
+
+ def get_pifs_by_record(self, record):
+ """record is partial pif record.
+ Get the pif(s) whose record matches.
+ """
+ def match(pifrec):
+ for key in record:
+ if record[key] != pifrec[key]:
+ return False
+ return True
+
+ return map(lambda (ref,rec): ref,
+ filter(lambda (ref,rec): match(rec),
+ self.__pifs.items()))
+
+ def get_pif_by_record(self, record):
+ """record is partial pif record.
+ Get the pif whose record matches.
+ """
+ pifs = self.get_pifs_by_record(record)
+ if len(pifs) == 0:
+ raise Error("No matching PIF \"%s\"" % str(record))
+ elif len(pifs) > 1:
+ raise Error("Multiple matching PIFs \"%s\"" % str(record))
+
+ return pifs[0]
+
+ def get_pif_by_bridge(self, host, bridge):
+ networks = map(lambda (ref,rec): ref,
+ filter(lambda (ref,rec): rec['bridge'] == bridge,
+ self.__networks.items()))
+ if len(networks) == 0:
+ raise Error("No matching network \"%s\"")
+
+ answer = None
+ for network in networks:
+ nwrec = self.get_network_record(network)
+ for pif in nwrec['PIFs']:
+ pifrec = self.get_pif_record(pif)
+ if pifrec['host'] != host:
+ continue
+ if answer:
+ raise Error("Multiple PIFs on %s for network %s" % (host, bridge))
+ answer = pif
+ if not answer:
+ raise Error("No PIF on %s for network %s" % (host, bridge))
+ return answer
+
+ def get_pif_record(self, pif):
+ if self.__pifs.has_key(pif):
+ return self.__pifs[pif]
+ raise Error("Unknown PIF \"%s\"" % pif)
+ def get_all_pifs(self):
+ return self.__pifs
+ def pif_exists(self, pif):
+ return self.__pifs.has_key(pif)
+
+ def get_management_pif(self, host):
+ """ Returns the management pif on host
+ """
+ all = self.get_all_pifs()
+ for pif in all:
+ pifrec = self.get_pif_record(pif)
+ if pifrec['management'] and pifrec['host'] == host :
+ return pif
+ return None
+
+ def get_network_record(self, network):
+ if self.__networks.has_key(network):
+ return self.__networks[network]
+ raise Error("Unknown network \"%s\"" % network)
+ def get_all_networks(self):
+ return self.__networks
+
+ def get_bond_record(self, bond):
+ if self.__bonds.has_key(bond):
+ return self.__bonds[bond]
+ else:
+ return None
+
+ def get_vlan_record(self, vlan):
+ if self.__vlans.has_key(vlan):
+ return self.__vlans[vlan]
+ else:
+ return None
+
+def bridge_name(pif):
+ """Return the bridge name associated with pif, or None if network is bridgeless"""
+ pifrec = db.get_pif_record(pif)
+ nwrec = db.get_network_record(pifrec['network'])
+
+ if nwrec['bridge']:
+ # TODO: sanity check that nwrec['bridgeless'] != 'true'
+ return nwrec['bridge']
+ else:
+ # TODO: sanity check that nwrec['bridgeless'] == 'true'
+ return None
+
+def interface_name(pif):
+ """Construct an interface name from the given PIF record."""
+
+ pifrec = db.get_pif_record(pif)
+
+ if pifrec['VLAN'] == '-1':
+ return pifrec['device']
+ else:
+ return "%(device)s.%(VLAN)s" % pifrec
+
+def datapath_name(pif):
+ """Return the OpenFlow datapath name associated with pif.
+For a non-VLAN PIF, the datapath name is the bridge name.
+For a VLAN PIF, the datapath name is the bridge name for the PIF's VLAN slave.
+(xapi will create a datapath named with the bridge name even though we won't
+use it.)
+"""
+
+ pifrec = db.get_pif_record(pif)
+
+ if pifrec['VLAN'] == '-1':
+ return bridge_name(pif)
+ else:
+ return bridge_name(get_vlan_slave_of_pif(pif))
+
+def ipdev_name(pif):
+ """Return the the name of the network device that carries the
+IP configuration (if any) associated with pif.
+The ipdev name is the same as the bridge name.
+"""
+
+ pifrec = db.get_pif_record(pif)
+ return bridge_name(pif)
+
+def physdev_names(pif):
+ """Return the name(s) of the physical network device(s) associated with pif.
+For a VLAN PIF, the physical devices are the VLAN slave's physical devices.
+For a bond master PIF, the physical devices are the bond slaves.
+For a non-VLAN, non-bond master PIF, the physical device is the PIF itself.
+"""
+
+ pifrec = db.get_pif_record(pif)
+
+ if pifrec['VLAN'] != '-1':
+ return physdev_names(get_vlan_slave_of_pif(pif))
+ elif len(pifrec['bond_master_of']) != 0:
+ physdevs = []
+ for slave in get_bond_slaves_of_pif(pif):
+ physdevs += physdev_names(slave)
+ return physdevs
+ else:
+ return [pifrec['device']]
+
+def log_pif_action(action, pif):
+ pifrec = db.get_pif_record(pif)
+ pifrec['action'] = action
+ pifrec['interface-name'] = interface_name(pif)
+ if action == "rewrite":
+ pifrec['message'] = "Rewrite PIF %(uuid)s configuration" % pifrec
+ else:
+ pifrec['message'] = "Bring %(action)s PIF %(uuid)s" % pifrec
+ log("%(message)s: %(interface-name)s configured as %(ip_configuration_mode)s" % pifrec)
+
+def get_bond_masters_of_pif(pif):
+ """Returns a list of PIFs which are bond masters of this PIF"""
+
+ pifrec = db.get_pif_record(pif)
+
+ bso = pifrec['bond_slave_of']
+
+ # bond-slave-of is currently a single reference but in principle a
+ # PIF could be a member of several bonds which are not
+ # concurrently attached. Be robust to this possibility.
+ if not bso or bso == "OpaqueRef:NULL":
+ bso = []
+ elif not type(bso) == list:
+ bso = [bso]
+
+ bondrecs = [db.get_bond_record(bond) for bond in bso]
+ bondrecs = [rec for rec in bondrecs if rec]
+
+ return [bond['master'] for bond in bondrecs]
+
+def get_bond_slaves_of_pif(pif):
+ """Returns a list of PIFs which make up the given bonded pif."""
+
+ pifrec = db.get_pif_record(pif)
+ host = pifrec['host']
+
+ bmo = pifrec['bond_master_of']
+ if len(bmo) > 1:
+ raise Error("Bond-master-of contains too many elements")
+
+ if len(bmo) == 0:
+ return []
+
+ bondrec = db.get_bond_record(bmo[0])
+ if not bondrec:
+ raise Error("No bond record for bond master PIF")
+
+ return bondrec['slaves']
+
+def get_vlan_slave_of_pif(pif):
+ """Find the PIF which is the VLAN slave of pif.
+
+Returns the 'physical' PIF underneath the a VLAN PIF @pif."""
+
+ pifrec = db.get_pif_record(pif)
+
+ vlan = pifrec['VLAN_master_of']
+ if not vlan or vlan == "OpaqueRef:NULL":
+ raise Error("PIF is not a VLAN master")
+
+ vlanrec = db.get_vlan_record(vlan)
+ if not vlanrec:
+ raise Error("No VLAN record found for PIF")
+
+ return vlanrec['tagged_PIF']
+
+def get_vlan_masters_of_pif(pif):
+ """Returns a list of PIFs which are VLANs on top of the given pif."""
+
+ pifrec = db.get_pif_record(pif)
+ vlans = [db.get_vlan_record(v) for v in pifrec['VLAN_slave_of']]
+ return [v['untagged_PIF'] for v in vlans if v and db.pif_exists(v['untagged_PIF'])]
+
+def interface_deconfigure_commands(interface):
+ # The use of [!0-9] keeps an interface of 'eth0' from matching
+ # VLANs attached to eth0 (such as 'eth0.123'), which are distinct
+ # interfaces.
+ return ['--del-match=bridge.*.port=%s' % interface,
+ '--del-match=bonding.%s.[!0-9]*' % interface,
+ '--del-match=bonding.*.slave=%s' % interface,
+ '--del-match=vlan.%s.[!0-9]*' % interface,
+ '--del-match=port.%s.[!0-9]*' % interface,
+ '--del-match=iface.%s.[!0-9]*' % interface]
+
+def run_command(command):
+ log("Running command: " + ' '.join(command))
+ if os.spawnl(os.P_WAIT, command[0], *command) != 0:
+ log("Command failed: " + ' '.join(command))
+ return False
+ return True
+
+def down_netdev(interface, deconfigure=True):
+ if not interface_exists(interface):
+ log("down_netdev: interface %s does not exist, ignoring" % interface)
+ return
+ argv = ["/sbin/ifconfig", interface, 'down']
+ if deconfigure:
+ argv += ['0.0.0.0']
+
+ # Kill dhclient.
+ pidfile_name = '/var/run/dhclient-%s.pid' % interface
+ pidfile = None
+ try:
+ pidfile = open(pidfile_name, 'r')
+ os.kill(int(pidfile.readline()), signal.SIGTERM)
+ except:
+ pass
+ if pidfile != None:
+ pidfile.close()
+
+ # Remove dhclient pidfile.
+ try:
+ os.remove(pidfile_name)
+ except:
+ pass
+ run_command(argv)
+
+def up_netdev(interface):
+ run_command(["/sbin/ifconfig", interface, 'up'])
+
+def find_distinguished_pifs(pif):
+ """Returns the PIFs on host that own DNS and the default route.
+The peerdns pif will be the one with pif::other-config:peerdns=true, or the mgmt pif if none have this set.
+The gateway pif will be the one with pif::other-config:defaultroute=true, or the mgmt pif if none have this set.
+
+Note: we prune out the bond master pif (if it exists).
+This is because when we are called to bring up an interface with a bond master, it is implicit that
+we should bring down that master."""
+
+ pifrec = db.get_pif_record(pif)
+ host = pifrec['host']
+
+ pifs_on_host = [ __pif for __pif in db.get_all_pifs() if
+ db.get_pif_record(__pif)['host'] == host and
+ (not __pif in get_bond_masters_of_pif(pif)) ]
+
+ peerdns_pif = None
+ defaultroute_pif = None
+
+ # loop through all the pifs on this host looking for one with
+ # other-config:peerdns = true, and one with
+ # other-config:default-route=true
+ for __pif in pifs_on_host:
+ __pifrec = db.get_pif_record(__pif)
+ __oc = __pifrec['other_config']
+ if __oc.has_key('peerdns') and __oc['peerdns'] == 'true':
+ if peerdns_pif == None:
+ peerdns_pif = __pif
+ else:
+ log('Warning: multiple pifs with "peerdns=true" - choosing %s and ignoring %s' % \
+ (db.get_pif_record(peerdns_pif)['device'], __pifrec['device']))
+ if __oc.has_key('defaultroute') and __oc['defaultroute'] == 'true':
+ if defaultroute_pif == None:
+ defaultroute_pif = __pif
+ else:
+ log('Warning: multiple pifs with "defaultroute=true" - choosing %s and ignoring %s' % \
+ (db.get_pif_record(defaultroute_pif)['device'], __pifrec['device']))
+
+ # If no pif is explicitly specified then use the mgmt pif for peerdns/defaultroute
+ if peerdns_pif == None:
+ peerdns_pif = management_pif
+ if defaultroute_pif == None:
+ defaultroute_pif = management_pif
+
+ return peerdns_pif, defaultroute_pif
+
+def ethtool_settings(oc):
+ # Options for "ethtool -s"
+ settings = []
+ if oc.has_key('ethtool-speed'):
+ val = oc['ethtool-speed']
+ if val in ["10", "100", "1000"]:
+ settings += ['speed', val]
+ else:
+ log("Invalid value for ethtool-speed = %s. Must be 10|100|1000." % val)
+ if oc.has_key('ethtool-duplex'):
+ val = oc['ethtool-duplex']
+ if val in ["10", "100", "1000"]:
+ settings += ['duplex', 'val']
+ else:
+ log("Invalid value for ethtool-duplex = %s. Must be half|full." % val)
+ if oc.has_key('ethtool-autoneg'):
+ val = oc['ethtool-autoneg']
+ if val in ["true", "on"]:
+ settings += ['autoneg', 'on']
+ elif val in ["false", "off"]:
+ settings += ['autoneg', 'off']
+ else:
+ log("Invalid value for ethtool-autoneg = %s. Must be on|true|off|false." % val)
+
+ # Options for "ethtool -K"
+ offload = []
+ for opt in ("rx", "tx", "sg", "tso", "ufo", "gso"):
+ if oc.has_key("ethtool-" + opt):
+ val = oc["ethtool-" + opt]
+ if val in ["true", "on"]:
+ offload += [opt, 'on']
+ elif val in ["false", "off"]:
+ offload += [opt, 'off']
+ else:
+ log("Invalid value for ethtool-%s = %s. Must be on|true|off|false." % (opt, val))
+
+ return settings, offload
+
+def configure_netdev(pif):
+ pifrec = db.get_pif_record(pif)
+ datapath = datapath_name(pif)
+ ipdev = ipdev_name(pif)
+
+ host = pifrec['host']
+ nw = pifrec['network']
+ nwrec = db.get_network_record(nw)
+
+ ifconfig_argv = ['/sbin/ifconfig', ipdev, 'up']
+ gateway = ''
+ if pifrec['ip_configuration_mode'] == "DHCP":
+ pass
+ elif pifrec['ip_configuration_mode'] == "Static":
+ ifconfig_argv += [pifrec['IP']]
+ ifconfig_argv += ['netmask', pifrec['netmask']]
+ gateway = pifrec['gateway']
+ elif pifrec['ip_configuration_mode'] == "None":
+ # Nothing to do.
+ pass
+ else:
+ raise Error("Unknown IP-configuration-mode %s" % pifrec['ip_configuration_mode'])
+
+ oc = {}
+ if pifrec.has_key('other_config'):
+ oc = pifrec['other_config']
+ if oc.has_key('mtu'):
+ int(oc['mtu']) # Check that the value is an integer
+ ifconfig_argv += ['mtu', oc['mtu']]
+
+ run_command(ifconfig_argv)
+
+ (peerdns_pif, defaultroute_pif) = find_distinguished_pifs(pif)
+
+ if peerdns_pif == pif:
+ f = ConfigurationFile('resolv.conf', "/etc")
+ if oc.has_key('domain'):
+ f.write("search %s\n" % oc['domain'])
+ for dns in pifrec['DNS'].split(","):
+ f.write("nameserver %s\n" % dns)
+ f.close()
+ f.apply()
+ f.commit()
+
+ if defaultroute_pif == pif and gateway != '':
+ run_command(['/sbin/ip', 'route', 'replace', 'default',
+ 'via', gateway, 'dev', ipdev])
+
+ if oc.has_key('static-routes'):
+ for line in oc['static-routes'].split(','):
+ network, masklen, gateway = line.split('/')
+ run_command(['/sbin/ip', 'route', 'add',
+ '%s/%s' % (netmask, masklen), 'via', gateway,
+ 'dev', ipdev])
+
+ settings, offload = ethtool_settings(oc)
+ if settings:
+ run_command(['/sbin/ethtool', '-s', ipdev] + settings)
+ if offload:
+ run_command(['/sbin/ethtool', '-K', ipdev] + offload)
+
+ if pifrec['ip_configuration_mode'] == "DHCP":
+ print
+ print "Determining IP information for %s..." % ipdev,
+ argv = ['/sbin/dhclient', '-q',
+ '-lf', '/var/lib/dhclient/dhclient-%s.leases' % ipdev,
+ '-pf', '/var/run/dhclient-%s.pid' % ipdev,
+ ipdev]
+ if run_command(argv):
+ print 'done.'
+ else:
+ print 'failed.'
+
+def modify_config(commands):
+ run_command(['/root/vswitch/bin/ovs-cfg-mod', '-vANY:console:emer',
+ '-F', '/etc/ovs-vswitchd.conf']
+ + commands + ['-c'])
+ run_command(['/sbin/service', 'vswitch', 'reload'])
+
+def is_bond_pif(pif):
+ pifrec = db.get_pif_record(pif)
+ return len(pifrec['bond_master_of']) != 0
+
+def configure_bond(pif):
+ pifrec = db.get_pif_record(pif)
+ interface = interface_name(pif)
+ ipdev = ipdev_name(pif)
+ datapath = datapath_name(pif)
+ physdevs = physdev_names(pif)
+
+ argv = ['--del-match=bonding.%s.[!0-9]*' % interface]
+ argv += ["--add=bonding.%s.slave=%s" % (interface, slave)
+ for slave in physdevs]
+
+ # Bonding options.
+ bond_options = {
+ "mode": "balance-slb",
+ "miimon": "100",
+ "downdelay": "200",
+ "updelay": "31000",
+ "use_carrier": "1",
+ }
+ # override defaults with values from other-config whose keys
+ # being with "bond-"
+ oc = pifrec['other_config']
+ overrides = filter(lambda (key,val):
+ key.startswith("bond-"), oc.items())
+ overrides = map(lambda (key,val): (key[5:], val), overrides)
+ bond_options.update(overrides)
+ for (name,val) in bond_options.items():
+ argv += ["--add=bonding.%s.%s=%s" % (interface, name, val)]
+ return argv
+
+def action_up(pif):
+ pifrec = db.get_pif_record(pif)
+
+ bridge = bridge_name(pif)
+ interface = interface_name(pif)
+ ipdev = ipdev_name(pif)
+ datapath = datapath_name(pif)
+ physdevs = physdev_names(pif)
+ vlan_slave = None
+ if pifrec['VLAN'] != '-1':
+ vlan_slave = get_vlan_slave_of_pif(pif)
+ if vlan_slave and is_bond_pif(vlan_slave):
+ bond_master = vlan_slave
+ elif is_bond_pif(pif):
+ bond_master = pif
+ else:
+ bond_master = None
+ bond_masters = get_bond_masters_of_pif(pif)
+
+ # Support "rpm -e vswitch" gracefully by keeping Centos configuration
+ # files up-to-date, even though we don't use them or need them.
+ f = configure_pif(pif)
+ mode = pifrec['ip_configuration_mode']
+ if bridge:
+ log("Configuring %s using %s configuration" % (bridge, mode))
+ br = open_network_ifcfg(pif)
+ configure_network(pif, br)
+ br.close()
+ f.attach_child(br)
+ else:
+ log("Configuring %s using %s configuration" % (interface, mode))
+ configure_network(pif, f)
+ f.close()
+ for master in bond_masters:
+ master_bridge = bridge_name(master)
+ removed = unconfigure_pif(master)
+ f.attach_child(removed)
+ if master_bridge:
+ removed = open_network_ifcfg(master)
+ log("Unlinking stale file %s" % removed.path())
+ removed.unlink()
+ f.attach_child(removed)
+
+ # /etc/xensource/scripts/vif needs to know where to add VIFs.
+ if vlan_slave:
+ if not os.path.exists(vswitch_config_dir):
+ os.mkdir(vswitch_config_dir)
+ br = ConfigurationFile("br-%s" % bridge, vswitch_config_dir)
+ br.write("VLAN_SLAVE=%s\n" % datapath)
+ br.write("VLAN_VID=%s\n" % pifrec['VLAN'])
+ br.close()
+ f.attach_child(br)
+
+ # Update all configuration files (both ours and Centos's).
+ f.apply()
+ f.commit()
+
+ # "ifconfig down" the network device and delete its IP address, etc.
+ down_netdev(ipdev)
+ for physdev in physdevs:
+ down_netdev(physdev)
+
+ # Remove all keys related to pif and any bond masters linked to PIF.
+ del_ports = [ipdev] + physdevs + bond_masters
+ if vlan_slave and bond_master:
+ del_ports += [interface_name(bond_master)]
+
+ # What ports do we need to add to the datapath?
+ #
+ # We definitely need the ipdev, and ordinarily we want the
+ # physical devices too, but for bonds we need the bond as bridge
+ # port.
+ add_ports = [ipdev, datapath]
+ if not bond_master:
+ add_ports += physdevs
+ else:
+ add_ports += [interface_name(bond_master)]
+
+ # What ports do we need to delete?
+ #
+ # - All the ports that we add, to avoid duplication and to drop
+ # them from another datapath in case they're misassigned.
+ #
+ # - The physical devices, since they will either be in add_ports
+ # or added to the bonding device (see below).
+ #
+ # - The bond masters for pif. (Ordinarily pif shouldn't have any
+ # bond masters. If it does then interface-reconfigure is
+ # implicitly being asked to take them down.)
+ del_ports = add_ports + physdevs + bond_masters
+
+ # What networks does this datapath carry?
+ #
+ # - The network corresponding to the datapath's PIF.
+ #
+ # - The networks corresponding to any VLANs attached to the
+ # datapath's PIF.
+ network_uuids = []
+ for nwpif in db.get_pifs_by_record({'device': pifrec['device'],
+ 'host': pifrec['host']}):
+ net = db.get_pif_record(nwpif)['network']
+ network_uuids += [db.get_network_record(net)['uuid']]
+
+ # Now modify the ovs-vswitchd config file.
+ argv = []
+ for port in set(del_ports):
+ argv += interface_deconfigure_commands(port)
+ for port in set(add_ports):
+ argv += ['--add=bridge.%s.port=%s' % (datapath, port)]
+ if vlan_slave:
+ argv += ['--add=vlan.%s.tag=%s' % (ipdev, pifrec['VLAN'])]
+ argv += ['--add=iface.%s.internal=true' % (ipdev)]
+
+ # xapi creates a bridge by the name of the ipdev and requires
+ # that the IP address will be on it. We need to delete this
+ # bridge because we need that device to be a member of our
+ # datapath.
+ argv += ['--del-match=bridge.%s.[!0-9]*' % ipdev]
+
+ # xapi insists that its attempts to create the bridge succeed,
+ # so force that to happen.
+ argv += ['--add=iface.%s.fake-bridge=true' % (ipdev)]
+ else:
+ try:
+ os.unlink("%s/br-%s" % (vswitch_config_dir, bridge))
+ except OSError:
+ pass
+ argv += ['--del-match=bridge.%s.xs-network-uuids=*' % datapath]
+ argv += ['--add=bridge.%s.xs-network-uuids=%s' % (datapath, uuid)
+ for uuid in set(network_uuids)]
+ if bond_master:
+ argv += configure_bond(bond_master)
+ modify_config(argv)
+
+ # Configure network devices.
+ configure_netdev(pif)
+
+ # Bring up VLAN slave and bond slaves.
+ if vlan_slave:
+ up_netdev(ipdev_name(vlan_slave))
+ for physdev in physdevs:
+ up_netdev(physdev)
+
+ # Update /etc/issue (which contains the IP address of the management interface)
+ os.system("/sbin/update-issue")
+
+def action_down(pif):
+ rec = db.get_pif_record(pif)
+ interface = interface_name(pif)
+ bridge = bridge_name(pif)
+ ipdev = ipdev_name(pif)
+
+ # Support "rpm -e vswitch" gracefully by keeping Centos configuration
+ # files up-to-date, even though we don't use them or need them.
+ f = unconfigure_pif(pif)
+ if bridge:
+ br = open_network_ifcfg(pif)
+ log("Unlinking stale file %s" % br.path())
+ br.unlink()
+ f.attach_child(br)
+ try:
+ f.apply()
+ f.commit()
+ except Error, e:
+ log("action_down failed to apply changes: %s" % e.msg)
+ f.revert()
+ raise
+
+ argv = []
+ if rec['VLAN'] != '-1':
+ # Get rid of the VLAN device itself.
+ down_netdev(ipdev)
+ argv += interface_deconfigure_commands(ipdev)
+
+ # If the VLAN's slave is attached, stop here.
+ slave = get_vlan_slave_of_pif(pif)
+ if db.get_pif_record(slave)['currently_attached']:
+ log("VLAN slave is currently attached")
+ modify_config(argv)
+ return
+
+ # If the VLAN's slave has other VLANs that are attached, stop here.
+ masters = get_vlan_masters_of_pif(slave)
+ for m in masters:
+ if m != pif and db.get_pif_record(m)['currently_attached']:
+ log("VLAN slave has other master %s" % interface_naem(m))
+ modify_config(argv)
+ return
+
+ # Otherwise, take down the VLAN's slave too.
+ log("No more masters, bring down vlan slave %s" % interface_name(slave))
+ pif = slave
+ else:
+ # Stop here if this PIF has attached VLAN masters.
+ vlan_masters = get_vlan_masters_of_pif(pif)
+ log("VLAN masters of %s - %s" % (rec['device'], [interface_name(m) for m in vlan_masters]))
+ for m in vlan_masters:
+ if db.get_pif_record(m)['currently_attached']:
+ log("Leaving %s up due to currently attached VLAN master %s" % (interface, interface_name(m)))
+ return
+
+ # pif is now either a bond or a physical device which needs to be
+ # brought down. pif might have changed so re-check all its attributes.
+ rec = db.get_pif_record(pif)
+ interface = interface_name(pif)
+ bridge = bridge_name(pif)
+ ipdev = ipdev_name(pif)
+
+
+ bond_slaves = get_bond_slaves_of_pif(pif)
+ log("bond slaves of %s - %s" % (rec['device'], [interface_name(s) for s in bond_slaves]))
+ for slave in bond_slaves:
+ slave_interface = interface_name(slave)
+ log("bring down bond slave %s" % slave_interface)
+ argv += interface_deconfigure_commands(slave_interface)
+ down_netdev(slave_interface)
+
+ argv += interface_deconfigure_commands(ipdev)
+ down_netdev(ipdev)
+
+ argv += ['--del-match', 'bridge.%s.*' % datapath_name(pif)]
+ argv += ['--del-match', 'bonding.%s.[!0-9]*' % interface]
+ modify_config(argv)
+
+def action_rewrite(pif):
+ # Support "rpm -e vswitch" gracefully by keeping Centos configuration
+ # files up-to-date, even though we don't use them or need them.
+ pifrec = db.get_pif_record(pif)
+ f = configure_pif(pif)
+ interface = interface_name(pif)
+ bridge = bridge_name(pif)
+ mode = pifrec['ip_configuration_mode']
+ if bridge:
+ log("Configuring %s using %s configuration" % (bridge, mode))
+ br = open_network_ifcfg(pif)
+ configure_network(pif, br)
+ br.close()
+ f.attach_child(br)
+ else:
+ log("Configuring %s using %s configuration" % (interface, mode))
+ configure_network(pif, f)
+ f.close()
+ try:
+ f.apply()
+ f.commit()
+ except Error, e:
+ log("failed to apply changes: %s" % e.msg)
+ f.revert()
+ raise
+
+ # We have no code of our own to run here.
+ pass
+
+def main(argv=None):
+ global output_directory, management_pif
+
+ session = None
+ pif_uuid = None
+ pif = None
+
+ force_interface = None
+ force_management = False
+
+ if argv is None:
+ argv = sys.argv
+
+ try:
+ try:
+ shortops = "h"
+ longops = [ "output-directory=",
+ "pif=", "pif-uuid=",
+ "session=",
+ "force=",
+ "force-interface=",
+ "management",
+ "test-mode",
+ "device=", "mode=", "ip=", "netmask=", "gateway=",
+ "help" ]
+ arglist, args = getopt.gnu_getopt(argv[1:], shortops, longops)
+ except getopt.GetoptError, msg:
+ raise Usage(msg)
+
+ force_rewrite_config = {}
+
+ for o,a in arglist:
+ if o == "--output-directory":
+ output_directory = a
+ elif o == "--pif":
+ pif = a
+ elif o == "--pif-uuid":
+ pif_uuid = a
+ elif o == "--session":
+ session = a
+ elif o == "--force-interface" or o == "--force":
+ force_interface = a
+ elif o == "--management":
+ force_management = True
+ elif o in ["--device", "--mode", "--ip", "--netmask", "--gateway"]:
+ force_rewrite_config[o[2:]] = a
+ elif o == "-h" or o == "--help":
+ print __doc__ % {'command-name': os.path.basename(argv[0])}
+ return 0
+
+ if not debug_mode():
+ syslog.openlog(os.path.basename(argv[0]))
+ log("Called as " + str.join(" ", argv))
+ if len(args) < 1:
+ raise Usage("Required option <action> not present")
+ if len(args) > 1:
+ raise Usage("Too many arguments")
+
+ action = args[0]
+ # backwards compatibility
+ if action == "rewrite-configuration": action = "rewrite"
+
+ if output_directory and ( session or pif ):
+ raise Usage("--session/--pif cannot be used with --output-directory")
+ if ( session or pif ) and pif_uuid:
+ raise Usage("--session/--pif and --pif-uuid are mutually exclusive.")
+ if ( session and not pif ) or ( not session and pif ):
+ raise Usage("--session and --pif must be used together.")
+ if force_interface and ( session or pif or pif_uuid ):
+ raise Usage("--force is mutually exclusive with --session, --pif and --pif-uuid")
+ if len(force_rewrite_config) and not (force_interface and action == "rewrite"):
+ raise Usage("\"--force rewrite\" needed for --device, --mode, --ip, --netmask, and --gateway")
+
+ global db
+ if force_interface:
+ log("Force interface %s %s" % (force_interface, action))
+
+ if action == "rewrite":
+ action_force_rewrite(force_interface, force_rewrite_config)
+ else:
+ db = DatabaseCache(cache_file=dbcache_file)
+ host = db.extras['host']
+ pif = db.get_pif_by_bridge(host, force_interface)
+ management_pif = db.get_management_pif(host)
+
+ if action == "up":
+ action_up(pif)
+ elif action == "down":
+ action_down(pif)
+ else:
+ raise Usage("Unknown action %s" % action)
+ else:
+ db = DatabaseCache(session_ref=session)
+
+ if pif_uuid:
+ pif = db.get_pif_by_uuid(pif_uuid)
+
+ if not pif:
+ raise Usage("No PIF given")
+
+ if force_management:
+ # pif is going to be the management pif
+ management_pif = pif
+ else:
+ # pif is not going to be the management pif.
+ # Search DB cache for pif on same host with management=true
+ pifrec = db.get_pif_record(pif)
+ host = pifrec['host']
+ management_pif = db.get_management_pif(host)
+
+ log_pif_action(action, pif)
+
+ if not check_allowed(pif):
+ return 0
+
+ if action == "up":
+ action_up(pif)
+ elif action == "down":
+ action_down(pif)
+ elif action == "rewrite":
+ action_rewrite(pif)
+ else:
+ raise Usage("Unknown action %s" % action)
+
+ # Save cache.
+ pifrec = db.get_pif_record(pif)
+ db.save(dbcache_file, {'host': pifrec['host']})
+
+ except Usage, err:
+ print >>sys.stderr, err.msg
+ print >>sys.stderr, "For help use --help."
+ return 2
+ except Error, err:
+ log(err.msg)
+ return 1
+
+ return 0
+
+# The following code allows interface-reconfigure to keep Centos
+# network configuration files up-to-date, even though the vswitch
+# never uses them. In turn, that means that "rpm -e vswitch" does not
+# have to update any configuration files.
+
+def configure_ethtool(oc, f):
+ # Options for "ethtool -s"
+ settings = None
+ setting_opts = ["autoneg", "speed", "duplex"]
+ # Options for "ethtool -K"
+ offload = None
+ offload_opts = ["rx", "tx", "sg", "tso", "ufo", "gso"]
+
+ for opt in [opt for opt in setting_opts + offload_opts if oc.has_key("ethtool-" + opt)]:
+ val = oc["ethtool-" + opt]
+
+ if opt in ["speed"]:
+ if val in ["10", "100", "1000"]:
+ val = "speed " + val
+ else:
+ log("Invalid value for ethtool-speed = %s. Must be 10|100|1000." % val)
+ val = None
+ elif opt in ["duplex"]:
+ if val in ["half", "full"]:
+ val = "duplex " + val
+ else:
+ log("Invalid value for ethtool-duplex = %s. Must be half|full." % val)
+ val = None
+ elif opt in ["autoneg"] + offload_opts:
+ if val in ["true", "on"]:
+ val = opt + " on"
+ elif val in ["false", "off"]:
+ val = opt + " off"
+ else:
+ log("Invalid value for ethtool-%s = %s. Must be on|true|off|false." % (opt, val))
+ val = None
+
+ if opt in setting_opts:
+ if val and settings:
+ settings = settings + " " + val
+ else:
+ settings = val
+ elif opt in offload_opts:
+ if val and offload:
+ offload = offload + " " + val
+ else:
+ offload = val
+
+ if settings:
+ f.write("ETHTOOL_OPTS=\"%s\"\n" % settings)
+ if offload:
+ f.write("ETHTOOL_OFFLOAD_OPTS=\"%s\"\n" % offload)
+
+def configure_mtu(oc, f):
+ if not oc.has_key('mtu'):
+ return
+
+ try:
+ mtu = int(oc['mtu'])
+ f.write("MTU=%d\n" % mtu)
+ except ValueError, x:
+ log("Invalid value for mtu = %s" % mtu)
+
+def configure_static_routes(interface, oc, f):
+ """Open a route-<interface> file for static routes.
+
+ Opens the static routes configuration file for interface and writes one
+ line for each route specified in the network's other config "static-routes" value.
+ E.g. if
+ interface ( RO): xenbr1
+ other-config (MRW): static-routes: 172.16.0.0/15/192.168.0.3,172.18.0.0/16/192.168.0.4;...
+
+ Then route-xenbr1 should be
+ 172.16.0.0/15 via 192.168.0.3 dev xenbr1
+ 172.18.0.0/16 via 192.168.0.4 dev xenbr1
+ """
+ fname = "route-%s" % interface
+ if oc.has_key('static-routes'):
+ # The key is present - extract comma seperates entries
+ lines = oc['static-routes'].split(',')
+ else:
+ # The key is not present, i.e. there are no static routes
+ lines = []
+
+ child = ConfigurationFile(fname)
+ child.write("# DO NOT EDIT: This file (%s) was autogenerated by %s\n" % \
+ (os.path.basename(child.path()), os.path.basename(sys.argv[0])))
+
+ try:
+ for l in lines:
+ network, masklen, gateway = l.split('/')
+ child.write("%s/%s via %s dev %s\n" % (network, masklen, gateway, interface))
+
+ f.attach_child(child)
+ child.close()
+
+ except ValueError, e:
+ log("Error in other-config['static-routes'] format for network %s: %s" % (interface, e))
+
+def __open_ifcfg(interface):
+ """Open a network interface configuration file.
+
+ Opens the configuration file for interface, writes a header and
+ common options and returns the file object.
+ """
+ fname = "ifcfg-%s" % interface
+ f = ConfigurationFile(fname)
+
+ f.write("# DO NOT EDIT: This file (%s) was autogenerated by %s\n" % \
+ (os.path.basename(f.path()), os.path.basename(sys.argv[0])))
+ f.write("XEMANAGED=yes\n")
+ f.write("DEVICE=%s\n" % interface)
+ f.write("ONBOOT=no\n")
+
+ return f
+
+def open_network_ifcfg(pif):
+ bridge = bridge_name(pif)
+ interface = interface_name(pif)
+ if bridge:
+ return __open_ifcfg(bridge)
+ else:
+ return __open_ifcfg(interface)
+
+
+def open_pif_ifcfg(pif):
+ pifrec = db.get_pif_record(pif)
+
+ log("Configuring %s (%s)" % (interface_name(pif), pifrec['MAC']))
+
+ f = __open_ifcfg(interface_name(pif))
+
+ if pifrec.has_key('other_config'):
+ configure_ethtool(pifrec['other_config'], f)
+ configure_mtu(pifrec['other_config'], f)
+
+ return f
+
+def configure_network(pif, f):
+ """Write the configuration file for a network.
+
+ Writes configuration derived from the network object into the relevant
+ ifcfg file. The configuration file is passed in, but if the network is
+ bridgeless it will be ifcfg-<interface>, otherwise it will be ifcfg-<bridge>.
+
+ This routine may also write ifcfg files of the networks corresponding to other PIFs
+ in order to maintain consistency.
+
+ params:
+ pif: Opaque_ref of pif
+ f : ConfigurationFile(/path/to/ifcfg) to which we append network configuration
+ """
+
+ pifrec = db.get_pif_record(pif)
+ host = pifrec['host']
+ nw = pifrec['network']
+ nwrec = db.get_network_record(nw)
+ oc = None
+ bridge = bridge_name(pif)
+ interface = interface_name(pif)
+ if bridge:
+ device = bridge
+ else:
+ device = interface
+
+ if nwrec.has_key('other_config'):
+ configure_ethtool(nwrec['other_config'], f)
+ configure_mtu(nwrec['other_config'], f)
+ configure_static_routes(device, nwrec['other_config'], f)
+
+
+ if pifrec.has_key('other_config'):
+ oc = pifrec['other_config']
+
+ if device == bridge:
+ f.write("TYPE=Bridge\n")
+ f.write("DELAY=0\n")
+ f.write("STP=off\n")
+ f.write("PIFDEV=%s\n" % interface_name(pif))
+
+ if pifrec['ip_configuration_mode'] == "DHCP":
+ f.write("BOOTPROTO=dhcp\n")
+ f.write("PERSISTENT_DHCLIENT=yes\n")
+ elif pifrec['ip_configuration_mode'] == "Static":
+ f.write("BOOTPROTO=none\n")
+ f.write("NETMASK=%(netmask)s\n" % pifrec)
+ f.write("IPADDR=%(IP)s\n" % pifrec)
+ f.write("GATEWAY=%(gateway)s\n" % pifrec)
+ elif pifrec['ip_configuration_mode'] == "None":
+ f.write("BOOTPROTO=none\n")
+ else:
+ raise Error("Unknown ip-configuration-mode %s" % pifrec['ip_configuration_mode'])
+
+ if pifrec.has_key('DNS') and pifrec['DNS'] != "":
+ ServerList = pifrec['DNS'].split(",")
+ for i in range(len(ServerList)): f.write("DNS%d=%s\n" % (i+1, ServerList[i]))
+ if oc and oc.has_key('domain'):
+ f.write("DOMAIN='%s'\n" % oc['domain'].replace(',', ' '))
+
+ # We only allow one ifcfg-xenbr* to have PEERDNS=yes and there can be only one GATEWAYDEV in /etc/sysconfig/network.
+ # The peerdns pif will be the one with pif::other-config:peerdns=true, or the mgmt pif if none have this set.
+ # The gateway pif will be the one with pif::other-config:defaultroute=true, or the mgmt pif if none have this set.
+
+ # Work out which pif on this host should be the one with PEERDNS=yes and which should be the GATEWAYDEV
+ #
+ # Note: we prune out the bond master pif (if it exists).
+ # This is because when we are called to bring up an interface with a bond master, it is implicit that
+ # we should bring down that master.
+ pifs_on_host = [ __pif for __pif in db.get_all_pifs() if
+ db.get_pif_record(__pif)['host'] == host and
+ (not __pif in get_bond_masters_of_pif(pif)) ]
+ other_pifs_on_host = [ __pif for __pif in pifs_on_host if __pif != pif ]
+
+ peerdns_pif = None
+ defaultroute_pif = None
+
+ # loop through all the pifs on this host looking for one with
+ # other-config:peerdns = true, and one with
+ # other-config:default-route=true
+ for __pif in pifs_on_host:
+ __pifrec = db.get_pif_record(__pif)
+ __oc = __pifrec['other_config']
+ if __oc.has_key('peerdns') and __oc['peerdns'] == 'true':
+ if peerdns_pif == None:
+ peerdns_pif = __pif
+ else:
+ log('Warning: multiple pifs with "peerdns=true" - choosing %s and ignoring %s' % \
+ (db.get_pif_record(peerdns_pif)['device'], __pifrec['device']))
+ if __oc.has_key('defaultroute') and __oc['defaultroute'] == 'true':
+ if defaultroute_pif == None:
+ defaultroute_pif = __pif
+ else:
+ log('Warning: multiple pifs with "defaultroute=true" - choosing %s and ignoring %s' % \
+ (db.get_pif_record(defaultroute_pif)['device'], __pifrec['device']))
+
+ # If no pif is explicitly specified then use the mgmt pif for peerdns/defaultroute
+ if peerdns_pif == None:
+ peerdns_pif = management_pif
+ if defaultroute_pif == None:
+ defaultroute_pif = management_pif
+
+ # Update all the other network's ifcfg files and ensure consistency
+ for __pif in other_pifs_on_host:
+ __f = open_network_ifcfg(__pif)
+ peerdns_line_wanted = 'PEERDNS=%s\n' % ((__pif == peerdns_pif) and 'yes' or 'no')
+ lines = __f.readlines()
+
+ if not peerdns_line_wanted in lines:
+ # the PIF selected for DNS has changed and as a result this ifcfg file needs rewriting
+ for line in lines:
+ if not line.lstrip().startswith('PEERDNS'):
+ __f.write(line)
+ log("Setting %s in %s" % (peerdns_line_wanted.strip(), __f.path()))
+ __f.write(peerdns_line_wanted)
+ __f.close()
+ f.attach_child(__f)
+
+ else:
+ # There is no need to change this ifcfg file. So don't attach_child.
+ pass
+
+ # ... and for this pif too
+ f.write('PEERDNS=%s\n' % ((pif == peerdns_pif) and 'yes' or 'no'))
+
+ # Update gatewaydev
+ fnetwork = ConfigurationFile("network", "/etc/sysconfig")
+ for line in fnetwork.readlines():
+ if line.lstrip().startswith('GATEWAY') :
+ continue
+ fnetwork.write(line)
+ if defaultroute_pif:
+ gatewaydev = bridge_name(defaultroute_pif)
+ if not gatewaydev:
+ gatewaydev = interface_name(defaultroute_pif)
+ fnetwork.write('GATEWAYDEV=%s\n' % gatewaydev)
+ fnetwork.close()
+ f.attach_child(fnetwork)
+
+ return
+
+
+def configure_physical_interface(pif):
+ """Write the configuration for a physical interface.
+
+ Writes the configuration file for the physical interface described by
+ the pif object.
+
+ Returns the open file handle for the interface configuration file.
+ """
+
+ pifrec = db.get_pif_record(pif)
+
+ f = open_pif_ifcfg(pif)
+
+ f.write("TYPE=Ethernet\n")
+ f.write("HWADDR=%(MAC)s\n" % pifrec)
+
+ return f
+
+def configure_bond_interface(pif):
+ """Write the configuration for a bond interface.
+
+ Writes the configuration file for the bond interface described by
+ the pif object. Handles writing the configuration for the slave
+ interfaces.
+
+ Returns the open file handle for the bond interface configuration
+ file.
+ """
+
+ pifrec = db.get_pif_record(pif)
+ oc = pifrec['other_config']
+ f = open_pif_ifcfg(pif)
+
+ if pifrec['MAC'] != "":
+ f.write("MACADDR=%s\n" % pifrec['MAC'])
+
+ for slave in get_bond_slaves_of_pif(pif):
+ s = configure_physical_interface(slave)
+ s.write("MASTER=%(device)s\n" % pifrec)
+ s.write("SLAVE=yes\n")
+ s.close()
+ f.attach_child(s)
+
+ # The bond option defaults
+ bond_options = {
+ "mode": "balance-slb",
+ "miimon": "100",
+ "downdelay": "200",
+ "updelay": "31000",
+ "use_carrier": "1",
+ }
+
+ # override defaults with values from other-config whose keys being with "bond-"
+ overrides = filter(lambda (key,val): key.startswith("bond-"), oc.items())
+ overrides = map(lambda (key,val): (key[5:], val), overrides)
+ bond_options.update(overrides)
+
+ # write the bond options to ifcfg-bondX
+ f.write('BONDING_OPTS="')
+ for (name,val) in bond_options.items():
+ f.write("%s=%s " % (name,val))
+ f.write('"\n')
+ return f
+
+def configure_vlan_interface(pif):
+ """Write the configuration for a VLAN interface.
+
+ Writes the configuration file for the VLAN interface described by
+ the pif object. Handles writing the configuration for the master
+ interface if necessary.
+
+ Returns the open file handle for the VLAN interface configuration
+ file.
+ """
+
+ slave = configure_pif(get_vlan_slave_of_pif(pif))
+ slave.close()
+
+ f = open_pif_ifcfg(pif)
+ f.write("VLAN=yes\n")
+ f.attach_child(slave)
+
+ return f
+
+def configure_pif(pif):
+ """Write the configuration for a PIF object.
+
+ Writes the configuration file the PIF and all dependent
+ interfaces (bond slaves and VLAN masters etc).
+
+ Returns the open file handle for the interface configuration file.
+ """
+
+ pifrec = db.get_pif_record(pif)
+
+ if pifrec['VLAN'] != '-1':
+ f = configure_vlan_interface(pif)
+ elif len(pifrec['bond_master_of']) != 0:
+ f = configure_bond_interface(pif)
+ else:
+ f = configure_physical_interface(pif)
+
+ bridge = bridge_name(pif)
+ if bridge:
+ f.write("BRIDGE=%s\n" % bridge)
+
+ return f
+
+def unconfigure_pif(pif):
+ """Clear up the files created by configure_pif"""
+ f = open_pif_ifcfg(pif)
+ log("Unlinking stale file %s" % f.path())
+ f.unlink()
+ return f
+
+if __name__ == "__main__":
+ rc = 1
+ try:
+ rc = main()
+ except:
+ ex = sys.exc_info()
+ err = traceback.format_exception(*ex)
+ for exline in err:
+ log(exline)
+
+ if not debug_mode():
+ syslog.closelog()
+
+ sys.exit(rc)
diff --git a/xenserver/usr_lib_xsconsole_plugins-base_XSFeatureVSwitch.py b/xenserver/usr_lib_xsconsole_plugins-base_XSFeatureVSwitch.py
new file mode 100644
index 000000000..8f4be3139
--- /dev/null
+++ b/xenserver/usr_lib_xsconsole_plugins-base_XSFeatureVSwitch.py
@@ -0,0 +1,296 @@
+# Copyright (c) Citrix Systems 2008. All rights reserved.
+# xsconsole is proprietary software.
+#
+# Xen, the Xen logo, XenCenter, XenMotion are trademarks or registered
+# trademarks of Citrix Systems, Inc., in the United States and other
+# countries.
+
+# Copyright (c) 2009 Nicira Networks.
+
+import logging
+log = logging.getLogger("vswitch-cfg-update")
+logging.basicConfig(filename="/var/log/vswitch-xsplugin.log", level=logging.DEBUG)
+
+import os
+import subprocess
+
+cfg_mod="/root/vswitch/bin/ovs-cfg-mod"
+vswitchd_cfg_filename="/etc/ovs-vswitchd.conf"
+
+if __name__ == "__main__":
+ raise Exception("This script is a plugin for xsconsole and cannot run independently")
+
+from XSConsoleStandard import *
+
+class VSwitchService:
+ service = {}
+
+ def __init__(self, name, processname=None):
+ self.name = name
+ self.processname = processname
+ if self.processname == None:
+ self.processname = name
+
+ def status(self):
+ try:
+ output = ShellPipe(["service", self.name, "status"]).Stdout()
+ except StandardError, e:
+ log.error("status retrieval error: " + str(e))
+ return "<unknown>"
+ if len(output) == 0:
+ return "<unknown>"
+ for l in output:
+ if self.processname not in l:
+ continue
+ elif "running" in l:
+ return "Running"
+ elif "stop" in l:
+ return "Stopped"
+ else:
+ return "<unknown>"
+ return "<unknown>"
+
+ def restart(self):
+ try:
+ ShellPipe(["service", self.name, "restart"]).Call()
+ except StandardError, e:
+ log.error("restart error: " + str(e))
+
+ @classmethod
+ def Inst(cls, name, processname=None):
+ key = name
+ if processname != None:
+ key = key + "-" + processname
+ if name not in cls.service:
+ cls.service[key] = VSwitchService(name, processname)
+ return cls.service[key]
+
+class VSwitchConfig:
+
+ @staticmethod
+ def Get(key):
+ try:
+ output = ShellPipe([cfg_mod, "-vANY:console:emer", "-F",
+ vswitchd_cfg_filename, "-q", key]).Stdout()
+ except StandardError, e:
+ log.error("config retrieval error: " + str(e))
+ return "<unknown>"
+
+ if len(output) == 0:
+ output = ""
+ else:
+ output = output[0].strip()
+ return output
+
+
+class VSwitchControllerDialogue(Dialogue):
+ def __init__(self):
+ Dialogue.__init__(self)
+ data=Data.Inst()
+
+ self.hostsInPool = 0
+ self.hostsUpdated = 0
+ self.controller = data.GetPoolForThisHost().get("other_config", {}).get("vSwitchController", "")
+
+ choiceDefs = [
+ ChoiceDef(Lang("Set pool-wide controller"),
+ lambda: self.getController()),
+ ChoiceDef(Lang("Delete pool-wide controller"),
+ lambda: self.deleteController()),
+ ChoiceDef(Lang("Resync server controller config"),
+ lambda: self.syncController()),
+# ChoiceDef(Lang("Restart ovs-vswitchd"),
+# lambda: self.restartService("vswitch")),
+# ChoiceDef(Lang("Restart ovs-brcompatd"),
+# lambda: self.restartService("vswitch-brcompatd"))
+ ]
+ self.menu = Menu(self, None, Lang("Configure vSwitch"), choiceDefs)
+
+ self.ChangeState("INITIAL")
+
+ def BuildPane(self):
+ pane = self.NewPane(DialoguePane(self.parent))
+ pane.TitleSet(Lang("Configure vSwitch"))
+ pane.AddBox()
+
+ def ChangeState(self, inState):
+ self.state = inState
+ self.BuildPane()
+ self.UpdateFields()
+
+ def UpdateFields(self):
+ self.Pane().ResetPosition()
+ getattr(self, "UpdateFields" + self.state)() # Dispatch method named 'UpdateFields'+self.state
+
+ def UpdateFieldsINITIAL(self):
+ pane = self.Pane()
+ pane.AddTitleField(Lang("Select an action"))
+ pane.AddMenuField(self.menu)
+ pane.AddKeyHelpField( { Lang("<Enter>") : Lang("OK"), Lang("<Esc>") : Lang("Cancel") } )
+
+ def UpdateFieldsGETCONTROLLER(self):
+ pane = self.Pane()
+ pane.ResetFields()
+
+ pane.AddTitleField(Lang("Enter IP address of controller"))
+ pane.AddInputField(Lang("Address", 16), self.controller, "address")
+ pane.AddKeyHelpField( { Lang("<Enter>") : Lang("OK"), Lang("<Esc>") : Lang("Exit") } )
+ if pane.CurrentInput() is None:
+ pane.InputIndexSet(0)
+
+ def HandleKey(self, inKey):
+ handled = False
+ if hasattr(self, "HandleKey" + self.state):
+ handled = getattr(self, "HandleKey" + self.state)(inKey)
+ if not handled and inKey == 'KEY_ESCAPE':
+ Layout.Inst().PopDialogue()
+ handled = True
+ return handled
+
+ def HandleKeyINITIAL(self, inKey):
+ return self.menu.HandleKey(inKey)
+
+ def HandleKeyGETCONTROLLER(self, inKey):
+ pane = self.Pane()
+ if pane.CurrentInput() is None:
+ pane.InputIndexSet(0)
+ if inKey == 'KEY_ENTER':
+ inputValues = pane.GetFieldValues()
+ self.controller = inputValues['address']
+ Layout.Inst().PopDialogue()
+ Layout.Inst().TransientBanner(Lang("Setting controller..."))
+ try:
+ self.SetController(self.controller)
+ Layout.Inst().PushDialogue(InfoDialogue(Lang("Setting controller successful")))
+ except Exception, e:
+ Layout.Inst().PushDialogue(InfoDialogue(Lang("Setting controller failed")))
+
+ self.ChangeState("INITIAL")
+ return True
+ else:
+ return pane.CurrentInput().HandleKey(inKey)
+
+ def restartService(self, name):
+ s = VSwitchService.Inst(name)
+ s.restart()
+ Layout.Inst().PopDialogue()
+
+ def getController(self):
+ self.ChangeState("GETCONTROLLER")
+ self.Pane().InputIndexSet(0)
+
+ def deleteController(self):
+ self.controller = ""
+ Layout.Inst().PopDialogue()
+ Layout.Inst().TransientBanner(Lang("Deleting controller..."))
+ try:
+ self.SetController(None)
+ Layout.Inst().PushDialogue(InfoDialogue(Lang("Controller deletion successful")))
+ except Exception, e:
+ Layout.Inst().PushDialogue(InfoDialogue(Lang("Controller deletion failed")))
+
+ def syncController(self):
+ Layout.Inst().PopDialogue()
+ Layout.Inst().TransientBanner(Lang("Resyncing controller setting..."))
+ try:
+ Task.Sync(lambda s: self._updateThisServer(s))
+ Layout.Inst().PushDialogue(InfoDialogue(Lang("Resyncing controller config successful")))
+ except Exception, e:
+ Layout.Inst().PushDialogue(InfoDialogue(Lang("Resyncing controller config failed")))
+
+ def SetController(self, ip):
+ self.hostsInPool = 0
+ self.hostsUpdated = 0
+ Task.Sync(lambda s: self._modifyPoolConfig(s, "vSwitchController", ip))
+ # Should be done asynchronously, maybe with an external script?
+ Task.Sync(lambda s: self._updateActiveServers(s))
+
+ def _modifyPoolConfig(self, session, key, value):
+ """Modify pool configuration.
+
+ If value == None then delete key, otherwise set key to value."""
+ pools = session.xenapi.pool.get_all()
+ # We assume there is only ever one pool...
+ if len(pools) == 0:
+ log.error("No pool for host.")
+ raise XenAPIPlugin.Failure("NO_POOL_FOR_HOST", [])
+ if len(pools) > 1:
+ log.error("More than one pool for host.")
+ raise XenAPIPlugin.Failure("MORE_THAN_ONE_POOL_FOR_HOST", [])
+ session.xenapi.pool.remove_from_other_config(pools[0], key)
+ if value != None:
+ session.xenapi.pool.add_to_other_config(pools[0], key, value)
+ Data.Inst().Update()
+
+ def _updateActiveServers(self, session):
+ hosts = session.xenapi.host.get_all()
+ self.hostsUpdated = 0
+ self.hostsInPool = len(hosts)
+ self.UpdateFields()
+ for host in hosts:
+ Layout.Inst().TransientBanner("Updating host %d out of %d"
+ % (self.hostsUpdated + 1, self.hostsInPool))
+ session.xenapi.host.call_plugin(host, "vswitch-cfg-update", "update", {})
+ self.hostsUpdated = self.hostsUpdated + 1
+
+ def _updateThisServer(self, session):
+ data = Data.Inst()
+ host = data.host.opaqueref()
+ session.xenapi.host.call_plugin(host, "vswitch-cfg-update", "update", {})
+
+
+class XSFeatureVSwitch:
+
+ @classmethod
+ def StatusUpdateHandler(cls, inPane):
+ data = Data.Inst()
+
+ inPane.AddTitleField(Lang("vSwitch"))
+
+ inPane.NewLine()
+
+ versionStr = data.host.other_config({}).get("vSwitchVersion", "<Unknown>")
+ inPane.AddStatusField(Lang("Version", 20), versionStr)
+
+ inPane.NewLine()
+ dbController = data.GetPoolForThisHost().get("other_config", {}).get("vSwitchController", "")
+ if dbController == "":
+ dbController = Lang("<None>")
+ inPane.AddStatusField(Lang("Controller (config)", 20), dbController)
+ controller = VSwitchConfig.Get("mgmt.controller")
+ if controller == "":
+ controller = Lang("<None>")
+ elif controller[0:4] == "ssl:":
+ controller = controller[4:]
+ inPane.AddStatusField(Lang("Controller (in-use)", 20), controller)
+
+ inPane.NewLine()
+ inPane.AddStatusField(Lang("ovs-vswitchd status", 20),
+ VSwitchService.Inst("vswitch", "ovs-vswitchd").status())
+ inPane.AddStatusField(Lang("ovs-brcompatd status", 20),
+ VSwitchService.Inst("vswitch", "ovs-brcompatd").status())
+
+ inPane.AddKeyHelpField( {
+ Lang("<Enter>") : Lang("Reconfigure"),
+ Lang("<F5>") : Lang("Refresh")
+ })
+
+ @classmethod
+ def ActivateHandler(cls):
+ DialogueUtils.AuthenticatedOnly(lambda: Layout.Inst().PushDialogue(VSwitchControllerDialogue()))
+
+ def Register(self):
+ Importer.RegisterNamedPlugIn(
+ self,
+ 'VSwitch', # Key of this plugin for replacement, etc.
+ {
+ 'menuname' : 'MENU_NETWORK',
+ 'menupriority' : 800,
+ 'menutext' : Lang('vSwitch') ,
+ 'statusupdatehandler' : self.StatusUpdateHandler,
+ 'activatehandler' : self.ActivateHandler
+ }
+ )
+
+# Register this plugin when module is imported
+XSFeatureVSwitch().Register()
diff --git a/xenserver/vswitch-xen.spec b/xenserver/vswitch-xen.spec
new file mode 100644
index 000000000..58e35dec9
--- /dev/null
+++ b/xenserver/vswitch-xen.spec
@@ -0,0 +1,310 @@
+# Spec file for vswitch and related programs.
+
+# Copyright (C) 2009 Nicira Networks, Inc.
+#
+# Copying and distribution of this file, with or without modification,
+# are permitted in any medium without royalty provided the copyright
+# notice and this notice are preserved. This file is offered as-is,
+# without warranty of any kind.
+
+# When building, the rpmbuild command line should define
+# vswitch_version, xen_version, and build_number using -D arguments.
+# for example:
+#
+# rpmbuild -D "vswitch_version 0.8.9~1+build123" -D "xen_version 2.6.18-128.1.1.el5.xs5.1.0.483.1000xen" -D "build_number --with-build-number=123" -bb /usr/src/redhat/SPECS/vswitch-xen.spec
+#
+%define version %{vswitch_version}-%{xen_version}
+%define _prefix /root/vswitch
+
+Name: vswitch
+Summary: Virtual switch
+Group: System Environment/Daemons
+URL: http://www.vswitch.org/
+Version: %{vswitch_version}
+License: GPL3
+Release: 1
+Source: openvswitch-%{vswitch_version}.tar.gz
+Buildroot: /tmp/vswitch-xen-rpm
+
+%description
+The vswitch provides standard network bridging functions augmented with
+support for the OpenFlow protocol for remote per-flow control of
+traffic.
+
+%prep
+%setup -q -n openvswitch-%{vswitch_version}
+
+%build
+./configure --prefix=%{_prefix} --localstatedir=%{_localstatedir} --with-l26=/lib/modules/%{xen_version}/build --enable-ssl %{build_number}
+make
+
+%install
+rm -rf $RPM_BUILD_ROOT
+make install DESTDIR=$RPM_BUILD_ROOT prefix=%{_prefix}
+install -d -m 755 $RPM_BUILD_ROOT/etc
+install -d -m 755 $RPM_BUILD_ROOT/etc/init.d
+install -m 755 xenserver/etc_init.d_vswitch \
+ $RPM_BUILD_ROOT/etc/init.d/vswitch
+install -m 755 xenserver/etc_init.d_vswitch-xapi-update \
+ $RPM_BUILD_ROOT/etc/init.d/vswitch-xapi-update
+install -d -m 755 $RPM_BUILD_ROOT/etc/sysconfig
+install -m 755 xenserver/etc_sysconfig_vswitch.example \
+ $RPM_BUILD_ROOT/etc/sysconfig/vswitch.example
+install -d -m 755 $RPM_BUILD_ROOT/etc/logrotate.d
+install -m 755 xenserver/etc_logrotate.d_vswitch \
+ $RPM_BUILD_ROOT/etc/logrotate.d/vswitch
+install -d -m 755 $RPM_BUILD_ROOT/etc/profile.d
+install -m 755 xenserver/etc_profile.d_vswitch.sh \
+ $RPM_BUILD_ROOT/etc/profile.d/vswitch.sh
+install -d -m 755 $RPM_BUILD_ROOT/etc/xapi.d/plugins
+install -m 755 xenserver/etc_xapi.d_plugins_vswitch-cfg-update \
+ $RPM_BUILD_ROOT/etc/xapi.d/plugins/vswitch-cfg-update
+install -d -m 755 $RPM_BUILD_ROOT%{_prefix}/scripts
+install -m 755 xenserver/opt_xensource_libexec_interface-reconfigure \
+ $RPM_BUILD_ROOT%{_prefix}/scripts/interface-reconfigure
+install -m 755 xenserver/etc_xensource_scripts_vif \
+ $RPM_BUILD_ROOT%{_prefix}/scripts/vif
+install -m 755 \
+ xenserver/usr_lib_xsconsole_plugins-base_XSFeatureVSwitch.py \
+ $RPM_BUILD_ROOT%{_prefix}/scripts/XSFeatureVSwitch.py
+
+install -d -m 755 $RPM_BUILD_ROOT%{_prefix}/kernel_modules
+find datapath/linux-2.6 -name *.ko -exec install -m 755 \{\} $RPM_BUILD_ROOT%{_prefix}/kernel_modules/ \;
+
+# Get rid of stuff we don't want to make RPM happy.
+rm -rf \
+ $RPM_BUILD_ROOT/root/vswitch/bin/ezio-term \
+ $RPM_BUILD_ROOT/root/vswitch/bin/ovs-controller \
+ $RPM_BUILD_ROOT/root/vswitch/bin/ovs-discover \
+ $RPM_BUILD_ROOT/root/vswitch/bin/ovs-kill \
+ $RPM_BUILD_ROOT/root/vswitch/bin/ovs-pki \
+ $RPM_BUILD_ROOT/root/vswitch/bin/ovs-switchui \
+ $RPM_BUILD_ROOT/root/vswitch/bin/ovs-wdt \
+ $RPM_BUILD_ROOT/root/vswitch/bin/secchan \
+ $RPM_BUILD_ROOT/root/vswitch/sbin/ovs-monitor \
+ $RPM_BUILD_ROOT/root/vswitch/share/man/man8/ovs-controller.8 \
+ $RPM_BUILD_ROOT/root/vswitch/share/man/man8/ovs-discover.8 \
+ $RPM_BUILD_ROOT/root/vswitch/share/man/man8/ovs-kill.8 \
+ $RPM_BUILD_ROOT/root/vswitch/share/man/man8/ovs-pki.8 \
+ $RPM_BUILD_ROOT/root/vswitch/share/man/man8/secchan.8 \
+ $RPM_BUILD_ROOT/root/vswitch/share/openvswitch
+
+%clean
+rm -rf $RPM_BUILD_ROOT
+
+%pre
+if [ ! -f /etc/xensource-inventory ]; then
+ printf "XenSource inventory not present in /etc/xensource-inventory"
+ exit 1
+fi
+
+if [ "$1" = "1" ]; then
+ if ! md5sum -c --status <<EOF
+b8e9835862ef1a9cec2a3f477d26c989 /etc/xensource/scripts/vif
+51970ad613a3996d5997e18e44db47da /opt/xensource/libexec/interface-reconfigure
+EOF
+ then
+ printf "\nThe original XenServer scripts replaced by this package\n"
+ printf "are different than expected. This could lead to unexpected\n"
+ printf "behavior of your server. Unless you are sure you know what\n"
+ printf "you are doing, it is highly recomended that you remove this\n"
+ printf "package immediately after the install completes, which\n"
+ printf "will restore the XenServer scripts that you were previously\n"
+ printf "using.\n\n"
+ fi
+fi
+
+if test ! -e /etc/vswitch.dbcache; then
+ if test "$1" = 1; then
+ printf "Creating xapi database cache... "
+ else
+ printf "warning: Open vSwitch is being re-installed or upgraded,\n"
+ printf " but the xapi database cache is missing.\n"
+ printf "Re-creating xapi database cache... "
+ fi
+
+ source /etc/xensource-inventory
+ if python - "$INSTALLATION_UUID" <<EOF
+import XenAPI
+import pickle
+import sys
+
+session = XenAPI.xapi_local()
+try:
+ session.xenapi.login_with_password("root", "")
+
+ vlans = session.xenapi.VLAN.get_all_records()
+ bonds = session.xenapi.Bond.get_all_records()
+ pifs = session.xenapi.PIF.get_all_records()
+ networks = session.xenapi.network.get_all_records()
+ host = session.xenapi.host.get_by_uuid(sys.argv[1])
+finally:
+ session.xenapi.session.logout()
+
+dbcache_file = "/etc/vswitch.dbcache"
+f = open(dbcache_file, 'w')
+pickle.dump({'vlans': vlans,
+ 'bonds': bonds,
+ 'pifs': pifs,
+ 'networks': networks}, f)
+pickle.dump({'host': host}, f)
+f.close()
+EOF
+ then
+ printf "done.\n"
+ else
+ printf "FAILED\n"
+ printf "Open vSwitch can only be installed on a XenServer that\n"
+ printf "has connectivity to xapi on the pool master. Please\n"
+ printf "fix connectivity to the pool master, then try again.\n"
+ exit 1
+ fi
+fi
+
+%post
+source /etc/xensource-inventory
+
+xe host-param-set \
+ "other-config:vSwitchVersion=%{version}" uuid="$INSTALLATION_UUID" ||
+ echo "Could not set vSwitchVersion config parameter"
+
+# Ensure ovs-vswitchd.conf exists
+touch /etc/ovs-vswitchd.conf
+
+# Replace original XenServer files
+mkdir -p %{_prefix}/xs-original \
+ || printf "Could not create script backup directory.\n"
+for f in \
+ /opt/xensource/libexec/interface-reconfigure \
+ /etc/xensource/scripts/vif
+do
+ s=$(basename "$f")
+ t=$(readlink "$f")
+ if [ "$t" != "%{_prefix}/scripts/$s" ]; then
+ mv "$f" %{_prefix}/xs-original/ \
+ || printf "Could not save original XenServer $s script\n"
+ ln -s "%{_prefix}/scripts/$s" "$f" \
+ || printf "Could not link to vSwitch $s script\n"
+ fi
+done
+
+# Install xsconsole plugin
+plugin=$(readlink /usr/lib/xsconsole/plugins-base/XSFeatureVSwitch.py)
+if [ "$plugin" != "/root/vswitch/scripts/XSFeatureVSwitch.py" ]; then
+ rm -f /usr/lib/xsconsole/plugins-base/XSFeatureVSwitch.py
+ ln -s /root/vswitch/scripts/XSFeatureVSwitch.py /usr/lib/xsconsole/plugins-base/ || printf "Could not link to vSswitch xsconsole plugin.\n"
+fi
+
+# Ensure all required services are set to run
+for s in vswitch vswitch-xapi-update; do
+ if chkconfig --list $s >/dev/null 2>&1; then
+ chkconfig --del $s || printf "Could not remove $s init script."
+ fi
+ chkconfig --add $s || printf "Could not add $s init script."
+ chkconfig $s on || printf "Could not enable $s init script."
+done
+
+if [ "$1" = "1" ]; then # $1 = 2 for upgrade
+ printf "\nYou MUST reboot the server NOW to complete the change to the\n"
+ printf "the vSwitch. Attempts to modify networking on the server\n"
+ printf "or any hosted VM will fail until after the reboot and could\n"
+ printf "leave the server in an state requiring manual recovery.\n\n"
+else
+ printf "\nTo use the new vSwitch, you should reboot the server\n"
+ printf "now. Failure to do so may result in incorrect operation.\n\n"
+fi
+
+%preun
+if [ "$1" = "0" ]; then # $1 = 1 for upgrade
+ for s in vswitch vswitch-xapi-update; do
+ chkconfig --del $s || printf "Could not remove $s init script."
+ done
+fi
+
+
+%postun
+if [ "$1" = "0" ]; then # $1 = 1 for upgrade
+
+ rm -f /usr/lib/xsconsole/plugins-base/XSFeatureVSwitch.py \
+ /usr/lib/xsconsole/plugins-base/XSFeatureVSwitch.pyc \
+ /usr/lib/xsconsole/plugins-base/XSFeatureVSwitch.pyo \
+ || printf "Could not remove vSwitch xsconsole plugin.\n"
+
+ # Restore original XenServer scripts
+ for f in \
+ /opt/xensource/libexec/interface-reconfigure \
+ /etc/xensource/scripts/vif
+ do
+ s=$(basename "$f")
+ if [ ! -f "%{_prefix}/xs-original/$s" ]; then
+ printf "Original XenServer $s script not present in %{_prefix}/xs-original\n"
+ printf "Could not restore original XenServer script.\n"
+ else
+ (rm -f "$f" \
+ && mv "%{_prefix}/xs-original/$s" "$f") \
+ || printf "Could not restore original XenServer $s script.\n"
+ fi
+ done
+
+ find %{_prefix} -type d -depth -exec rmdir \{\} \; \
+ || printf "Could not remove vSwitch install directory.\n"
+
+ # Remove all configuration and log files
+ rm -f /etc/ovs-vswitchd.conf
+ rm -f /etc/sysconfig/vswitch
+ rm -f /var/log/vswitch*
+ rm -f /etc/ovs-vswitchd.cacert
+
+ if [ ! -f /etc/xensource-inventory ]; then
+ printf "XenSource inventory not present in /etc/xensource-inventory\n"
+ printf "Could not remove vSwitchVersion from XAPI database.\n"
+ exit 1
+ else
+ source /etc/xensource-inventory
+ xe host-param-remove \
+ param-name=other-config param-key=vSwitchVersion \
+ uuid="$INSTALLATION_UUID" ||
+ echo "Could not clear vSwitchVersion config parameter."
+ fi
+
+ printf "\nYou MUST reboot the server now to complete the change to\n"
+ printf "standard Xen networking. Attempts to modify networking on the\n"
+ printf "server or any hosted VM will fail until after the reboot and\n"
+ printf "could leave the server in a state requiring manual recovery.\n\n"
+fi
+
+
+%files
+%defattr(-,root,root)
+/etc/init.d/vswitch
+/etc/init.d/vswitch-xapi-update
+/etc/xapi.d/plugins/vswitch-cfg-update
+/etc/sysconfig/vswitch.example
+/etc/logrotate.d/vswitch
+/etc/profile.d/vswitch.sh
+/root/vswitch/kernel_modules/brcompat_mod.ko
+/root/vswitch/kernel_modules/openvswitch_mod.ko
+/root/vswitch/kernel_modules/veth_mod.ko
+/root/vswitch/scripts/interface-reconfigure
+/root/vswitch/scripts/vif
+/root/vswitch/scripts/XSFeatureVSwitch.py
+# Following two files are generated automatically by rpm. We don't
+# really need them and they won't be used on the XenServer, but there
+# isn't an obvious place to get rid of them since they are generated
+# after the install script runs. Since they are small, we just
+# include them.
+/root/vswitch/scripts/XSFeatureVSwitch.pyc
+/root/vswitch/scripts/XSFeatureVSwitch.pyo
+/root/vswitch/sbin/ovs-brcompatd
+/root/vswitch/sbin/ovs-vswitchd
+/root/vswitch/bin/ovs-appctl
+/root/vswitch/bin/ovs-cfg-mod
+/root/vswitch/bin/ovs-dpctl
+/root/vswitch/bin/ovs-ofctl
+/root/vswitch/share/man/man5/ovs-vswitchd.conf.5
+/root/vswitch/share/man/man8/ovs-appctl.8
+/root/vswitch/share/man/man8/ovs-brcompatd.8
+/root/vswitch/share/man/man8/ovs-cfg-mod.8
+/root/vswitch/share/man/man8/ovs-dpctl.8
+/root/vswitch/share/man/man8/ovs-ofctl.8
+/root/vswitch/share/man/man8/ovs-vswitchd.8