summaryrefslogtreecommitdiff
path: root/libnetwork/cmd
diff options
context:
space:
mode:
authorFlavio Crisciani <flavio.crisciani@docker.com>2017-12-08 11:04:44 -0800
committerFlavio Crisciani <flavio.crisciani@docker.com>2018-01-25 16:09:37 -0800
commit2f6921cbba0778d0ea7b73c84fd9ce7b325261e8 (patch)
tree68b45399d6747139d75eef9ac3c936435f62d5c1 /libnetwork/cmd
parenta59ecd9537f9c76a1befae7d241faa4ef7c797d5 (diff)
downloaddocker-2f6921cbba0778d0ea7b73c84fd9ce7b325261e8.tar.gz
Diagnostic client
- the client allows to talk to the diagnostic server and decode the internal values of the overlay and service discovery - the tool also allows to remediate in case of orphans entries - added README Signed-off-by: Flavio Crisciani <flavio.crisciani@docker.com>
Diffstat (limited to 'libnetwork/cmd')
-rw-r--r--libnetwork/cmd/diagnostic/Dockerfile.client4
-rw-r--r--libnetwork/cmd/diagnostic/Dockerfile.dind4
-rw-r--r--libnetwork/cmd/diagnostic/README.md252
-rw-r--r--libnetwork/cmd/diagnostic/daemon.json4
-rw-r--r--libnetwork/cmd/diagnostic/main.go191
5 files changed, 455 insertions, 0 deletions
diff --git a/libnetwork/cmd/diagnostic/Dockerfile.client b/libnetwork/cmd/diagnostic/Dockerfile.client
new file mode 100644
index 0000000000..ee8771517a
--- /dev/null
+++ b/libnetwork/cmd/diagnostic/Dockerfile.client
@@ -0,0 +1,4 @@
+FROM alpine
+RUN apk add --no-cache curl
+COPY diagnosticClient /usr/local/bin/diagnosticClient
+ENTRYPOINT ["/usr/local/bin/diagnosticClient"]
diff --git a/libnetwork/cmd/diagnostic/Dockerfile.dind b/libnetwork/cmd/diagnostic/Dockerfile.dind
new file mode 100644
index 0000000000..fa66272168
--- /dev/null
+++ b/libnetwork/cmd/diagnostic/Dockerfile.dind
@@ -0,0 +1,4 @@
+FROM docker:17.12-dind
+RUN apk add --no-cache curl
+COPY daemon.json /etc/docker/daemon.json
+COPY diagnosticClient /usr/local/bin/diagnosticClient
diff --git a/libnetwork/cmd/diagnostic/README.md b/libnetwork/cmd/diagnostic/README.md
new file mode 100644
index 0000000000..4c6ce6c35f
--- /dev/null
+++ b/libnetwork/cmd/diagnostic/README.md
@@ -0,0 +1,252 @@
+---
+description: Learn to use the built-in network debugger to debug overlay networking problems
+keywords: network, troubleshooting, debug
+title: Debug overlay or swarm networking issues
+---
+
+**WARNING**
+This tool can change the internal state of the libnetwork API, be really mindful
+on its use and read carefully the following guide. Improper use of it will damage
+or permanently destroy the network configuration.
+
+
+Docker CE 17.12 and higher introduce a network debugging tool designed to help
+debug issues with overlay networks and swarm services running on Linux hosts.
+When enabled, a network diagnostic server listens on the specified port and
+provides diagnostic information. The network debugging tool should only be
+started to debug specific issues, and should not be left running all the time.
+
+Information about networks is stored in the database, which can be examined using
+the API. Currently the database contains information about the overlay network
+as well as the service discovery data.
+
+The Docker API exposes endpoints to query and control the network debugging
+tool. CLI integration is provided as a preview, but the implementation is not
+yet considered stable and commands and options may change without notice.
+
+The tool is available into 2 forms:
+1) client only: dockereng/network-diagnostic:onlyclient
+2) docker in docker version: dockereng/network-diagnostic:17.12-dind
+The latter allows to use the tool with a cluster running an engine older than 17.12
+
+## Enable the diagnostic server
+
+The tool currently only works on Docker hosts running on Linux. To enable it on a node
+follow the step below.
+
+1. Set the `network-diagnostic-port` to a port which is free on the Docker
+ host, in the `/etc/docker/daemon.json` configuration file.
+
+ ```json
+ “network-diagnostic-port”: <port>
+ ```
+
+2. Get the process ID (PID) of the `dockerd` process. It is the second field in
+ the output, and is typically a number from 2 to 6 digits long.
+
+ ```bash
+ $ ps aux |grep dockerd | grep -v grep
+ ```
+
+3. Reload the Docker configuration without restarting Docker, by sending the
+ `HUP` signal to the PID you found in the previous step.
+
+ ```bash
+ kill -HUP <pid-of-dockerd>
+ ```
+
+If systemd is used the command `systemctl reload docker` will be enough
+
+
+A message like the following will appear in the Docker host logs:
+
+```none
+Starting the diagnostic server listening on <port> for commands
+```
+
+## Disable the diagnostic tool
+
+Repeat these steps for each node participating in the swarm.
+
+1. Remove the `network-diagnostic-port` key from the `/etc/docker/daemon.json`
+ configuration file.
+
+2. Get the process ID (PID) of the `dockerd` process. It is the second field in
+ the output, and is typically a number from 2 to 6 digits long.
+
+ ```bash
+ $ ps aux |grep dockerd | grep -v grep
+ ```
+
+3. Reload the Docker configuration without restarting Docker, by sending the
+ `HUP` signal to the PID you found in the previous step.
+
+ ```bash
+ kill -HUP <pid-of-dockerd>
+ ```
+
+A message like the following will appear in the Docker host logs:
+
+```none
+Disabling the diagnostic server
+```
+
+## Access the diagnostic tool's API
+
+The network diagnostic tool exposes its own RESTful API. To access the API,
+send a HTTP request to the port where the tool is listening. The following
+commands assume the tool is listening on port 2000.
+
+Examples are not given for every endpoint.
+
+### Get help
+
+```bash
+$ curl localhost:2000/help
+
+OK
+/updateentry
+/getentry
+/gettable
+/leavenetwork
+/createentry
+/help
+/clusterpeers
+/ready
+/joinnetwork
+/deleteentry
+/networkpeers
+/
+/join
+```
+
+### Join or leave the network database cluster
+
+```bash
+$ curl localhost:2000/join?members=ip1,ip2,...
+```
+
+```bash
+$ curl localhost:2000/leave?members=ip1,ip2,...
+```
+
+`ip1`, `ip2`, ... are the swarm node ips (usually one is enough)
+
+### Join or leave a network
+
+```bash
+$ curl localhost:2000/joinnetwork?nid=<network id>
+```
+
+```bash
+$ curl localhost:2000/leavenetwork?nid=<network id>
+```
+
+`network id` can be retrieved on the manager with `docker network ls --no-trunc` and has
+to be the full length identifier
+
+### List cluster peers
+
+```bash
+$ curl localhost:2000/clusterpeers
+```
+
+### List nodes connected to a given network
+
+```bash
+$ curl localhost:2000/networkpeers?nid=<network id>
+```
+`network id` can be retrieved on the manager with `docker network ls --no-trunc` and has
+to be the full length identifier
+
+### Dump database tables
+
+The tables are called `endpoint_table` and `overlay_peer_table`.
+The `overlay_peer_table` contains all the overlay forwarding information
+The `endpoint_table` contains all the service discovery information
+
+```bash
+$ curl localhost:2000/gettable?nid=<network id>&tname=<table name>
+```
+
+### Interact with a specific database table
+
+The tables are called `endpoint_table` and `overlay_peer_table`.
+
+```bash
+$ curl localhost:2000/<method>?nid=<network id>&tname=<table name>&key=<key>[&value=<value>]
+```
+
+Note:
+operations on tables have node ownership, this means that are going to remain persistent till
+the node that inserted them is part of the cluster
+
+## Access the diagnostic tool's CLI
+
+The CLI is provided as a preview and is not yet stable. Commands or options may
+change at any time.
+
+The CLI executable is called `diagnosticClient` and is made available using a
+standalone container.
+
+`docker run --net host dockereng/network-diagnostic:onlyclient -v -net <full network id> -t sd`
+
+The following flags are supported:
+
+| Flag | Description |
+|---------------|-------------------------------------------------|
+| -t <string> | Table one of `sd` or `overlay`. |
+| -ip <string> | The IP address to query. Defaults to 127.0.0.1. |
+| -net <string> | The target network ID. |
+| -port <int> | The target port. (default port is 2000) |
+| -v | Enable verbose output. |
+
+### Container version of the diagnostic tool
+
+The CLI is provided as a container with a 17.12 engine that needs to run using privileged mode.
+*NOTE*
+Remember that table operations have ownership, so any `create entry` will be persistent till
+the diagnostic container is part of the swarm.
+
+1. Make sure that the node where the diagnostic client will run is not part of the swarm, if so do `docker swarm leave -f`
+
+2. To run the container, use a command like the following:
+
+ ```bash
+ $ docker container run --name net-diagnostic -d --privileged --network host dockereng/network-diagnostic:17.12-dind
+ ```
+
+3. Connect to the container using `docker exec -it <container-ID> sh`,
+ and start the server using the following command:
+
+ ```bash
+ $ kill -HUP 1
+ ```
+
+4. Join the diagnostic container to the swarm, then run the diagnostic CLI within the container.
+
+ ```bash
+ $ ./diagnosticClient <flags>...
+ ```
+
+4. When finished debugging, leave the swarm and stop the container.
+
+### Examples
+
+The following commands dump the service discovery table and verify node
+ownership.
+
+*NOTE*
+Remember to use the full network ID, you can easily find that with `docker network ls --no-trunc`
+
+**Service discovery and load balancer:**
+
+```bash
+$ diagnostiClient -c sd -v -net n8a8ie6tb3wr2e260vxj8ncy4
+```
+
+**Overlay network:**
+
+```bash
+$ diagnostiClient -port 2001 -c overlay -v -net n8a8ie6tb3wr2e260vxj8ncy4
+```
diff --git a/libnetwork/cmd/diagnostic/daemon.json b/libnetwork/cmd/diagnostic/daemon.json
new file mode 100644
index 0000000000..b5eb9889b8
--- /dev/null
+++ b/libnetwork/cmd/diagnostic/daemon.json
@@ -0,0 +1,4 @@
+{
+ "debug": true,
+ "network-diagnostic-port": 2000
+}
diff --git a/libnetwork/cmd/diagnostic/main.go b/libnetwork/cmd/diagnostic/main.go
new file mode 100644
index 0000000000..0f3f559ec5
--- /dev/null
+++ b/libnetwork/cmd/diagnostic/main.go
@@ -0,0 +1,191 @@
+package main
+
+import (
+ "bufio"
+ "encoding/base64"
+ "encoding/json"
+ "flag"
+ "fmt"
+ "io"
+ "io/ioutil"
+ "net/http"
+ "os"
+ "strings"
+
+ "github.com/docker/libnetwork"
+ "github.com/docker/libnetwork/diagnostic"
+ "github.com/docker/libnetwork/drivers/overlay"
+ "github.com/sirupsen/logrus"
+)
+
+const (
+ readyPath = "http://%s:%d/ready"
+ joinNetwork = "http://%s:%d/joinnetwork?nid=%s"
+ leaveNetwork = "http://%s:%d/leavenetwork?nid=%s"
+ clusterPeers = "http://%s:%d/clusterpeers?json"
+ networkPeers = "http://%s:%d/networkpeers?nid=%s&json"
+ dumpTable = "http://%s:%d/gettable?nid=%s&tname=%s&json"
+ deleteEntry = "http://%s:%d/deleteentry?nid=%s&tname=%s&key=%s&json"
+)
+
+func httpIsOk(body io.ReadCloser) {
+ b, err := ioutil.ReadAll(body)
+ if err != nil {
+ logrus.Fatalf("Failed the body parse %s", err)
+ }
+ if !strings.Contains(string(b), "OK") {
+ logrus.Fatalf("Server not ready %s", b)
+ }
+ body.Close()
+}
+
+func main() {
+ ipPtr := flag.String("ip", "127.0.0.1", "ip address")
+ portPtr := flag.Int("port", 2000, "port")
+ networkPtr := flag.String("net", "", "target network")
+ tablePtr := flag.String("t", "", "table to process <sd/overlay>")
+ remediatePtr := flag.Bool("r", false, "perform remediation deleting orphan entries")
+ verbosePtr := flag.Bool("v", false, "verbose output")
+
+ flag.Parse()
+
+ if *verbosePtr {
+ logrus.SetLevel(logrus.DebugLevel)
+ }
+
+ logrus.Infof("Connecting to %s:%d checking ready", *ipPtr, *portPtr)
+ resp, err := http.Get(fmt.Sprintf(readyPath, *ipPtr, *portPtr))
+ if err != nil {
+ logrus.WithError(err).Fatalf("The connection failed")
+ }
+ httpIsOk(resp.Body)
+
+ clusterPeers := fetchNodePeers(*ipPtr, *portPtr, "")
+ var networkPeers map[string]string
+ var joinedNetwork bool
+ if *networkPtr != "" {
+ logrus.Infof("Joining the network:%s", *networkPtr)
+ resp, err = http.Get(fmt.Sprintf(joinNetwork, *ipPtr, *portPtr, *networkPtr))
+ if err != nil {
+ logrus.WithError(err).Fatalf("Failed joining the network")
+ }
+ httpIsOk(resp.Body)
+ networkPeers = fetchNodePeers(*ipPtr, *portPtr, *networkPtr)
+ joinedNetwork = true
+ }
+
+ switch *tablePtr {
+ case "sd":
+ fetchTable(*ipPtr, *portPtr, *networkPtr, "endpoint_table", clusterPeers, networkPeers, *remediatePtr)
+ case "overlay":
+ fetchTable(*ipPtr, *portPtr, *networkPtr, "overlay_peer_table", clusterPeers, networkPeers, *remediatePtr)
+ }
+
+ if joinedNetwork {
+ resp, err = http.Get(fmt.Sprintf(leaveNetwork, *ipPtr, *portPtr, *networkPtr))
+ if err != nil {
+ logrus.WithError(err).Fatalf("Failed leaving the network")
+ }
+ httpIsOk(resp.Body)
+ }
+}
+
+func fetchNodePeers(ip string, port int, network string) map[string]string {
+ logrus.Infof("Fetch peers %s", network)
+ var path string
+ if network != "" {
+ path = fmt.Sprintf(networkPeers, ip, port, network)
+ } else {
+ path = fmt.Sprintf(clusterPeers, ip, port)
+ }
+
+ resp, err := http.Get(path)
+ if err != nil {
+ logrus.WithError(err).Fatalf("Failed fetching path")
+ }
+ defer resp.Body.Close()
+ body, err := ioutil.ReadAll(resp.Body)
+ if err != nil {
+ logrus.WithError(err).Fatalf("Failed the body parse")
+ }
+
+ output := diagnostic.HTTPResult{Details: &diagnostic.TablePeersResult{}}
+ err = json.Unmarshal(body, &output)
+ if err != nil {
+ logrus.WithError(err).Fatalf("Failed the json unmarshalling")
+ }
+
+ logrus.Debugf("Parsing JSON response")
+ result := make(map[string]string, output.Details.(*diagnostic.TablePeersResult).Length)
+ for _, v := range output.Details.(*diagnostic.TablePeersResult).Elements {
+ logrus.Debugf("name:%s ip:%s", v.Name, v.IP)
+ result[v.Name] = v.IP
+ }
+ return result
+}
+
+func fetchTable(ip string, port int, network, tableName string, clusterPeers, networkPeers map[string]string, remediate bool) {
+ logrus.Infof("Fetch %s table and check owners", tableName)
+ resp, err := http.Get(fmt.Sprintf(dumpTable, ip, port, network, tableName))
+ if err != nil {
+ logrus.WithError(err).Fatalf("Failed fetching endpoint table")
+ }
+ defer resp.Body.Close()
+ body, err := ioutil.ReadAll(resp.Body)
+ if err != nil {
+ logrus.WithError(err).Fatalf("Failed the body parse")
+ }
+
+ output := diagnostic.HTTPResult{Details: &diagnostic.TableEndpointsResult{}}
+ err = json.Unmarshal(body, &output)
+ if err != nil {
+ logrus.WithError(err).Fatalf("Failed the json unmarshalling")
+ }
+
+ logrus.Debug("Parsing data structures")
+ var orphanKeys []string
+ for _, v := range output.Details.(*diagnostic.TableEndpointsResult).Elements {
+ decoded, err := base64.StdEncoding.DecodeString(v.Value)
+ if err != nil {
+ logrus.WithError(err).Errorf("Failed decoding entry")
+ continue
+ }
+ switch tableName {
+ case "endpoint_table":
+ var elem libnetwork.EndpointRecord
+ elem.Unmarshal(decoded)
+ logrus.Debugf("key:%s value:%+v owner:%s", v.Key, elem, v.Owner)
+ case "overlay_peer_table":
+ var elem overlay.PeerRecord
+ elem.Unmarshal(decoded)
+ logrus.Debugf("key:%s value:%+v owner:%s", v.Key, elem, v.Owner)
+ }
+
+ if _, ok := networkPeers[v.Owner]; !ok {
+ logrus.Warnf("The element with key:%s does not belong to any node on this network", v.Key)
+ orphanKeys = append(orphanKeys, v.Key)
+ }
+ if _, ok := clusterPeers[v.Owner]; !ok {
+ logrus.Warnf("The element with key:%s does not belong to any node on this cluster", v.Key)
+ }
+ }
+
+ if len(orphanKeys) > 0 && remediate {
+ logrus.Warnf("The following keys:%v results as orphan, do you want to proceed with the deletion (this operation is irreversible)? [Yes/No]", orphanKeys)
+ reader := bufio.NewReader(os.Stdin)
+ text, _ := reader.ReadString('\n')
+ text = strings.Replace(text, "\n", "", -1)
+ if strings.Compare(text, "Yes") == 0 {
+ for _, k := range orphanKeys {
+ resp, err := http.Get(fmt.Sprintf(deleteEntry, ip, port, network, tableName, k))
+ if err != nil {
+ logrus.WithError(err).Errorf("Failed deleting entry k:%s", k)
+ break
+ }
+ resp.Body.Close()
+ }
+ } else {
+ logrus.Infof("Deletion skipped")
+ }
+ }
+}