summaryrefslogtreecommitdiff
path: root/vendor/github.com/moby/swarmkit/v2/ca/server.go
blob: 44a51b5e245c5b5d916bd2ccba69ebefa883c143 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
package ca

import (
	"bytes"
	"context"
	"crypto/subtle"
	"crypto/x509"
	"sync"
	"time"

	gogotypes "github.com/gogo/protobuf/types"
	"github.com/moby/swarmkit/v2/api"
	"github.com/moby/swarmkit/v2/api/equality"
	"github.com/moby/swarmkit/v2/identity"
	"github.com/moby/swarmkit/v2/log"
	"github.com/moby/swarmkit/v2/manager/state/store"
	"github.com/pkg/errors"
	"github.com/sirupsen/logrus"
	"google.golang.org/grpc/codes"
	"google.golang.org/grpc/status"
)

const (
	defaultReconciliationRetryInterval = 10 * time.Second
	defaultRootReconciliationInterval  = 3 * time.Second
)

// Server is the CA and NodeCA API gRPC server.
// TODO(aaronl): At some point we may want to have separate implementations of
// CA, NodeCA, and other hypothetical future CA services. At the moment,
// breaking it apart doesn't seem worth it.
type Server struct {
	mu                          sync.Mutex
	wg                          sync.WaitGroup
	ctx                         context.Context
	cancel                      func()
	store                       *store.MemoryStore
	securityConfig              *SecurityConfig
	clusterID                   string
	localRootCA                 *RootCA
	externalCA                  *ExternalCA
	externalCAPool              *x509.CertPool
	joinTokens                  *api.JoinTokens
	reconciliationRetryInterval time.Duration

	// pending is a map of nodes with pending certificates issuance or
	// renewal. They are indexed by node ID.
	pending map[string]*api.Node

	// started is a channel which gets closed once the server is running
	// and able to service RPCs.
	started chan struct{}

	// these are cached values to ensure we only update the security config when
	// the cluster root CA and external CAs have changed - the cluster object
	// can change for other reasons, and it would not be necessary to update
	// the security config as a result
	lastSeenClusterRootCA *api.RootCA
	lastSeenExternalCAs   []*api.ExternalCA

	// This mutex protects the components of the CA server used to issue new certificates
	// (and any attributes used to update those components): `lastSeenClusterRootCA` and
	// `lastSeenExternalCA`, which are used to update `externalCA` and the `rootCA` object
	// of the SecurityConfig
	signingMu sync.Mutex

	// lets us monitor and finish root rotations
	rootReconciler                  *rootRotationReconciler
	rootReconciliationRetryInterval time.Duration
}

// DefaultCAConfig returns the default CA Config, with a default expiration.
func DefaultCAConfig() api.CAConfig {
	return api.CAConfig{
		NodeCertExpiry: gogotypes.DurationProto(DefaultNodeCertExpiration),
	}
}

// NewServer creates a CA API server.
func NewServer(store *store.MemoryStore, securityConfig *SecurityConfig) *Server {
	return &Server{
		store:                           store,
		securityConfig:                  securityConfig,
		localRootCA:                     securityConfig.RootCA(),
		externalCA:                      NewExternalCA(nil, nil),
		pending:                         make(map[string]*api.Node),
		started:                         make(chan struct{}),
		reconciliationRetryInterval:     defaultReconciliationRetryInterval,
		rootReconciliationRetryInterval: defaultRootReconciliationInterval,
		clusterID:                       securityConfig.ClientTLSCreds.Organization(),
	}
}

// ExternalCA returns the current external CA - this is exposed to support unit testing only, and the external CA
// should really be a private field
func (s *Server) ExternalCA() *ExternalCA {
	s.signingMu.Lock()
	defer s.signingMu.Unlock()
	return s.externalCA
}

// RootCA returns the current local root CA - this is exposed to support unit testing only, and the root CA
// should really be a private field
func (s *Server) RootCA() *RootCA {
	s.signingMu.Lock()
	defer s.signingMu.Unlock()
	return s.localRootCA
}

// SetReconciliationRetryInterval changes the time interval between
// reconciliation attempts. This function must be called before Run.
func (s *Server) SetReconciliationRetryInterval(reconciliationRetryInterval time.Duration) {
	s.reconciliationRetryInterval = reconciliationRetryInterval
}

// SetRootReconciliationInterval changes the time interval between root rotation
// reconciliation attempts.  This function must be called before Run.
func (s *Server) SetRootReconciliationInterval(interval time.Duration) {
	s.rootReconciliationRetryInterval = interval
}

// GetUnlockKey is responsible for returning the current unlock key used for encrypting TLS private keys and
// other at rest data.  Access to this RPC call should only be allowed via mutual TLS from managers.
func (s *Server) GetUnlockKey(ctx context.Context, request *api.GetUnlockKeyRequest) (*api.GetUnlockKeyResponse, error) {
	// This directly queries the store, rather than storing the unlock key and version on
	// the `Server` object and updating it `updateCluster` is called, because we need this
	// API to return the latest version of the key.  Otherwise, there might be a slight delay
	// between when the cluster gets updated, and when this function returns the latest key.
	// This delay is currently unacceptable because this RPC call is the only way, after a
	// cluster update, to get the actual value of the unlock key, and we don't want to return
	// a cached value.
	resp := api.GetUnlockKeyResponse{}
	s.store.View(func(tx store.ReadTx) {
		cluster := store.GetCluster(tx, s.clusterID)
		resp.Version = cluster.Meta.Version
		if cluster.Spec.EncryptionConfig.AutoLockManagers {
			for _, encryptionKey := range cluster.UnlockKeys {
				if encryptionKey.Subsystem == ManagerRole {
					resp.UnlockKey = encryptionKey.Key
					return
				}
			}
		}
	})

	return &resp, nil
}

// NodeCertificateStatus returns the current issuance status of an issuance request identified by the nodeID
func (s *Server) NodeCertificateStatus(ctx context.Context, request *api.NodeCertificateStatusRequest) (*api.NodeCertificateStatusResponse, error) {
	if request.NodeID == "" {
		return nil, status.Errorf(codes.InvalidArgument, codes.InvalidArgument.String())
	}

	serverCtx, err := s.isRunningLocked()
	if err != nil {
		return nil, err
	}

	var node *api.Node

	event := api.EventUpdateNode{
		Node:   &api.Node{ID: request.NodeID},
		Checks: []api.NodeCheckFunc{api.NodeCheckID},
	}

	// Retrieve the current value of the certificate with this token, and create a watcher
	updates, cancel, err := store.ViewAndWatch(
		s.store,
		func(tx store.ReadTx) error {
			node = store.GetNode(tx, request.NodeID)
			return nil
		},
		event,
	)
	if err != nil {
		return nil, err
	}
	defer cancel()

	// This node ID doesn't exist
	if node == nil {
		return nil, status.Errorf(codes.NotFound, codes.NotFound.String())
	}

	log.G(ctx).WithFields(logrus.Fields{
		"node.id": node.ID,
		"status":  node.Certificate.Status,
		"method":  "NodeCertificateStatus",
	})

	// If this certificate has a final state, return it immediately (both pending and renew are transition states)
	if isFinalState(node.Certificate.Status) {
		return &api.NodeCertificateStatusResponse{
			Status:      &node.Certificate.Status,
			Certificate: &node.Certificate,
		}, nil
	}

	log.G(ctx).WithFields(logrus.Fields{
		"node.id": node.ID,
		"status":  node.Certificate.Status,
		"method":  "NodeCertificateStatus",
	}).Debugf("started watching for certificate updates")

	// Certificate is Pending or in an Unknown state, let's wait for changes.
	for {
		select {
		case event := <-updates:
			switch v := event.(type) {
			case api.EventUpdateNode:
				// We got an update on the certificate record. If the status is a final state,
				// return the certificate.
				if isFinalState(v.Node.Certificate.Status) {
					cert := v.Node.Certificate.Copy()
					return &api.NodeCertificateStatusResponse{
						Status:      &cert.Status,
						Certificate: cert,
					}, nil
				}
			}
		case <-ctx.Done():
			return nil, ctx.Err()
		case <-serverCtx.Done():
			return nil, s.ctx.Err()
		}
	}
}

// IssueNodeCertificate is responsible for gatekeeping both certificate requests from new nodes in the swarm,
// and authorizing certificate renewals.
// If a node presented a valid certificate, the corresponding certificate is set in a RENEW state.
// If a node failed to present a valid certificate, we check for a valid join token and set the
// role accordingly. A new random node ID is generated, and the corresponding node entry is created.
// IssueNodeCertificate is the only place where new node entries to raft should be created.
func (s *Server) IssueNodeCertificate(ctx context.Context, request *api.IssueNodeCertificateRequest) (*api.IssueNodeCertificateResponse, error) {
	// First, let's see if the remote node is presenting a non-empty CSR
	if len(request.CSR) == 0 {
		return nil, status.Errorf(codes.InvalidArgument, codes.InvalidArgument.String())
	}

	if err := s.isReadyLocked(); err != nil {
		return nil, err
	}

	var (
		blacklistedCerts map[string]*api.BlacklistedCertificate
		clusters         []*api.Cluster
		err              error
	)

	s.store.View(func(readTx store.ReadTx) {
		clusters, err = store.FindClusters(readTx, store.ByName(store.DefaultClusterName))
	})

	// Not having a cluster object yet means we can't check
	// the blacklist.
	if err == nil && len(clusters) == 1 {
		blacklistedCerts = clusters[0].BlacklistedCertificates
	}

	// Renewing the cert with a local (unix socket) is always valid.
	localNodeInfo := ctx.Value(LocalRequestKey)
	if localNodeInfo != nil {
		nodeInfo, ok := localNodeInfo.(RemoteNodeInfo)
		if ok && nodeInfo.NodeID != "" {
			return s.issueRenewCertificate(ctx, nodeInfo.NodeID, request.CSR)
		}
	}

	// If the remote node is a worker (either forwarded by a manager, or calling directly),
	// issue a renew worker certificate entry with the correct ID
	nodeID, err := AuthorizeForwardedRoleAndOrg(ctx, []string{WorkerRole}, []string{ManagerRole}, s.clusterID, blacklistedCerts)
	if err == nil {
		return s.issueRenewCertificate(ctx, nodeID, request.CSR)
	}

	// If the remote node is a manager (either forwarded by another manager, or calling directly),
	// issue a renew certificate entry with the correct ID
	nodeID, err = AuthorizeForwardedRoleAndOrg(ctx, []string{ManagerRole}, []string{ManagerRole}, s.clusterID, blacklistedCerts)
	if err == nil {
		return s.issueRenewCertificate(ctx, nodeID, request.CSR)
	}

	// The remote node didn't successfully present a valid MTLS certificate, let's issue a
	// certificate with a new random ID
	role := api.NodeRole(-1)

	s.mu.Lock()
	if subtle.ConstantTimeCompare([]byte(s.joinTokens.Manager), []byte(request.Token)) == 1 {
		role = api.NodeRoleManager
	} else if subtle.ConstantTimeCompare([]byte(s.joinTokens.Worker), []byte(request.Token)) == 1 {
		role = api.NodeRoleWorker
	}
	s.mu.Unlock()

	if role < 0 {
		return nil, status.Errorf(codes.InvalidArgument, "A valid join token is necessary to join this cluster")
	}

	// Max number of collisions of ID or CN to tolerate before giving up
	maxRetries := 3
	// Generate a random ID for this new node
	for i := 0; ; i++ {
		nodeID = identity.NewID()

		// Create a new node
		err := s.store.Update(func(tx store.Tx) error {
			node := &api.Node{
				Role: role,
				ID:   nodeID,
				Certificate: api.Certificate{
					CSR:  request.CSR,
					CN:   nodeID,
					Role: role,
					Status: api.IssuanceStatus{
						State: api.IssuanceStatePending,
					},
				},
				Spec: api.NodeSpec{
					DesiredRole:  role,
					Membership:   api.NodeMembershipAccepted,
					Availability: request.Availability,
				},
			}
			node.VXLANUDPPort = clusters[0].VXLANUDPPort
			return store.CreateNode(tx, node)
		})
		if err == nil {
			log.G(ctx).WithFields(logrus.Fields{
				"node.id":   nodeID,
				"node.role": role,
				"method":    "IssueNodeCertificate",
			}).Debugf("new certificate entry added")
			break
		}
		if err != store.ErrExist {
			return nil, err
		}
		if i == maxRetries {
			return nil, err
		}
		log.G(ctx).WithFields(logrus.Fields{
			"node.id":   nodeID,
			"node.role": role,
			"method":    "IssueNodeCertificate",
		}).Errorf("randomly generated node ID collided with an existing one - retrying")
	}

	return &api.IssueNodeCertificateResponse{
		NodeID:         nodeID,
		NodeMembership: api.NodeMembershipAccepted,
	}, nil
}

// issueRenewCertificate receives a nodeID and a CSR and modifies the node's certificate entry with the new CSR
// and changes the state to RENEW, so it can be picked up and signed by the signing reconciliation loop
func (s *Server) issueRenewCertificate(ctx context.Context, nodeID string, csr []byte) (*api.IssueNodeCertificateResponse, error) {
	var (
		cert api.Certificate
		node *api.Node
	)
	err := s.store.Update(func(tx store.Tx) error {
		// Attempt to retrieve the node with nodeID
		node = store.GetNode(tx, nodeID)
		if node == nil {
			log.G(ctx).WithFields(logrus.Fields{
				"node.id": nodeID,
				"method":  "issueRenewCertificate",
			}).Warnf("node does not exist")
			// If this node doesn't exist, we shouldn't be renewing a certificate for it
			return status.Errorf(codes.NotFound, "node %s not found when attempting to renew certificate", nodeID)
		}

		// Create a new Certificate entry for this node with the new CSR and a RENEW state
		cert = api.Certificate{
			CSR:  csr,
			CN:   node.ID,
			Role: node.Role,
			Status: api.IssuanceStatus{
				State: api.IssuanceStateRenew,
			},
		}

		node.Certificate = cert
		return store.UpdateNode(tx, node)
	})
	if err != nil {
		return nil, err
	}

	log.G(ctx).WithFields(logrus.Fields{
		"cert.cn":   cert.CN,
		"cert.role": cert.Role,
		"method":    "issueRenewCertificate",
	}).Debugf("node certificate updated")

	return &api.IssueNodeCertificateResponse{
		NodeID:         nodeID,
		NodeMembership: node.Spec.Membership,
	}, nil
}

// GetRootCACertificate returns the certificate of the Root CA. It is used as a convenience for distributing
// the root of trust for the swarm. Clients should be using the CA hash to verify if they weren't target to
// a MiTM. If they fail to do so, node bootstrap works with TOFU semantics.
func (s *Server) GetRootCACertificate(ctx context.Context, request *api.GetRootCACertificateRequest) (*api.GetRootCACertificateResponse, error) {
	log.G(ctx).WithFields(logrus.Fields{
		"method": "GetRootCACertificate",
	})

	s.signingMu.Lock()
	defer s.signingMu.Unlock()

	return &api.GetRootCACertificateResponse{
		Certificate: s.localRootCA.Certs,
	}, nil
}

// Run runs the CA signer main loop.
// The CA signer can be stopped with cancelling ctx or calling Stop().
func (s *Server) Run(ctx context.Context) error {
	s.mu.Lock()
	if s.isRunning() {
		s.mu.Unlock()
		return errors.New("CA signer is already running")
	}
	s.wg.Add(1)
	s.ctx, s.cancel = context.WithCancel(log.WithModule(ctx, "ca"))
	ctx = s.ctx
	s.mu.Unlock()
	defer s.wg.Done()
	defer func() {
		s.mu.Lock()
		s.mu.Unlock()
	}()

	// Retrieve the channels to keep track of changes in the cluster
	// Retrieve all the currently registered nodes
	var (
		nodes   []*api.Node
		cluster *api.Cluster
		err     error
	)
	updates, cancel, err := store.ViewAndWatch(
		s.store,
		func(readTx store.ReadTx) error {
			cluster = store.GetCluster(readTx, s.clusterID)
			if cluster == nil {
				return errors.New("could not find cluster object")
			}
			nodes, err = store.FindNodes(readTx, store.All)
			return err
		},
		api.EventCreateNode{},
		api.EventUpdateNode{},
		api.EventDeleteNode{},
		api.EventUpdateCluster{
			Cluster: &api.Cluster{ID: s.clusterID},
			Checks:  []api.ClusterCheckFunc{api.ClusterCheckID},
		},
	)

	// call once to ensure that the join tokens and local/external CA signer are always set
	rootReconciler := &rootRotationReconciler{
		ctx:                 log.WithField(ctx, "method", "(*Server).rootRotationReconciler"),
		clusterID:           s.clusterID,
		store:               s.store,
		batchUpdateInterval: s.rootReconciliationRetryInterval,
	}

	s.UpdateRootCA(ctx, cluster, rootReconciler)

	// Do this after updateCluster has been called, so Ready() and isRunning never returns true without
	// the join tokens and external CA/security config's root CA being set correctly
	s.mu.Lock()
	close(s.started)
	s.mu.Unlock()

	if err != nil {
		log.G(ctx).WithFields(logrus.Fields{
			"method": "(*Server).Run",
		}).WithError(err).Errorf("snapshot store view failed")
		return err
	}
	defer cancel()

	// We might have missed some updates if there was a leader election,
	// so let's pick up the slack.
	if err := s.reconcileNodeCertificates(ctx, nodes); err != nil {
		// We don't return here because that means the Run loop would
		// never run. Log an error instead.
		log.G(ctx).WithFields(logrus.Fields{
			"method": "(*Server).Run",
		}).WithError(err).Errorf("error attempting to reconcile certificates")
	}

	ticker := time.NewTicker(s.reconciliationRetryInterval)
	defer ticker.Stop()

	externalTLSCredsChange, externalTLSWatchCancel := s.securityConfig.Watch()
	defer externalTLSWatchCancel()

	// Watch for new nodes being created, new nodes being updated, and changes
	// to the cluster
	for {
		select {
		case <-ctx.Done():
			return nil
		default:
		}

		select {
		case event := <-updates:
			switch v := event.(type) {
			case api.EventCreateNode:
				s.evaluateAndSignNodeCert(ctx, v.Node)
				rootReconciler.UpdateNode(v.Node)
			case api.EventUpdateNode:
				// If this certificate is already at a final state
				// no need to evaluate and sign it.
				if !isFinalState(v.Node.Certificate.Status) {
					s.evaluateAndSignNodeCert(ctx, v.Node)
				}
				rootReconciler.UpdateNode(v.Node)
			case api.EventDeleteNode:
				rootReconciler.DeleteNode(v.Node)
			case api.EventUpdateCluster:
				if v.Cluster.ID == s.clusterID {
					s.UpdateRootCA(ctx, v.Cluster, rootReconciler)
				}
			}
		case <-externalTLSCredsChange:
			// The TLS certificates can rotate independently of the root CA (and hence which roots the
			// external CA trusts) and external CA URLs.  It's possible that the root CA update is received
			// before the external TLS cred change notification.  During that period, it is possible that
			// the TLS creds will expire or otherwise fail to authorize against external CAs.  However, in
			// that case signing will just fail with a recoverable connectivity error - the state of the
			// certificate issuance is left as pending, and on the next tick, the server will try to sign
			// all nodes with pending certs again (by which time the TLS cred change will have been
			// received).

			// Note that if the external CA changes, the new external CA *MUST* trust the current server's
			// certificate issuer, and this server's certificates should not be extremely close to expiry,
			// otherwise this server would not be able to get new TLS certificates and will no longer be
			// able to function.
			s.signingMu.Lock()
			s.externalCA.UpdateTLSConfig(NewExternalCATLSConfig(
				s.securityConfig.ClientTLSCreds.Config().Certificates, s.externalCAPool))
			s.signingMu.Unlock()
		case <-ticker.C:
			for _, node := range s.pending {
				if err := s.evaluateAndSignNodeCert(ctx, node); err != nil {
					// If this sign operation did not succeed, the rest are
					// unlikely to. Yield so that we don't hammer an external CA.
					// Since the map iteration order is randomized, there is no
					// risk of getting stuck on a problematic CSR.
					break
				}
			}
		case <-ctx.Done():
			return nil
		}
	}
}

// Stop stops the CA and closes all grpc streams.
func (s *Server) Stop() error {
	s.mu.Lock()

	if !s.isRunning() {
		s.mu.Unlock()
		return errors.New("CA signer is already stopped")
	}
	s.cancel()
	s.started = make(chan struct{})
	s.joinTokens = nil
	s.mu.Unlock()

	// Wait for Run to complete
	s.wg.Wait()

	return nil
}

// Ready waits on the ready channel and returns when the server is ready to serve.
func (s *Server) Ready() <-chan struct{} {
	s.mu.Lock()
	defer s.mu.Unlock()
	return s.started
}

func (s *Server) isRunningLocked() (context.Context, error) {
	s.mu.Lock()
	if !s.isRunning() {
		s.mu.Unlock()
		return nil, status.Errorf(codes.Aborted, "CA signer is stopped")
	}
	ctx := s.ctx
	s.mu.Unlock()
	return ctx, nil
}

func (s *Server) isReadyLocked() error {
	s.mu.Lock()
	defer s.mu.Unlock()
	if !s.isRunning() {
		return status.Errorf(codes.Aborted, "CA signer is stopped")
	}
	if s.joinTokens == nil {
		return status.Errorf(codes.Aborted, "CA signer is still starting")
	}
	return nil
}

func (s *Server) isRunning() bool {
	if s.ctx == nil {
		return false
	}
	select {
	case <-s.ctx.Done():
		return false
	default:
	}
	return true
}

// filterExternalCAURLS returns a list of external CA urls filtered by the desired cert.
func filterExternalCAURLS(ctx context.Context, desiredCert, defaultCert []byte, apiExternalCAs []*api.ExternalCA) (urls []string) {
	desiredCert = NormalizePEMs(desiredCert)

	// TODO(aaronl): In the future, this will be abstracted with an ExternalCA interface that has different
	// implementations for different CA types. At the moment, only CFSSL is supported.
	for i, extCA := range apiExternalCAs {
		// We want to support old external CA specifications which did not have a CA cert.  If there is no cert specified,
		// we assume it's the old cert
		certForExtCA := extCA.CACert
		if len(certForExtCA) == 0 {
			certForExtCA = defaultCert
		}
		certForExtCA = NormalizePEMs(certForExtCA)
		if extCA.Protocol != api.ExternalCA_CAProtocolCFSSL {
			log.G(ctx).Debugf("skipping external CA %d (url: %s) due to unknown protocol type", i, extCA.URL)
			continue
		}
		if !bytes.Equal(certForExtCA, desiredCert) {
			log.G(ctx).Debugf("skipping external CA %d (url: %s) because it has the wrong CA cert", i, extCA.URL)
			continue
		}
		urls = append(urls, extCA.URL)
	}
	return
}

// UpdateRootCA is called when there are cluster changes, and it ensures that the local RootCA is
// always aware of changes in clusterExpiry and the Root CA key material - this can be called by
// anything to update the root CA material
func (s *Server) UpdateRootCA(ctx context.Context, cluster *api.Cluster, reconciler *rootRotationReconciler) error {
	s.mu.Lock()
	s.joinTokens = cluster.RootCA.JoinTokens.Copy()
	s.mu.Unlock()
	rCA := cluster.RootCA.Copy()
	if reconciler != nil {
		reconciler.UpdateRootCA(rCA)
	}

	s.signingMu.Lock()
	defer s.signingMu.Unlock()
	firstSeenCluster := s.lastSeenClusterRootCA == nil && s.lastSeenExternalCAs == nil
	rootCAChanged := len(rCA.CACert) != 0 && !equality.RootCAEqualStable(s.lastSeenClusterRootCA, rCA)
	externalCAChanged := !equality.ExternalCAsEqualStable(s.lastSeenExternalCAs, cluster.Spec.CAConfig.ExternalCAs)
	ctx = log.WithLogger(ctx, log.G(ctx).WithFields(logrus.Fields{
		"cluster.id": cluster.ID,
		"method":     "(*Server).UpdateRootCA",
	}))

	if rootCAChanged {
		setOrUpdate := "set"
		if !firstSeenCluster {
			log.G(ctx).Debug("Updating signing root CA and external CA due to change in cluster Root CA")
			setOrUpdate = "updated"
		}
		expiry := DefaultNodeCertExpiration
		if cluster.Spec.CAConfig.NodeCertExpiry != nil {
			// NodeCertExpiry exists, let's try to parse the duration out of it
			clusterExpiry, err := gogotypes.DurationFromProto(cluster.Spec.CAConfig.NodeCertExpiry)
			if err != nil {
				log.G(ctx).WithError(err).Warn("failed to parse certificate expiration, using default")
			} else {
				// We were able to successfully parse the expiration out of the cluster.
				expiry = clusterExpiry
			}
		} else {
			// NodeCertExpiry seems to be nil
			log.G(ctx).Warn("no certificate expiration specified, using default")
		}
		// Attempt to update our local RootCA with the new parameters
		updatedRootCA, err := RootCAFromAPI(rCA, expiry)
		if err != nil {
			return errors.Wrap(err, "invalid Root CA object in cluster")
		}

		s.localRootCA = &updatedRootCA
		s.externalCAPool = updatedRootCA.Pool
		externalCACert := rCA.CACert
		if rCA.RootRotation != nil {
			externalCACert = rCA.RootRotation.CACert
			// the external CA has to trust the new CA cert
			s.externalCAPool = x509.NewCertPool()
			s.externalCAPool.AppendCertsFromPEM(rCA.CACert)
			s.externalCAPool.AppendCertsFromPEM(rCA.RootRotation.CACert)
		}
		s.lastSeenExternalCAs = cluster.Spec.CAConfig.Copy().ExternalCAs
		urls := filterExternalCAURLS(ctx, externalCACert, rCA.CACert, s.lastSeenExternalCAs)
		// Replace the external CA with the relevant intermediates, URLS, and TLS config
		s.externalCA = NewExternalCA(updatedRootCA.Intermediates,
			NewExternalCATLSConfig(s.securityConfig.ClientTLSCreds.Config().Certificates, s.externalCAPool), urls...)

		// only update the server cache if we've successfully updated the root CA
		log.G(ctx).Debugf("Root CA %s successfully", setOrUpdate)
		s.lastSeenClusterRootCA = rCA
	} else if externalCAChanged {
		// we want to update only if the external CA URLS have changed, since if the root CA has changed we already
		// run similar logic
		if !firstSeenCluster {
			log.G(ctx).Debug("Updating security config external CA URLs due to change in cluster spec's list of external CAs")
		}
		wantedExternalCACert := rCA.CACert // we want to only add external CA URLs that use this cert
		if rCA.RootRotation != nil {
			// we're rotating to a new root, so we only want external CAs with the new root cert
			wantedExternalCACert = rCA.RootRotation.CACert
		}
		// Update our external CA with the list of External CA URLs from the new cluster state
		s.lastSeenExternalCAs = cluster.Spec.CAConfig.Copy().ExternalCAs
		urls := filterExternalCAURLS(ctx, wantedExternalCACert, rCA.CACert, s.lastSeenExternalCAs)
		s.externalCA.UpdateURLs(urls...)
	}
	return nil
}

// evaluateAndSignNodeCert implements the logic of which certificates to sign
func (s *Server) evaluateAndSignNodeCert(ctx context.Context, node *api.Node) error {
	// If the desired membership and actual state are in sync, there's
	// nothing to do.
	certState := node.Certificate.Status.State
	if node.Spec.Membership == api.NodeMembershipAccepted &&
		(certState == api.IssuanceStateIssued || certState == api.IssuanceStateRotate) {
		return nil
	}

	// If the certificate state is renew, then it is a server-sided accepted cert (cert renewals)
	if certState == api.IssuanceStateRenew {
		return s.signNodeCert(ctx, node)
	}

	// Sign this certificate if a user explicitly changed it to Accepted, and
	// the certificate is in pending state
	if node.Spec.Membership == api.NodeMembershipAccepted && certState == api.IssuanceStatePending {
		return s.signNodeCert(ctx, node)
	}

	return nil
}

// signNodeCert does the bulk of the work for signing a certificate
func (s *Server) signNodeCert(ctx context.Context, node *api.Node) error {
	s.signingMu.Lock()
	rootCA := s.localRootCA
	externalCA := s.externalCA
	s.signingMu.Unlock()

	node = node.Copy()
	nodeID := node.ID
	// Convert the role from proto format
	role, err := ParseRole(node.Certificate.Role)
	if err != nil {
		log.G(ctx).WithFields(logrus.Fields{
			"node.id": node.ID,
			"method":  "(*Server).signNodeCert",
		}).WithError(err).Errorf("failed to parse role")
		return errors.New("failed to parse role")
	}

	s.pending[node.ID] = node

	// Attempt to sign the CSR
	var (
		rawCSR = node.Certificate.CSR
		cn     = node.Certificate.CN
		ou     = role
		org    = s.clusterID
	)

	// Try using the external CA first.
	cert, err := externalCA.Sign(ctx, PrepareCSR(rawCSR, cn, ou, org))
	if err == ErrNoExternalCAURLs {
		// No external CA servers configured. Try using the local CA.
		cert, err = rootCA.ParseValidateAndSignCSR(rawCSR, cn, ou, org)
	}

	if err != nil {
		log.G(ctx).WithFields(logrus.Fields{
			"node.id": node.ID,
			"method":  "(*Server).signNodeCert",
		}).WithError(err).Errorf("failed to sign CSR")

		// If the current state is already Failed, no need to change it
		if node.Certificate.Status.State == api.IssuanceStateFailed {
			delete(s.pending, node.ID)
			return errors.New("failed to sign CSR")
		}

		if _, ok := err.(recoverableErr); ok {
			// Return without changing the state of the certificate. We may
			// retry signing it in the future.
			return errors.New("failed to sign CSR")
		}

		// We failed to sign this CSR, change the state to FAILED
		err = s.store.Update(func(tx store.Tx) error {
			node := store.GetNode(tx, nodeID)
			if node == nil {
				return errors.Errorf("node %s not found", nodeID)
			}

			node.Certificate.Status = api.IssuanceStatus{
				State: api.IssuanceStateFailed,
				Err:   err.Error(),
			}

			return store.UpdateNode(tx, node)
		})
		if err != nil {
			log.G(ctx).WithFields(logrus.Fields{
				"node.id": nodeID,
				"method":  "(*Server).signNodeCert",
			}).WithError(err).Errorf("transaction failed when setting state to FAILED")
		}

		delete(s.pending, node.ID)
		return errors.New("failed to sign CSR")
	}

	// We were able to successfully sign the new CSR. Let's try to update the nodeStore
	for {
		err = s.store.Update(func(tx store.Tx) error {
			node.Certificate.Certificate = cert
			node.Certificate.Status = api.IssuanceStatus{
				State: api.IssuanceStateIssued,
			}

			err := store.UpdateNode(tx, node)
			if err != nil {
				node = store.GetNode(tx, nodeID)
				if node == nil {
					err = errors.Errorf("node %s does not exist", nodeID)
				}
			}
			return err
		})
		if err == nil {
			log.G(ctx).WithFields(logrus.Fields{
				"node.id":   node.ID,
				"node.role": node.Certificate.Role,
				"method":    "(*Server).signNodeCert",
			}).Debugf("certificate issued")
			delete(s.pending, node.ID)
			break
		}
		if err == store.ErrSequenceConflict {
			continue
		}

		log.G(ctx).WithFields(logrus.Fields{
			"node.id": nodeID,
			"method":  "(*Server).signNodeCert",
		}).WithError(err).Errorf("transaction failed")
		return errors.New("transaction failed")
	}
	return nil
}

// reconcileNodeCertificates is a helper method that calls evaluateAndSignNodeCert on all the
// nodes.
func (s *Server) reconcileNodeCertificates(ctx context.Context, nodes []*api.Node) error {
	for _, node := range nodes {
		s.evaluateAndSignNodeCert(ctx, node)
	}

	return nil
}

// A successfully issued certificate and a failed certificate are our current final states
func isFinalState(status api.IssuanceStatus) bool {
	if status.State == api.IssuanceStateIssued || status.State == api.IssuanceStateFailed ||
		status.State == api.IssuanceStateRotate {
		return true
	}

	return false
}

// RootCAFromAPI creates a RootCA object from an api.RootCA object
func RootCAFromAPI(apiRootCA *api.RootCA, expiry time.Duration) (RootCA, error) {
	var intermediates []byte
	signingCert := apiRootCA.CACert
	signingKey := apiRootCA.CAKey
	if apiRootCA.RootRotation != nil {
		signingCert = apiRootCA.RootRotation.CrossSignedCACert
		signingKey = apiRootCA.RootRotation.CAKey
		intermediates = apiRootCA.RootRotation.CrossSignedCACert
	}
	if signingKey == nil {
		signingCert = nil
	}
	return NewRootCA(apiRootCA.CACert, signingCert, signingKey, expiry, intermediates)
}