supervisor.go 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346
  1. package origin
  2. import (
  3. "context"
  4. "errors"
  5. "net"
  6. "time"
  7. "github.com/google/uuid"
  8. "github.com/cloudflare/cloudflared/buffer"
  9. "github.com/cloudflare/cloudflared/connection"
  10. "github.com/cloudflare/cloudflared/edgediscovery"
  11. "github.com/cloudflare/cloudflared/h2mux"
  12. "github.com/cloudflare/cloudflared/logger"
  13. "github.com/cloudflare/cloudflared/signal"
  14. tunnelpogs "github.com/cloudflare/cloudflared/tunnelrpc/pogs"
  15. )
  16. const (
  17. // Waiting time before retrying a failed tunnel connection
  18. tunnelRetryDuration = time.Second * 10
  19. // SRV record resolution TTL
  20. resolveTTL = time.Hour
  21. // Interval between registering new tunnels
  22. registrationInterval = time.Second
  23. subsystemRefreshAuth = "refresh_auth"
  24. // Maximum exponent for 'Authenticate' exponential backoff
  25. refreshAuthMaxBackoff = 10
  26. // Waiting time before retrying a failed 'Authenticate' connection
  27. refreshAuthRetryDuration = time.Second * 10
  28. // Maximum time to make an Authenticate RPC
  29. authTokenTimeout = time.Second * 30
  30. )
  31. var (
  32. errEventDigestUnset = errors.New("event digest unset")
  33. )
  34. // Supervisor manages non-declarative tunnels. Establishes TCP connections with the edge, and
  35. // reconnects them if they disconnect.
  36. type Supervisor struct {
  37. cloudflaredUUID uuid.UUID
  38. config *TunnelConfig
  39. edgeIPs *edgediscovery.Edge
  40. lastResolve time.Time
  41. resolverC chan resolveResult
  42. tunnelErrors chan tunnelError
  43. tunnelsConnecting map[int]chan struct{}
  44. // nextConnectedIndex and nextConnectedSignal are used to wait for all
  45. // currently-connecting tunnels to finish connecting so we can reset backoff timer
  46. nextConnectedIndex int
  47. nextConnectedSignal chan struct{}
  48. logger logger.Service
  49. reconnectCredentialManager *reconnectCredentialManager
  50. bufferPool *buffer.Pool
  51. }
  52. type resolveResult struct {
  53. err error
  54. }
  55. type tunnelError struct {
  56. index int
  57. addr *net.TCPAddr
  58. err error
  59. }
  60. func NewSupervisor(config *TunnelConfig, cloudflaredUUID uuid.UUID) (*Supervisor, error) {
  61. var (
  62. edgeIPs *edgediscovery.Edge
  63. err error
  64. )
  65. if len(config.EdgeAddrs) > 0 {
  66. edgeIPs, err = edgediscovery.StaticEdge(config.Logger, config.EdgeAddrs)
  67. } else {
  68. edgeIPs, err = edgediscovery.ResolveEdge(config.Logger)
  69. }
  70. if err != nil {
  71. return nil, err
  72. }
  73. return &Supervisor{
  74. cloudflaredUUID: cloudflaredUUID,
  75. config: config,
  76. edgeIPs: edgeIPs,
  77. tunnelErrors: make(chan tunnelError),
  78. tunnelsConnecting: map[int]chan struct{}{},
  79. logger: config.Logger,
  80. reconnectCredentialManager: newReconnectCredentialManager(metricsNamespace, tunnelSubsystem, config.HAConnections),
  81. bufferPool: buffer.NewPool(512 * 1024),
  82. }, nil
  83. }
  84. func (s *Supervisor) Run(ctx context.Context, connectedSignal *signal.Signal, reconnectCh chan ReconnectSignal) error {
  85. logger := s.config.Logger
  86. if err := s.initialize(ctx, connectedSignal, reconnectCh); err != nil {
  87. return err
  88. }
  89. var tunnelsWaiting []int
  90. tunnelsActive := s.config.HAConnections
  91. backoff := BackoffHandler{MaxRetries: s.config.Retries, BaseTime: tunnelRetryDuration, RetryForever: true}
  92. var backoffTimer <-chan time.Time
  93. refreshAuthBackoff := &BackoffHandler{MaxRetries: refreshAuthMaxBackoff, BaseTime: refreshAuthRetryDuration, RetryForever: true}
  94. var refreshAuthBackoffTimer <-chan time.Time
  95. if s.config.UseReconnectToken {
  96. if timer, err := s.reconnectCredentialManager.RefreshAuth(ctx, refreshAuthBackoff, s.authenticate); err == nil {
  97. refreshAuthBackoffTimer = timer
  98. } else {
  99. logger.Errorf("supervisor: initial refreshAuth failed, retrying in %v: %s", refreshAuthRetryDuration, err)
  100. refreshAuthBackoffTimer = time.After(refreshAuthRetryDuration)
  101. }
  102. }
  103. for {
  104. select {
  105. // Context cancelled
  106. case <-ctx.Done():
  107. for tunnelsActive > 0 {
  108. <-s.tunnelErrors
  109. tunnelsActive--
  110. }
  111. return nil
  112. // startTunnel returned with error
  113. // (note that this may also be caused by context cancellation)
  114. case tunnelError := <-s.tunnelErrors:
  115. tunnelsActive--
  116. if tunnelError.err != nil {
  117. logger.Infof("supervisor: Tunnel disconnected due to error: %s", tunnelError.err)
  118. tunnelsWaiting = append(tunnelsWaiting, tunnelError.index)
  119. s.waitForNextTunnel(tunnelError.index)
  120. if backoffTimer == nil {
  121. backoffTimer = backoff.BackoffTimer()
  122. }
  123. // Previously we'd mark the edge address as bad here, but now we'll just silently use
  124. // another.
  125. }
  126. // Backoff was set and its timer expired
  127. case <-backoffTimer:
  128. backoffTimer = nil
  129. for _, index := range tunnelsWaiting {
  130. go s.startTunnel(ctx, index, s.newConnectedTunnelSignal(index), reconnectCh)
  131. }
  132. tunnelsActive += len(tunnelsWaiting)
  133. tunnelsWaiting = nil
  134. // Time to call Authenticate
  135. case <-refreshAuthBackoffTimer:
  136. newTimer, err := s.reconnectCredentialManager.RefreshAuth(ctx, refreshAuthBackoff, s.authenticate)
  137. if err != nil {
  138. logger.Errorf("supervisor: Authentication failed: %s", err)
  139. // Permanent failure. Leave the `select` without setting the
  140. // channel to be non-null, so we'll never hit this case of the `select` again.
  141. continue
  142. }
  143. refreshAuthBackoffTimer = newTimer
  144. // Tunnel successfully connected
  145. case <-s.nextConnectedSignal:
  146. if !s.waitForNextTunnel(s.nextConnectedIndex) && len(tunnelsWaiting) == 0 {
  147. // No more tunnels outstanding, clear backoff timer
  148. backoff.SetGracePeriod()
  149. }
  150. // DNS resolution returned
  151. case result := <-s.resolverC:
  152. s.lastResolve = time.Now()
  153. s.resolverC = nil
  154. if result.err == nil {
  155. logger.Debug("supervisor: Service discovery refresh complete")
  156. } else {
  157. logger.Errorf("supervisor: Service discovery error: %s", result.err)
  158. }
  159. }
  160. }
  161. }
  162. // Returns nil if initialization succeeded, else the initialization error.
  163. func (s *Supervisor) initialize(ctx context.Context, connectedSignal *signal.Signal, reconnectCh chan ReconnectSignal) error {
  164. logger := s.logger
  165. s.lastResolve = time.Now()
  166. availableAddrs := int(s.edgeIPs.AvailableAddrs())
  167. if s.config.HAConnections > availableAddrs {
  168. logger.Infof("You requested %d HA connections but I can give you at most %d.", s.config.HAConnections, availableAddrs)
  169. s.config.HAConnections = availableAddrs
  170. }
  171. go s.startFirstTunnel(ctx, connectedSignal, reconnectCh)
  172. select {
  173. case <-ctx.Done():
  174. <-s.tunnelErrors
  175. return ctx.Err()
  176. case tunnelError := <-s.tunnelErrors:
  177. return tunnelError.err
  178. case <-connectedSignal.Wait():
  179. }
  180. // At least one successful connection, so start the rest
  181. for i := 1; i < s.config.HAConnections; i++ {
  182. ch := signal.New(make(chan struct{}))
  183. go s.startTunnel(ctx, i, ch, reconnectCh)
  184. time.Sleep(registrationInterval)
  185. }
  186. return nil
  187. }
  188. // startTunnel starts the first tunnel connection. The resulting error will be sent on
  189. // s.tunnelErrors. It will send a signal via connectedSignal if registration succeed
  190. func (s *Supervisor) startFirstTunnel(ctx context.Context, connectedSignal *signal.Signal, reconnectCh chan ReconnectSignal) {
  191. var (
  192. addr *net.TCPAddr
  193. err error
  194. )
  195. const firstConnIndex = 0
  196. defer func() {
  197. s.tunnelErrors <- tunnelError{index: firstConnIndex, addr: addr, err: err}
  198. }()
  199. addr, err = s.edgeIPs.GetAddr(firstConnIndex)
  200. if err != nil {
  201. return
  202. }
  203. err = ServeTunnelLoop(ctx, s.reconnectCredentialManager, s.config, addr, firstConnIndex, connectedSignal, s.cloudflaredUUID, s.bufferPool, reconnectCh)
  204. // If the first tunnel disconnects, keep restarting it.
  205. edgeErrors := 0
  206. for s.unusedIPs() {
  207. if ctx.Err() != nil {
  208. return
  209. }
  210. switch err.(type) {
  211. case nil:
  212. return
  213. // try the next address if it was a dialError(network problem) or
  214. // dupConnRegisterTunnelError
  215. case connection.DialError, dupConnRegisterTunnelError:
  216. edgeErrors++
  217. default:
  218. return
  219. }
  220. if edgeErrors >= 2 {
  221. addr, err = s.edgeIPs.GetDifferentAddr(firstConnIndex)
  222. if err != nil {
  223. return
  224. }
  225. }
  226. err = ServeTunnelLoop(ctx, s.reconnectCredentialManager, s.config, addr, firstConnIndex, connectedSignal, s.cloudflaredUUID, s.bufferPool, reconnectCh)
  227. }
  228. }
  229. // startTunnel starts a new tunnel connection. The resulting error will be sent on
  230. // s.tunnelErrors.
  231. func (s *Supervisor) startTunnel(ctx context.Context, index int, connectedSignal *signal.Signal, reconnectCh chan ReconnectSignal) {
  232. var (
  233. addr *net.TCPAddr
  234. err error
  235. )
  236. defer func() {
  237. s.tunnelErrors <- tunnelError{index: index, addr: addr, err: err}
  238. }()
  239. addr, err = s.edgeIPs.GetDifferentAddr(index)
  240. if err != nil {
  241. return
  242. }
  243. err = ServeTunnelLoop(ctx, s.reconnectCredentialManager, s.config, addr, uint8(index), connectedSignal, s.cloudflaredUUID, s.bufferPool, reconnectCh)
  244. }
  245. func (s *Supervisor) newConnectedTunnelSignal(index int) *signal.Signal {
  246. sig := make(chan struct{})
  247. s.tunnelsConnecting[index] = sig
  248. s.nextConnectedSignal = sig
  249. s.nextConnectedIndex = index
  250. return signal.New(sig)
  251. }
  252. func (s *Supervisor) waitForNextTunnel(index int) bool {
  253. delete(s.tunnelsConnecting, index)
  254. s.nextConnectedSignal = nil
  255. for k, v := range s.tunnelsConnecting {
  256. s.nextConnectedIndex = k
  257. s.nextConnectedSignal = v
  258. return true
  259. }
  260. return false
  261. }
  262. func (s *Supervisor) unusedIPs() bool {
  263. return s.edgeIPs.AvailableAddrs() > s.config.HAConnections
  264. }
  265. func (s *Supervisor) authenticate(ctx context.Context, numPreviousAttempts int) (tunnelpogs.AuthOutcome, error) {
  266. arbitraryEdgeIP, err := s.edgeIPs.GetAddrForRPC()
  267. if err != nil {
  268. return nil, err
  269. }
  270. edgeConn, err := connection.DialEdge(ctx, dialTimeout, s.config.TlsConfig, arbitraryEdgeIP)
  271. if err != nil {
  272. return nil, err
  273. }
  274. defer edgeConn.Close()
  275. handler := h2mux.MuxedStreamFunc(func(*h2mux.MuxedStream) error {
  276. // This callback is invoked by h2mux when the edge initiates a stream.
  277. return nil // noop
  278. })
  279. muxerConfig := s.config.muxerConfig(handler)
  280. muxer, err := h2mux.Handshake(edgeConn, edgeConn, muxerConfig, s.config.Metrics.activeStreams)
  281. if err != nil {
  282. return nil, err
  283. }
  284. go muxer.Serve(ctx)
  285. defer func() {
  286. // If we don't wait for the muxer shutdown here, edgeConn.Close() runs before the muxer connections are done,
  287. // and the user sees log noise: "error writing data", "connection closed unexpectedly"
  288. <-muxer.Shutdown()
  289. }()
  290. tunnelServer, err := connection.NewRPCClient(ctx, muxer, s.logger, openStreamTimeout)
  291. if err != nil {
  292. return nil, err
  293. }
  294. defer tunnelServer.Close()
  295. const arbitraryConnectionID = uint8(0)
  296. registrationOptions := s.config.RegistrationOptions(arbitraryConnectionID, edgeConn.LocalAddr().String(), s.cloudflaredUUID)
  297. registrationOptions.NumPreviousAttempts = uint8(numPreviousAttempts)
  298. authResponse, err := tunnelServer.Authenticate(
  299. ctx,
  300. s.config.OriginCert,
  301. s.config.Hostname,
  302. registrationOptions,
  303. )
  304. if err != nil {
  305. return nil, err
  306. }
  307. return authResponse.Outcome(), nil
  308. }