metrics.go 5.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199
  1. package metrics
  2. import (
  3. "context"
  4. "fmt"
  5. "net"
  6. "net/http"
  7. _ "net/http/pprof"
  8. "runtime"
  9. "sync"
  10. "time"
  11. "github.com/facebookgo/grace/gracenet"
  12. "github.com/prometheus/client_golang/prometheus"
  13. "github.com/prometheus/client_golang/prometheus/promhttp"
  14. "github.com/rs/zerolog"
  15. "golang.org/x/net/trace"
  16. "github.com/cloudflare/cloudflared/diagnostic"
  17. )
  18. const (
  19. startupTime = time.Millisecond * 500
  20. defaultShutdownTimeout = time.Second * 15
  21. )
  22. // This variable is set at compile time to allow the default local address to change.
  23. var Runtime = "host"
  24. func GetMetricsDefaultAddress(runtimeType string) string {
  25. // When issuing the diagnostic command we may have to reach a server that is
  26. // running in a virtual enviroment and in that case we must bind to 0.0.0.0
  27. // otherwise the server won't be reachable.
  28. switch runtimeType {
  29. case "virtual":
  30. return "0.0.0.0:0"
  31. default:
  32. return "localhost:0"
  33. }
  34. }
  35. // GetMetricsKnownAddresses returns the addresses used by the metrics server to bind at
  36. // startup time to allow a semi-deterministic approach to know where the server is listening at.
  37. // The ports were selected because at the time we are in 2024 and they do not collide with any
  38. // know/registered port according https://en.wikipedia.org/wiki/List_of_TCP_and_UDP_port_numbers.
  39. func GetMetricsKnownAddresses(runtimeType string) [5]string {
  40. switch Runtime {
  41. case "virtual":
  42. return [5]string{"0.0.0.0:20241", "0.0.0.0:20242", "0.0.0.0:20243", "0.0.0.0:20244", "0.0.0.0:20245"}
  43. default:
  44. return [5]string{"localhost:20241", "localhost:20242", "localhost:20243", "localhost:20244", "localhost:20245"}
  45. }
  46. }
  47. type Config struct {
  48. ReadyServer *ReadyServer
  49. DiagnosticHandler *diagnostic.Handler
  50. QuickTunnelHostname string
  51. Orchestrator orchestrator
  52. ShutdownTimeout time.Duration
  53. }
  54. type orchestrator interface {
  55. GetVersionedConfigJSON() ([]byte, error)
  56. }
  57. func newMetricsHandler(
  58. config Config,
  59. log *zerolog.Logger,
  60. ) *http.ServeMux {
  61. router := http.NewServeMux()
  62. router.Handle("/debug/", http.DefaultServeMux)
  63. router.Handle("/metrics", promhttp.Handler())
  64. router.HandleFunc("/healthcheck", func(w http.ResponseWriter, r *http.Request) {
  65. _, _ = fmt.Fprintf(w, "OK\n")
  66. })
  67. if config.ReadyServer != nil {
  68. router.Handle("/ready", config.ReadyServer)
  69. }
  70. router.HandleFunc("/quicktunnel", func(w http.ResponseWriter, r *http.Request) {
  71. _, _ = fmt.Fprintf(w, `{"hostname":"%s"}`, config.QuickTunnelHostname)
  72. })
  73. if config.Orchestrator != nil {
  74. router.HandleFunc("/config", func(w http.ResponseWriter, r *http.Request) {
  75. json, err := config.Orchestrator.GetVersionedConfigJSON()
  76. if err != nil {
  77. w.WriteHeader(500)
  78. _, _ = fmt.Fprintf(w, "ERR: %v", err)
  79. log.Err(err).Msg("Failed to serve config")
  80. return
  81. }
  82. _, _ = w.Write(json)
  83. })
  84. }
  85. router.HandleFunc("/diag/configuration", config.DiagnosticHandler.ConfigurationHandler)
  86. router.HandleFunc("/diag/tunnel", config.DiagnosticHandler.TunnelStateHandler)
  87. router.HandleFunc("/diag/system", config.DiagnosticHandler.SystemHandler)
  88. return router
  89. }
  90. // CreateMetricsListener will create a new [net.Listener] by using an
  91. // known set of ports when the default address is passed with the fallback
  92. // of choosing a random port when none is available.
  93. //
  94. // In case the provided address is not the default one then it will be used
  95. // as is.
  96. func CreateMetricsListener(listeners *gracenet.Net, laddr string) (net.Listener, error) {
  97. if laddr == GetMetricsDefaultAddress(Runtime) {
  98. // On the presence of the default address select
  99. // a port from the known set of addresses iteratively.
  100. addresses := GetMetricsKnownAddresses(Runtime)
  101. for _, address := range addresses {
  102. listener, err := listeners.Listen("tcp", address)
  103. if err == nil {
  104. return listener, nil
  105. }
  106. }
  107. // When no port is available then bind to a random one
  108. listener, err := listeners.Listen("tcp", laddr)
  109. if err != nil {
  110. return nil, fmt.Errorf("failed to listen to default metrics address: %w", err)
  111. }
  112. return listener, nil
  113. }
  114. // Explicitly got a local address then bind to it
  115. listener, err := listeners.Listen("tcp", laddr)
  116. if err != nil {
  117. return nil, fmt.Errorf("failed to bind to address (%s): %w", laddr, err)
  118. }
  119. return listener, nil
  120. }
  121. func ServeMetrics(
  122. l net.Listener,
  123. ctx context.Context,
  124. config Config,
  125. log *zerolog.Logger,
  126. ) (err error) {
  127. var wg sync.WaitGroup
  128. // Metrics port is privileged, so no need for further access control
  129. trace.AuthRequest = func(*http.Request) (bool, bool) { return true, true }
  130. // TODO: parameterize ReadTimeout and WriteTimeout. The maximum time we can
  131. // profile CPU usage depends on WriteTimeout
  132. h := newMetricsHandler(config, log)
  133. server := &http.Server{
  134. ReadTimeout: 10 * time.Second,
  135. WriteTimeout: 10 * time.Second,
  136. Handler: h,
  137. }
  138. wg.Add(1)
  139. go func() {
  140. defer wg.Done()
  141. err = server.Serve(l)
  142. }()
  143. log.Info().Msgf("Starting metrics server on %s", fmt.Sprintf("%v/metrics", l.Addr()))
  144. // server.Serve will hang if server.Shutdown is called before the server is
  145. // fully started up. So add artificial delay.
  146. time.Sleep(startupTime)
  147. <-ctx.Done()
  148. shutdownTimeout := config.ShutdownTimeout
  149. if shutdownTimeout == 0 {
  150. shutdownTimeout = defaultShutdownTimeout
  151. }
  152. ctx, cancel := context.WithTimeout(context.Background(), shutdownTimeout)
  153. _ = server.Shutdown(ctx)
  154. cancel()
  155. wg.Wait()
  156. if err == http.ErrServerClosed {
  157. log.Info().Msg("Metrics server stopped")
  158. return nil
  159. }
  160. log.Err(err).Msg("Metrics server failed")
  161. return err
  162. }
  163. func RegisterBuildInfo(buildType, buildTime, version string) {
  164. buildInfo := prometheus.NewGaugeVec(
  165. prometheus.GaugeOpts{
  166. // Don't namespace build_info, since we want it to be consistent across all Cloudflare services
  167. Name: "build_info",
  168. Help: "Build and version information",
  169. },
  170. []string{"goversion", "type", "revision", "version"},
  171. )
  172. prometheus.MustRegister(buildInfo)
  173. buildInfo.WithLabelValues(runtime.Version(), buildType, buildTime, version).Set(1)
  174. }