metrics.go 5.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197
  1. package metrics
  2. import (
  3. "context"
  4. "fmt"
  5. "net"
  6. "net/http"
  7. _ "net/http/pprof"
  8. "runtime"
  9. "sync"
  10. "time"
  11. "github.com/facebookgo/grace/gracenet"
  12. "github.com/prometheus/client_golang/prometheus"
  13. "github.com/prometheus/client_golang/prometheus/promhttp"
  14. "github.com/rs/zerolog"
  15. "golang.org/x/net/trace"
  16. "github.com/cloudflare/cloudflared/diagnostic"
  17. )
  18. const (
  19. startupTime = time.Millisecond * 500
  20. defaultShutdownTimeout = time.Second * 15
  21. )
  22. // This variable is set at compile time to allow the default local address to change.
  23. var Runtime = "host"
  24. func GetMetricsDefaultAddress(runtimeType string) string {
  25. // When issuing the diagnostic command we may have to reach a server that is
  26. // running in a virtual enviroment and in that case we must bind to 0.0.0.0
  27. // otherwise the server won't be reachable.
  28. switch runtimeType {
  29. case "virtual":
  30. return "0.0.0.0:0"
  31. default:
  32. return "localhost:0"
  33. }
  34. }
  35. // GetMetricsKnownAddresses returns the addresses used by the metrics server to bind at
  36. // startup time to allow a semi-deterministic approach to know where the server is listening at.
  37. // The ports were selected because at the time we are in 2024 and they do not collide with any
  38. // know/registered port according https://en.wikipedia.org/wiki/List_of_TCP_and_UDP_port_numbers.
  39. func GetMetricsKnownAddresses(runtimeType string) [5]string {
  40. switch Runtime {
  41. case "virtual":
  42. return [5]string{"0.0.0.0:20241", "0.0.0.0:20242", "0.0.0.0:20243", "0.0.0.0:20244", "0.0.0.0:20245"}
  43. default:
  44. return [5]string{"localhost:20241", "localhost:20242", "localhost:20243", "localhost:20244", "localhost:20245"}
  45. }
  46. }
  47. type Config struct {
  48. ReadyServer *ReadyServer
  49. DiagnosticHandler *diagnostic.Handler
  50. QuickTunnelHostname string
  51. Orchestrator orchestrator
  52. ShutdownTimeout time.Duration
  53. }
  54. type orchestrator interface {
  55. GetVersionedConfigJSON() ([]byte, error)
  56. }
  57. func newMetricsHandler(
  58. config Config,
  59. log *zerolog.Logger,
  60. ) *http.ServeMux {
  61. router := http.NewServeMux()
  62. router.Handle("/debug/", http.DefaultServeMux)
  63. router.Handle("/metrics", promhttp.Handler())
  64. router.HandleFunc("/healthcheck", func(w http.ResponseWriter, r *http.Request) {
  65. _, _ = fmt.Fprintf(w, "OK\n")
  66. })
  67. if config.ReadyServer != nil {
  68. router.Handle("/ready", config.ReadyServer)
  69. }
  70. router.HandleFunc("/quicktunnel", func(w http.ResponseWriter, r *http.Request) {
  71. _, _ = fmt.Fprintf(w, `{"hostname":"%s"}`, config.QuickTunnelHostname)
  72. })
  73. if config.Orchestrator != nil {
  74. router.HandleFunc("/config", func(w http.ResponseWriter, r *http.Request) {
  75. json, err := config.Orchestrator.GetVersionedConfigJSON()
  76. if err != nil {
  77. w.WriteHeader(500)
  78. _, _ = fmt.Fprintf(w, "ERR: %v", err)
  79. log.Err(err).Msg("Failed to serve config")
  80. return
  81. }
  82. _, _ = w.Write(json)
  83. })
  84. }
  85. router.HandleFunc("/diag/system", config.DiagnosticHandler.SystemHandler)
  86. return router
  87. }
  88. // CreateMetricsListener will create a new [net.Listener] by using an
  89. // known set of ports when the default address is passed with the fallback
  90. // of choosing a random port when none is available.
  91. //
  92. // In case the provided address is not the default one then it will be used
  93. // as is.
  94. func CreateMetricsListener(listeners *gracenet.Net, laddr string) (net.Listener, error) {
  95. if laddr == GetMetricsDefaultAddress(Runtime) {
  96. // On the presence of the default address select
  97. // a port from the known set of addresses iteratively.
  98. addresses := GetMetricsKnownAddresses(Runtime)
  99. for _, address := range addresses {
  100. listener, err := listeners.Listen("tcp", address)
  101. if err == nil {
  102. return listener, nil
  103. }
  104. }
  105. // When no port is available then bind to a random one
  106. listener, err := listeners.Listen("tcp", laddr)
  107. if err != nil {
  108. return nil, fmt.Errorf("failed to listen to default metrics address: %w", err)
  109. }
  110. return listener, nil
  111. }
  112. // Explicitly got a local address then bind to it
  113. listener, err := listeners.Listen("tcp", laddr)
  114. if err != nil {
  115. return nil, fmt.Errorf("failed to bind to address (%s): %w", laddr, err)
  116. }
  117. return listener, nil
  118. }
  119. func ServeMetrics(
  120. l net.Listener,
  121. ctx context.Context,
  122. config Config,
  123. log *zerolog.Logger,
  124. ) (err error) {
  125. var wg sync.WaitGroup
  126. // Metrics port is privileged, so no need for further access control
  127. trace.AuthRequest = func(*http.Request) (bool, bool) { return true, true }
  128. // TODO: parameterize ReadTimeout and WriteTimeout. The maximum time we can
  129. // profile CPU usage depends on WriteTimeout
  130. h := newMetricsHandler(config, log)
  131. server := &http.Server{
  132. ReadTimeout: 10 * time.Second,
  133. WriteTimeout: 10 * time.Second,
  134. Handler: h,
  135. }
  136. wg.Add(1)
  137. go func() {
  138. defer wg.Done()
  139. err = server.Serve(l)
  140. }()
  141. log.Info().Msgf("Starting metrics server on %s", fmt.Sprintf("%v/metrics", l.Addr()))
  142. // server.Serve will hang if server.Shutdown is called before the server is
  143. // fully started up. So add artificial delay.
  144. time.Sleep(startupTime)
  145. <-ctx.Done()
  146. shutdownTimeout := config.ShutdownTimeout
  147. if shutdownTimeout == 0 {
  148. shutdownTimeout = defaultShutdownTimeout
  149. }
  150. ctx, cancel := context.WithTimeout(context.Background(), shutdownTimeout)
  151. _ = server.Shutdown(ctx)
  152. cancel()
  153. wg.Wait()
  154. if err == http.ErrServerClosed {
  155. log.Info().Msg("Metrics server stopped")
  156. return nil
  157. }
  158. log.Err(err).Msg("Metrics server failed")
  159. return err
  160. }
  161. func RegisterBuildInfo(buildType, buildTime, version string) {
  162. buildInfo := prometheus.NewGaugeVec(
  163. prometheus.GaugeOpts{
  164. // Don't namespace build_info, since we want it to be consistent across all Cloudflare services
  165. Name: "build_info",
  166. Help: "Build and version information",
  167. },
  168. []string{"goversion", "type", "revision", "version"},
  169. )
  170. prometheus.MustRegister(buildInfo)
  171. buildInfo.WithLabelValues(runtime.Version(), buildType, buildTime, version).Set(1)
  172. }