diagnostic.go 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562
  1. package diagnostic
  2. import (
  3. "context"
  4. "encoding/json"
  5. "errors"
  6. "fmt"
  7. "io"
  8. "net/url"
  9. "os"
  10. "path/filepath"
  11. "strings"
  12. "sync"
  13. "time"
  14. "github.com/rs/zerolog"
  15. network "github.com/cloudflare/cloudflared/diagnostic/network"
  16. )
  17. const (
  18. taskSuccess = "success"
  19. taskFailure = "failure"
  20. jobReportName = "job report"
  21. tunnelStateJobName = "tunnel state"
  22. systemInformationJobName = "system information"
  23. goroutineJobName = "goroutine profile"
  24. heapJobName = "heap profile"
  25. metricsJobName = "metrics"
  26. logInformationJobName = "log information"
  27. rawNetworkInformationJobName = "raw network information"
  28. networkInformationJobName = "network information"
  29. cliConfigurationJobName = "cli configuration"
  30. configurationJobName = "configuration"
  31. )
  32. // Struct used to hold the results of different routines executing the network collection.
  33. type taskResult struct {
  34. Result string `json:"result,omitempty"`
  35. Err error `json:"error,omitempty"`
  36. path string
  37. }
  38. func (result taskResult) MarshalJSON() ([]byte, error) {
  39. s := map[string]string{
  40. "result": result.Result,
  41. }
  42. if result.Err != nil {
  43. s["error"] = result.Err.Error()
  44. }
  45. return json.Marshal(s)
  46. }
  47. // Struct used to hold the results of different routines executing the network collection.
  48. type networkCollectionResult struct {
  49. name string
  50. info []*network.Hop
  51. raw string
  52. err error
  53. }
  54. // This type represents the most common functions from the diagnostic http client
  55. // functions.
  56. type collectToWriterFunc func(ctx context.Context, writer io.Writer) error
  57. // This type represents the common denominator among all the collection procedures.
  58. type collectFunc func(ctx context.Context) (string, error)
  59. // collectJob is an internal struct that denotes holds the information necessary
  60. // to run a collection job.
  61. type collectJob struct {
  62. jobName string
  63. fn collectFunc
  64. bypass bool
  65. }
  66. // The Toggles structure denotes the available toggles for the diagnostic procedure.
  67. // Each toggle enables/disables tasks from the diagnostic.
  68. type Toggles struct {
  69. NoDiagLogs bool
  70. NoDiagMetrics bool
  71. NoDiagSystem bool
  72. NoDiagRuntime bool
  73. NoDiagNetwork bool
  74. }
  75. // The Options structure holds every option necessary for
  76. // the diagnostic procedure to work.
  77. type Options struct {
  78. KnownAddresses []string
  79. Address string
  80. ContainerID string
  81. PodID string
  82. Toggles Toggles
  83. }
  84. func collectLogs(
  85. ctx context.Context,
  86. client HTTPClient,
  87. diagContainer, diagPod string,
  88. ) (string, error) {
  89. var collector LogCollector
  90. if diagPod != "" {
  91. collector = NewKubernetesLogCollector(diagContainer, diagPod)
  92. } else if diagContainer != "" {
  93. collector = NewDockerLogCollector(diagContainer)
  94. } else {
  95. collector = NewHostLogCollector(client)
  96. }
  97. logInformation, err := collector.Collect(ctx)
  98. if err != nil {
  99. return "", fmt.Errorf("error collecting logs: %w", err)
  100. }
  101. if logInformation.isDirectory {
  102. return CopyFilesFromDirectory(logInformation.path)
  103. }
  104. if logInformation.wasCreated {
  105. return logInformation.path, nil
  106. }
  107. logHandle, err := os.Open(logInformation.path)
  108. if err != nil {
  109. return "", fmt.Errorf("error opening log file while collecting logs: %w", err)
  110. }
  111. defer logHandle.Close()
  112. outputLogHandle, err := os.Create(filepath.Join(os.TempDir(), logFilename))
  113. if err != nil {
  114. return "", ErrCreatingTemporaryFile
  115. }
  116. defer outputLogHandle.Close()
  117. _, err = io.Copy(outputLogHandle, logHandle)
  118. if err != nil {
  119. return "", fmt.Errorf("error copying logs while collecting logs: %w", err)
  120. }
  121. return outputLogHandle.Name(), err
  122. }
  123. func collectNetworkResultRoutine(
  124. ctx context.Context,
  125. collector network.NetworkCollector,
  126. hostname string,
  127. useIPv4 bool,
  128. results chan networkCollectionResult,
  129. ) {
  130. const (
  131. hopsNo = 5
  132. timeout = time.Second * 5
  133. )
  134. name := hostname
  135. if useIPv4 {
  136. name += "-v4"
  137. } else {
  138. name += "-v6"
  139. }
  140. hops, raw, err := collector.Collect(ctx, network.NewTraceOptions(hopsNo, timeout, hostname, useIPv4))
  141. results <- networkCollectionResult{name, hops, raw, err}
  142. }
  143. func gatherNetworkInformation(ctx context.Context) map[string]networkCollectionResult {
  144. networkCollector := network.NetworkCollectorImpl{}
  145. hostAndIPversionPairs := []struct {
  146. host string
  147. useV4 bool
  148. }{
  149. {"region1.v2.argotunnel.com", true},
  150. {"region1.v2.argotunnel.com", false},
  151. {"region2.v2.argotunnel.com", true},
  152. {"region2.v2.argotunnel.com", false},
  153. }
  154. // the number of results is known thus use len to avoid footguns
  155. results := make(chan networkCollectionResult, len(hostAndIPversionPairs))
  156. var wgroup sync.WaitGroup
  157. for _, item := range hostAndIPversionPairs {
  158. wgroup.Add(1)
  159. go func() {
  160. defer wgroup.Done()
  161. collectNetworkResultRoutine(ctx, &networkCollector, item.host, item.useV4, results)
  162. }()
  163. }
  164. // Wait for routines to end.
  165. wgroup.Wait()
  166. resultMap := make(map[string]networkCollectionResult)
  167. for range len(hostAndIPversionPairs) {
  168. result := <-results
  169. resultMap[result.name] = result
  170. }
  171. return resultMap
  172. }
  173. func networkInformationCollectors() (rawNetworkCollector, jsonNetworkCollector collectFunc) {
  174. // The network collector is an operation that takes most of the diagnostic time, thus,
  175. // the sync.Once is used to memoize the result of the collector and then create different
  176. // outputs.
  177. var once sync.Once
  178. var resultMap map[string]networkCollectionResult
  179. rawNetworkCollector = func(ctx context.Context) (string, error) {
  180. once.Do(func() { resultMap = gatherNetworkInformation(ctx) })
  181. return rawNetworkInformationWriter(resultMap)
  182. }
  183. jsonNetworkCollector = func(ctx context.Context) (string, error) {
  184. once.Do(func() { resultMap = gatherNetworkInformation(ctx) })
  185. return jsonNetworkInformationWriter(resultMap)
  186. }
  187. return rawNetworkCollector, jsonNetworkCollector
  188. }
  189. func rawNetworkInformationWriter(resultMap map[string]networkCollectionResult) (string, error) {
  190. networkDumpHandle, err := os.Create(filepath.Join(os.TempDir(), rawNetworkBaseName))
  191. if err != nil {
  192. return "", ErrCreatingTemporaryFile
  193. }
  194. defer networkDumpHandle.Close()
  195. var exitErr error
  196. for k, v := range resultMap {
  197. if v.err != nil {
  198. if exitErr == nil {
  199. exitErr = v.err
  200. }
  201. _, err := networkDumpHandle.WriteString(k + "\nno content\n")
  202. if err != nil {
  203. return networkDumpHandle.Name(), fmt.Errorf("error writing 'no content' to raw network file: %w", err)
  204. }
  205. } else {
  206. _, err := networkDumpHandle.WriteString(k + "\n" + v.raw + "\n")
  207. if err != nil {
  208. return networkDumpHandle.Name(), fmt.Errorf("error writing raw network information: %w", err)
  209. }
  210. }
  211. }
  212. return networkDumpHandle.Name(), exitErr
  213. }
  214. func jsonNetworkInformationWriter(resultMap map[string]networkCollectionResult) (string, error) {
  215. networkDumpHandle, err := os.Create(filepath.Join(os.TempDir(), networkBaseName))
  216. if err != nil {
  217. return "", ErrCreatingTemporaryFile
  218. }
  219. defer networkDumpHandle.Close()
  220. encoder := newFormattedEncoder(networkDumpHandle)
  221. var exitErr error
  222. jsonMap := make(map[string][]*network.Hop, len(resultMap))
  223. for k, v := range resultMap {
  224. jsonMap[k] = v.info
  225. if exitErr == nil && v.err != nil {
  226. exitErr = v.err
  227. }
  228. }
  229. err = encoder.Encode(jsonMap)
  230. if err != nil {
  231. return networkDumpHandle.Name(), fmt.Errorf("error encoding network information results: %w", err)
  232. }
  233. return networkDumpHandle.Name(), exitErr
  234. }
  235. func collectFromEndpointAdapter(collect collectToWriterFunc, fileName string) collectFunc {
  236. return func(ctx context.Context) (string, error) {
  237. dumpHandle, err := os.Create(filepath.Join(os.TempDir(), fileName))
  238. if err != nil {
  239. return "", ErrCreatingTemporaryFile
  240. }
  241. defer dumpHandle.Close()
  242. err = collect(ctx, dumpHandle)
  243. if err != nil {
  244. return dumpHandle.Name(), fmt.Errorf("error running collector: %w", err)
  245. }
  246. return dumpHandle.Name(), nil
  247. }
  248. }
  249. func tunnelStateCollectEndpointAdapter(client HTTPClient, tunnel *TunnelState, fileName string) collectFunc {
  250. endpointFunc := func(ctx context.Context, writer io.Writer) error {
  251. if tunnel == nil {
  252. // When the metrics server is not passed the diagnostic will query all known hosts
  253. // and get the tunnel state, however, when the metrics server is passed that won't
  254. // happen hence the check for nil in this function.
  255. tunnelResponse, err := client.GetTunnelState(ctx)
  256. if err != nil {
  257. return fmt.Errorf("error retrieving tunnel state: %w", err)
  258. }
  259. tunnel = tunnelResponse
  260. }
  261. encoder := newFormattedEncoder(writer)
  262. err := encoder.Encode(tunnel)
  263. if err != nil {
  264. return fmt.Errorf("error encoding tunnel state: %w", err)
  265. }
  266. return nil
  267. }
  268. return collectFromEndpointAdapter(endpointFunc, fileName)
  269. }
  270. // resolveInstanceBaseURL is responsible to
  271. // resolve the base URL of the instance that should be diagnosed.
  272. // To resolve the instance it may be necessary to query the
  273. // /diag/tunnel endpoint of the known instances, thus, if a single
  274. // instance is found its state is also returned; if multiple instances
  275. // are found then their states are returned in an array along with an
  276. // error.
  277. func resolveInstanceBaseURL(
  278. metricsServerAddress string,
  279. log *zerolog.Logger,
  280. client *httpClient,
  281. addresses []string,
  282. ) (*url.URL, *TunnelState, []*AddressableTunnelState, error) {
  283. if metricsServerAddress != "" {
  284. if !strings.HasPrefix(metricsServerAddress, "http://") {
  285. metricsServerAddress = "http://" + metricsServerAddress
  286. }
  287. url, err := url.Parse(metricsServerAddress)
  288. if err != nil {
  289. return nil, nil, nil, fmt.Errorf("provided address is not valid: %w", err)
  290. }
  291. return url, nil, nil, nil
  292. }
  293. tunnelState, foundTunnelStates, err := FindMetricsServer(log, client, addresses)
  294. if err != nil {
  295. return nil, nil, foundTunnelStates, err
  296. }
  297. return tunnelState.URL, tunnelState.TunnelState, nil, nil
  298. }
  299. func createJobs(
  300. client *httpClient,
  301. tunnel *TunnelState,
  302. diagContainer string,
  303. diagPod string,
  304. noDiagSystem bool,
  305. noDiagRuntime bool,
  306. noDiagMetrics bool,
  307. noDiagLogs bool,
  308. noDiagNetwork bool,
  309. ) []collectJob {
  310. rawNetworkCollectorFunc, jsonNetworkCollectorFunc := networkInformationCollectors()
  311. jobs := []collectJob{
  312. {
  313. jobName: tunnelStateJobName,
  314. fn: tunnelStateCollectEndpointAdapter(client, tunnel, tunnelStateBaseName),
  315. bypass: false,
  316. },
  317. {
  318. jobName: systemInformationJobName,
  319. fn: collectFromEndpointAdapter(client.GetSystemInformation, systemInformationBaseName),
  320. bypass: noDiagSystem,
  321. },
  322. {
  323. jobName: goroutineJobName,
  324. fn: collectFromEndpointAdapter(client.GetGoroutineDump, goroutinePprofBaseName),
  325. bypass: noDiagRuntime,
  326. },
  327. {
  328. jobName: heapJobName,
  329. fn: collectFromEndpointAdapter(client.GetMemoryDump, heapPprofBaseName),
  330. bypass: noDiagRuntime,
  331. },
  332. {
  333. jobName: metricsJobName,
  334. fn: collectFromEndpointAdapter(client.GetMetrics, metricsBaseName),
  335. bypass: noDiagMetrics,
  336. },
  337. {
  338. jobName: logInformationJobName,
  339. fn: func(ctx context.Context) (string, error) {
  340. return collectLogs(ctx, client, diagContainer, diagPod)
  341. },
  342. bypass: noDiagLogs,
  343. },
  344. {
  345. jobName: rawNetworkInformationJobName,
  346. fn: rawNetworkCollectorFunc,
  347. bypass: noDiagNetwork,
  348. },
  349. {
  350. jobName: networkInformationJobName,
  351. fn: jsonNetworkCollectorFunc,
  352. bypass: noDiagNetwork,
  353. },
  354. {
  355. jobName: cliConfigurationJobName,
  356. fn: collectFromEndpointAdapter(client.GetCliConfiguration, cliConfigurationBaseName),
  357. bypass: false,
  358. },
  359. {
  360. jobName: configurationJobName,
  361. fn: collectFromEndpointAdapter(client.GetTunnelConfiguration, configurationBaseName),
  362. bypass: false,
  363. },
  364. }
  365. return jobs
  366. }
  367. func createTaskReport(taskReport map[string]taskResult) (string, error) {
  368. dumpHandle, err := os.Create(filepath.Join(os.TempDir(), taskResultBaseName))
  369. if err != nil {
  370. return "", ErrCreatingTemporaryFile
  371. }
  372. defer dumpHandle.Close()
  373. encoder := newFormattedEncoder(dumpHandle)
  374. err = encoder.Encode(taskReport)
  375. if err != nil {
  376. return "", fmt.Errorf("error encoding task results: %w", err)
  377. }
  378. return dumpHandle.Name(), nil
  379. }
  380. func runJobs(ctx context.Context, jobs []collectJob, log *zerolog.Logger) map[string]taskResult {
  381. jobReport := make(map[string]taskResult, len(jobs))
  382. for _, job := range jobs {
  383. if job.bypass {
  384. continue
  385. }
  386. log.Info().Msgf("Collecting %s...", job.jobName)
  387. path, err := job.fn(ctx)
  388. var result taskResult
  389. if err != nil {
  390. result = taskResult{Result: taskFailure, Err: err, path: path}
  391. log.Error().Err(err).Msgf("Job: %s finished with error.", job.jobName)
  392. } else {
  393. result = taskResult{Result: taskSuccess, Err: nil, path: path}
  394. log.Info().Msgf("Collected %s.", job.jobName)
  395. }
  396. jobReport[job.jobName] = result
  397. }
  398. taskReportName, err := createTaskReport(jobReport)
  399. var result taskResult
  400. if err != nil {
  401. result = taskResult{
  402. Result: taskFailure,
  403. path: taskReportName,
  404. Err: err,
  405. }
  406. } else {
  407. result = taskResult{
  408. Result: taskSuccess,
  409. path: taskReportName,
  410. Err: nil,
  411. }
  412. }
  413. jobReport[jobReportName] = result
  414. return jobReport
  415. }
  416. func RunDiagnostic(
  417. log *zerolog.Logger,
  418. options Options,
  419. ) ([]*AddressableTunnelState, error) {
  420. client := NewHTTPClient()
  421. baseURL, tunnel, foundTunnels, err := resolveInstanceBaseURL(options.Address, log, client, options.KnownAddresses)
  422. if err != nil {
  423. return foundTunnels, err
  424. }
  425. log.Info().Msgf("Selected server %s starting diagnostic...", baseURL.String())
  426. client.SetBaseURL(baseURL)
  427. const timeout = 45 * time.Second
  428. ctx, cancel := context.WithTimeout(context.Background(), timeout)
  429. defer cancel()
  430. jobs := createJobs(
  431. client,
  432. tunnel,
  433. options.ContainerID,
  434. options.PodID,
  435. options.Toggles.NoDiagSystem,
  436. options.Toggles.NoDiagRuntime,
  437. options.Toggles.NoDiagMetrics,
  438. options.Toggles.NoDiagLogs,
  439. options.Toggles.NoDiagNetwork,
  440. )
  441. jobsReport := runJobs(ctx, jobs, log)
  442. paths := make([]string, 0)
  443. var gerr error
  444. for _, v := range jobsReport {
  445. paths = append(paths, v.path)
  446. if gerr == nil && v.Err != nil {
  447. gerr = v.Err
  448. }
  449. defer func() {
  450. if !errors.Is(v.Err, ErrCreatingTemporaryFile) {
  451. os.Remove(v.path)
  452. }
  453. }()
  454. }
  455. zipfile, err := CreateDiagnosticZipFile(zipName, paths)
  456. if err != nil {
  457. return nil, err
  458. }
  459. log.Info().Msgf("Diagnostic file written: %v", zipfile)
  460. return nil, gerr
  461. }