numa.c 44 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833
  1. /*
  2. * numa.c
  3. *
  4. * numa: Simulate NUMA-sensitive workload and measure their NUMA performance
  5. */
  6. /* For the CLR_() macros */
  7. #include <pthread.h>
  8. #include "../perf.h"
  9. #include "../builtin.h"
  10. #include "../util/util.h"
  11. #include <subcmd/parse-options.h>
  12. #include "../util/cloexec.h"
  13. #include "bench.h"
  14. #include <errno.h>
  15. #include <sched.h>
  16. #include <stdio.h>
  17. #include <assert.h>
  18. #include <malloc.h>
  19. #include <signal.h>
  20. #include <stdlib.h>
  21. #include <string.h>
  22. #include <unistd.h>
  23. #include <sys/mman.h>
  24. #include <sys/time.h>
  25. #include <sys/resource.h>
  26. #include <sys/wait.h>
  27. #include <sys/prctl.h>
  28. #include <sys/types.h>
  29. #include <linux/time64.h>
  30. #include <numa.h>
  31. #include <numaif.h>
  32. /*
  33. * Regular printout to the terminal, supressed if -q is specified:
  34. */
  35. #define tprintf(x...) do { if (g && g->p.show_details >= 0) printf(x); } while (0)
  36. /*
  37. * Debug printf:
  38. */
  39. #define dprintf(x...) do { if (g && g->p.show_details >= 1) printf(x); } while (0)
  40. struct thread_data {
  41. int curr_cpu;
  42. cpu_set_t bind_cpumask;
  43. int bind_node;
  44. u8 *process_data;
  45. int process_nr;
  46. int thread_nr;
  47. int task_nr;
  48. unsigned int loops_done;
  49. u64 val;
  50. u64 runtime_ns;
  51. u64 system_time_ns;
  52. u64 user_time_ns;
  53. double speed_gbs;
  54. pthread_mutex_t *process_lock;
  55. };
  56. /* Parameters set by options: */
  57. struct params {
  58. /* Startup synchronization: */
  59. bool serialize_startup;
  60. /* Task hierarchy: */
  61. int nr_proc;
  62. int nr_threads;
  63. /* Working set sizes: */
  64. const char *mb_global_str;
  65. const char *mb_proc_str;
  66. const char *mb_proc_locked_str;
  67. const char *mb_thread_str;
  68. double mb_global;
  69. double mb_proc;
  70. double mb_proc_locked;
  71. double mb_thread;
  72. /* Access patterns to the working set: */
  73. bool data_reads;
  74. bool data_writes;
  75. bool data_backwards;
  76. bool data_zero_memset;
  77. bool data_rand_walk;
  78. u32 nr_loops;
  79. u32 nr_secs;
  80. u32 sleep_usecs;
  81. /* Working set initialization: */
  82. bool init_zero;
  83. bool init_random;
  84. bool init_cpu0;
  85. /* Misc options: */
  86. int show_details;
  87. int run_all;
  88. int thp;
  89. long bytes_global;
  90. long bytes_process;
  91. long bytes_process_locked;
  92. long bytes_thread;
  93. int nr_tasks;
  94. bool show_quiet;
  95. bool show_convergence;
  96. bool measure_convergence;
  97. int perturb_secs;
  98. int nr_cpus;
  99. int nr_nodes;
  100. /* Affinity options -C and -N: */
  101. char *cpu_list_str;
  102. char *node_list_str;
  103. };
  104. /* Global, read-writable area, accessible to all processes and threads: */
  105. struct global_info {
  106. u8 *data;
  107. pthread_mutex_t startup_mutex;
  108. int nr_tasks_started;
  109. pthread_mutex_t startup_done_mutex;
  110. pthread_mutex_t start_work_mutex;
  111. int nr_tasks_working;
  112. pthread_mutex_t stop_work_mutex;
  113. u64 bytes_done;
  114. struct thread_data *threads;
  115. /* Convergence latency measurement: */
  116. bool all_converged;
  117. bool stop_work;
  118. int print_once;
  119. struct params p;
  120. };
  121. static struct global_info *g = NULL;
  122. static int parse_cpus_opt(const struct option *opt, const char *arg, int unset);
  123. static int parse_nodes_opt(const struct option *opt, const char *arg, int unset);
  124. struct params p0;
  125. static const struct option options[] = {
  126. OPT_INTEGER('p', "nr_proc" , &p0.nr_proc, "number of processes"),
  127. OPT_INTEGER('t', "nr_threads" , &p0.nr_threads, "number of threads per process"),
  128. OPT_STRING('G', "mb_global" , &p0.mb_global_str, "MB", "global memory (MBs)"),
  129. OPT_STRING('P', "mb_proc" , &p0.mb_proc_str, "MB", "process memory (MBs)"),
  130. OPT_STRING('L', "mb_proc_locked", &p0.mb_proc_locked_str,"MB", "process serialized/locked memory access (MBs), <= process_memory"),
  131. OPT_STRING('T', "mb_thread" , &p0.mb_thread_str, "MB", "thread memory (MBs)"),
  132. OPT_UINTEGER('l', "nr_loops" , &p0.nr_loops, "max number of loops to run (default: unlimited)"),
  133. OPT_UINTEGER('s', "nr_secs" , &p0.nr_secs, "max number of seconds to run (default: 5 secs)"),
  134. OPT_UINTEGER('u', "usleep" , &p0.sleep_usecs, "usecs to sleep per loop iteration"),
  135. OPT_BOOLEAN('R', "data_reads" , &p0.data_reads, "access the data via writes (can be mixed with -W)"),
  136. OPT_BOOLEAN('W', "data_writes" , &p0.data_writes, "access the data via writes (can be mixed with -R)"),
  137. OPT_BOOLEAN('B', "data_backwards", &p0.data_backwards, "access the data backwards as well"),
  138. OPT_BOOLEAN('Z', "data_zero_memset", &p0.data_zero_memset,"access the data via glibc bzero only"),
  139. OPT_BOOLEAN('r', "data_rand_walk", &p0.data_rand_walk, "access the data with random (32bit LFSR) walk"),
  140. OPT_BOOLEAN('z', "init_zero" , &p0.init_zero, "bzero the initial allocations"),
  141. OPT_BOOLEAN('I', "init_random" , &p0.init_random, "randomize the contents of the initial allocations"),
  142. OPT_BOOLEAN('0', "init_cpu0" , &p0.init_cpu0, "do the initial allocations on CPU#0"),
  143. OPT_INTEGER('x', "perturb_secs", &p0.perturb_secs, "perturb thread 0/0 every X secs, to test convergence stability"),
  144. OPT_INCR ('d', "show_details" , &p0.show_details, "Show details"),
  145. OPT_INCR ('a', "all" , &p0.run_all, "Run all tests in the suite"),
  146. OPT_INTEGER('H', "thp" , &p0.thp, "MADV_NOHUGEPAGE < 0 < MADV_HUGEPAGE"),
  147. OPT_BOOLEAN('c', "show_convergence", &p0.show_convergence, "show convergence details"),
  148. OPT_BOOLEAN('m', "measure_convergence", &p0.measure_convergence, "measure convergence latency"),
  149. OPT_BOOLEAN('q', "quiet" , &p0.show_quiet, "quiet mode"),
  150. OPT_BOOLEAN('S', "serialize-startup", &p0.serialize_startup,"serialize thread startup"),
  151. /* Special option string parsing callbacks: */
  152. OPT_CALLBACK('C', "cpus", NULL, "cpu[,cpu2,...cpuN]",
  153. "bind the first N tasks to these specific cpus (the rest is unbound)",
  154. parse_cpus_opt),
  155. OPT_CALLBACK('M', "memnodes", NULL, "node[,node2,...nodeN]",
  156. "bind the first N tasks to these specific memory nodes (the rest is unbound)",
  157. parse_nodes_opt),
  158. OPT_END()
  159. };
  160. static const char * const bench_numa_usage[] = {
  161. "perf bench numa <options>",
  162. NULL
  163. };
  164. static const char * const numa_usage[] = {
  165. "perf bench numa mem [<options>]",
  166. NULL
  167. };
  168. /*
  169. * To get number of numa nodes present.
  170. */
  171. static int nr_numa_nodes(void)
  172. {
  173. int i, nr_nodes = 0;
  174. for (i = 0; i < g->p.nr_nodes; i++) {
  175. if (numa_bitmask_isbitset(numa_nodes_ptr, i))
  176. nr_nodes++;
  177. }
  178. return nr_nodes;
  179. }
  180. /*
  181. * To check if given numa node is present.
  182. */
  183. static int is_node_present(int node)
  184. {
  185. return numa_bitmask_isbitset(numa_nodes_ptr, node);
  186. }
  187. /*
  188. * To check given numa node has cpus.
  189. */
  190. static bool node_has_cpus(int node)
  191. {
  192. struct bitmask *cpu = numa_allocate_cpumask();
  193. unsigned int i;
  194. if (cpu && !numa_node_to_cpus(node, cpu)) {
  195. for (i = 0; i < cpu->size; i++) {
  196. if (numa_bitmask_isbitset(cpu, i))
  197. return true;
  198. }
  199. }
  200. return false; /* lets fall back to nocpus safely */
  201. }
  202. static cpu_set_t bind_to_cpu(int target_cpu)
  203. {
  204. cpu_set_t orig_mask, mask;
  205. int ret;
  206. ret = sched_getaffinity(0, sizeof(orig_mask), &orig_mask);
  207. BUG_ON(ret);
  208. CPU_ZERO(&mask);
  209. if (target_cpu == -1) {
  210. int cpu;
  211. for (cpu = 0; cpu < g->p.nr_cpus; cpu++)
  212. CPU_SET(cpu, &mask);
  213. } else {
  214. BUG_ON(target_cpu < 0 || target_cpu >= g->p.nr_cpus);
  215. CPU_SET(target_cpu, &mask);
  216. }
  217. ret = sched_setaffinity(0, sizeof(mask), &mask);
  218. BUG_ON(ret);
  219. return orig_mask;
  220. }
  221. static cpu_set_t bind_to_node(int target_node)
  222. {
  223. int cpus_per_node = g->p.nr_cpus / nr_numa_nodes();
  224. cpu_set_t orig_mask, mask;
  225. int cpu;
  226. int ret;
  227. BUG_ON(cpus_per_node * nr_numa_nodes() != g->p.nr_cpus);
  228. BUG_ON(!cpus_per_node);
  229. ret = sched_getaffinity(0, sizeof(orig_mask), &orig_mask);
  230. BUG_ON(ret);
  231. CPU_ZERO(&mask);
  232. if (target_node == -1) {
  233. for (cpu = 0; cpu < g->p.nr_cpus; cpu++)
  234. CPU_SET(cpu, &mask);
  235. } else {
  236. int cpu_start = (target_node + 0) * cpus_per_node;
  237. int cpu_stop = (target_node + 1) * cpus_per_node;
  238. BUG_ON(cpu_stop > g->p.nr_cpus);
  239. for (cpu = cpu_start; cpu < cpu_stop; cpu++)
  240. CPU_SET(cpu, &mask);
  241. }
  242. ret = sched_setaffinity(0, sizeof(mask), &mask);
  243. BUG_ON(ret);
  244. return orig_mask;
  245. }
  246. static void bind_to_cpumask(cpu_set_t mask)
  247. {
  248. int ret;
  249. ret = sched_setaffinity(0, sizeof(mask), &mask);
  250. BUG_ON(ret);
  251. }
  252. static void mempol_restore(void)
  253. {
  254. int ret;
  255. ret = set_mempolicy(MPOL_DEFAULT, NULL, g->p.nr_nodes-1);
  256. BUG_ON(ret);
  257. }
  258. static void bind_to_memnode(int node)
  259. {
  260. unsigned long nodemask;
  261. int ret;
  262. if (node == -1)
  263. return;
  264. BUG_ON(g->p.nr_nodes > (int)sizeof(nodemask)*8);
  265. nodemask = 1L << node;
  266. ret = set_mempolicy(MPOL_BIND, &nodemask, sizeof(nodemask)*8);
  267. dprintf("binding to node %d, mask: %016lx => %d\n", node, nodemask, ret);
  268. BUG_ON(ret);
  269. }
  270. #define HPSIZE (2*1024*1024)
  271. #define set_taskname(fmt...) \
  272. do { \
  273. char name[20]; \
  274. \
  275. snprintf(name, 20, fmt); \
  276. prctl(PR_SET_NAME, name); \
  277. } while (0)
  278. static u8 *alloc_data(ssize_t bytes0, int map_flags,
  279. int init_zero, int init_cpu0, int thp, int init_random)
  280. {
  281. cpu_set_t orig_mask;
  282. ssize_t bytes;
  283. u8 *buf;
  284. int ret;
  285. if (!bytes0)
  286. return NULL;
  287. /* Allocate and initialize all memory on CPU#0: */
  288. if (init_cpu0) {
  289. orig_mask = bind_to_node(0);
  290. bind_to_memnode(0);
  291. }
  292. bytes = bytes0 + HPSIZE;
  293. buf = (void *)mmap(0, bytes, PROT_READ|PROT_WRITE, MAP_ANON|map_flags, -1, 0);
  294. BUG_ON(buf == (void *)-1);
  295. if (map_flags == MAP_PRIVATE) {
  296. if (thp > 0) {
  297. ret = madvise(buf, bytes, MADV_HUGEPAGE);
  298. if (ret && !g->print_once) {
  299. g->print_once = 1;
  300. printf("WARNING: Could not enable THP - do: 'echo madvise > /sys/kernel/mm/transparent_hugepage/enabled'\n");
  301. }
  302. }
  303. if (thp < 0) {
  304. ret = madvise(buf, bytes, MADV_NOHUGEPAGE);
  305. if (ret && !g->print_once) {
  306. g->print_once = 1;
  307. printf("WARNING: Could not disable THP: run a CONFIG_TRANSPARENT_HUGEPAGE kernel?\n");
  308. }
  309. }
  310. }
  311. if (init_zero) {
  312. bzero(buf, bytes);
  313. } else {
  314. /* Initialize random contents, different in each word: */
  315. if (init_random) {
  316. u64 *wbuf = (void *)buf;
  317. long off = rand();
  318. long i;
  319. for (i = 0; i < bytes/8; i++)
  320. wbuf[i] = i + off;
  321. }
  322. }
  323. /* Align to 2MB boundary: */
  324. buf = (void *)(((unsigned long)buf + HPSIZE-1) & ~(HPSIZE-1));
  325. /* Restore affinity: */
  326. if (init_cpu0) {
  327. bind_to_cpumask(orig_mask);
  328. mempol_restore();
  329. }
  330. return buf;
  331. }
  332. static void free_data(void *data, ssize_t bytes)
  333. {
  334. int ret;
  335. if (!data)
  336. return;
  337. ret = munmap(data, bytes);
  338. BUG_ON(ret);
  339. }
  340. /*
  341. * Create a shared memory buffer that can be shared between processes, zeroed:
  342. */
  343. static void * zalloc_shared_data(ssize_t bytes)
  344. {
  345. return alloc_data(bytes, MAP_SHARED, 1, g->p.init_cpu0, g->p.thp, g->p.init_random);
  346. }
  347. /*
  348. * Create a shared memory buffer that can be shared between processes:
  349. */
  350. static void * setup_shared_data(ssize_t bytes)
  351. {
  352. return alloc_data(bytes, MAP_SHARED, 0, g->p.init_cpu0, g->p.thp, g->p.init_random);
  353. }
  354. /*
  355. * Allocate process-local memory - this will either be shared between
  356. * threads of this process, or only be accessed by this thread:
  357. */
  358. static void * setup_private_data(ssize_t bytes)
  359. {
  360. return alloc_data(bytes, MAP_PRIVATE, 0, g->p.init_cpu0, g->p.thp, g->p.init_random);
  361. }
  362. /*
  363. * Return a process-shared (global) mutex:
  364. */
  365. static void init_global_mutex(pthread_mutex_t *mutex)
  366. {
  367. pthread_mutexattr_t attr;
  368. pthread_mutexattr_init(&attr);
  369. pthread_mutexattr_setpshared(&attr, PTHREAD_PROCESS_SHARED);
  370. pthread_mutex_init(mutex, &attr);
  371. }
  372. static int parse_cpu_list(const char *arg)
  373. {
  374. p0.cpu_list_str = strdup(arg);
  375. dprintf("got CPU list: {%s}\n", p0.cpu_list_str);
  376. return 0;
  377. }
  378. static int parse_setup_cpu_list(void)
  379. {
  380. struct thread_data *td;
  381. char *str0, *str;
  382. int t;
  383. if (!g->p.cpu_list_str)
  384. return 0;
  385. dprintf("g->p.nr_tasks: %d\n", g->p.nr_tasks);
  386. str0 = str = strdup(g->p.cpu_list_str);
  387. t = 0;
  388. BUG_ON(!str);
  389. tprintf("# binding tasks to CPUs:\n");
  390. tprintf("# ");
  391. while (true) {
  392. int bind_cpu, bind_cpu_0, bind_cpu_1;
  393. char *tok, *tok_end, *tok_step, *tok_len, *tok_mul;
  394. int bind_len;
  395. int step;
  396. int mul;
  397. tok = strsep(&str, ",");
  398. if (!tok)
  399. break;
  400. tok_end = strstr(tok, "-");
  401. dprintf("\ntoken: {%s}, end: {%s}\n", tok, tok_end);
  402. if (!tok_end) {
  403. /* Single CPU specified: */
  404. bind_cpu_0 = bind_cpu_1 = atol(tok);
  405. } else {
  406. /* CPU range specified (for example: "5-11"): */
  407. bind_cpu_0 = atol(tok);
  408. bind_cpu_1 = atol(tok_end + 1);
  409. }
  410. step = 1;
  411. tok_step = strstr(tok, "#");
  412. if (tok_step) {
  413. step = atol(tok_step + 1);
  414. BUG_ON(step <= 0 || step >= g->p.nr_cpus);
  415. }
  416. /*
  417. * Mask length.
  418. * Eg: "--cpus 8_4-16#4" means: '--cpus 8_4,12_4,16_4',
  419. * where the _4 means the next 4 CPUs are allowed.
  420. */
  421. bind_len = 1;
  422. tok_len = strstr(tok, "_");
  423. if (tok_len) {
  424. bind_len = atol(tok_len + 1);
  425. BUG_ON(bind_len <= 0 || bind_len > g->p.nr_cpus);
  426. }
  427. /* Multiplicator shortcut, "0x8" is a shortcut for: "0,0,0,0,0,0,0,0" */
  428. mul = 1;
  429. tok_mul = strstr(tok, "x");
  430. if (tok_mul) {
  431. mul = atol(tok_mul + 1);
  432. BUG_ON(mul <= 0);
  433. }
  434. dprintf("CPUs: %d_%d-%d#%dx%d\n", bind_cpu_0, bind_len, bind_cpu_1, step, mul);
  435. if (bind_cpu_0 >= g->p.nr_cpus || bind_cpu_1 >= g->p.nr_cpus) {
  436. printf("\nTest not applicable, system has only %d CPUs.\n", g->p.nr_cpus);
  437. return -1;
  438. }
  439. BUG_ON(bind_cpu_0 < 0 || bind_cpu_1 < 0);
  440. BUG_ON(bind_cpu_0 > bind_cpu_1);
  441. for (bind_cpu = bind_cpu_0; bind_cpu <= bind_cpu_1; bind_cpu += step) {
  442. int i;
  443. for (i = 0; i < mul; i++) {
  444. int cpu;
  445. if (t >= g->p.nr_tasks) {
  446. printf("\n# NOTE: ignoring bind CPUs starting at CPU#%d\n #", bind_cpu);
  447. goto out;
  448. }
  449. td = g->threads + t;
  450. if (t)
  451. tprintf(",");
  452. if (bind_len > 1) {
  453. tprintf("%2d/%d", bind_cpu, bind_len);
  454. } else {
  455. tprintf("%2d", bind_cpu);
  456. }
  457. CPU_ZERO(&td->bind_cpumask);
  458. for (cpu = bind_cpu; cpu < bind_cpu+bind_len; cpu++) {
  459. BUG_ON(cpu < 0 || cpu >= g->p.nr_cpus);
  460. CPU_SET(cpu, &td->bind_cpumask);
  461. }
  462. t++;
  463. }
  464. }
  465. }
  466. out:
  467. tprintf("\n");
  468. if (t < g->p.nr_tasks)
  469. printf("# NOTE: %d tasks bound, %d tasks unbound\n", t, g->p.nr_tasks - t);
  470. free(str0);
  471. return 0;
  472. }
  473. static int parse_cpus_opt(const struct option *opt __maybe_unused,
  474. const char *arg, int unset __maybe_unused)
  475. {
  476. if (!arg)
  477. return -1;
  478. return parse_cpu_list(arg);
  479. }
  480. static int parse_node_list(const char *arg)
  481. {
  482. p0.node_list_str = strdup(arg);
  483. dprintf("got NODE list: {%s}\n", p0.node_list_str);
  484. return 0;
  485. }
  486. static int parse_setup_node_list(void)
  487. {
  488. struct thread_data *td;
  489. char *str0, *str;
  490. int t;
  491. if (!g->p.node_list_str)
  492. return 0;
  493. dprintf("g->p.nr_tasks: %d\n", g->p.nr_tasks);
  494. str0 = str = strdup(g->p.node_list_str);
  495. t = 0;
  496. BUG_ON(!str);
  497. tprintf("# binding tasks to NODEs:\n");
  498. tprintf("# ");
  499. while (true) {
  500. int bind_node, bind_node_0, bind_node_1;
  501. char *tok, *tok_end, *tok_step, *tok_mul;
  502. int step;
  503. int mul;
  504. tok = strsep(&str, ",");
  505. if (!tok)
  506. break;
  507. tok_end = strstr(tok, "-");
  508. dprintf("\ntoken: {%s}, end: {%s}\n", tok, tok_end);
  509. if (!tok_end) {
  510. /* Single NODE specified: */
  511. bind_node_0 = bind_node_1 = atol(tok);
  512. } else {
  513. /* NODE range specified (for example: "5-11"): */
  514. bind_node_0 = atol(tok);
  515. bind_node_1 = atol(tok_end + 1);
  516. }
  517. step = 1;
  518. tok_step = strstr(tok, "#");
  519. if (tok_step) {
  520. step = atol(tok_step + 1);
  521. BUG_ON(step <= 0 || step >= g->p.nr_nodes);
  522. }
  523. /* Multiplicator shortcut, "0x8" is a shortcut for: "0,0,0,0,0,0,0,0" */
  524. mul = 1;
  525. tok_mul = strstr(tok, "x");
  526. if (tok_mul) {
  527. mul = atol(tok_mul + 1);
  528. BUG_ON(mul <= 0);
  529. }
  530. dprintf("NODEs: %d-%d #%d\n", bind_node_0, bind_node_1, step);
  531. if (bind_node_0 >= g->p.nr_nodes || bind_node_1 >= g->p.nr_nodes) {
  532. printf("\nTest not applicable, system has only %d nodes.\n", g->p.nr_nodes);
  533. return -1;
  534. }
  535. BUG_ON(bind_node_0 < 0 || bind_node_1 < 0);
  536. BUG_ON(bind_node_0 > bind_node_1);
  537. for (bind_node = bind_node_0; bind_node <= bind_node_1; bind_node += step) {
  538. int i;
  539. for (i = 0; i < mul; i++) {
  540. if (t >= g->p.nr_tasks || !node_has_cpus(bind_node)) {
  541. printf("\n# NOTE: ignoring bind NODEs starting at NODE#%d\n", bind_node);
  542. goto out;
  543. }
  544. td = g->threads + t;
  545. if (!t)
  546. tprintf(" %2d", bind_node);
  547. else
  548. tprintf(",%2d", bind_node);
  549. td->bind_node = bind_node;
  550. t++;
  551. }
  552. }
  553. }
  554. out:
  555. tprintf("\n");
  556. if (t < g->p.nr_tasks)
  557. printf("# NOTE: %d tasks mem-bound, %d tasks unbound\n", t, g->p.nr_tasks - t);
  558. free(str0);
  559. return 0;
  560. }
  561. static int parse_nodes_opt(const struct option *opt __maybe_unused,
  562. const char *arg, int unset __maybe_unused)
  563. {
  564. if (!arg)
  565. return -1;
  566. return parse_node_list(arg);
  567. return 0;
  568. }
  569. #define BIT(x) (1ul << x)
  570. static inline uint32_t lfsr_32(uint32_t lfsr)
  571. {
  572. const uint32_t taps = BIT(1) | BIT(5) | BIT(6) | BIT(31);
  573. return (lfsr>>1) ^ ((0x0u - (lfsr & 0x1u)) & taps);
  574. }
  575. /*
  576. * Make sure there's real data dependency to RAM (when read
  577. * accesses are enabled), so the compiler, the CPU and the
  578. * kernel (KSM, zero page, etc.) cannot optimize away RAM
  579. * accesses:
  580. */
  581. static inline u64 access_data(u64 *data __attribute__((unused)), u64 val)
  582. {
  583. if (g->p.data_reads)
  584. val += *data;
  585. if (g->p.data_writes)
  586. *data = val + 1;
  587. return val;
  588. }
  589. /*
  590. * The worker process does two types of work, a forwards going
  591. * loop and a backwards going loop.
  592. *
  593. * We do this so that on multiprocessor systems we do not create
  594. * a 'train' of processing, with highly synchronized processes,
  595. * skewing the whole benchmark.
  596. */
  597. static u64 do_work(u8 *__data, long bytes, int nr, int nr_max, int loop, u64 val)
  598. {
  599. long words = bytes/sizeof(u64);
  600. u64 *data = (void *)__data;
  601. long chunk_0, chunk_1;
  602. u64 *d0, *d, *d1;
  603. long off;
  604. long i;
  605. BUG_ON(!data && words);
  606. BUG_ON(data && !words);
  607. if (!data)
  608. return val;
  609. /* Very simple memset() work variant: */
  610. if (g->p.data_zero_memset && !g->p.data_rand_walk) {
  611. bzero(data, bytes);
  612. return val;
  613. }
  614. /* Spread out by PID/TID nr and by loop nr: */
  615. chunk_0 = words/nr_max;
  616. chunk_1 = words/g->p.nr_loops;
  617. off = nr*chunk_0 + loop*chunk_1;
  618. while (off >= words)
  619. off -= words;
  620. if (g->p.data_rand_walk) {
  621. u32 lfsr = nr + loop + val;
  622. int j;
  623. for (i = 0; i < words/1024; i++) {
  624. long start, end;
  625. lfsr = lfsr_32(lfsr);
  626. start = lfsr % words;
  627. end = min(start + 1024, words-1);
  628. if (g->p.data_zero_memset) {
  629. bzero(data + start, (end-start) * sizeof(u64));
  630. } else {
  631. for (j = start; j < end; j++)
  632. val = access_data(data + j, val);
  633. }
  634. }
  635. } else if (!g->p.data_backwards || (nr + loop) & 1) {
  636. d0 = data + off;
  637. d = data + off + 1;
  638. d1 = data + words;
  639. /* Process data forwards: */
  640. for (;;) {
  641. if (unlikely(d >= d1))
  642. d = data;
  643. if (unlikely(d == d0))
  644. break;
  645. val = access_data(d, val);
  646. d++;
  647. }
  648. } else {
  649. /* Process data backwards: */
  650. d0 = data + off;
  651. d = data + off - 1;
  652. d1 = data + words;
  653. /* Process data forwards: */
  654. for (;;) {
  655. if (unlikely(d < data))
  656. d = data + words-1;
  657. if (unlikely(d == d0))
  658. break;
  659. val = access_data(d, val);
  660. d--;
  661. }
  662. }
  663. return val;
  664. }
  665. static void update_curr_cpu(int task_nr, unsigned long bytes_worked)
  666. {
  667. unsigned int cpu;
  668. cpu = sched_getcpu();
  669. g->threads[task_nr].curr_cpu = cpu;
  670. prctl(0, bytes_worked);
  671. }
  672. #define MAX_NR_NODES 64
  673. /*
  674. * Count the number of nodes a process's threads
  675. * are spread out on.
  676. *
  677. * A count of 1 means that the process is compressed
  678. * to a single node. A count of g->p.nr_nodes means it's
  679. * spread out on the whole system.
  680. */
  681. static int count_process_nodes(int process_nr)
  682. {
  683. char node_present[MAX_NR_NODES] = { 0, };
  684. int nodes;
  685. int n, t;
  686. for (t = 0; t < g->p.nr_threads; t++) {
  687. struct thread_data *td;
  688. int task_nr;
  689. int node;
  690. task_nr = process_nr*g->p.nr_threads + t;
  691. td = g->threads + task_nr;
  692. node = numa_node_of_cpu(td->curr_cpu);
  693. if (node < 0) /* curr_cpu was likely still -1 */
  694. return 0;
  695. node_present[node] = 1;
  696. }
  697. nodes = 0;
  698. for (n = 0; n < MAX_NR_NODES; n++)
  699. nodes += node_present[n];
  700. return nodes;
  701. }
  702. /*
  703. * Count the number of distinct process-threads a node contains.
  704. *
  705. * A count of 1 means that the node contains only a single
  706. * process. If all nodes on the system contain at most one
  707. * process then we are well-converged.
  708. */
  709. static int count_node_processes(int node)
  710. {
  711. int processes = 0;
  712. int t, p;
  713. for (p = 0; p < g->p.nr_proc; p++) {
  714. for (t = 0; t < g->p.nr_threads; t++) {
  715. struct thread_data *td;
  716. int task_nr;
  717. int n;
  718. task_nr = p*g->p.nr_threads + t;
  719. td = g->threads + task_nr;
  720. n = numa_node_of_cpu(td->curr_cpu);
  721. if (n == node) {
  722. processes++;
  723. break;
  724. }
  725. }
  726. }
  727. return processes;
  728. }
  729. static void calc_convergence_compression(int *strong)
  730. {
  731. unsigned int nodes_min, nodes_max;
  732. int p;
  733. nodes_min = -1;
  734. nodes_max = 0;
  735. for (p = 0; p < g->p.nr_proc; p++) {
  736. unsigned int nodes = count_process_nodes(p);
  737. if (!nodes) {
  738. *strong = 0;
  739. return;
  740. }
  741. nodes_min = min(nodes, nodes_min);
  742. nodes_max = max(nodes, nodes_max);
  743. }
  744. /* Strong convergence: all threads compress on a single node: */
  745. if (nodes_min == 1 && nodes_max == 1) {
  746. *strong = 1;
  747. } else {
  748. *strong = 0;
  749. tprintf(" {%d-%d}", nodes_min, nodes_max);
  750. }
  751. }
  752. static void calc_convergence(double runtime_ns_max, double *convergence)
  753. {
  754. unsigned int loops_done_min, loops_done_max;
  755. int process_groups;
  756. int nodes[MAX_NR_NODES];
  757. int distance;
  758. int nr_min;
  759. int nr_max;
  760. int strong;
  761. int sum;
  762. int nr;
  763. int node;
  764. int cpu;
  765. int t;
  766. if (!g->p.show_convergence && !g->p.measure_convergence)
  767. return;
  768. for (node = 0; node < g->p.nr_nodes; node++)
  769. nodes[node] = 0;
  770. loops_done_min = -1;
  771. loops_done_max = 0;
  772. for (t = 0; t < g->p.nr_tasks; t++) {
  773. struct thread_data *td = g->threads + t;
  774. unsigned int loops_done;
  775. cpu = td->curr_cpu;
  776. /* Not all threads have written it yet: */
  777. if (cpu < 0)
  778. continue;
  779. node = numa_node_of_cpu(cpu);
  780. nodes[node]++;
  781. loops_done = td->loops_done;
  782. loops_done_min = min(loops_done, loops_done_min);
  783. loops_done_max = max(loops_done, loops_done_max);
  784. }
  785. nr_max = 0;
  786. nr_min = g->p.nr_tasks;
  787. sum = 0;
  788. for (node = 0; node < g->p.nr_nodes; node++) {
  789. if (!is_node_present(node))
  790. continue;
  791. nr = nodes[node];
  792. nr_min = min(nr, nr_min);
  793. nr_max = max(nr, nr_max);
  794. sum += nr;
  795. }
  796. BUG_ON(nr_min > nr_max);
  797. BUG_ON(sum > g->p.nr_tasks);
  798. if (0 && (sum < g->p.nr_tasks))
  799. return;
  800. /*
  801. * Count the number of distinct process groups present
  802. * on nodes - when we are converged this will decrease
  803. * to g->p.nr_proc:
  804. */
  805. process_groups = 0;
  806. for (node = 0; node < g->p.nr_nodes; node++) {
  807. int processes;
  808. if (!is_node_present(node))
  809. continue;
  810. processes = count_node_processes(node);
  811. nr = nodes[node];
  812. tprintf(" %2d/%-2d", nr, processes);
  813. process_groups += processes;
  814. }
  815. distance = nr_max - nr_min;
  816. tprintf(" [%2d/%-2d]", distance, process_groups);
  817. tprintf(" l:%3d-%-3d (%3d)",
  818. loops_done_min, loops_done_max, loops_done_max-loops_done_min);
  819. if (loops_done_min && loops_done_max) {
  820. double skew = 1.0 - (double)loops_done_min/loops_done_max;
  821. tprintf(" [%4.1f%%]", skew * 100.0);
  822. }
  823. calc_convergence_compression(&strong);
  824. if (strong && process_groups == g->p.nr_proc) {
  825. if (!*convergence) {
  826. *convergence = runtime_ns_max;
  827. tprintf(" (%6.1fs converged)\n", *convergence / NSEC_PER_SEC);
  828. if (g->p.measure_convergence) {
  829. g->all_converged = true;
  830. g->stop_work = true;
  831. }
  832. }
  833. } else {
  834. if (*convergence) {
  835. tprintf(" (%6.1fs de-converged)", runtime_ns_max / NSEC_PER_SEC);
  836. *convergence = 0;
  837. }
  838. tprintf("\n");
  839. }
  840. }
  841. static void show_summary(double runtime_ns_max, int l, double *convergence)
  842. {
  843. tprintf("\r # %5.1f%% [%.1f mins]",
  844. (double)(l+1)/g->p.nr_loops*100.0, runtime_ns_max / NSEC_PER_SEC / 60.0);
  845. calc_convergence(runtime_ns_max, convergence);
  846. if (g->p.show_details >= 0)
  847. fflush(stdout);
  848. }
  849. static void *worker_thread(void *__tdata)
  850. {
  851. struct thread_data *td = __tdata;
  852. struct timeval start0, start, stop, diff;
  853. int process_nr = td->process_nr;
  854. int thread_nr = td->thread_nr;
  855. unsigned long last_perturbance;
  856. int task_nr = td->task_nr;
  857. int details = g->p.show_details;
  858. int first_task, last_task;
  859. double convergence = 0;
  860. u64 val = td->val;
  861. double runtime_ns_max;
  862. u8 *global_data;
  863. u8 *process_data;
  864. u8 *thread_data;
  865. u64 bytes_done;
  866. long work_done;
  867. u32 l;
  868. struct rusage rusage;
  869. bind_to_cpumask(td->bind_cpumask);
  870. bind_to_memnode(td->bind_node);
  871. set_taskname("thread %d/%d", process_nr, thread_nr);
  872. global_data = g->data;
  873. process_data = td->process_data;
  874. thread_data = setup_private_data(g->p.bytes_thread);
  875. bytes_done = 0;
  876. last_task = 0;
  877. if (process_nr == g->p.nr_proc-1 && thread_nr == g->p.nr_threads-1)
  878. last_task = 1;
  879. first_task = 0;
  880. if (process_nr == 0 && thread_nr == 0)
  881. first_task = 1;
  882. if (details >= 2) {
  883. printf("# thread %2d / %2d global mem: %p, process mem: %p, thread mem: %p\n",
  884. process_nr, thread_nr, global_data, process_data, thread_data);
  885. }
  886. if (g->p.serialize_startup) {
  887. pthread_mutex_lock(&g->startup_mutex);
  888. g->nr_tasks_started++;
  889. pthread_mutex_unlock(&g->startup_mutex);
  890. /* Here we will wait for the main process to start us all at once: */
  891. pthread_mutex_lock(&g->start_work_mutex);
  892. g->nr_tasks_working++;
  893. /* Last one wake the main process: */
  894. if (g->nr_tasks_working == g->p.nr_tasks)
  895. pthread_mutex_unlock(&g->startup_done_mutex);
  896. pthread_mutex_unlock(&g->start_work_mutex);
  897. }
  898. gettimeofday(&start0, NULL);
  899. start = stop = start0;
  900. last_perturbance = start.tv_sec;
  901. for (l = 0; l < g->p.nr_loops; l++) {
  902. start = stop;
  903. if (g->stop_work)
  904. break;
  905. val += do_work(global_data, g->p.bytes_global, process_nr, g->p.nr_proc, l, val);
  906. val += do_work(process_data, g->p.bytes_process, thread_nr, g->p.nr_threads, l, val);
  907. val += do_work(thread_data, g->p.bytes_thread, 0, 1, l, val);
  908. if (g->p.sleep_usecs) {
  909. pthread_mutex_lock(td->process_lock);
  910. usleep(g->p.sleep_usecs);
  911. pthread_mutex_unlock(td->process_lock);
  912. }
  913. /*
  914. * Amount of work to be done under a process-global lock:
  915. */
  916. if (g->p.bytes_process_locked) {
  917. pthread_mutex_lock(td->process_lock);
  918. val += do_work(process_data, g->p.bytes_process_locked, thread_nr, g->p.nr_threads, l, val);
  919. pthread_mutex_unlock(td->process_lock);
  920. }
  921. work_done = g->p.bytes_global + g->p.bytes_process +
  922. g->p.bytes_process_locked + g->p.bytes_thread;
  923. update_curr_cpu(task_nr, work_done);
  924. bytes_done += work_done;
  925. if (details < 0 && !g->p.perturb_secs && !g->p.measure_convergence && !g->p.nr_secs)
  926. continue;
  927. td->loops_done = l;
  928. gettimeofday(&stop, NULL);
  929. /* Check whether our max runtime timed out: */
  930. if (g->p.nr_secs) {
  931. timersub(&stop, &start0, &diff);
  932. if ((u32)diff.tv_sec >= g->p.nr_secs) {
  933. g->stop_work = true;
  934. break;
  935. }
  936. }
  937. /* Update the summary at most once per second: */
  938. if (start.tv_sec == stop.tv_sec)
  939. continue;
  940. /*
  941. * Perturb the first task's equilibrium every g->p.perturb_secs seconds,
  942. * by migrating to CPU#0:
  943. */
  944. if (first_task && g->p.perturb_secs && (int)(stop.tv_sec - last_perturbance) >= g->p.perturb_secs) {
  945. cpu_set_t orig_mask;
  946. int target_cpu;
  947. int this_cpu;
  948. last_perturbance = stop.tv_sec;
  949. /*
  950. * Depending on where we are running, move into
  951. * the other half of the system, to create some
  952. * real disturbance:
  953. */
  954. this_cpu = g->threads[task_nr].curr_cpu;
  955. if (this_cpu < g->p.nr_cpus/2)
  956. target_cpu = g->p.nr_cpus-1;
  957. else
  958. target_cpu = 0;
  959. orig_mask = bind_to_cpu(target_cpu);
  960. /* Here we are running on the target CPU already */
  961. if (details >= 1)
  962. printf(" (injecting perturbalance, moved to CPU#%d)\n", target_cpu);
  963. bind_to_cpumask(orig_mask);
  964. }
  965. if (details >= 3) {
  966. timersub(&stop, &start, &diff);
  967. runtime_ns_max = diff.tv_sec * NSEC_PER_SEC;
  968. runtime_ns_max += diff.tv_usec * NSEC_PER_USEC;
  969. if (details >= 0) {
  970. printf(" #%2d / %2d: %14.2lf nsecs/op [val: %016"PRIx64"]\n",
  971. process_nr, thread_nr, runtime_ns_max / bytes_done, val);
  972. }
  973. fflush(stdout);
  974. }
  975. if (!last_task)
  976. continue;
  977. timersub(&stop, &start0, &diff);
  978. runtime_ns_max = diff.tv_sec * NSEC_PER_SEC;
  979. runtime_ns_max += diff.tv_usec * NSEC_PER_USEC;
  980. show_summary(runtime_ns_max, l, &convergence);
  981. }
  982. gettimeofday(&stop, NULL);
  983. timersub(&stop, &start0, &diff);
  984. td->runtime_ns = diff.tv_sec * NSEC_PER_SEC;
  985. td->runtime_ns += diff.tv_usec * NSEC_PER_USEC;
  986. td->speed_gbs = bytes_done / (td->runtime_ns / NSEC_PER_SEC) / 1e9;
  987. getrusage(RUSAGE_THREAD, &rusage);
  988. td->system_time_ns = rusage.ru_stime.tv_sec * NSEC_PER_SEC;
  989. td->system_time_ns += rusage.ru_stime.tv_usec * NSEC_PER_USEC;
  990. td->user_time_ns = rusage.ru_utime.tv_sec * NSEC_PER_SEC;
  991. td->user_time_ns += rusage.ru_utime.tv_usec * NSEC_PER_USEC;
  992. free_data(thread_data, g->p.bytes_thread);
  993. pthread_mutex_lock(&g->stop_work_mutex);
  994. g->bytes_done += bytes_done;
  995. pthread_mutex_unlock(&g->stop_work_mutex);
  996. return NULL;
  997. }
  998. /*
  999. * A worker process starts a couple of threads:
  1000. */
  1001. static void worker_process(int process_nr)
  1002. {
  1003. pthread_mutex_t process_lock;
  1004. struct thread_data *td;
  1005. pthread_t *pthreads;
  1006. u8 *process_data;
  1007. int task_nr;
  1008. int ret;
  1009. int t;
  1010. pthread_mutex_init(&process_lock, NULL);
  1011. set_taskname("process %d", process_nr);
  1012. /*
  1013. * Pick up the memory policy and the CPU binding of our first thread,
  1014. * so that we initialize memory accordingly:
  1015. */
  1016. task_nr = process_nr*g->p.nr_threads;
  1017. td = g->threads + task_nr;
  1018. bind_to_memnode(td->bind_node);
  1019. bind_to_cpumask(td->bind_cpumask);
  1020. pthreads = zalloc(g->p.nr_threads * sizeof(pthread_t));
  1021. process_data = setup_private_data(g->p.bytes_process);
  1022. if (g->p.show_details >= 3) {
  1023. printf(" # process %2d global mem: %p, process mem: %p\n",
  1024. process_nr, g->data, process_data);
  1025. }
  1026. for (t = 0; t < g->p.nr_threads; t++) {
  1027. task_nr = process_nr*g->p.nr_threads + t;
  1028. td = g->threads + task_nr;
  1029. td->process_data = process_data;
  1030. td->process_nr = process_nr;
  1031. td->thread_nr = t;
  1032. td->task_nr = task_nr;
  1033. td->val = rand();
  1034. td->curr_cpu = -1;
  1035. td->process_lock = &process_lock;
  1036. ret = pthread_create(pthreads + t, NULL, worker_thread, td);
  1037. BUG_ON(ret);
  1038. }
  1039. for (t = 0; t < g->p.nr_threads; t++) {
  1040. ret = pthread_join(pthreads[t], NULL);
  1041. BUG_ON(ret);
  1042. }
  1043. free_data(process_data, g->p.bytes_process);
  1044. free(pthreads);
  1045. }
  1046. static void print_summary(void)
  1047. {
  1048. if (g->p.show_details < 0)
  1049. return;
  1050. printf("\n ###\n");
  1051. printf(" # %d %s will execute (on %d nodes, %d CPUs):\n",
  1052. g->p.nr_tasks, g->p.nr_tasks == 1 ? "task" : "tasks", nr_numa_nodes(), g->p.nr_cpus);
  1053. printf(" # %5dx %5ldMB global shared mem operations\n",
  1054. g->p.nr_loops, g->p.bytes_global/1024/1024);
  1055. printf(" # %5dx %5ldMB process shared mem operations\n",
  1056. g->p.nr_loops, g->p.bytes_process/1024/1024);
  1057. printf(" # %5dx %5ldMB thread local mem operations\n",
  1058. g->p.nr_loops, g->p.bytes_thread/1024/1024);
  1059. printf(" ###\n");
  1060. printf("\n ###\n"); fflush(stdout);
  1061. }
  1062. static void init_thread_data(void)
  1063. {
  1064. ssize_t size = sizeof(*g->threads)*g->p.nr_tasks;
  1065. int t;
  1066. g->threads = zalloc_shared_data(size);
  1067. for (t = 0; t < g->p.nr_tasks; t++) {
  1068. struct thread_data *td = g->threads + t;
  1069. int cpu;
  1070. /* Allow all nodes by default: */
  1071. td->bind_node = -1;
  1072. /* Allow all CPUs by default: */
  1073. CPU_ZERO(&td->bind_cpumask);
  1074. for (cpu = 0; cpu < g->p.nr_cpus; cpu++)
  1075. CPU_SET(cpu, &td->bind_cpumask);
  1076. }
  1077. }
  1078. static void deinit_thread_data(void)
  1079. {
  1080. ssize_t size = sizeof(*g->threads)*g->p.nr_tasks;
  1081. free_data(g->threads, size);
  1082. }
  1083. static int init(void)
  1084. {
  1085. g = (void *)alloc_data(sizeof(*g), MAP_SHARED, 1, 0, 0 /* THP */, 0);
  1086. /* Copy over options: */
  1087. g->p = p0;
  1088. g->p.nr_cpus = numa_num_configured_cpus();
  1089. g->p.nr_nodes = numa_max_node() + 1;
  1090. /* char array in count_process_nodes(): */
  1091. BUG_ON(g->p.nr_nodes > MAX_NR_NODES || g->p.nr_nodes < 0);
  1092. if (g->p.show_quiet && !g->p.show_details)
  1093. g->p.show_details = -1;
  1094. /* Some memory should be specified: */
  1095. if (!g->p.mb_global_str && !g->p.mb_proc_str && !g->p.mb_thread_str)
  1096. return -1;
  1097. if (g->p.mb_global_str) {
  1098. g->p.mb_global = atof(g->p.mb_global_str);
  1099. BUG_ON(g->p.mb_global < 0);
  1100. }
  1101. if (g->p.mb_proc_str) {
  1102. g->p.mb_proc = atof(g->p.mb_proc_str);
  1103. BUG_ON(g->p.mb_proc < 0);
  1104. }
  1105. if (g->p.mb_proc_locked_str) {
  1106. g->p.mb_proc_locked = atof(g->p.mb_proc_locked_str);
  1107. BUG_ON(g->p.mb_proc_locked < 0);
  1108. BUG_ON(g->p.mb_proc_locked > g->p.mb_proc);
  1109. }
  1110. if (g->p.mb_thread_str) {
  1111. g->p.mb_thread = atof(g->p.mb_thread_str);
  1112. BUG_ON(g->p.mb_thread < 0);
  1113. }
  1114. BUG_ON(g->p.nr_threads <= 0);
  1115. BUG_ON(g->p.nr_proc <= 0);
  1116. g->p.nr_tasks = g->p.nr_proc*g->p.nr_threads;
  1117. g->p.bytes_global = g->p.mb_global *1024L*1024L;
  1118. g->p.bytes_process = g->p.mb_proc *1024L*1024L;
  1119. g->p.bytes_process_locked = g->p.mb_proc_locked *1024L*1024L;
  1120. g->p.bytes_thread = g->p.mb_thread *1024L*1024L;
  1121. g->data = setup_shared_data(g->p.bytes_global);
  1122. /* Startup serialization: */
  1123. init_global_mutex(&g->start_work_mutex);
  1124. init_global_mutex(&g->startup_mutex);
  1125. init_global_mutex(&g->startup_done_mutex);
  1126. init_global_mutex(&g->stop_work_mutex);
  1127. init_thread_data();
  1128. tprintf("#\n");
  1129. if (parse_setup_cpu_list() || parse_setup_node_list())
  1130. return -1;
  1131. tprintf("#\n");
  1132. print_summary();
  1133. return 0;
  1134. }
  1135. static void deinit(void)
  1136. {
  1137. free_data(g->data, g->p.bytes_global);
  1138. g->data = NULL;
  1139. deinit_thread_data();
  1140. free_data(g, sizeof(*g));
  1141. g = NULL;
  1142. }
  1143. /*
  1144. * Print a short or long result, depending on the verbosity setting:
  1145. */
  1146. static void print_res(const char *name, double val,
  1147. const char *txt_unit, const char *txt_short, const char *txt_long)
  1148. {
  1149. if (!name)
  1150. name = "main,";
  1151. if (!g->p.show_quiet)
  1152. printf(" %-30s %15.3f, %-15s %s\n", name, val, txt_unit, txt_short);
  1153. else
  1154. printf(" %14.3f %s\n", val, txt_long);
  1155. }
  1156. static int __bench_numa(const char *name)
  1157. {
  1158. struct timeval start, stop, diff;
  1159. u64 runtime_ns_min, runtime_ns_sum;
  1160. pid_t *pids, pid, wpid;
  1161. double delta_runtime;
  1162. double runtime_avg;
  1163. double runtime_sec_max;
  1164. double runtime_sec_min;
  1165. int wait_stat;
  1166. double bytes;
  1167. int i, t, p;
  1168. if (init())
  1169. return -1;
  1170. pids = zalloc(g->p.nr_proc * sizeof(*pids));
  1171. pid = -1;
  1172. /* All threads try to acquire it, this way we can wait for them to start up: */
  1173. pthread_mutex_lock(&g->start_work_mutex);
  1174. if (g->p.serialize_startup) {
  1175. tprintf(" #\n");
  1176. tprintf(" # Startup synchronization: ..."); fflush(stdout);
  1177. }
  1178. gettimeofday(&start, NULL);
  1179. for (i = 0; i < g->p.nr_proc; i++) {
  1180. pid = fork();
  1181. dprintf(" # process %2d: PID %d\n", i, pid);
  1182. BUG_ON(pid < 0);
  1183. if (!pid) {
  1184. /* Child process: */
  1185. worker_process(i);
  1186. exit(0);
  1187. }
  1188. pids[i] = pid;
  1189. }
  1190. /* Wait for all the threads to start up: */
  1191. while (g->nr_tasks_started != g->p.nr_tasks)
  1192. usleep(USEC_PER_MSEC);
  1193. BUG_ON(g->nr_tasks_started != g->p.nr_tasks);
  1194. if (g->p.serialize_startup) {
  1195. double startup_sec;
  1196. pthread_mutex_lock(&g->startup_done_mutex);
  1197. /* This will start all threads: */
  1198. pthread_mutex_unlock(&g->start_work_mutex);
  1199. /* This mutex is locked - the last started thread will wake us: */
  1200. pthread_mutex_lock(&g->startup_done_mutex);
  1201. gettimeofday(&stop, NULL);
  1202. timersub(&stop, &start, &diff);
  1203. startup_sec = diff.tv_sec * NSEC_PER_SEC;
  1204. startup_sec += diff.tv_usec * NSEC_PER_USEC;
  1205. startup_sec /= NSEC_PER_SEC;
  1206. tprintf(" threads initialized in %.6f seconds.\n", startup_sec);
  1207. tprintf(" #\n");
  1208. start = stop;
  1209. pthread_mutex_unlock(&g->startup_done_mutex);
  1210. } else {
  1211. gettimeofday(&start, NULL);
  1212. }
  1213. /* Parent process: */
  1214. for (i = 0; i < g->p.nr_proc; i++) {
  1215. wpid = waitpid(pids[i], &wait_stat, 0);
  1216. BUG_ON(wpid < 0);
  1217. BUG_ON(!WIFEXITED(wait_stat));
  1218. }
  1219. runtime_ns_sum = 0;
  1220. runtime_ns_min = -1LL;
  1221. for (t = 0; t < g->p.nr_tasks; t++) {
  1222. u64 thread_runtime_ns = g->threads[t].runtime_ns;
  1223. runtime_ns_sum += thread_runtime_ns;
  1224. runtime_ns_min = min(thread_runtime_ns, runtime_ns_min);
  1225. }
  1226. gettimeofday(&stop, NULL);
  1227. timersub(&stop, &start, &diff);
  1228. BUG_ON(bench_format != BENCH_FORMAT_DEFAULT);
  1229. tprintf("\n ###\n");
  1230. tprintf("\n");
  1231. runtime_sec_max = diff.tv_sec * NSEC_PER_SEC;
  1232. runtime_sec_max += diff.tv_usec * NSEC_PER_USEC;
  1233. runtime_sec_max /= NSEC_PER_SEC;
  1234. runtime_sec_min = runtime_ns_min / NSEC_PER_SEC;
  1235. bytes = g->bytes_done;
  1236. runtime_avg = (double)runtime_ns_sum / g->p.nr_tasks / NSEC_PER_SEC;
  1237. if (g->p.measure_convergence) {
  1238. print_res(name, runtime_sec_max,
  1239. "secs,", "NUMA-convergence-latency", "secs latency to NUMA-converge");
  1240. }
  1241. print_res(name, runtime_sec_max,
  1242. "secs,", "runtime-max/thread", "secs slowest (max) thread-runtime");
  1243. print_res(name, runtime_sec_min,
  1244. "secs,", "runtime-min/thread", "secs fastest (min) thread-runtime");
  1245. print_res(name, runtime_avg,
  1246. "secs,", "runtime-avg/thread", "secs average thread-runtime");
  1247. delta_runtime = (runtime_sec_max - runtime_sec_min)/2.0;
  1248. print_res(name, delta_runtime / runtime_sec_max * 100.0,
  1249. "%,", "spread-runtime/thread", "% difference between max/avg runtime");
  1250. print_res(name, bytes / g->p.nr_tasks / 1e9,
  1251. "GB,", "data/thread", "GB data processed, per thread");
  1252. print_res(name, bytes / 1e9,
  1253. "GB,", "data-total", "GB data processed, total");
  1254. print_res(name, runtime_sec_max * NSEC_PER_SEC / (bytes / g->p.nr_tasks),
  1255. "nsecs,", "runtime/byte/thread","nsecs/byte/thread runtime");
  1256. print_res(name, bytes / g->p.nr_tasks / 1e9 / runtime_sec_max,
  1257. "GB/sec,", "thread-speed", "GB/sec/thread speed");
  1258. print_res(name, bytes / runtime_sec_max / 1e9,
  1259. "GB/sec,", "total-speed", "GB/sec total speed");
  1260. if (g->p.show_details >= 2) {
  1261. char tname[14 + 2 * 10 + 1];
  1262. struct thread_data *td;
  1263. for (p = 0; p < g->p.nr_proc; p++) {
  1264. for (t = 0; t < g->p.nr_threads; t++) {
  1265. memset(tname, 0, sizeof(tname));
  1266. td = g->threads + p*g->p.nr_threads + t;
  1267. snprintf(tname, sizeof(tname), "process%d:thread%d", p, t);
  1268. print_res(tname, td->speed_gbs,
  1269. "GB/sec", "thread-speed", "GB/sec/thread speed");
  1270. print_res(tname, td->system_time_ns / NSEC_PER_SEC,
  1271. "secs", "thread-system-time", "system CPU time/thread");
  1272. print_res(tname, td->user_time_ns / NSEC_PER_SEC,
  1273. "secs", "thread-user-time", "user CPU time/thread");
  1274. }
  1275. }
  1276. }
  1277. free(pids);
  1278. deinit();
  1279. return 0;
  1280. }
  1281. #define MAX_ARGS 50
  1282. static int command_size(const char **argv)
  1283. {
  1284. int size = 0;
  1285. while (*argv) {
  1286. size++;
  1287. argv++;
  1288. }
  1289. BUG_ON(size >= MAX_ARGS);
  1290. return size;
  1291. }
  1292. static void init_params(struct params *p, const char *name, int argc, const char **argv)
  1293. {
  1294. int i;
  1295. printf("\n # Running %s \"perf bench numa", name);
  1296. for (i = 0; i < argc; i++)
  1297. printf(" %s", argv[i]);
  1298. printf("\"\n");
  1299. memset(p, 0, sizeof(*p));
  1300. /* Initialize nonzero defaults: */
  1301. p->serialize_startup = 1;
  1302. p->data_reads = true;
  1303. p->data_writes = true;
  1304. p->data_backwards = true;
  1305. p->data_rand_walk = true;
  1306. p->nr_loops = -1;
  1307. p->init_random = true;
  1308. p->mb_global_str = "1";
  1309. p->nr_proc = 1;
  1310. p->nr_threads = 1;
  1311. p->nr_secs = 5;
  1312. p->run_all = argc == 1;
  1313. }
  1314. static int run_bench_numa(const char *name, const char **argv)
  1315. {
  1316. int argc = command_size(argv);
  1317. init_params(&p0, name, argc, argv);
  1318. argc = parse_options(argc, argv, options, bench_numa_usage, 0);
  1319. if (argc)
  1320. goto err;
  1321. if (__bench_numa(name))
  1322. goto err;
  1323. return 0;
  1324. err:
  1325. return -1;
  1326. }
  1327. #define OPT_BW_RAM "-s", "20", "-zZq", "--thp", " 1", "--no-data_rand_walk"
  1328. #define OPT_BW_RAM_NOTHP OPT_BW_RAM, "--thp", "-1"
  1329. #define OPT_CONV "-s", "100", "-zZ0qcm", "--thp", " 1"
  1330. #define OPT_CONV_NOTHP OPT_CONV, "--thp", "-1"
  1331. #define OPT_BW "-s", "20", "-zZ0q", "--thp", " 1"
  1332. #define OPT_BW_NOTHP OPT_BW, "--thp", "-1"
  1333. /*
  1334. * The built-in test-suite executed by "perf bench numa -a".
  1335. *
  1336. * (A minimum of 4 nodes and 16 GB of RAM is recommended.)
  1337. */
  1338. static const char *tests[][MAX_ARGS] = {
  1339. /* Basic single-stream NUMA bandwidth measurements: */
  1340. { "RAM-bw-local,", "mem", "-p", "1", "-t", "1", "-P", "1024",
  1341. "-C" , "0", "-M", "0", OPT_BW_RAM },
  1342. { "RAM-bw-local-NOTHP,",
  1343. "mem", "-p", "1", "-t", "1", "-P", "1024",
  1344. "-C" , "0", "-M", "0", OPT_BW_RAM_NOTHP },
  1345. { "RAM-bw-remote,", "mem", "-p", "1", "-t", "1", "-P", "1024",
  1346. "-C" , "0", "-M", "1", OPT_BW_RAM },
  1347. /* 2-stream NUMA bandwidth measurements: */
  1348. { "RAM-bw-local-2x,", "mem", "-p", "2", "-t", "1", "-P", "1024",
  1349. "-C", "0,2", "-M", "0x2", OPT_BW_RAM },
  1350. { "RAM-bw-remote-2x,", "mem", "-p", "2", "-t", "1", "-P", "1024",
  1351. "-C", "0,2", "-M", "1x2", OPT_BW_RAM },
  1352. /* Cross-stream NUMA bandwidth measurement: */
  1353. { "RAM-bw-cross,", "mem", "-p", "2", "-t", "1", "-P", "1024",
  1354. "-C", "0,8", "-M", "1,0", OPT_BW_RAM },
  1355. /* Convergence latency measurements: */
  1356. { " 1x3-convergence,", "mem", "-p", "1", "-t", "3", "-P", "512", OPT_CONV },
  1357. { " 1x4-convergence,", "mem", "-p", "1", "-t", "4", "-P", "512", OPT_CONV },
  1358. { " 1x6-convergence,", "mem", "-p", "1", "-t", "6", "-P", "1020", OPT_CONV },
  1359. { " 2x3-convergence,", "mem", "-p", "3", "-t", "3", "-P", "1020", OPT_CONV },
  1360. { " 3x3-convergence,", "mem", "-p", "3", "-t", "3", "-P", "1020", OPT_CONV },
  1361. { " 4x4-convergence,", "mem", "-p", "4", "-t", "4", "-P", "512", OPT_CONV },
  1362. { " 4x4-convergence-NOTHP,",
  1363. "mem", "-p", "4", "-t", "4", "-P", "512", OPT_CONV_NOTHP },
  1364. { " 4x6-convergence,", "mem", "-p", "4", "-t", "6", "-P", "1020", OPT_CONV },
  1365. { " 4x8-convergence,", "mem", "-p", "4", "-t", "8", "-P", "512", OPT_CONV },
  1366. { " 8x4-convergence,", "mem", "-p", "8", "-t", "4", "-P", "512", OPT_CONV },
  1367. { " 8x4-convergence-NOTHP,",
  1368. "mem", "-p", "8", "-t", "4", "-P", "512", OPT_CONV_NOTHP },
  1369. { " 3x1-convergence,", "mem", "-p", "3", "-t", "1", "-P", "512", OPT_CONV },
  1370. { " 4x1-convergence,", "mem", "-p", "4", "-t", "1", "-P", "512", OPT_CONV },
  1371. { " 8x1-convergence,", "mem", "-p", "8", "-t", "1", "-P", "512", OPT_CONV },
  1372. { "16x1-convergence,", "mem", "-p", "16", "-t", "1", "-P", "256", OPT_CONV },
  1373. { "32x1-convergence,", "mem", "-p", "32", "-t", "1", "-P", "128", OPT_CONV },
  1374. /* Various NUMA process/thread layout bandwidth measurements: */
  1375. { " 2x1-bw-process,", "mem", "-p", "2", "-t", "1", "-P", "1024", OPT_BW },
  1376. { " 3x1-bw-process,", "mem", "-p", "3", "-t", "1", "-P", "1024", OPT_BW },
  1377. { " 4x1-bw-process,", "mem", "-p", "4", "-t", "1", "-P", "1024", OPT_BW },
  1378. { " 8x1-bw-process,", "mem", "-p", "8", "-t", "1", "-P", " 512", OPT_BW },
  1379. { " 8x1-bw-process-NOTHP,",
  1380. "mem", "-p", "8", "-t", "1", "-P", " 512", OPT_BW_NOTHP },
  1381. { "16x1-bw-process,", "mem", "-p", "16", "-t", "1", "-P", "256", OPT_BW },
  1382. { " 4x1-bw-thread,", "mem", "-p", "1", "-t", "4", "-T", "256", OPT_BW },
  1383. { " 8x1-bw-thread,", "mem", "-p", "1", "-t", "8", "-T", "256", OPT_BW },
  1384. { "16x1-bw-thread,", "mem", "-p", "1", "-t", "16", "-T", "128", OPT_BW },
  1385. { "32x1-bw-thread,", "mem", "-p", "1", "-t", "32", "-T", "64", OPT_BW },
  1386. { " 2x3-bw-thread,", "mem", "-p", "2", "-t", "3", "-P", "512", OPT_BW },
  1387. { " 4x4-bw-thread,", "mem", "-p", "4", "-t", "4", "-P", "512", OPT_BW },
  1388. { " 4x6-bw-thread,", "mem", "-p", "4", "-t", "6", "-P", "512", OPT_BW },
  1389. { " 4x8-bw-thread,", "mem", "-p", "4", "-t", "8", "-P", "512", OPT_BW },
  1390. { " 4x8-bw-thread-NOTHP,",
  1391. "mem", "-p", "4", "-t", "8", "-P", "512", OPT_BW_NOTHP },
  1392. { " 3x3-bw-thread,", "mem", "-p", "3", "-t", "3", "-P", "512", OPT_BW },
  1393. { " 5x5-bw-thread,", "mem", "-p", "5", "-t", "5", "-P", "512", OPT_BW },
  1394. { "2x16-bw-thread,", "mem", "-p", "2", "-t", "16", "-P", "512", OPT_BW },
  1395. { "1x32-bw-thread,", "mem", "-p", "1", "-t", "32", "-P", "2048", OPT_BW },
  1396. { "numa02-bw,", "mem", "-p", "1", "-t", "32", "-T", "32", OPT_BW },
  1397. { "numa02-bw-NOTHP,", "mem", "-p", "1", "-t", "32", "-T", "32", OPT_BW_NOTHP },
  1398. { "numa01-bw-thread,", "mem", "-p", "2", "-t", "16", "-T", "192", OPT_BW },
  1399. { "numa01-bw-thread-NOTHP,",
  1400. "mem", "-p", "2", "-t", "16", "-T", "192", OPT_BW_NOTHP },
  1401. };
  1402. static int bench_all(void)
  1403. {
  1404. int nr = ARRAY_SIZE(tests);
  1405. int ret;
  1406. int i;
  1407. ret = system("echo ' #'; echo ' # Running test on: '$(uname -a); echo ' #'");
  1408. BUG_ON(ret < 0);
  1409. for (i = 0; i < nr; i++) {
  1410. run_bench_numa(tests[i][0], tests[i] + 1);
  1411. }
  1412. printf("\n");
  1413. return 0;
  1414. }
  1415. int bench_numa(int argc, const char **argv, const char *prefix __maybe_unused)
  1416. {
  1417. init_params(&p0, "main,", argc, argv);
  1418. argc = parse_options(argc, argv, options, bench_numa_usage, 0);
  1419. if (argc)
  1420. goto err;
  1421. if (p0.run_all)
  1422. return bench_all();
  1423. if (__bench_numa(NULL))
  1424. goto err;
  1425. return 0;
  1426. err:
  1427. usage_with_options(numa_usage, options);
  1428. return -1;
  1429. }