cqm.c 43 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767
  1. /*
  2. * Intel Cache Quality-of-Service Monitoring (CQM) support.
  3. *
  4. * Based very, very heavily on work by Peter Zijlstra.
  5. */
  6. #include <linux/perf_event.h>
  7. #include <linux/slab.h>
  8. #include <asm/cpu_device_id.h>
  9. #include <asm/intel_rdt_common.h>
  10. #include "../perf_event.h"
  11. #define MSR_IA32_QM_CTR 0x0c8e
  12. #define MSR_IA32_QM_EVTSEL 0x0c8d
  13. #define MBM_CNTR_WIDTH 24
  14. /*
  15. * Guaranteed time in ms as per SDM where MBM counters will not overflow.
  16. */
  17. #define MBM_CTR_OVERFLOW_TIME 1000
  18. static u32 cqm_max_rmid = -1;
  19. static unsigned int cqm_l3_scale; /* supposedly cacheline size */
  20. static bool cqm_enabled, mbm_enabled;
  21. unsigned int mbm_socket_max;
  22. /*
  23. * The cached intel_pqr_state is strictly per CPU and can never be
  24. * updated from a remote CPU. Both functions which modify the state
  25. * (intel_cqm_event_start and intel_cqm_event_stop) are called with
  26. * interrupts disabled, which is sufficient for the protection.
  27. */
  28. DEFINE_PER_CPU(struct intel_pqr_state, pqr_state);
  29. static struct hrtimer *mbm_timers;
  30. /**
  31. * struct sample - mbm event's (local or total) data
  32. * @total_bytes #bytes since we began monitoring
  33. * @prev_msr previous value of MSR
  34. */
  35. struct sample {
  36. u64 total_bytes;
  37. u64 prev_msr;
  38. };
  39. /*
  40. * samples profiled for total memory bandwidth type events
  41. */
  42. static struct sample *mbm_total;
  43. /*
  44. * samples profiled for local memory bandwidth type events
  45. */
  46. static struct sample *mbm_local;
  47. #define pkg_id topology_physical_package_id(smp_processor_id())
  48. /*
  49. * rmid_2_index returns the index for the rmid in mbm_local/mbm_total array.
  50. * mbm_total[] and mbm_local[] are linearly indexed by socket# * max number of
  51. * rmids per socket, an example is given below
  52. * RMID1 of Socket0: vrmid = 1
  53. * RMID1 of Socket1: vrmid = 1 * (cqm_max_rmid + 1) + 1
  54. * RMID1 of Socket2: vrmid = 2 * (cqm_max_rmid + 1) + 1
  55. */
  56. #define rmid_2_index(rmid) ((pkg_id * (cqm_max_rmid + 1)) + rmid)
  57. /*
  58. * Protects cache_cgroups and cqm_rmid_free_lru and cqm_rmid_limbo_lru.
  59. * Also protects event->hw.cqm_rmid
  60. *
  61. * Hold either for stability, both for modification of ->hw.cqm_rmid.
  62. */
  63. static DEFINE_MUTEX(cache_mutex);
  64. static DEFINE_RAW_SPINLOCK(cache_lock);
  65. /*
  66. * Groups of events that have the same target(s), one RMID per group.
  67. */
  68. static LIST_HEAD(cache_groups);
  69. /*
  70. * Mask of CPUs for reading CQM values. We only need one per-socket.
  71. */
  72. static cpumask_t cqm_cpumask;
  73. #define RMID_VAL_ERROR (1ULL << 63)
  74. #define RMID_VAL_UNAVAIL (1ULL << 62)
  75. /*
  76. * Event IDs are used to program IA32_QM_EVTSEL before reading event
  77. * counter from IA32_QM_CTR
  78. */
  79. #define QOS_L3_OCCUP_EVENT_ID 0x01
  80. #define QOS_MBM_TOTAL_EVENT_ID 0x02
  81. #define QOS_MBM_LOCAL_EVENT_ID 0x03
  82. /*
  83. * This is central to the rotation algorithm in __intel_cqm_rmid_rotate().
  84. *
  85. * This rmid is always free and is guaranteed to have an associated
  86. * near-zero occupancy value, i.e. no cachelines are tagged with this
  87. * RMID, once __intel_cqm_rmid_rotate() returns.
  88. */
  89. static u32 intel_cqm_rotation_rmid;
  90. #define INVALID_RMID (-1)
  91. /*
  92. * Is @rmid valid for programming the hardware?
  93. *
  94. * rmid 0 is reserved by the hardware for all non-monitored tasks, which
  95. * means that we should never come across an rmid with that value.
  96. * Likewise, an rmid value of -1 is used to indicate "no rmid currently
  97. * assigned" and is used as part of the rotation code.
  98. */
  99. static inline bool __rmid_valid(u32 rmid)
  100. {
  101. if (!rmid || rmid == INVALID_RMID)
  102. return false;
  103. return true;
  104. }
  105. static u64 __rmid_read(u32 rmid)
  106. {
  107. u64 val;
  108. /*
  109. * Ignore the SDM, this thing is _NOTHING_ like a regular perfcnt,
  110. * it just says that to increase confusion.
  111. */
  112. wrmsr(MSR_IA32_QM_EVTSEL, QOS_L3_OCCUP_EVENT_ID, rmid);
  113. rdmsrl(MSR_IA32_QM_CTR, val);
  114. /*
  115. * Aside from the ERROR and UNAVAIL bits, assume this thing returns
  116. * the number of cachelines tagged with @rmid.
  117. */
  118. return val;
  119. }
  120. enum rmid_recycle_state {
  121. RMID_YOUNG = 0,
  122. RMID_AVAILABLE,
  123. RMID_DIRTY,
  124. };
  125. struct cqm_rmid_entry {
  126. u32 rmid;
  127. enum rmid_recycle_state state;
  128. struct list_head list;
  129. unsigned long queue_time;
  130. };
  131. /*
  132. * cqm_rmid_free_lru - A least recently used list of RMIDs.
  133. *
  134. * Oldest entry at the head, newest (most recently used) entry at the
  135. * tail. This list is never traversed, it's only used to keep track of
  136. * the lru order. That is, we only pick entries of the head or insert
  137. * them on the tail.
  138. *
  139. * All entries on the list are 'free', and their RMIDs are not currently
  140. * in use. To mark an RMID as in use, remove its entry from the lru
  141. * list.
  142. *
  143. *
  144. * cqm_rmid_limbo_lru - list of currently unused but (potentially) dirty RMIDs.
  145. *
  146. * This list is contains RMIDs that no one is currently using but that
  147. * may have a non-zero occupancy value associated with them. The
  148. * rotation worker moves RMIDs from the limbo list to the free list once
  149. * the occupancy value drops below __intel_cqm_threshold.
  150. *
  151. * Both lists are protected by cache_mutex.
  152. */
  153. static LIST_HEAD(cqm_rmid_free_lru);
  154. static LIST_HEAD(cqm_rmid_limbo_lru);
  155. /*
  156. * We use a simple array of pointers so that we can lookup a struct
  157. * cqm_rmid_entry in O(1). This alleviates the callers of __get_rmid()
  158. * and __put_rmid() from having to worry about dealing with struct
  159. * cqm_rmid_entry - they just deal with rmids, i.e. integers.
  160. *
  161. * Once this array is initialized it is read-only. No locks are required
  162. * to access it.
  163. *
  164. * All entries for all RMIDs can be looked up in the this array at all
  165. * times.
  166. */
  167. static struct cqm_rmid_entry **cqm_rmid_ptrs;
  168. static inline struct cqm_rmid_entry *__rmid_entry(u32 rmid)
  169. {
  170. struct cqm_rmid_entry *entry;
  171. entry = cqm_rmid_ptrs[rmid];
  172. WARN_ON(entry->rmid != rmid);
  173. return entry;
  174. }
  175. /*
  176. * Returns < 0 on fail.
  177. *
  178. * We expect to be called with cache_mutex held.
  179. */
  180. static u32 __get_rmid(void)
  181. {
  182. struct cqm_rmid_entry *entry;
  183. lockdep_assert_held(&cache_mutex);
  184. if (list_empty(&cqm_rmid_free_lru))
  185. return INVALID_RMID;
  186. entry = list_first_entry(&cqm_rmid_free_lru, struct cqm_rmid_entry, list);
  187. list_del(&entry->list);
  188. return entry->rmid;
  189. }
  190. static void __put_rmid(u32 rmid)
  191. {
  192. struct cqm_rmid_entry *entry;
  193. lockdep_assert_held(&cache_mutex);
  194. WARN_ON(!__rmid_valid(rmid));
  195. entry = __rmid_entry(rmid);
  196. entry->queue_time = jiffies;
  197. entry->state = RMID_YOUNG;
  198. list_add_tail(&entry->list, &cqm_rmid_limbo_lru);
  199. }
  200. static void cqm_cleanup(void)
  201. {
  202. int i;
  203. if (!cqm_rmid_ptrs)
  204. return;
  205. for (i = 0; i < cqm_max_rmid; i++)
  206. kfree(cqm_rmid_ptrs[i]);
  207. kfree(cqm_rmid_ptrs);
  208. cqm_rmid_ptrs = NULL;
  209. cqm_enabled = false;
  210. }
  211. static int intel_cqm_setup_rmid_cache(void)
  212. {
  213. struct cqm_rmid_entry *entry;
  214. unsigned int nr_rmids;
  215. int r = 0;
  216. nr_rmids = cqm_max_rmid + 1;
  217. cqm_rmid_ptrs = kzalloc(sizeof(struct cqm_rmid_entry *) *
  218. nr_rmids, GFP_KERNEL);
  219. if (!cqm_rmid_ptrs)
  220. return -ENOMEM;
  221. for (; r <= cqm_max_rmid; r++) {
  222. struct cqm_rmid_entry *entry;
  223. entry = kmalloc(sizeof(*entry), GFP_KERNEL);
  224. if (!entry)
  225. goto fail;
  226. INIT_LIST_HEAD(&entry->list);
  227. entry->rmid = r;
  228. cqm_rmid_ptrs[r] = entry;
  229. list_add_tail(&entry->list, &cqm_rmid_free_lru);
  230. }
  231. /*
  232. * RMID 0 is special and is always allocated. It's used for all
  233. * tasks that are not monitored.
  234. */
  235. entry = __rmid_entry(0);
  236. list_del(&entry->list);
  237. mutex_lock(&cache_mutex);
  238. intel_cqm_rotation_rmid = __get_rmid();
  239. mutex_unlock(&cache_mutex);
  240. return 0;
  241. fail:
  242. cqm_cleanup();
  243. return -ENOMEM;
  244. }
  245. /*
  246. * Determine if @a and @b measure the same set of tasks.
  247. *
  248. * If @a and @b measure the same set of tasks then we want to share a
  249. * single RMID.
  250. */
  251. static bool __match_event(struct perf_event *a, struct perf_event *b)
  252. {
  253. /* Per-cpu and task events don't mix */
  254. if ((a->attach_state & PERF_ATTACH_TASK) !=
  255. (b->attach_state & PERF_ATTACH_TASK))
  256. return false;
  257. #ifdef CONFIG_CGROUP_PERF
  258. if (a->cgrp != b->cgrp)
  259. return false;
  260. #endif
  261. /* If not task event, we're machine wide */
  262. if (!(b->attach_state & PERF_ATTACH_TASK))
  263. return true;
  264. /*
  265. * Events that target same task are placed into the same cache group.
  266. * Mark it as a multi event group, so that we update ->count
  267. * for every event rather than just the group leader later.
  268. */
  269. if (a->hw.target == b->hw.target) {
  270. b->hw.is_group_event = true;
  271. return true;
  272. }
  273. /*
  274. * Are we an inherited event?
  275. */
  276. if (b->parent == a)
  277. return true;
  278. return false;
  279. }
  280. #ifdef CONFIG_CGROUP_PERF
  281. static inline struct perf_cgroup *event_to_cgroup(struct perf_event *event)
  282. {
  283. if (event->attach_state & PERF_ATTACH_TASK)
  284. return perf_cgroup_from_task(event->hw.target, event->ctx);
  285. return event->cgrp;
  286. }
  287. #endif
  288. /*
  289. * Determine if @a's tasks intersect with @b's tasks
  290. *
  291. * There are combinations of events that we explicitly prohibit,
  292. *
  293. * PROHIBITS
  294. * system-wide -> cgroup and task
  295. * cgroup -> system-wide
  296. * -> task in cgroup
  297. * task -> system-wide
  298. * -> task in cgroup
  299. *
  300. * Call this function before allocating an RMID.
  301. */
  302. static bool __conflict_event(struct perf_event *a, struct perf_event *b)
  303. {
  304. #ifdef CONFIG_CGROUP_PERF
  305. /*
  306. * We can have any number of cgroups but only one system-wide
  307. * event at a time.
  308. */
  309. if (a->cgrp && b->cgrp) {
  310. struct perf_cgroup *ac = a->cgrp;
  311. struct perf_cgroup *bc = b->cgrp;
  312. /*
  313. * This condition should have been caught in
  314. * __match_event() and we should be sharing an RMID.
  315. */
  316. WARN_ON_ONCE(ac == bc);
  317. if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) ||
  318. cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup))
  319. return true;
  320. return false;
  321. }
  322. if (a->cgrp || b->cgrp) {
  323. struct perf_cgroup *ac, *bc;
  324. /*
  325. * cgroup and system-wide events are mutually exclusive
  326. */
  327. if ((a->cgrp && !(b->attach_state & PERF_ATTACH_TASK)) ||
  328. (b->cgrp && !(a->attach_state & PERF_ATTACH_TASK)))
  329. return true;
  330. /*
  331. * Ensure neither event is part of the other's cgroup
  332. */
  333. ac = event_to_cgroup(a);
  334. bc = event_to_cgroup(b);
  335. if (ac == bc)
  336. return true;
  337. /*
  338. * Must have cgroup and non-intersecting task events.
  339. */
  340. if (!ac || !bc)
  341. return false;
  342. /*
  343. * We have cgroup and task events, and the task belongs
  344. * to a cgroup. Check for for overlap.
  345. */
  346. if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) ||
  347. cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup))
  348. return true;
  349. return false;
  350. }
  351. #endif
  352. /*
  353. * If one of them is not a task, same story as above with cgroups.
  354. */
  355. if (!(a->attach_state & PERF_ATTACH_TASK) ||
  356. !(b->attach_state & PERF_ATTACH_TASK))
  357. return true;
  358. /*
  359. * Must be non-overlapping.
  360. */
  361. return false;
  362. }
  363. struct rmid_read {
  364. u32 rmid;
  365. u32 evt_type;
  366. atomic64_t value;
  367. };
  368. static void __intel_cqm_event_count(void *info);
  369. static void init_mbm_sample(u32 rmid, u32 evt_type);
  370. static void __intel_mbm_event_count(void *info);
  371. static bool is_cqm_event(int e)
  372. {
  373. return (e == QOS_L3_OCCUP_EVENT_ID);
  374. }
  375. static bool is_mbm_event(int e)
  376. {
  377. return (e >= QOS_MBM_TOTAL_EVENT_ID && e <= QOS_MBM_LOCAL_EVENT_ID);
  378. }
  379. static void cqm_mask_call(struct rmid_read *rr)
  380. {
  381. if (is_mbm_event(rr->evt_type))
  382. on_each_cpu_mask(&cqm_cpumask, __intel_mbm_event_count, rr, 1);
  383. else
  384. on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, rr, 1);
  385. }
  386. /*
  387. * Exchange the RMID of a group of events.
  388. */
  389. static u32 intel_cqm_xchg_rmid(struct perf_event *group, u32 rmid)
  390. {
  391. struct perf_event *event;
  392. struct list_head *head = &group->hw.cqm_group_entry;
  393. u32 old_rmid = group->hw.cqm_rmid;
  394. lockdep_assert_held(&cache_mutex);
  395. /*
  396. * If our RMID is being deallocated, perform a read now.
  397. */
  398. if (__rmid_valid(old_rmid) && !__rmid_valid(rmid)) {
  399. struct rmid_read rr = {
  400. .rmid = old_rmid,
  401. .evt_type = group->attr.config,
  402. .value = ATOMIC64_INIT(0),
  403. };
  404. cqm_mask_call(&rr);
  405. local64_set(&group->count, atomic64_read(&rr.value));
  406. }
  407. raw_spin_lock_irq(&cache_lock);
  408. group->hw.cqm_rmid = rmid;
  409. list_for_each_entry(event, head, hw.cqm_group_entry)
  410. event->hw.cqm_rmid = rmid;
  411. raw_spin_unlock_irq(&cache_lock);
  412. /*
  413. * If the allocation is for mbm, init the mbm stats.
  414. * Need to check if each event in the group is mbm event
  415. * because there could be multiple type of events in the same group.
  416. */
  417. if (__rmid_valid(rmid)) {
  418. event = group;
  419. if (is_mbm_event(event->attr.config))
  420. init_mbm_sample(rmid, event->attr.config);
  421. list_for_each_entry(event, head, hw.cqm_group_entry) {
  422. if (is_mbm_event(event->attr.config))
  423. init_mbm_sample(rmid, event->attr.config);
  424. }
  425. }
  426. return old_rmid;
  427. }
  428. /*
  429. * If we fail to assign a new RMID for intel_cqm_rotation_rmid because
  430. * cachelines are still tagged with RMIDs in limbo, we progressively
  431. * increment the threshold until we find an RMID in limbo with <=
  432. * __intel_cqm_threshold lines tagged. This is designed to mitigate the
  433. * problem where cachelines tagged with an RMID are not steadily being
  434. * evicted.
  435. *
  436. * On successful rotations we decrease the threshold back towards zero.
  437. *
  438. * __intel_cqm_max_threshold provides an upper bound on the threshold,
  439. * and is measured in bytes because it's exposed to userland.
  440. */
  441. static unsigned int __intel_cqm_threshold;
  442. static unsigned int __intel_cqm_max_threshold;
  443. /*
  444. * Test whether an RMID has a zero occupancy value on this cpu.
  445. */
  446. static void intel_cqm_stable(void *arg)
  447. {
  448. struct cqm_rmid_entry *entry;
  449. list_for_each_entry(entry, &cqm_rmid_limbo_lru, list) {
  450. if (entry->state != RMID_AVAILABLE)
  451. break;
  452. if (__rmid_read(entry->rmid) > __intel_cqm_threshold)
  453. entry->state = RMID_DIRTY;
  454. }
  455. }
  456. /*
  457. * If we have group events waiting for an RMID that don't conflict with
  458. * events already running, assign @rmid.
  459. */
  460. static bool intel_cqm_sched_in_event(u32 rmid)
  461. {
  462. struct perf_event *leader, *event;
  463. lockdep_assert_held(&cache_mutex);
  464. leader = list_first_entry(&cache_groups, struct perf_event,
  465. hw.cqm_groups_entry);
  466. event = leader;
  467. list_for_each_entry_continue(event, &cache_groups,
  468. hw.cqm_groups_entry) {
  469. if (__rmid_valid(event->hw.cqm_rmid))
  470. continue;
  471. if (__conflict_event(event, leader))
  472. continue;
  473. intel_cqm_xchg_rmid(event, rmid);
  474. return true;
  475. }
  476. return false;
  477. }
  478. /*
  479. * Initially use this constant for both the limbo queue time and the
  480. * rotation timer interval, pmu::hrtimer_interval_ms.
  481. *
  482. * They don't need to be the same, but the two are related since if you
  483. * rotate faster than you recycle RMIDs, you may run out of available
  484. * RMIDs.
  485. */
  486. #define RMID_DEFAULT_QUEUE_TIME 250 /* ms */
  487. static unsigned int __rmid_queue_time_ms = RMID_DEFAULT_QUEUE_TIME;
  488. /*
  489. * intel_cqm_rmid_stabilize - move RMIDs from limbo to free list
  490. * @nr_available: number of freeable RMIDs on the limbo list
  491. *
  492. * Quiescent state; wait for all 'freed' RMIDs to become unused, i.e. no
  493. * cachelines are tagged with those RMIDs. After this we can reuse them
  494. * and know that the current set of active RMIDs is stable.
  495. *
  496. * Return %true or %false depending on whether stabilization needs to be
  497. * reattempted.
  498. *
  499. * If we return %true then @nr_available is updated to indicate the
  500. * number of RMIDs on the limbo list that have been queued for the
  501. * minimum queue time (RMID_AVAILABLE), but whose data occupancy values
  502. * are above __intel_cqm_threshold.
  503. */
  504. static bool intel_cqm_rmid_stabilize(unsigned int *available)
  505. {
  506. struct cqm_rmid_entry *entry, *tmp;
  507. lockdep_assert_held(&cache_mutex);
  508. *available = 0;
  509. list_for_each_entry(entry, &cqm_rmid_limbo_lru, list) {
  510. unsigned long min_queue_time;
  511. unsigned long now = jiffies;
  512. /*
  513. * We hold RMIDs placed into limbo for a minimum queue
  514. * time. Before the minimum queue time has elapsed we do
  515. * not recycle RMIDs.
  516. *
  517. * The reasoning is that until a sufficient time has
  518. * passed since we stopped using an RMID, any RMID
  519. * placed onto the limbo list will likely still have
  520. * data tagged in the cache, which means we'll probably
  521. * fail to recycle it anyway.
  522. *
  523. * We can save ourselves an expensive IPI by skipping
  524. * any RMIDs that have not been queued for the minimum
  525. * time.
  526. */
  527. min_queue_time = entry->queue_time +
  528. msecs_to_jiffies(__rmid_queue_time_ms);
  529. if (time_after(min_queue_time, now))
  530. break;
  531. entry->state = RMID_AVAILABLE;
  532. (*available)++;
  533. }
  534. /*
  535. * Fast return if none of the RMIDs on the limbo list have been
  536. * sitting on the queue for the minimum queue time.
  537. */
  538. if (!*available)
  539. return false;
  540. /*
  541. * Test whether an RMID is free for each package.
  542. */
  543. on_each_cpu_mask(&cqm_cpumask, intel_cqm_stable, NULL, true);
  544. list_for_each_entry_safe(entry, tmp, &cqm_rmid_limbo_lru, list) {
  545. /*
  546. * Exhausted all RMIDs that have waited min queue time.
  547. */
  548. if (entry->state == RMID_YOUNG)
  549. break;
  550. if (entry->state == RMID_DIRTY)
  551. continue;
  552. list_del(&entry->list); /* remove from limbo */
  553. /*
  554. * The rotation RMID gets priority if it's
  555. * currently invalid. In which case, skip adding
  556. * the RMID to the the free lru.
  557. */
  558. if (!__rmid_valid(intel_cqm_rotation_rmid)) {
  559. intel_cqm_rotation_rmid = entry->rmid;
  560. continue;
  561. }
  562. /*
  563. * If we have groups waiting for RMIDs, hand
  564. * them one now provided they don't conflict.
  565. */
  566. if (intel_cqm_sched_in_event(entry->rmid))
  567. continue;
  568. /*
  569. * Otherwise place it onto the free list.
  570. */
  571. list_add_tail(&entry->list, &cqm_rmid_free_lru);
  572. }
  573. return __rmid_valid(intel_cqm_rotation_rmid);
  574. }
  575. /*
  576. * Pick a victim group and move it to the tail of the group list.
  577. * @next: The first group without an RMID
  578. */
  579. static void __intel_cqm_pick_and_rotate(struct perf_event *next)
  580. {
  581. struct perf_event *rotor;
  582. u32 rmid;
  583. lockdep_assert_held(&cache_mutex);
  584. rotor = list_first_entry(&cache_groups, struct perf_event,
  585. hw.cqm_groups_entry);
  586. /*
  587. * The group at the front of the list should always have a valid
  588. * RMID. If it doesn't then no groups have RMIDs assigned and we
  589. * don't need to rotate the list.
  590. */
  591. if (next == rotor)
  592. return;
  593. rmid = intel_cqm_xchg_rmid(rotor, INVALID_RMID);
  594. __put_rmid(rmid);
  595. list_rotate_left(&cache_groups);
  596. }
  597. /*
  598. * Deallocate the RMIDs from any events that conflict with @event, and
  599. * place them on the back of the group list.
  600. */
  601. static void intel_cqm_sched_out_conflicting_events(struct perf_event *event)
  602. {
  603. struct perf_event *group, *g;
  604. u32 rmid;
  605. lockdep_assert_held(&cache_mutex);
  606. list_for_each_entry_safe(group, g, &cache_groups, hw.cqm_groups_entry) {
  607. if (group == event)
  608. continue;
  609. rmid = group->hw.cqm_rmid;
  610. /*
  611. * Skip events that don't have a valid RMID.
  612. */
  613. if (!__rmid_valid(rmid))
  614. continue;
  615. /*
  616. * No conflict? No problem! Leave the event alone.
  617. */
  618. if (!__conflict_event(group, event))
  619. continue;
  620. intel_cqm_xchg_rmid(group, INVALID_RMID);
  621. __put_rmid(rmid);
  622. }
  623. }
  624. /*
  625. * Attempt to rotate the groups and assign new RMIDs.
  626. *
  627. * We rotate for two reasons,
  628. * 1. To handle the scheduling of conflicting events
  629. * 2. To recycle RMIDs
  630. *
  631. * Rotating RMIDs is complicated because the hardware doesn't give us
  632. * any clues.
  633. *
  634. * There's problems with the hardware interface; when you change the
  635. * task:RMID map cachelines retain their 'old' tags, giving a skewed
  636. * picture. In order to work around this, we must always keep one free
  637. * RMID - intel_cqm_rotation_rmid.
  638. *
  639. * Rotation works by taking away an RMID from a group (the old RMID),
  640. * and assigning the free RMID to another group (the new RMID). We must
  641. * then wait for the old RMID to not be used (no cachelines tagged).
  642. * This ensure that all cachelines are tagged with 'active' RMIDs. At
  643. * this point we can start reading values for the new RMID and treat the
  644. * old RMID as the free RMID for the next rotation.
  645. *
  646. * Return %true or %false depending on whether we did any rotating.
  647. */
  648. static bool __intel_cqm_rmid_rotate(void)
  649. {
  650. struct perf_event *group, *start = NULL;
  651. unsigned int threshold_limit;
  652. unsigned int nr_needed = 0;
  653. unsigned int nr_available;
  654. bool rotated = false;
  655. mutex_lock(&cache_mutex);
  656. again:
  657. /*
  658. * Fast path through this function if there are no groups and no
  659. * RMIDs that need cleaning.
  660. */
  661. if (list_empty(&cache_groups) && list_empty(&cqm_rmid_limbo_lru))
  662. goto out;
  663. list_for_each_entry(group, &cache_groups, hw.cqm_groups_entry) {
  664. if (!__rmid_valid(group->hw.cqm_rmid)) {
  665. if (!start)
  666. start = group;
  667. nr_needed++;
  668. }
  669. }
  670. /*
  671. * We have some event groups, but they all have RMIDs assigned
  672. * and no RMIDs need cleaning.
  673. */
  674. if (!nr_needed && list_empty(&cqm_rmid_limbo_lru))
  675. goto out;
  676. if (!nr_needed)
  677. goto stabilize;
  678. /*
  679. * We have more event groups without RMIDs than available RMIDs,
  680. * or we have event groups that conflict with the ones currently
  681. * scheduled.
  682. *
  683. * We force deallocate the rmid of the group at the head of
  684. * cache_groups. The first event group without an RMID then gets
  685. * assigned intel_cqm_rotation_rmid. This ensures we always make
  686. * forward progress.
  687. *
  688. * Rotate the cache_groups list so the previous head is now the
  689. * tail.
  690. */
  691. __intel_cqm_pick_and_rotate(start);
  692. /*
  693. * If the rotation is going to succeed, reduce the threshold so
  694. * that we don't needlessly reuse dirty RMIDs.
  695. */
  696. if (__rmid_valid(intel_cqm_rotation_rmid)) {
  697. intel_cqm_xchg_rmid(start, intel_cqm_rotation_rmid);
  698. intel_cqm_rotation_rmid = __get_rmid();
  699. intel_cqm_sched_out_conflicting_events(start);
  700. if (__intel_cqm_threshold)
  701. __intel_cqm_threshold--;
  702. }
  703. rotated = true;
  704. stabilize:
  705. /*
  706. * We now need to stablize the RMID we freed above (if any) to
  707. * ensure that the next time we rotate we have an RMID with zero
  708. * occupancy value.
  709. *
  710. * Alternatively, if we didn't need to perform any rotation,
  711. * we'll have a bunch of RMIDs in limbo that need stabilizing.
  712. */
  713. threshold_limit = __intel_cqm_max_threshold / cqm_l3_scale;
  714. while (intel_cqm_rmid_stabilize(&nr_available) &&
  715. __intel_cqm_threshold < threshold_limit) {
  716. unsigned int steal_limit;
  717. /*
  718. * Don't spin if nobody is actively waiting for an RMID,
  719. * the rotation worker will be kicked as soon as an
  720. * event needs an RMID anyway.
  721. */
  722. if (!nr_needed)
  723. break;
  724. /* Allow max 25% of RMIDs to be in limbo. */
  725. steal_limit = (cqm_max_rmid + 1) / 4;
  726. /*
  727. * We failed to stabilize any RMIDs so our rotation
  728. * logic is now stuck. In order to make forward progress
  729. * we have a few options:
  730. *
  731. * 1. rotate ("steal") another RMID
  732. * 2. increase the threshold
  733. * 3. do nothing
  734. *
  735. * We do both of 1. and 2. until we hit the steal limit.
  736. *
  737. * The steal limit prevents all RMIDs ending up on the
  738. * limbo list. This can happen if every RMID has a
  739. * non-zero occupancy above threshold_limit, and the
  740. * occupancy values aren't dropping fast enough.
  741. *
  742. * Note that there is prioritisation at work here - we'd
  743. * rather increase the number of RMIDs on the limbo list
  744. * than increase the threshold, because increasing the
  745. * threshold skews the event data (because we reuse
  746. * dirty RMIDs) - threshold bumps are a last resort.
  747. */
  748. if (nr_available < steal_limit)
  749. goto again;
  750. __intel_cqm_threshold++;
  751. }
  752. out:
  753. mutex_unlock(&cache_mutex);
  754. return rotated;
  755. }
  756. static void intel_cqm_rmid_rotate(struct work_struct *work);
  757. static DECLARE_DELAYED_WORK(intel_cqm_rmid_work, intel_cqm_rmid_rotate);
  758. static struct pmu intel_cqm_pmu;
  759. static void intel_cqm_rmid_rotate(struct work_struct *work)
  760. {
  761. unsigned long delay;
  762. __intel_cqm_rmid_rotate();
  763. delay = msecs_to_jiffies(intel_cqm_pmu.hrtimer_interval_ms);
  764. schedule_delayed_work(&intel_cqm_rmid_work, delay);
  765. }
  766. static u64 update_sample(unsigned int rmid, u32 evt_type, int first)
  767. {
  768. struct sample *mbm_current;
  769. u32 vrmid = rmid_2_index(rmid);
  770. u64 val, bytes, shift;
  771. u32 eventid;
  772. if (evt_type == QOS_MBM_LOCAL_EVENT_ID) {
  773. mbm_current = &mbm_local[vrmid];
  774. eventid = QOS_MBM_LOCAL_EVENT_ID;
  775. } else {
  776. mbm_current = &mbm_total[vrmid];
  777. eventid = QOS_MBM_TOTAL_EVENT_ID;
  778. }
  779. wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid);
  780. rdmsrl(MSR_IA32_QM_CTR, val);
  781. if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
  782. return mbm_current->total_bytes;
  783. if (first) {
  784. mbm_current->prev_msr = val;
  785. mbm_current->total_bytes = 0;
  786. return mbm_current->total_bytes;
  787. }
  788. /*
  789. * The h/w guarantees that counters will not overflow
  790. * so long as we poll them at least once per second.
  791. */
  792. shift = 64 - MBM_CNTR_WIDTH;
  793. bytes = (val << shift) - (mbm_current->prev_msr << shift);
  794. bytes >>= shift;
  795. bytes *= cqm_l3_scale;
  796. mbm_current->total_bytes += bytes;
  797. mbm_current->prev_msr = val;
  798. return mbm_current->total_bytes;
  799. }
  800. static u64 rmid_read_mbm(unsigned int rmid, u32 evt_type)
  801. {
  802. return update_sample(rmid, evt_type, 0);
  803. }
  804. static void __intel_mbm_event_init(void *info)
  805. {
  806. struct rmid_read *rr = info;
  807. update_sample(rr->rmid, rr->evt_type, 1);
  808. }
  809. static void init_mbm_sample(u32 rmid, u32 evt_type)
  810. {
  811. struct rmid_read rr = {
  812. .rmid = rmid,
  813. .evt_type = evt_type,
  814. .value = ATOMIC64_INIT(0),
  815. };
  816. /* on each socket, init sample */
  817. on_each_cpu_mask(&cqm_cpumask, __intel_mbm_event_init, &rr, 1);
  818. }
  819. /*
  820. * Find a group and setup RMID.
  821. *
  822. * If we're part of a group, we use the group's RMID.
  823. */
  824. static void intel_cqm_setup_event(struct perf_event *event,
  825. struct perf_event **group)
  826. {
  827. struct perf_event *iter;
  828. bool conflict = false;
  829. u32 rmid;
  830. event->hw.is_group_event = false;
  831. list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) {
  832. rmid = iter->hw.cqm_rmid;
  833. if (__match_event(iter, event)) {
  834. /* All tasks in a group share an RMID */
  835. event->hw.cqm_rmid = rmid;
  836. *group = iter;
  837. if (is_mbm_event(event->attr.config) && __rmid_valid(rmid))
  838. init_mbm_sample(rmid, event->attr.config);
  839. return;
  840. }
  841. /*
  842. * We only care about conflicts for events that are
  843. * actually scheduled in (and hence have a valid RMID).
  844. */
  845. if (__conflict_event(iter, event) && __rmid_valid(rmid))
  846. conflict = true;
  847. }
  848. if (conflict)
  849. rmid = INVALID_RMID;
  850. else
  851. rmid = __get_rmid();
  852. if (is_mbm_event(event->attr.config) && __rmid_valid(rmid))
  853. init_mbm_sample(rmid, event->attr.config);
  854. event->hw.cqm_rmid = rmid;
  855. }
  856. static void intel_cqm_event_read(struct perf_event *event)
  857. {
  858. unsigned long flags;
  859. u32 rmid;
  860. u64 val;
  861. /*
  862. * Task events are handled by intel_cqm_event_count().
  863. */
  864. if (event->cpu == -1)
  865. return;
  866. raw_spin_lock_irqsave(&cache_lock, flags);
  867. rmid = event->hw.cqm_rmid;
  868. if (!__rmid_valid(rmid))
  869. goto out;
  870. if (is_mbm_event(event->attr.config))
  871. val = rmid_read_mbm(rmid, event->attr.config);
  872. else
  873. val = __rmid_read(rmid);
  874. /*
  875. * Ignore this reading on error states and do not update the value.
  876. */
  877. if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
  878. goto out;
  879. local64_set(&event->count, val);
  880. out:
  881. raw_spin_unlock_irqrestore(&cache_lock, flags);
  882. }
  883. static void __intel_cqm_event_count(void *info)
  884. {
  885. struct rmid_read *rr = info;
  886. u64 val;
  887. val = __rmid_read(rr->rmid);
  888. if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
  889. return;
  890. atomic64_add(val, &rr->value);
  891. }
  892. static inline bool cqm_group_leader(struct perf_event *event)
  893. {
  894. return !list_empty(&event->hw.cqm_groups_entry);
  895. }
  896. static void __intel_mbm_event_count(void *info)
  897. {
  898. struct rmid_read *rr = info;
  899. u64 val;
  900. val = rmid_read_mbm(rr->rmid, rr->evt_type);
  901. if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
  902. return;
  903. atomic64_add(val, &rr->value);
  904. }
  905. static enum hrtimer_restart mbm_hrtimer_handle(struct hrtimer *hrtimer)
  906. {
  907. struct perf_event *iter, *iter1;
  908. int ret = HRTIMER_RESTART;
  909. struct list_head *head;
  910. unsigned long flags;
  911. u32 grp_rmid;
  912. /*
  913. * Need to cache_lock as the timer Event Select MSR reads
  914. * can race with the mbm/cqm count() and mbm_init() reads.
  915. */
  916. raw_spin_lock_irqsave(&cache_lock, flags);
  917. if (list_empty(&cache_groups)) {
  918. ret = HRTIMER_NORESTART;
  919. goto out;
  920. }
  921. list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) {
  922. grp_rmid = iter->hw.cqm_rmid;
  923. if (!__rmid_valid(grp_rmid))
  924. continue;
  925. if (is_mbm_event(iter->attr.config))
  926. update_sample(grp_rmid, iter->attr.config, 0);
  927. head = &iter->hw.cqm_group_entry;
  928. if (list_empty(head))
  929. continue;
  930. list_for_each_entry(iter1, head, hw.cqm_group_entry) {
  931. if (!iter1->hw.is_group_event)
  932. break;
  933. if (is_mbm_event(iter1->attr.config))
  934. update_sample(iter1->hw.cqm_rmid,
  935. iter1->attr.config, 0);
  936. }
  937. }
  938. hrtimer_forward_now(hrtimer, ms_to_ktime(MBM_CTR_OVERFLOW_TIME));
  939. out:
  940. raw_spin_unlock_irqrestore(&cache_lock, flags);
  941. return ret;
  942. }
  943. static void __mbm_start_timer(void *info)
  944. {
  945. hrtimer_start(&mbm_timers[pkg_id], ms_to_ktime(MBM_CTR_OVERFLOW_TIME),
  946. HRTIMER_MODE_REL_PINNED);
  947. }
  948. static void __mbm_stop_timer(void *info)
  949. {
  950. hrtimer_cancel(&mbm_timers[pkg_id]);
  951. }
  952. static void mbm_start_timers(void)
  953. {
  954. on_each_cpu_mask(&cqm_cpumask, __mbm_start_timer, NULL, 1);
  955. }
  956. static void mbm_stop_timers(void)
  957. {
  958. on_each_cpu_mask(&cqm_cpumask, __mbm_stop_timer, NULL, 1);
  959. }
  960. static void mbm_hrtimer_init(void)
  961. {
  962. struct hrtimer *hr;
  963. int i;
  964. for (i = 0; i < mbm_socket_max; i++) {
  965. hr = &mbm_timers[i];
  966. hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
  967. hr->function = mbm_hrtimer_handle;
  968. }
  969. }
  970. static u64 intel_cqm_event_count(struct perf_event *event)
  971. {
  972. unsigned long flags;
  973. struct rmid_read rr = {
  974. .evt_type = event->attr.config,
  975. .value = ATOMIC64_INIT(0),
  976. };
  977. /*
  978. * We only need to worry about task events. System-wide events
  979. * are handled like usual, i.e. entirely with
  980. * intel_cqm_event_read().
  981. */
  982. if (event->cpu != -1)
  983. return __perf_event_count(event);
  984. /*
  985. * Only the group leader gets to report values except in case of
  986. * multiple events in the same group, we still need to read the
  987. * other events.This stops us
  988. * reporting duplicate values to userspace, and gives us a clear
  989. * rule for which task gets to report the values.
  990. *
  991. * Note that it is impossible to attribute these values to
  992. * specific packages - we forfeit that ability when we create
  993. * task events.
  994. */
  995. if (!cqm_group_leader(event) && !event->hw.is_group_event)
  996. return 0;
  997. /*
  998. * Getting up-to-date values requires an SMP IPI which is not
  999. * possible if we're being called in interrupt context. Return
  1000. * the cached values instead.
  1001. */
  1002. if (unlikely(in_interrupt()))
  1003. goto out;
  1004. /*
  1005. * Notice that we don't perform the reading of an RMID
  1006. * atomically, because we can't hold a spin lock across the
  1007. * IPIs.
  1008. *
  1009. * Speculatively perform the read, since @event might be
  1010. * assigned a different (possibly invalid) RMID while we're
  1011. * busying performing the IPI calls. It's therefore necessary to
  1012. * check @event's RMID afterwards, and if it has changed,
  1013. * discard the result of the read.
  1014. */
  1015. rr.rmid = ACCESS_ONCE(event->hw.cqm_rmid);
  1016. if (!__rmid_valid(rr.rmid))
  1017. goto out;
  1018. cqm_mask_call(&rr);
  1019. raw_spin_lock_irqsave(&cache_lock, flags);
  1020. if (event->hw.cqm_rmid == rr.rmid)
  1021. local64_set(&event->count, atomic64_read(&rr.value));
  1022. raw_spin_unlock_irqrestore(&cache_lock, flags);
  1023. out:
  1024. return __perf_event_count(event);
  1025. }
  1026. static void intel_cqm_event_start(struct perf_event *event, int mode)
  1027. {
  1028. struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
  1029. u32 rmid = event->hw.cqm_rmid;
  1030. if (!(event->hw.cqm_state & PERF_HES_STOPPED))
  1031. return;
  1032. event->hw.cqm_state &= ~PERF_HES_STOPPED;
  1033. if (state->rmid_usecnt++) {
  1034. if (!WARN_ON_ONCE(state->rmid != rmid))
  1035. return;
  1036. } else {
  1037. WARN_ON_ONCE(state->rmid);
  1038. }
  1039. state->rmid = rmid;
  1040. wrmsr(MSR_IA32_PQR_ASSOC, rmid, state->closid);
  1041. }
  1042. static void intel_cqm_event_stop(struct perf_event *event, int mode)
  1043. {
  1044. struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
  1045. if (event->hw.cqm_state & PERF_HES_STOPPED)
  1046. return;
  1047. event->hw.cqm_state |= PERF_HES_STOPPED;
  1048. intel_cqm_event_read(event);
  1049. if (!--state->rmid_usecnt) {
  1050. state->rmid = 0;
  1051. wrmsr(MSR_IA32_PQR_ASSOC, 0, state->closid);
  1052. } else {
  1053. WARN_ON_ONCE(!state->rmid);
  1054. }
  1055. }
  1056. static int intel_cqm_event_add(struct perf_event *event, int mode)
  1057. {
  1058. unsigned long flags;
  1059. u32 rmid;
  1060. raw_spin_lock_irqsave(&cache_lock, flags);
  1061. event->hw.cqm_state = PERF_HES_STOPPED;
  1062. rmid = event->hw.cqm_rmid;
  1063. if (__rmid_valid(rmid) && (mode & PERF_EF_START))
  1064. intel_cqm_event_start(event, mode);
  1065. raw_spin_unlock_irqrestore(&cache_lock, flags);
  1066. return 0;
  1067. }
  1068. static void intel_cqm_event_destroy(struct perf_event *event)
  1069. {
  1070. struct perf_event *group_other = NULL;
  1071. unsigned long flags;
  1072. mutex_lock(&cache_mutex);
  1073. /*
  1074. * Hold the cache_lock as mbm timer handlers could be
  1075. * scanning the list of events.
  1076. */
  1077. raw_spin_lock_irqsave(&cache_lock, flags);
  1078. /*
  1079. * If there's another event in this group...
  1080. */
  1081. if (!list_empty(&event->hw.cqm_group_entry)) {
  1082. group_other = list_first_entry(&event->hw.cqm_group_entry,
  1083. struct perf_event,
  1084. hw.cqm_group_entry);
  1085. list_del(&event->hw.cqm_group_entry);
  1086. }
  1087. /*
  1088. * And we're the group leader..
  1089. */
  1090. if (cqm_group_leader(event)) {
  1091. /*
  1092. * If there was a group_other, make that leader, otherwise
  1093. * destroy the group and return the RMID.
  1094. */
  1095. if (group_other) {
  1096. list_replace(&event->hw.cqm_groups_entry,
  1097. &group_other->hw.cqm_groups_entry);
  1098. } else {
  1099. u32 rmid = event->hw.cqm_rmid;
  1100. if (__rmid_valid(rmid))
  1101. __put_rmid(rmid);
  1102. list_del(&event->hw.cqm_groups_entry);
  1103. }
  1104. }
  1105. raw_spin_unlock_irqrestore(&cache_lock, flags);
  1106. /*
  1107. * Stop the mbm overflow timers when the last event is destroyed.
  1108. */
  1109. if (mbm_enabled && list_empty(&cache_groups))
  1110. mbm_stop_timers();
  1111. mutex_unlock(&cache_mutex);
  1112. }
  1113. static int intel_cqm_event_init(struct perf_event *event)
  1114. {
  1115. struct perf_event *group = NULL;
  1116. bool rotate = false;
  1117. unsigned long flags;
  1118. if (event->attr.type != intel_cqm_pmu.type)
  1119. return -ENOENT;
  1120. if ((event->attr.config < QOS_L3_OCCUP_EVENT_ID) ||
  1121. (event->attr.config > QOS_MBM_LOCAL_EVENT_ID))
  1122. return -EINVAL;
  1123. if ((is_cqm_event(event->attr.config) && !cqm_enabled) ||
  1124. (is_mbm_event(event->attr.config) && !mbm_enabled))
  1125. return -EINVAL;
  1126. /* unsupported modes and filters */
  1127. if (event->attr.exclude_user ||
  1128. event->attr.exclude_kernel ||
  1129. event->attr.exclude_hv ||
  1130. event->attr.exclude_idle ||
  1131. event->attr.exclude_host ||
  1132. event->attr.exclude_guest ||
  1133. event->attr.sample_period) /* no sampling */
  1134. return -EINVAL;
  1135. INIT_LIST_HEAD(&event->hw.cqm_group_entry);
  1136. INIT_LIST_HEAD(&event->hw.cqm_groups_entry);
  1137. event->destroy = intel_cqm_event_destroy;
  1138. mutex_lock(&cache_mutex);
  1139. /*
  1140. * Start the mbm overflow timers when the first event is created.
  1141. */
  1142. if (mbm_enabled && list_empty(&cache_groups))
  1143. mbm_start_timers();
  1144. /* Will also set rmid */
  1145. intel_cqm_setup_event(event, &group);
  1146. /*
  1147. * Hold the cache_lock as mbm timer handlers be
  1148. * scanning the list of events.
  1149. */
  1150. raw_spin_lock_irqsave(&cache_lock, flags);
  1151. if (group) {
  1152. list_add_tail(&event->hw.cqm_group_entry,
  1153. &group->hw.cqm_group_entry);
  1154. } else {
  1155. list_add_tail(&event->hw.cqm_groups_entry,
  1156. &cache_groups);
  1157. /*
  1158. * All RMIDs are either in use or have recently been
  1159. * used. Kick the rotation worker to clean/free some.
  1160. *
  1161. * We only do this for the group leader, rather than for
  1162. * every event in a group to save on needless work.
  1163. */
  1164. if (!__rmid_valid(event->hw.cqm_rmid))
  1165. rotate = true;
  1166. }
  1167. raw_spin_unlock_irqrestore(&cache_lock, flags);
  1168. mutex_unlock(&cache_mutex);
  1169. if (rotate)
  1170. schedule_delayed_work(&intel_cqm_rmid_work, 0);
  1171. return 0;
  1172. }
  1173. EVENT_ATTR_STR(llc_occupancy, intel_cqm_llc, "event=0x01");
  1174. EVENT_ATTR_STR(llc_occupancy.per-pkg, intel_cqm_llc_pkg, "1");
  1175. EVENT_ATTR_STR(llc_occupancy.unit, intel_cqm_llc_unit, "Bytes");
  1176. EVENT_ATTR_STR(llc_occupancy.scale, intel_cqm_llc_scale, NULL);
  1177. EVENT_ATTR_STR(llc_occupancy.snapshot, intel_cqm_llc_snapshot, "1");
  1178. EVENT_ATTR_STR(total_bytes, intel_cqm_total_bytes, "event=0x02");
  1179. EVENT_ATTR_STR(total_bytes.per-pkg, intel_cqm_total_bytes_pkg, "1");
  1180. EVENT_ATTR_STR(total_bytes.unit, intel_cqm_total_bytes_unit, "MB");
  1181. EVENT_ATTR_STR(total_bytes.scale, intel_cqm_total_bytes_scale, "1e-6");
  1182. EVENT_ATTR_STR(local_bytes, intel_cqm_local_bytes, "event=0x03");
  1183. EVENT_ATTR_STR(local_bytes.per-pkg, intel_cqm_local_bytes_pkg, "1");
  1184. EVENT_ATTR_STR(local_bytes.unit, intel_cqm_local_bytes_unit, "MB");
  1185. EVENT_ATTR_STR(local_bytes.scale, intel_cqm_local_bytes_scale, "1e-6");
  1186. static struct attribute *intel_cqm_events_attr[] = {
  1187. EVENT_PTR(intel_cqm_llc),
  1188. EVENT_PTR(intel_cqm_llc_pkg),
  1189. EVENT_PTR(intel_cqm_llc_unit),
  1190. EVENT_PTR(intel_cqm_llc_scale),
  1191. EVENT_PTR(intel_cqm_llc_snapshot),
  1192. NULL,
  1193. };
  1194. static struct attribute *intel_mbm_events_attr[] = {
  1195. EVENT_PTR(intel_cqm_total_bytes),
  1196. EVENT_PTR(intel_cqm_local_bytes),
  1197. EVENT_PTR(intel_cqm_total_bytes_pkg),
  1198. EVENT_PTR(intel_cqm_local_bytes_pkg),
  1199. EVENT_PTR(intel_cqm_total_bytes_unit),
  1200. EVENT_PTR(intel_cqm_local_bytes_unit),
  1201. EVENT_PTR(intel_cqm_total_bytes_scale),
  1202. EVENT_PTR(intel_cqm_local_bytes_scale),
  1203. NULL,
  1204. };
  1205. static struct attribute *intel_cmt_mbm_events_attr[] = {
  1206. EVENT_PTR(intel_cqm_llc),
  1207. EVENT_PTR(intel_cqm_total_bytes),
  1208. EVENT_PTR(intel_cqm_local_bytes),
  1209. EVENT_PTR(intel_cqm_llc_pkg),
  1210. EVENT_PTR(intel_cqm_total_bytes_pkg),
  1211. EVENT_PTR(intel_cqm_local_bytes_pkg),
  1212. EVENT_PTR(intel_cqm_llc_unit),
  1213. EVENT_PTR(intel_cqm_total_bytes_unit),
  1214. EVENT_PTR(intel_cqm_local_bytes_unit),
  1215. EVENT_PTR(intel_cqm_llc_scale),
  1216. EVENT_PTR(intel_cqm_total_bytes_scale),
  1217. EVENT_PTR(intel_cqm_local_bytes_scale),
  1218. EVENT_PTR(intel_cqm_llc_snapshot),
  1219. NULL,
  1220. };
  1221. static struct attribute_group intel_cqm_events_group = {
  1222. .name = "events",
  1223. .attrs = NULL,
  1224. };
  1225. PMU_FORMAT_ATTR(event, "config:0-7");
  1226. static struct attribute *intel_cqm_formats_attr[] = {
  1227. &format_attr_event.attr,
  1228. NULL,
  1229. };
  1230. static struct attribute_group intel_cqm_format_group = {
  1231. .name = "format",
  1232. .attrs = intel_cqm_formats_attr,
  1233. };
  1234. static ssize_t
  1235. max_recycle_threshold_show(struct device *dev, struct device_attribute *attr,
  1236. char *page)
  1237. {
  1238. ssize_t rv;
  1239. mutex_lock(&cache_mutex);
  1240. rv = snprintf(page, PAGE_SIZE-1, "%u\n", __intel_cqm_max_threshold);
  1241. mutex_unlock(&cache_mutex);
  1242. return rv;
  1243. }
  1244. static ssize_t
  1245. max_recycle_threshold_store(struct device *dev,
  1246. struct device_attribute *attr,
  1247. const char *buf, size_t count)
  1248. {
  1249. unsigned int bytes, cachelines;
  1250. int ret;
  1251. ret = kstrtouint(buf, 0, &bytes);
  1252. if (ret)
  1253. return ret;
  1254. mutex_lock(&cache_mutex);
  1255. __intel_cqm_max_threshold = bytes;
  1256. cachelines = bytes / cqm_l3_scale;
  1257. /*
  1258. * The new maximum takes effect immediately.
  1259. */
  1260. if (__intel_cqm_threshold > cachelines)
  1261. __intel_cqm_threshold = cachelines;
  1262. mutex_unlock(&cache_mutex);
  1263. return count;
  1264. }
  1265. static DEVICE_ATTR_RW(max_recycle_threshold);
  1266. static struct attribute *intel_cqm_attrs[] = {
  1267. &dev_attr_max_recycle_threshold.attr,
  1268. NULL,
  1269. };
  1270. static const struct attribute_group intel_cqm_group = {
  1271. .attrs = intel_cqm_attrs,
  1272. };
  1273. static const struct attribute_group *intel_cqm_attr_groups[] = {
  1274. &intel_cqm_events_group,
  1275. &intel_cqm_format_group,
  1276. &intel_cqm_group,
  1277. NULL,
  1278. };
  1279. static struct pmu intel_cqm_pmu = {
  1280. .hrtimer_interval_ms = RMID_DEFAULT_QUEUE_TIME,
  1281. .attr_groups = intel_cqm_attr_groups,
  1282. .task_ctx_nr = perf_sw_context,
  1283. .event_init = intel_cqm_event_init,
  1284. .add = intel_cqm_event_add,
  1285. .del = intel_cqm_event_stop,
  1286. .start = intel_cqm_event_start,
  1287. .stop = intel_cqm_event_stop,
  1288. .read = intel_cqm_event_read,
  1289. .count = intel_cqm_event_count,
  1290. };
  1291. static inline void cqm_pick_event_reader(int cpu)
  1292. {
  1293. int reader;
  1294. /* First online cpu in package becomes the reader */
  1295. reader = cpumask_any_and(&cqm_cpumask, topology_core_cpumask(cpu));
  1296. if (reader >= nr_cpu_ids)
  1297. cpumask_set_cpu(cpu, &cqm_cpumask);
  1298. }
  1299. static int intel_cqm_cpu_starting(unsigned int cpu)
  1300. {
  1301. struct intel_pqr_state *state = &per_cpu(pqr_state, cpu);
  1302. struct cpuinfo_x86 *c = &cpu_data(cpu);
  1303. state->rmid = 0;
  1304. state->closid = 0;
  1305. state->rmid_usecnt = 0;
  1306. WARN_ON(c->x86_cache_max_rmid != cqm_max_rmid);
  1307. WARN_ON(c->x86_cache_occ_scale != cqm_l3_scale);
  1308. cqm_pick_event_reader(cpu);
  1309. return 0;
  1310. }
  1311. static int intel_cqm_cpu_exit(unsigned int cpu)
  1312. {
  1313. int target;
  1314. /* Is @cpu the current cqm reader for this package ? */
  1315. if (!cpumask_test_and_clear_cpu(cpu, &cqm_cpumask))
  1316. return 0;
  1317. /* Find another online reader in this package */
  1318. target = cpumask_any_but(topology_core_cpumask(cpu), cpu);
  1319. if (target < nr_cpu_ids)
  1320. cpumask_set_cpu(target, &cqm_cpumask);
  1321. return 0;
  1322. }
  1323. static const struct x86_cpu_id intel_cqm_match[] = {
  1324. { .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_OCCUP_LLC },
  1325. {}
  1326. };
  1327. static void mbm_cleanup(void)
  1328. {
  1329. if (!mbm_enabled)
  1330. return;
  1331. kfree(mbm_local);
  1332. kfree(mbm_total);
  1333. mbm_enabled = false;
  1334. }
  1335. static const struct x86_cpu_id intel_mbm_local_match[] = {
  1336. { .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_MBM_LOCAL },
  1337. {}
  1338. };
  1339. static const struct x86_cpu_id intel_mbm_total_match[] = {
  1340. { .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_MBM_TOTAL },
  1341. {}
  1342. };
  1343. static int intel_mbm_init(void)
  1344. {
  1345. int ret = 0, array_size, maxid = cqm_max_rmid + 1;
  1346. mbm_socket_max = topology_max_packages();
  1347. array_size = sizeof(struct sample) * maxid * mbm_socket_max;
  1348. mbm_local = kmalloc(array_size, GFP_KERNEL);
  1349. if (!mbm_local)
  1350. return -ENOMEM;
  1351. mbm_total = kmalloc(array_size, GFP_KERNEL);
  1352. if (!mbm_total) {
  1353. ret = -ENOMEM;
  1354. goto out;
  1355. }
  1356. array_size = sizeof(struct hrtimer) * mbm_socket_max;
  1357. mbm_timers = kmalloc(array_size, GFP_KERNEL);
  1358. if (!mbm_timers) {
  1359. ret = -ENOMEM;
  1360. goto out;
  1361. }
  1362. mbm_hrtimer_init();
  1363. out:
  1364. if (ret)
  1365. mbm_cleanup();
  1366. return ret;
  1367. }
  1368. static int __init intel_cqm_init(void)
  1369. {
  1370. char *str = NULL, scale[20];
  1371. int cpu, ret;
  1372. if (x86_match_cpu(intel_cqm_match))
  1373. cqm_enabled = true;
  1374. if (x86_match_cpu(intel_mbm_local_match) &&
  1375. x86_match_cpu(intel_mbm_total_match))
  1376. mbm_enabled = true;
  1377. if (!cqm_enabled && !mbm_enabled)
  1378. return -ENODEV;
  1379. cqm_l3_scale = boot_cpu_data.x86_cache_occ_scale;
  1380. /*
  1381. * It's possible that not all resources support the same number
  1382. * of RMIDs. Instead of making scheduling much more complicated
  1383. * (where we have to match a task's RMID to a cpu that supports
  1384. * that many RMIDs) just find the minimum RMIDs supported across
  1385. * all cpus.
  1386. *
  1387. * Also, check that the scales match on all cpus.
  1388. */
  1389. get_online_cpus();
  1390. for_each_online_cpu(cpu) {
  1391. struct cpuinfo_x86 *c = &cpu_data(cpu);
  1392. if (c->x86_cache_max_rmid < cqm_max_rmid)
  1393. cqm_max_rmid = c->x86_cache_max_rmid;
  1394. if (c->x86_cache_occ_scale != cqm_l3_scale) {
  1395. pr_err("Multiple LLC scale values, disabling\n");
  1396. ret = -EINVAL;
  1397. goto out;
  1398. }
  1399. }
  1400. /*
  1401. * A reasonable upper limit on the max threshold is the number
  1402. * of lines tagged per RMID if all RMIDs have the same number of
  1403. * lines tagged in the LLC.
  1404. *
  1405. * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
  1406. */
  1407. __intel_cqm_max_threshold =
  1408. boot_cpu_data.x86_cache_size * 1024 / (cqm_max_rmid + 1);
  1409. snprintf(scale, sizeof(scale), "%u", cqm_l3_scale);
  1410. str = kstrdup(scale, GFP_KERNEL);
  1411. if (!str) {
  1412. ret = -ENOMEM;
  1413. goto out;
  1414. }
  1415. event_attr_intel_cqm_llc_scale.event_str = str;
  1416. ret = intel_cqm_setup_rmid_cache();
  1417. if (ret)
  1418. goto out;
  1419. if (mbm_enabled)
  1420. ret = intel_mbm_init();
  1421. if (ret && !cqm_enabled)
  1422. goto out;
  1423. if (cqm_enabled && mbm_enabled)
  1424. intel_cqm_events_group.attrs = intel_cmt_mbm_events_attr;
  1425. else if (!cqm_enabled && mbm_enabled)
  1426. intel_cqm_events_group.attrs = intel_mbm_events_attr;
  1427. else if (cqm_enabled && !mbm_enabled)
  1428. intel_cqm_events_group.attrs = intel_cqm_events_attr;
  1429. ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm", -1);
  1430. if (ret) {
  1431. pr_err("Intel CQM perf registration failed: %d\n", ret);
  1432. goto out;
  1433. }
  1434. if (cqm_enabled)
  1435. pr_info("Intel CQM monitoring enabled\n");
  1436. if (mbm_enabled)
  1437. pr_info("Intel MBM enabled\n");
  1438. /*
  1439. * Setup the hot cpu notifier once we are sure cqm
  1440. * is enabled to avoid notifier leak.
  1441. */
  1442. cpuhp_setup_state(CPUHP_AP_PERF_X86_CQM_STARTING,
  1443. "perf/x86/cqm:starting",
  1444. intel_cqm_cpu_starting, NULL);
  1445. cpuhp_setup_state(CPUHP_AP_PERF_X86_CQM_ONLINE, "perf/x86/cqm:online",
  1446. NULL, intel_cqm_cpu_exit);
  1447. out:
  1448. put_online_cpus();
  1449. if (ret) {
  1450. kfree(str);
  1451. cqm_cleanup();
  1452. mbm_cleanup();
  1453. }
  1454. return ret;
  1455. }
  1456. device_initcall(intel_cqm_init);