trace_event_perf.c 7.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342
  1. /*
  2. * trace event based perf event profiling/tracing
  3. *
  4. * Copyright (C) 2009 Red Hat Inc, Peter Zijlstra <pzijlstr@redhat.com>
  5. * Copyright (C) 2009-2010 Frederic Weisbecker <fweisbec@gmail.com>
  6. */
  7. #include <linux/module.h>
  8. #include <linux/kprobes.h>
  9. #include "trace.h"
  10. static char __percpu *perf_trace_buf[PERF_NR_CONTEXTS];
  11. /*
  12. * Force it to be aligned to unsigned long to avoid misaligned accesses
  13. * suprises
  14. */
  15. typedef typeof(unsigned long [PERF_MAX_TRACE_SIZE / sizeof(unsigned long)])
  16. perf_trace_t;
  17. /* Count the events in use (per event id, not per instance) */
  18. static int total_ref_count;
  19. static int perf_trace_event_perm(struct ftrace_event_call *tp_event,
  20. struct perf_event *p_event)
  21. {
  22. /* The ftrace function trace is allowed only for root. */
  23. if (ftrace_event_is_function(tp_event) &&
  24. perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN))
  25. return -EPERM;
  26. /* No tracing, just counting, so no obvious leak */
  27. if (!(p_event->attr.sample_type & PERF_SAMPLE_RAW))
  28. return 0;
  29. /* Some events are ok to be traced by non-root users... */
  30. if (p_event->attach_state == PERF_ATTACH_TASK) {
  31. if (tp_event->flags & TRACE_EVENT_FL_CAP_ANY)
  32. return 0;
  33. }
  34. /*
  35. * ...otherwise raw tracepoint data can be a severe data leak,
  36. * only allow root to have these.
  37. */
  38. if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN))
  39. return -EPERM;
  40. return 0;
  41. }
  42. static int perf_trace_event_reg(struct ftrace_event_call *tp_event,
  43. struct perf_event *p_event)
  44. {
  45. struct hlist_head __percpu *list;
  46. int ret = -ENOMEM;
  47. int cpu;
  48. p_event->tp_event = tp_event;
  49. if (tp_event->perf_refcount++ > 0)
  50. return 0;
  51. list = alloc_percpu(struct hlist_head);
  52. if (!list)
  53. goto fail;
  54. for_each_possible_cpu(cpu)
  55. INIT_HLIST_HEAD(per_cpu_ptr(list, cpu));
  56. tp_event->perf_events = list;
  57. if (!total_ref_count) {
  58. char __percpu *buf;
  59. int i;
  60. for (i = 0; i < PERF_NR_CONTEXTS; i++) {
  61. buf = (char __percpu *)alloc_percpu(perf_trace_t);
  62. if (!buf)
  63. goto fail;
  64. perf_trace_buf[i] = buf;
  65. }
  66. }
  67. ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER, NULL);
  68. if (ret)
  69. goto fail;
  70. total_ref_count++;
  71. return 0;
  72. fail:
  73. if (!total_ref_count) {
  74. int i;
  75. for (i = 0; i < PERF_NR_CONTEXTS; i++) {
  76. free_percpu(perf_trace_buf[i]);
  77. perf_trace_buf[i] = NULL;
  78. }
  79. }
  80. if (!--tp_event->perf_refcount) {
  81. free_percpu(tp_event->perf_events);
  82. tp_event->perf_events = NULL;
  83. }
  84. return ret;
  85. }
  86. static void perf_trace_event_unreg(struct perf_event *p_event)
  87. {
  88. struct ftrace_event_call *tp_event = p_event->tp_event;
  89. int i;
  90. if (--tp_event->perf_refcount > 0)
  91. goto out;
  92. tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER, NULL);
  93. /*
  94. * Ensure our callback won't be called anymore. The buffers
  95. * will be freed after that.
  96. */
  97. tracepoint_synchronize_unregister();
  98. free_percpu(tp_event->perf_events);
  99. tp_event->perf_events = NULL;
  100. if (!--total_ref_count) {
  101. for (i = 0; i < PERF_NR_CONTEXTS; i++) {
  102. free_percpu(perf_trace_buf[i]);
  103. perf_trace_buf[i] = NULL;
  104. }
  105. }
  106. out:
  107. module_put(tp_event->mod);
  108. }
  109. static int perf_trace_event_open(struct perf_event *p_event)
  110. {
  111. struct ftrace_event_call *tp_event = p_event->tp_event;
  112. return tp_event->class->reg(tp_event, TRACE_REG_PERF_OPEN, p_event);
  113. }
  114. static void perf_trace_event_close(struct perf_event *p_event)
  115. {
  116. struct ftrace_event_call *tp_event = p_event->tp_event;
  117. tp_event->class->reg(tp_event, TRACE_REG_PERF_CLOSE, p_event);
  118. }
  119. static int perf_trace_event_init(struct ftrace_event_call *tp_event,
  120. struct perf_event *p_event)
  121. {
  122. int ret;
  123. ret = perf_trace_event_perm(tp_event, p_event);
  124. if (ret)
  125. return ret;
  126. ret = perf_trace_event_reg(tp_event, p_event);
  127. if (ret)
  128. return ret;
  129. ret = perf_trace_event_open(p_event);
  130. if (ret) {
  131. perf_trace_event_unreg(p_event);
  132. return ret;
  133. }
  134. return 0;
  135. }
  136. int perf_trace_init(struct perf_event *p_event)
  137. {
  138. struct ftrace_event_call *tp_event;
  139. int event_id = p_event->attr.config;
  140. int ret = -EINVAL;
  141. mutex_lock(&event_mutex);
  142. list_for_each_entry(tp_event, &ftrace_events, list) {
  143. if (tp_event->event.type == event_id &&
  144. tp_event->class && tp_event->class->reg &&
  145. try_module_get(tp_event->mod)) {
  146. ret = perf_trace_event_init(tp_event, p_event);
  147. if (ret)
  148. module_put(tp_event->mod);
  149. break;
  150. }
  151. }
  152. mutex_unlock(&event_mutex);
  153. return ret;
  154. }
  155. void perf_trace_destroy(struct perf_event *p_event)
  156. {
  157. mutex_lock(&event_mutex);
  158. perf_trace_event_close(p_event);
  159. perf_trace_event_unreg(p_event);
  160. mutex_unlock(&event_mutex);
  161. }
  162. int perf_trace_add(struct perf_event *p_event, int flags)
  163. {
  164. struct ftrace_event_call *tp_event = p_event->tp_event;
  165. struct hlist_head __percpu *pcpu_list;
  166. struct hlist_head *list;
  167. pcpu_list = tp_event->perf_events;
  168. if (WARN_ON_ONCE(!pcpu_list))
  169. return -EINVAL;
  170. if (!(flags & PERF_EF_START))
  171. p_event->hw.state = PERF_HES_STOPPED;
  172. list = this_cpu_ptr(pcpu_list);
  173. hlist_add_head_rcu(&p_event->hlist_entry, list);
  174. return tp_event->class->reg(tp_event, TRACE_REG_PERF_ADD, p_event);
  175. }
  176. void perf_trace_del(struct perf_event *p_event, int flags)
  177. {
  178. struct ftrace_event_call *tp_event = p_event->tp_event;
  179. if (!hlist_unhashed(&p_event->hlist_entry))
  180. hlist_del_rcu(&p_event->hlist_entry);
  181. tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event);
  182. }
  183. __kprobes void *perf_trace_buf_prepare(int size, unsigned short type,
  184. struct pt_regs *regs, int *rctxp)
  185. {
  186. struct trace_entry *entry;
  187. unsigned long flags;
  188. char *raw_data;
  189. int pc;
  190. BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long));
  191. pc = preempt_count();
  192. *rctxp = perf_swevent_get_recursion_context();
  193. if (*rctxp < 0)
  194. return NULL;
  195. raw_data = this_cpu_ptr(perf_trace_buf[*rctxp]);
  196. /* zero the dead bytes from align to not leak stack to user */
  197. memset(&raw_data[size - sizeof(u64)], 0, sizeof(u64));
  198. entry = (struct trace_entry *)raw_data;
  199. local_save_flags(flags);
  200. tracing_generic_entry_update(entry, flags, pc);
  201. entry->type = type;
  202. return raw_data;
  203. }
  204. EXPORT_SYMBOL_GPL(perf_trace_buf_prepare);
  205. #ifdef CONFIG_FUNCTION_TRACER
  206. static void
  207. perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip)
  208. {
  209. struct ftrace_entry *entry;
  210. struct hlist_head *head;
  211. struct pt_regs regs;
  212. int rctx;
  213. #define ENTRY_SIZE (ALIGN(sizeof(struct ftrace_entry) + sizeof(u32), \
  214. sizeof(u64)) - sizeof(u32))
  215. BUILD_BUG_ON(ENTRY_SIZE > PERF_MAX_TRACE_SIZE);
  216. perf_fetch_caller_regs(&regs);
  217. entry = perf_trace_buf_prepare(ENTRY_SIZE, TRACE_FN, NULL, &rctx);
  218. if (!entry)
  219. return;
  220. entry->ip = ip;
  221. entry->parent_ip = parent_ip;
  222. head = this_cpu_ptr(event_function.perf_events);
  223. perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, 0,
  224. 1, &regs, head);
  225. #undef ENTRY_SIZE
  226. }
  227. static int perf_ftrace_function_register(struct perf_event *event)
  228. {
  229. struct ftrace_ops *ops = &event->ftrace_ops;
  230. ops->flags |= FTRACE_OPS_FL_CONTROL;
  231. ops->func = perf_ftrace_function_call;
  232. return register_ftrace_function(ops);
  233. }
  234. static int perf_ftrace_function_unregister(struct perf_event *event)
  235. {
  236. struct ftrace_ops *ops = &event->ftrace_ops;
  237. int ret = unregister_ftrace_function(ops);
  238. ftrace_free_filter(ops);
  239. return ret;
  240. }
  241. static void perf_ftrace_function_enable(struct perf_event *event)
  242. {
  243. ftrace_function_local_enable(&event->ftrace_ops);
  244. }
  245. static void perf_ftrace_function_disable(struct perf_event *event)
  246. {
  247. ftrace_function_local_disable(&event->ftrace_ops);
  248. }
  249. int perf_ftrace_event_register(struct ftrace_event_call *call,
  250. enum trace_reg type, void *data)
  251. {
  252. switch (type) {
  253. case TRACE_REG_REGISTER:
  254. case TRACE_REG_UNREGISTER:
  255. break;
  256. case TRACE_REG_PERF_REGISTER:
  257. case TRACE_REG_PERF_UNREGISTER:
  258. return 0;
  259. case TRACE_REG_PERF_OPEN:
  260. return perf_ftrace_function_register(data);
  261. case TRACE_REG_PERF_CLOSE:
  262. return perf_ftrace_function_unregister(data);
  263. case TRACE_REG_PERF_ADD:
  264. perf_ftrace_function_enable(data);
  265. return 0;
  266. case TRACE_REG_PERF_DEL:
  267. perf_ftrace_function_disable(data);
  268. return 0;
  269. }
  270. return -EINVAL;
  271. }
  272. #endif /* CONFIG_FUNCTION_TRACER */