beignet-1.3.2-in-order-queue.patch 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382
  1. Description: Make in-order command queues actually be in-order
  2. When beignet added out-of-order execution support (7fd45f15),
  3. it made *all* queues use it, even ones that are nominally in-order.
  4. While using out-of-order queues is probably a good idea when possible
  5. (for performance), the OpenCL spec does not allow it to be the default.
  6. Author: Rebecca N. Palmer <rebecca_palmer@zoho.com>
  7. Forwarded: https://lists.freedesktop.org/archives/beignet/2018-July/009213.html
  8. --- a/src/cl_api.c
  9. +++ b/src/cl_api.c
  10. @@ -276,7 +276,7 @@ clEnqueueSVMFree (cl_command_queue command_queue,
  11. data->size = num_svm_pointers;
  12. data->ptr = user_data;
  13. - if (e_status == CL_COMPLETE) {
  14. + if (cl_command_queue_allow_bypass_submit(command_queue) && (e_status == CL_COMPLETE)) {
  15. // Sync mode, no need to queue event.
  16. err = cl_event_exec(e, CL_COMPLETE, CL_FALSE);
  17. if (err != CL_SUCCESS) {
  18. @@ -422,7 +422,7 @@ cl_int clEnqueueSVMMemcpy (cl_command_queue command_queue,
  19. data->const_ptr = src_ptr;
  20. data->size = size;
  21. - if (e_status == CL_COMPLETE) {
  22. + if (cl_command_queue_allow_bypass_submit(command_queue) && (e_status == CL_COMPLETE)) {
  23. // Sync mode, no need to queue event.
  24. err = cl_event_exec(e, CL_COMPLETE, CL_FALSE);
  25. if (err != CL_SUCCESS) {
  26. @@ -434,6 +434,9 @@ cl_int clEnqueueSVMMemcpy (cl_command_queue command_queue,
  27. break;
  28. }
  29. cl_command_queue_enqueue_event(command_queue, e);
  30. + if (blocking_copy) {
  31. + cl_event_wait_for_events_list(1, &e);
  32. + }
  33. }
  34. } while(0);
  35. @@ -511,7 +514,7 @@ cl_int clEnqueueSVMMemFill (cl_command_queue command_queue,
  36. data->pattern_size = pattern_size;
  37. data->size = size;
  38. - if (e_status == CL_COMPLETE) {
  39. + if (cl_command_queue_allow_bypass_submit(command_queue) && (e_status == CL_COMPLETE)) {
  40. // Sync mode, no need to queue event.
  41. err = cl_event_exec(e, CL_COMPLETE, CL_FALSE);
  42. if (err != CL_SUCCESS) {
  43. --- a/src/cl_api_kernel.c
  44. +++ b/src/cl_api_kernel.c
  45. @@ -223,6 +223,7 @@ clEnqueueNDRangeKernel(cl_command_queue command_queue,
  46. count *= global_wk_sz_rem[2] ? 2 : 1;
  47. const size_t *global_wk_all[2] = {global_wk_sz_div, global_wk_sz_rem};
  48. + cl_bool allow_immediate_submit = cl_command_queue_allow_bypass_submit(command_queue);
  49. /* Go through the at most 8 cases and euque if there is work items left */
  50. for (i = 0; i < 2; i++) {
  51. for (j = 0; j < 2; j++) {
  52. @@ -263,7 +264,7 @@ clEnqueueNDRangeKernel(cl_command_queue command_queue,
  53. break;
  54. }
  55. - err = cl_event_exec(e, (event_status == CL_COMPLETE ? CL_SUBMITTED : CL_QUEUED), CL_FALSE);
  56. + err = cl_event_exec(e, ((allow_immediate_submit && event_status == CL_COMPLETE) ? CL_SUBMITTED : CL_QUEUED), CL_FALSE);
  57. if (err != CL_SUCCESS) {
  58. break;
  59. }
  60. --- a/src/cl_api_mem.c
  61. +++ b/src/cl_api_mem.c
  62. @@ -308,7 +308,7 @@ clEnqueueMapBuffer(cl_command_queue command_queue,
  63. if (map_flags & (CL_MAP_WRITE | CL_MAP_WRITE_INVALIDATE_REGION))
  64. data->write_map = 1;
  65. - if (e_status == CL_COMPLETE) {
  66. + if (cl_command_queue_allow_bypass_submit(command_queue) && (e_status == CL_COMPLETE)) {
  67. // Sync mode, no need to queue event.
  68. err = cl_event_exec(e, CL_COMPLETE, CL_FALSE);
  69. if (err != CL_SUCCESS) {
  70. @@ -321,6 +321,9 @@ clEnqueueMapBuffer(cl_command_queue command_queue,
  71. }
  72. cl_command_queue_enqueue_event(command_queue, e);
  73. + if (blocking_map) {
  74. + cl_event_wait_for_events_list(1, &e);
  75. + }
  76. }
  77. ptr = data->ptr;
  78. @@ -393,7 +396,7 @@ clEnqueueUnmapMemObject(cl_command_queue command_queue,
  79. data->mem_obj = memobj;
  80. data->ptr = mapped_ptr;
  81. - if (e_status == CL_COMPLETE) { // No need to wait
  82. + if (cl_command_queue_allow_bypass_submit(command_queue) && (e_status == CL_COMPLETE)) { // No need to wait
  83. err = cl_event_exec(e, CL_COMPLETE, CL_FALSE);
  84. if (err != CL_SUCCESS) {
  85. break;
  86. @@ -495,7 +498,7 @@ clEnqueueReadBuffer(cl_command_queue command_queue,
  87. data->offset = offset;
  88. data->size = size;
  89. - if (e_status == CL_COMPLETE) {
  90. + if (cl_command_queue_allow_bypass_submit(command_queue) && (e_status == CL_COMPLETE)) {
  91. // Sync mode, no need to queue event.
  92. err = cl_event_exec(e, CL_COMPLETE, CL_FALSE);
  93. if (err != CL_SUCCESS) {
  94. @@ -507,6 +510,9 @@ clEnqueueReadBuffer(cl_command_queue command_queue,
  95. break;
  96. }
  97. cl_command_queue_enqueue_event(command_queue, e);
  98. + if (blocking_read) {
  99. + cl_event_wait_for_events_list(1, &e);
  100. + }
  101. }
  102. } while (0);
  103. @@ -598,7 +604,7 @@ clEnqueueWriteBuffer(cl_command_queue command_queue,
  104. data->offset = offset;
  105. data->size = size;
  106. - if (e_status == CL_COMPLETE) {
  107. + if (cl_command_queue_allow_bypass_submit(command_queue) && (e_status == CL_COMPLETE)) {
  108. // Sync mode, no need to queue event.
  109. err = cl_event_exec(e, CL_COMPLETE, CL_FALSE);
  110. if (err != CL_SUCCESS) {
  111. @@ -610,6 +616,9 @@ clEnqueueWriteBuffer(cl_command_queue command_queue,
  112. break;
  113. }
  114. cl_command_queue_enqueue_event(command_queue, e);
  115. + if (blocking_write) {
  116. + cl_event_wait_for_events_list(1, &e);
  117. + }
  118. }
  119. } while (0);
  120. @@ -747,7 +756,7 @@ clEnqueueReadBufferRect(cl_command_queue command_queue,
  121. data->host_row_pitch = host_row_pitch;
  122. data->host_slice_pitch = host_slice_pitch;
  123. - if (e_status == CL_COMPLETE) {
  124. + if (cl_command_queue_allow_bypass_submit(command_queue) && (e_status == CL_COMPLETE)) {
  125. // Sync mode, no need to queue event.
  126. err = cl_event_exec(e, CL_COMPLETE, CL_FALSE);
  127. if (err != CL_SUCCESS) {
  128. @@ -759,6 +768,9 @@ clEnqueueReadBufferRect(cl_command_queue command_queue,
  129. break;
  130. }
  131. cl_command_queue_enqueue_event(command_queue, e);
  132. + if (blocking_read) {
  133. + cl_event_wait_for_events_list(1, &e);
  134. + }
  135. }
  136. } while (0);
  137. @@ -898,7 +910,7 @@ clEnqueueWriteBufferRect(cl_command_queue command_queue,
  138. data->host_row_pitch = host_row_pitch;
  139. data->host_slice_pitch = host_slice_pitch;
  140. - if (e_status == CL_COMPLETE) {
  141. + if (cl_command_queue_allow_bypass_submit(command_queue) && (e_status == CL_COMPLETE)) {
  142. // Sync mode, no need to queue event.
  143. err = cl_event_exec(e, CL_COMPLETE, CL_FALSE);
  144. if (err != CL_SUCCESS) {
  145. @@ -910,6 +922,9 @@ clEnqueueWriteBufferRect(cl_command_queue command_queue,
  146. break;
  147. }
  148. cl_command_queue_enqueue_event(command_queue, e);
  149. + if (blocking_write) {
  150. + cl_event_wait_for_events_list(1, &e);
  151. + }
  152. }
  153. } while (0);
  154. @@ -1017,7 +1032,7 @@ clEnqueueCopyBuffer(cl_command_queue command_queue,
  155. break;
  156. }
  157. - err = cl_event_exec(e, e_status == CL_COMPLETE ? CL_SUBMITTED : CL_QUEUED, CL_FALSE);
  158. + err = cl_event_exec(e, (cl_command_queue_allow_bypass_submit(command_queue) && (e_status == CL_COMPLETE)) ? CL_SUBMITTED : CL_QUEUED, CL_FALSE);
  159. if (err != CL_SUCCESS) {
  160. break;
  161. }
  162. @@ -1207,7 +1222,7 @@ clEnqueueCopyBufferRect(cl_command_queue command_queue,
  163. if (e_status < CL_COMPLETE) { // Error happend, cancel.
  164. err = CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST;
  165. break;
  166. - } else if (e_status == CL_COMPLETE) {
  167. + } else if (cl_command_queue_allow_bypass_submit(command_queue) && (e_status == CL_COMPLETE)) {
  168. err = cl_event_exec(e, CL_SUBMITTED, CL_FALSE);
  169. if (err != CL_SUCCESS) {
  170. break;
  171. @@ -1308,7 +1323,7 @@ clEnqueueFillBuffer(cl_command_queue command_queue,
  172. break;
  173. }
  174. - err = cl_event_exec(e, e_status == CL_COMPLETE ? CL_SUBMITTED : CL_QUEUED, CL_FALSE);
  175. + err = cl_event_exec(e, (cl_command_queue_allow_bypass_submit(command_queue) && (e_status == CL_COMPLETE)) ? CL_SUBMITTED : CL_QUEUED, CL_FALSE);
  176. if (err != CL_SUCCESS) {
  177. break;
  178. }
  179. @@ -1395,7 +1410,7 @@ clEnqueueMigrateMemObjects(cl_command_queue command_queue,
  180. break;
  181. }
  182. - err = cl_event_exec(e, e_status == CL_COMPLETE ? CL_SUBMITTED : CL_QUEUED, CL_FALSE);
  183. + err = cl_event_exec(e, (cl_command_queue_allow_bypass_submit(command_queue) && (e_status == CL_COMPLETE)) ? CL_SUBMITTED : CL_QUEUED, CL_FALSE);
  184. if (err != CL_SUCCESS) {
  185. break;
  186. }
  187. @@ -1574,7 +1589,7 @@ clEnqueueMapImage(cl_command_queue command_queue,
  188. if (map_flags & (CL_MAP_WRITE | CL_MAP_WRITE_INVALIDATE_REGION))
  189. data->write_map = 1;
  190. - if (e_status == CL_COMPLETE) {
  191. + if (cl_command_queue_allow_bypass_submit(command_queue) && (e_status == CL_COMPLETE)) {
  192. // Sync mode, no need to queue event.
  193. err = cl_event_exec(e, CL_COMPLETE, CL_FALSE);
  194. if (err != CL_SUCCESS) {
  195. @@ -1587,6 +1602,9 @@ clEnqueueMapImage(cl_command_queue command_queue,
  196. }
  197. cl_command_queue_enqueue_event(command_queue, e);
  198. + if (blocking_map) {
  199. + cl_event_wait_for_events_list(1, &e);
  200. + }
  201. }
  202. ptr = data->ptr;
  203. @@ -1764,7 +1782,7 @@ clEnqueueReadImage(cl_command_queue command_queue,
  204. data->row_pitch = row_pitch;
  205. data->slice_pitch = slice_pitch;
  206. - if (e_status == CL_COMPLETE) {
  207. + if (cl_command_queue_allow_bypass_submit(command_queue) && (e_status == CL_COMPLETE)) {
  208. // Sync mode, no need to queue event.
  209. err = cl_event_exec(e, CL_COMPLETE, CL_FALSE);
  210. if (err != CL_SUCCESS) {
  211. @@ -1776,6 +1794,9 @@ clEnqueueReadImage(cl_command_queue command_queue,
  212. break;
  213. }
  214. cl_command_queue_enqueue_event(command_queue, e);
  215. + if (blocking_read) {
  216. + cl_event_wait_for_events_list(1, &e);
  217. + }
  218. }
  219. } while (0);
  220. @@ -1916,7 +1937,7 @@ clEnqueueWriteImage(cl_command_queue command_queue,
  221. data->row_pitch = row_pitch;
  222. data->slice_pitch = slice_pitch;
  223. - if (e_status == CL_COMPLETE) {
  224. + if (cl_command_queue_allow_bypass_submit(command_queue) && (e_status == CL_COMPLETE)) {
  225. // Sync mode, no need to queue event.
  226. err = cl_event_exec(e, CL_COMPLETE, CL_FALSE);
  227. if (err != CL_SUCCESS) {
  228. @@ -1928,6 +1949,9 @@ clEnqueueWriteImage(cl_command_queue command_queue,
  229. break;
  230. }
  231. cl_command_queue_enqueue_event(command_queue, e);
  232. + if (blocking_write) {
  233. + cl_event_wait_for_events_list(1, &e);
  234. + }
  235. }
  236. } while (0);
  237. @@ -2062,7 +2086,7 @@ clEnqueueCopyImage(cl_command_queue command_queue,
  238. break;
  239. }
  240. - err = cl_event_exec(e, e_status == CL_COMPLETE ? CL_SUBMITTED : CL_QUEUED, CL_FALSE);
  241. + err = cl_event_exec(e, (cl_command_queue_allow_bypass_submit(command_queue) && (e_status == CL_COMPLETE)) ? CL_SUBMITTED : CL_QUEUED, CL_FALSE);
  242. if (err != CL_SUCCESS) {
  243. break;
  244. }
  245. @@ -2173,7 +2197,7 @@ clEnqueueCopyImageToBuffer(cl_command_queue command_queue,
  246. break;
  247. }
  248. - err = cl_event_exec(e, e_status == CL_COMPLETE ? CL_SUBMITTED : CL_QUEUED, CL_FALSE);
  249. + err = cl_event_exec(e, (cl_command_queue_allow_bypass_submit(command_queue) && (e_status == CL_COMPLETE)) ? CL_SUBMITTED : CL_QUEUED, CL_FALSE);
  250. if (err != CL_SUCCESS) {
  251. break;
  252. }
  253. @@ -2285,7 +2309,7 @@ clEnqueueCopyBufferToImage(cl_command_queue command_queue,
  254. break;
  255. }
  256. - err = cl_event_exec(e, e_status == CL_COMPLETE ? CL_SUBMITTED : CL_QUEUED, CL_FALSE);
  257. + err = cl_event_exec(e, (cl_command_queue_allow_bypass_submit(command_queue) && (e_status == CL_COMPLETE)) ? CL_SUBMITTED : CL_QUEUED, CL_FALSE);
  258. if (err != CL_SUCCESS) {
  259. break;
  260. }
  261. @@ -2395,7 +2419,7 @@ clEnqueueFillImage(cl_command_queue command_queue,
  262. break;
  263. }
  264. - err = cl_event_exec(e, e_status == CL_COMPLETE ? CL_SUBMITTED : CL_QUEUED, CL_FALSE);
  265. + err = cl_event_exec(e, (cl_command_queue_allow_bypass_submit(command_queue) && (e_status == CL_COMPLETE)) ? CL_SUBMITTED : CL_QUEUED, CL_FALSE);
  266. if (err != CL_SUCCESS) {
  267. break;
  268. }
  269. --- a/src/cl_command_queue.h
  270. +++ b/src/cl_command_queue.h
  271. @@ -103,6 +103,11 @@ extern cl_int cl_command_queue_wait_finish(cl_command_queue queue);
  272. extern cl_int cl_command_queue_wait_flush(cl_command_queue queue);
  273. /* Note: Must call this function with queue's lock. */
  274. extern cl_event *cl_command_queue_record_in_queue_events(cl_command_queue queue, cl_uint *list_num);
  275. +/* Whether it is valid to call cl_event_exec directly, instead of cl_command_queue_enqueue_event */
  276. +static inline cl_bool cl_command_queue_allow_bypass_submit(cl_command_queue queue){
  277. + return (queue->props & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE)/* if out-of-order, always */
  278. + || list_empty(&queue->worker.enqueued_events);/* if in-order, only if empty */
  279. +}
  280. #endif /* __CL_COMMAND_QUEUE_H__ */
  281. --- a/src/cl_command_queue_enqueue.c
  282. +++ b/src/cl_command_queue_enqueue.c
  283. @@ -65,6 +65,8 @@ worker_thread_function(void *Arg)
  284. if (cl_event_is_ready(e) <= CL_COMPLETE) {
  285. list_node_del(&e->enqueue_node);
  286. list_add_tail(&ready_list, &e->enqueue_node);
  287. + } else if(!(queue->props & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE)){
  288. + break; /* in in-order mode, can't skip over non-ready events */
  289. }
  290. }
  291. @@ -80,18 +82,20 @@ worker_thread_function(void *Arg)
  292. CL_OBJECT_UNLOCK(queue);
  293. /* Do the really job without lock.*/
  294. - exec_status = CL_SUBMITTED;
  295. - list_for_each_safe(pos, n, &ready_list)
  296. - {
  297. - e = list_entry(pos, _cl_event, enqueue_node);
  298. - cl_event_exec(e, exec_status, CL_FALSE);
  299. - }
  300. + if (queue->props & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE) { /* in in-order mode, need to get each all the way to CL_COMPLETE before starting the next one */
  301. + exec_status = CL_SUBMITTED;
  302. + list_for_each_safe(pos, n, &ready_list)
  303. + {
  304. + e = list_entry(pos, _cl_event, enqueue_node);
  305. + cl_event_exec(e, exec_status, CL_FALSE);
  306. + }
  307. - /* Notify all waiting for flush. */
  308. - CL_OBJECT_LOCK(queue);
  309. - worker->in_exec_status = CL_SUBMITTED;
  310. - CL_OBJECT_NOTIFY_COND(queue);
  311. - CL_OBJECT_UNLOCK(queue);
  312. + /* Notify all waiting for flush. */
  313. + CL_OBJECT_LOCK(queue);
  314. + worker->in_exec_status = CL_SUBMITTED;
  315. + CL_OBJECT_NOTIFY_COND(queue);
  316. + CL_OBJECT_UNLOCK(queue);
  317. + }
  318. list_for_each_safe(pos, n, &ready_list)
  319. {
  320. --- a/src/cl_gl_api.c
  321. +++ b/src/cl_gl_api.c
  322. @@ -188,7 +188,7 @@ cl_int clEnqueueAcquireGLObjects (cl_command_queue command_queue,
  323. data = &e->exec_data;
  324. data->type = EnqueueReturnSuccesss;
  325. - if (e_status == CL_COMPLETE) {
  326. + if (cl_command_queue_allow_bypass_submit(command_queue) && (e_status == CL_COMPLETE)) {
  327. // Sync mode, no need to queue event.
  328. err = cl_event_exec(e, CL_COMPLETE, CL_FALSE);
  329. if (err != CL_SUCCESS) {
  330. @@ -274,7 +274,7 @@ cl_int clEnqueueReleaseGLObjects (cl_command_queue command_queue,
  331. data = &e->exec_data;
  332. data->type = EnqueueReturnSuccesss;
  333. - if (e_status == CL_COMPLETE) {
  334. + if (cl_command_queue_allow_bypass_submit(command_queue) && (e_status == CL_COMPLETE)) {
  335. // Sync mode, no need to queue event.
  336. err = cl_event_exec(e, CL_COMPLETE, CL_FALSE);
  337. if (err != CL_SUCCESS) {