parfor.c 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200
  1. // Copyright 2012 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. // Parallel for algorithm.
  5. #include "runtime.h"
  6. #include "arch.h"
  7. struct ParForThread
  8. {
  9. // the thread's iteration space [32lsb, 32msb)
  10. uint64 pos;
  11. // stats
  12. uint64 nsteal;
  13. uint64 nstealcnt;
  14. uint64 nprocyield;
  15. uint64 nosyield;
  16. uint64 nsleep;
  17. byte pad[CacheLineSize];
  18. };
  19. ParFor*
  20. runtime_parforalloc(uint32 nthrmax)
  21. {
  22. ParFor *desc;
  23. // The ParFor object is followed by CacheLineSize padding
  24. // and then nthrmax ParForThread.
  25. desc = (ParFor*)runtime_malloc(sizeof(ParFor) + CacheLineSize + nthrmax * sizeof(ParForThread));
  26. desc->thr = (ParForThread*)((byte*)(desc+1) + CacheLineSize);
  27. desc->nthrmax = nthrmax;
  28. return desc;
  29. }
  30. void
  31. runtime_parforsetup(ParFor *desc, uint32 nthr, uint32 n, void *ctx, bool wait, void (*body)(ParFor*, uint32))
  32. {
  33. uint32 i, begin, end;
  34. uint64 *pos;
  35. if(desc == nil || nthr == 0 || nthr > desc->nthrmax || body == nil) {
  36. runtime_printf("desc=%p nthr=%d count=%d body=%p\n", desc, nthr, n, body);
  37. runtime_throw("parfor: invalid args");
  38. }
  39. desc->body = body;
  40. desc->done = 0;
  41. desc->nthr = nthr;
  42. desc->thrseq = 0;
  43. desc->cnt = n;
  44. desc->ctx = ctx;
  45. desc->wait = wait;
  46. desc->nsteal = 0;
  47. desc->nstealcnt = 0;
  48. desc->nprocyield = 0;
  49. desc->nosyield = 0;
  50. desc->nsleep = 0;
  51. for(i=0; i<nthr; i++) {
  52. begin = (uint64)n*i / nthr;
  53. end = (uint64)n*(i+1) / nthr;
  54. pos = &desc->thr[i].pos;
  55. if(((uintptr)pos & 7) != 0)
  56. runtime_throw("parforsetup: pos is not aligned");
  57. *pos = (uint64)begin | (((uint64)end)<<32);
  58. }
  59. }
  60. void
  61. runtime_parfordo(ParFor *desc)
  62. {
  63. ParForThread *me;
  64. uint32 tid, begin, end, begin2, try, victim, i;
  65. uint64 *mypos, *victimpos, pos, newpos;
  66. void (*body)(ParFor*, uint32);
  67. bool idle;
  68. // Obtain 0-based thread index.
  69. tid = runtime_xadd(&desc->thrseq, 1) - 1;
  70. if(tid >= desc->nthr) {
  71. runtime_printf("tid=%d nthr=%d\n", tid, desc->nthr);
  72. runtime_throw("parfor: invalid tid");
  73. }
  74. // If single-threaded, just execute the for serially.
  75. if(desc->nthr==1) {
  76. for(i=0; i<desc->cnt; i++)
  77. desc->body(desc, i);
  78. return;
  79. }
  80. body = desc->body;
  81. me = &desc->thr[tid];
  82. mypos = &me->pos;
  83. for(;;) {
  84. for(;;) {
  85. // While there is local work,
  86. // bump low index and execute the iteration.
  87. pos = runtime_xadd64(mypos, 1);
  88. begin = (uint32)pos-1;
  89. end = (uint32)(pos>>32);
  90. if(begin < end) {
  91. body(desc, begin);
  92. continue;
  93. }
  94. break;
  95. }
  96. // Out of work, need to steal something.
  97. idle = false;
  98. for(try=0;; try++) {
  99. // If we don't see any work for long enough,
  100. // increment the done counter...
  101. if(try > desc->nthr*4 && !idle) {
  102. idle = true;
  103. runtime_xadd(&desc->done, 1);
  104. }
  105. // ...if all threads have incremented the counter,
  106. // we are done.
  107. if(desc->done + !idle == desc->nthr) {
  108. if(!idle)
  109. runtime_xadd(&desc->done, 1);
  110. goto exit;
  111. }
  112. // Choose a random victim for stealing.
  113. victim = runtime_fastrand1() % (desc->nthr-1);
  114. if(victim >= tid)
  115. victim++;
  116. victimpos = &desc->thr[victim].pos;
  117. for(;;) {
  118. // See if it has any work.
  119. pos = runtime_atomicload64(victimpos);
  120. begin = (uint32)pos;
  121. end = (uint32)(pos>>32);
  122. if(begin+1 >= end) {
  123. begin = end = 0;
  124. break;
  125. }
  126. if(idle) {
  127. runtime_xadd(&desc->done, -1);
  128. idle = false;
  129. }
  130. begin2 = begin + (end-begin)/2;
  131. newpos = (uint64)begin | (uint64)begin2<<32;
  132. if(runtime_cas64(victimpos, pos, newpos)) {
  133. begin = begin2;
  134. break;
  135. }
  136. }
  137. if(begin < end) {
  138. // Has successfully stolen some work.
  139. if(idle)
  140. runtime_throw("parfor: should not be idle");
  141. runtime_atomicstore64(mypos, (uint64)begin | (uint64)end<<32);
  142. me->nsteal++;
  143. me->nstealcnt += end-begin;
  144. break;
  145. }
  146. // Backoff.
  147. if(try < desc->nthr) {
  148. // nothing
  149. } else if (try < 4*desc->nthr) {
  150. me->nprocyield++;
  151. runtime_procyield(20);
  152. // If a caller asked not to wait for the others, exit now
  153. // (assume that most work is already done at this point).
  154. } else if (!desc->wait) {
  155. if(!idle)
  156. runtime_xadd(&desc->done, 1);
  157. goto exit;
  158. } else if (try < 6*desc->nthr) {
  159. me->nosyield++;
  160. runtime_osyield();
  161. } else {
  162. me->nsleep++;
  163. runtime_usleep(1);
  164. }
  165. }
  166. }
  167. exit:
  168. runtime_xadd64(&desc->nsteal, me->nsteal);
  169. runtime_xadd64(&desc->nstealcnt, me->nstealcnt);
  170. runtime_xadd64(&desc->nprocyield, me->nprocyield);
  171. runtime_xadd64(&desc->nosyield, me->nosyield);
  172. runtime_xadd64(&desc->nsleep, me->nsleep);
  173. me->nsteal = 0;
  174. me->nstealcnt = 0;
  175. me->nprocyield = 0;
  176. me->nosyield = 0;
  177. me->nsleep = 0;
  178. }
  179. // For testing from Go.
  180. void
  181. runtime_parforiters(ParFor *desc, uintptr tid, uintptr *start, uintptr *end)
  182. {
  183. *start = (uint32)desc->thr[tid].pos;
  184. *end = (uint32)(desc->thr[tid].pos>>32);
  185. }