barrier_sync_server.cc 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406
  1. #include "barrier_sync_client.h"
  2. #include "barrier_sync_server.h"
  3. #include "simulator.h"
  4. #include "core_manager.h"
  5. #include "core.h"
  6. #include "thread.h"
  7. #include "performance_model.h"
  8. #include "hooks_manager.h"
  9. #include "syscall_server.h"
  10. #include "config.h"
  11. #include "log.h"
  12. #include "stats.h"
  13. #include "config.hpp"
  14. #include "circular_log.h"
  15. #include <algorithm>
  16. BarrierSyncServer::BarrierSyncServer()
  17. : m_local_clock_list(Sim()->getConfig()->getApplicationCores(), SubsecondTime::Zero())
  18. , m_barrier_acquire_list(Sim()->getConfig()->getApplicationCores(), false)
  19. , m_core_cond(Sim()->getConfig()->getApplicationCores(), NULL)
  20. , m_core_group(Sim()->getConfig()->getApplicationCores(), INVALID_CORE_ID)
  21. , m_core_thread(Sim()->getConfig()->getApplicationCores(), INVALID_THREAD_ID)
  22. , m_global_time(SubsecondTime::Zero())
  23. , m_fastforward(false)
  24. , m_disable(false)
  25. {
  26. try
  27. {
  28. m_barrier_interval = SubsecondTime::NS() * (UInt64) Sim()->getCfg()->getInt("clock_skew_minimization/barrier/quantum");
  29. }
  30. catch(...)
  31. {
  32. LOG_PRINT_ERROR("Error Reading 'clock_skew_minimization/barrier/quantum' from the config file");
  33. }
  34. for(core_id_t core_id = 0; core_id < (core_id_t)Sim()->getConfig()->getApplicationCores(); ++core_id)
  35. m_core_cond[core_id] = new ConditionVariable();
  36. m_next_barrier_time = m_barrier_interval;
  37. // Order our hooks to occur after possible reschedulings (which are done with ORDER_ACTION)
  38. Sim()->getHooksManager()->registerHook(HookType::HOOK_THREAD_EXIT, BarrierSyncServer::hookThreadExit, (UInt64)this, HooksManager::ORDER_NOTIFY_POST);
  39. Sim()->getHooksManager()->registerHook(HookType::HOOK_THREAD_STALL, BarrierSyncServer::hookThreadStall, (UInt64)this, HooksManager::ORDER_NOTIFY_POST);
  40. Sim()->getHooksManager()->registerHook(HookType::HOOK_THREAD_MIGRATE, BarrierSyncServer::hookThreadMigrate, (UInt64)this, HooksManager::ORDER_NOTIFY_POST);
  41. registerStatsMetric("barrier", 0, "global_time", &m_global_time);
  42. }
  43. BarrierSyncServer::~BarrierSyncServer()
  44. {
  45. for(core_id_t core_id = 0; core_id < (core_id_t)Sim()->getConfig()->getApplicationCores(); ++core_id)
  46. delete m_core_cond[core_id];
  47. }
  48. void
  49. BarrierSyncServer::synchronize(core_id_t core_id, SubsecondTime time)
  50. {
  51. ScopedLock sl(Sim()->getThreadManager()->getLock());
  52. if (m_disable)
  53. return;
  54. Core *core = Sim()->getCoreManager()->getCoreFromID(core_id);
  55. core_id_t master_core_id;
  56. if (m_fastforward)
  57. master_core_id = core_id; // In fast-forward, the SMT performance model in not active so every core (HW context) calls into the barrier
  58. else
  59. master_core_id = m_core_group[core_id] == INVALID_CORE_ID ? core_id : m_core_group[core_id];
  60. Core *master_core = Sim()->getCoreManager()->getCoreFromID(core_id);
  61. thread_id_t thread_me = core->getThread()->getId();
  62. CLOG("barrier", "Core %d entry (master core %d, thread %d, ffwd %d)", core_id, master_core_id, thread_me, m_fastforward);
  63. LOG_PRINT("Received 'SIM_BARRIER_WAIT' from Core(%i), Time(%s)", core_id, itostr(time).c_str());
  64. LOG_ASSERT_ERROR(core->getState() == Core::RUNNING || core->getState() == Core::INITIALIZING, "Core(%i) is not running or initializing at time(%s)", core_id, itostr(time).c_str());
  65. LOG_ASSERT_ERROR(m_barrier_acquire_list[master_core_id] == false, "Core(%i) or its sibling is already in the barrier (this is thread %d, we have thread %d)", master_core_id, thread_me, m_core_thread[master_core_id]);
  66. if (time < m_next_barrier_time && !m_fastforward)
  67. {
  68. LOG_PRINT("Sent 'SIM_BARRIER_RELEASE' immediately time(%s), m_next_barrier_time(%s)", itostr(time).c_str(), itostr(m_next_barrier_time).c_str());
  69. // LOG_PRINT_WARNING("core_id(%i), local_clock(%llu), m_next_barrier_time(%llu), m_barrier_interval(%llu)", core_id, time, m_next_barrier_time, m_barrier_interval);
  70. CLOG("barrier", "Core %d immediate exit", core_id);
  71. return;
  72. }
  73. // One thread entered the barrier, another one can resume
  74. doRelease(1);
  75. master_core->getPerformanceModel()->barrierEnter();
  76. m_local_clock_list[master_core_id] = time;
  77. m_barrier_acquire_list[master_core_id] = true;
  78. m_core_thread[master_core_id] = thread_me;
  79. bool mustWait = true;
  80. if (isBarrierReached())
  81. mustWait = barrierRelease(thread_me);
  82. if (mustWait)
  83. m_core_cond[master_core_id]->wait(Sim()->getThreadManager()->getLock());
  84. else
  85. master_core->getPerformanceModel()->barrierExit();
  86. CLOG("barrier", "Core %d exit (master core %d, thread %d)", core_id, master_core_id, thread_me);
  87. }
  88. void
  89. BarrierSyncServer::threadExit(HooksManager::ThreadTime *argument)
  90. {
  91. // Release thread from the barrier
  92. releaseThread(argument->thread_id);
  93. // Check to see if we were waiting for this thread
  94. signal();
  95. }
  96. void
  97. BarrierSyncServer::threadStall(HooksManager::ThreadStall *argument)
  98. {
  99. // Release thread from the barrier
  100. releaseThread(argument->thread_id);
  101. // Check to see if we were waiting for this thread
  102. signal();
  103. }
  104. void
  105. BarrierSyncServer::threadMigrate(HooksManager::ThreadMigrate *argument)
  106. {
  107. // Update the migrating thread's time so we'll be sure to release it
  108. releaseThread(argument->thread_id);
  109. // Migration due to thread stall/exit will generate another event later, we'll do a signal() then
  110. // Migration because of pre-emption is done only inside periodic(), we'll return into barrierRelease()
  111. }
  112. void
  113. BarrierSyncServer::releaseThread(thread_id_t thread_id)
  114. {
  115. for(core_id_t core_id = 0; core_id < (core_id_t) Sim()->getConfig()->getApplicationCores(); core_id++)
  116. {
  117. if (m_barrier_acquire_list[core_id] && m_core_thread[core_id] == thread_id)
  118. {
  119. // Make sure thread is released on next barrierRelease()
  120. m_local_clock_list[core_id] = SubsecondTime::Zero();
  121. }
  122. }
  123. // One thread stopped running, release another one now
  124. doRelease(1);
  125. }
  126. void
  127. BarrierSyncServer::signal()
  128. {
  129. if (m_disable)
  130. return;
  131. if (isBarrierReached())
  132. barrierRelease(INVALID_THREAD_ID);
  133. }
  134. bool
  135. BarrierSyncServer::isCoreRunning(core_id_t core_id, bool siblings)
  136. {
  137. Core *core = Sim()->getCoreManager()->getCoreFromID(core_id);
  138. if (core->getState() == Core::RUNNING)
  139. {
  140. LOG_ASSERT_ERROR(core->getThread(), "Core (%d) is running but has no thread", core_id);
  141. if (Sim()->getThreadManager()->isThreadRunning(core->getThread()->getId()))
  142. return true;
  143. }
  144. if (siblings && !m_fastforward)
  145. {
  146. for (core_id_t sibling_core_id = 0; sibling_core_id < (core_id_t) Sim()->getConfig()->getApplicationCores(); sibling_core_id++)
  147. {
  148. if (m_core_group[sibling_core_id] == core_id)
  149. {
  150. if (isCoreRunning(sibling_core_id, false))
  151. return true;
  152. }
  153. }
  154. }
  155. return false;
  156. }
  157. void
  158. BarrierSyncServer::advance()
  159. {
  160. barrierRelease(INVALID_THREAD_ID, true);
  161. }
  162. bool
  163. BarrierSyncServer::isBarrierReached()
  164. {
  165. bool single_core_barrier_reached = false;
  166. // Check if all cores have reached the barrier
  167. // All least one core must have (sync_time > m_next_barrier_time)
  168. for (core_id_t core_id = 0; core_id < (core_id_t) Sim()->getConfig()->getApplicationCores(); core_id++)
  169. {
  170. // In fastforward mode, it's enough that a core is waiting. In detailed mode, it needs to have advanced up to the predefined barrier time
  171. if (m_fastforward)
  172. {
  173. if (m_barrier_acquire_list[core_id])
  174. {
  175. // At least one core has reached the barrier
  176. single_core_barrier_reached = true;
  177. }
  178. else if (isCoreRunning(core_id))
  179. {
  180. // Core is running but hasn't checked in yet. Wait for it to sync.
  181. return false;
  182. }
  183. }
  184. else if (m_core_group[core_id] != INVALID_CORE_ID)
  185. {
  186. // Only consider group masters
  187. continue;
  188. }
  189. else if (isCoreRunning(core_id))
  190. {
  191. if (m_local_clock_list[core_id] < m_next_barrier_time)
  192. {
  193. // Core running on this core has not reached the barrier
  194. // Wait for it to sync
  195. return false;
  196. }
  197. else
  198. {
  199. // At least one core has reached the barrier
  200. single_core_barrier_reached = true;
  201. }
  202. }
  203. }
  204. return single_core_barrier_reached;
  205. }
  206. bool
  207. BarrierSyncServer::barrierRelease(thread_id_t caller_id, bool continue_until_release)
  208. {
  209. CLOG("barrier", "Release (caller thread %d)", caller_id);
  210. LOG_PRINT("Sending 'BARRIER_RELEASE'");
  211. // All cores have reached the barrier
  212. // Advance m_next_barrier_time
  213. // Release the Barrier
  214. LOG_ASSERT_ERROR(m_to_release.size() == 0, "Reached the barrier while some threads haven't even restarted?");
  215. if (m_fastforward)
  216. {
  217. for (core_id_t core_id = 0; core_id < (core_id_t) Sim()->getConfig()->getApplicationCores(); core_id++)
  218. {
  219. // In fast-forward mode, skip over (potentially very many) timeslots
  220. if (m_local_clock_list[core_id] > m_next_barrier_time)
  221. m_next_barrier_time = m_local_clock_list[core_id];
  222. }
  223. }
  224. // If a core cannot be resumed, we have to advance the sync
  225. // time till a core can be resumed. Then only, will we have
  226. // forward progress
  227. bool core_resumed = false;
  228. bool must_wait = true;
  229. while (!core_resumed)
  230. {
  231. m_global_time = m_next_barrier_time;
  232. CLOG("barrier", "Barrier %" PRId64 "ns", m_next_barrier_time.getNS());
  233. Sim()->getHooksManager()->callHooks(HookType::HOOK_PERIODIC, static_cast<subsecond_time_t>(m_next_barrier_time).m_time);
  234. if (continue_until_release)
  235. {
  236. // If HOOK_PERIODIC woke someone up, this thread can safely go to sleep
  237. if (Sim()->getThreadManager()->anyThreadRunning())
  238. return false;
  239. else
  240. LOG_ASSERT_ERROR(Sim()->getSyscallServer()->getNextTimeout(m_global_time) < SubsecondTime::MaxTime(), "No threads running, no timeout. Application has deadlocked...");
  241. }
  242. // If the barrier was disabled from HOOK_PERIODIC (for instance, if roi-end was triggered from a script), break
  243. if (m_disable)
  244. return false;
  245. m_next_barrier_time += m_barrier_interval;
  246. LOG_PRINT("m_next_barrier_time updated to (%s)", itostr(m_next_barrier_time).c_str());
  247. for (core_id_t core_id = 0; core_id < (core_id_t) Sim()->getConfig()->getApplicationCores(); core_id++)
  248. {
  249. if (m_local_clock_list[core_id] < m_next_barrier_time)
  250. {
  251. // Check if this core was running. If yes, release that core
  252. if (m_barrier_acquire_list[core_id] == true)
  253. {
  254. //Core *core = Sim()->getCoreManager()->getCoreFromID(core_id);
  255. //LOG_ASSERT_ERROR(core->getState() == Core::RUNNING || core->getState() == Core::INITIALIZING, "(%i) has acquired barrier, local_clock(%s), m_next_barrier_time(%s), but not initializing or running", core_id, itostr(m_local_clock_list[core_id]).c_str(), itostr(m_next_barrier_time).c_str());
  256. m_barrier_acquire_list[core_id] = false;
  257. core_resumed = true;
  258. if (m_core_thread[core_id] == caller_id)
  259. must_wait = false;
  260. else
  261. {
  262. Core *core = Sim()->getCoreManager()->getCoreFromID(core_id);
  263. core->getPerformanceModel()->barrierExit();
  264. m_to_release.push_back(core_id);
  265. }
  266. }
  267. }
  268. }
  269. }
  270. // To avoid overwhelming the OS scheduler, we only release N threads at a time (N ~= host cores).
  271. // Once a thread is done (stops executing because it completed the next barrier quantum, or due to thread stall),
  272. // one more thread is released so we always have at most N running threads.
  273. std::random_shuffle(m_to_release.begin(), m_to_release.end());
  274. doRelease(m_fastforward ? -1 : Sim()->getConfig()->getNumHostCores());
  275. return must_wait;
  276. }
  277. void
  278. BarrierSyncServer::doRelease(int n)
  279. {
  280. // Release up to n threads from the list.
  281. // When n == -1, all threads are released
  282. while(m_to_release.size() && n--)
  283. {
  284. core_id_t core_id = m_to_release.back();
  285. m_to_release.pop_back();
  286. m_core_cond[core_id]->signal();
  287. }
  288. }
  289. void
  290. BarrierSyncServer::abortBarrier()
  291. {
  292. CLOG("barrier", "Abort");
  293. for(core_id_t core_id = 0; core_id < (core_id_t) Sim()->getConfig()->getApplicationCores(); core_id++)
  294. {
  295. // Check if this core was running. If yes, release that core
  296. if (m_barrier_acquire_list[core_id] == true)
  297. {
  298. m_barrier_acquire_list[core_id] = false;
  299. Core *core = Sim()->getCoreManager()->getCoreFromID(core_id);
  300. core->getPerformanceModel()->barrierExit();
  301. m_core_cond[core_id]->signal();
  302. }
  303. }
  304. }
  305. void
  306. BarrierSyncServer::setDisable(bool disable)
  307. {
  308. this->m_disable = disable;
  309. if (disable)
  310. abortBarrier();
  311. }
  312. void
  313. BarrierSyncServer::setGroup(core_id_t core_id, core_id_t master_core_id)
  314. {
  315. if (master_core_id != INVALID_CORE_ID)
  316. LOG_ASSERT_ERROR(m_barrier_acquire_list[core_id] == false, "Core(%d) is in the barrier, cannot set participate to false", core_id);
  317. m_core_group[core_id] = master_core_id;
  318. }
  319. void
  320. BarrierSyncServer::setFastForward(bool fastforward, SubsecondTime next_barrier_time)
  321. {
  322. if (m_fastforward != fastforward)
  323. CLOG("barrier", "FastForward %d > %d", m_fastforward, fastforward);
  324. m_fastforward = fastforward;
  325. if (next_barrier_time != SubsecondTime::MaxTime())
  326. {
  327. m_next_barrier_time = std::max(m_next_barrier_time, next_barrier_time);
  328. }
  329. }
  330. void
  331. BarrierSyncServer::printState(void)
  332. {
  333. printf("Barrier state:");
  334. for(core_id_t core_id = 0; core_id < (core_id_t) Sim()->getConfig()->getApplicationCores(); core_id++)
  335. {
  336. if (m_core_group[core_id] != INVALID_CORE_ID)
  337. printf(" .");
  338. else if (m_barrier_acquire_list[core_id] == true)
  339. {
  340. if (m_local_clock_list[core_id] >= m_next_barrier_time)
  341. printf(" ^");
  342. else
  343. printf(" A");
  344. }
  345. else if (isCoreRunning(core_id))
  346. printf(" R");
  347. else
  348. printf(" _");
  349. }
  350. printf("\n");
  351. }