123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582 |
- #include "core.h"
- #include "network.h"
- #include "syscall_model.h"
- #include "branch_predictor.h"
- #include "memory_manager_base.h"
- #include "performance_model.h"
- #include "instruction.h"
- #include "clock_skew_minimization_object.h"
- #include "core_manager.h"
- #include "dvfs_manager.h"
- #include "hooks_manager.h"
- #include "trace_manager.h"
- #include "simulator.h"
- #include "log.h"
- #include "config.hpp"
- #include "stats.h"
- #include "topology_info.h"
- #include "cheetah_manager.h"
- #include <cstring>
- #if 0
- extern Lock iolock;
- # define MYLOG(...) { ScopedLock l(iolock); fflush(stderr); fprintf(stderr, "[%8lu] %dcor %-25s@%03u: ", getPerformanceModel()->getCycleCount(ShmemPerfModel::_USER_THREAD), m_core_id, __FUNCTION__, __LINE__); fprintf(stderr, __VA_ARGS__); fprintf(stderr, "\n"); fflush(stderr); }
- #else
- # define MYLOG(...) {}
- #endif
- #define VERBOSE 0
- const char * ModeledString(Core::MemModeled modeled) {
- switch(modeled)
- {
- case Core::MEM_MODELED_NONE: return "none";
- case Core::MEM_MODELED_COUNT: return "count";
- case Core::MEM_MODELED_COUNT_TLBTIME: return "count/tlb";
- case Core::MEM_MODELED_TIME: return "time";
- case Core::MEM_MODELED_FENCED: return "fenced";
- case Core::MEM_MODELED_RETURN: return "return";
- }
- return "?";
- }
- const char * core_state_names[] = {
- "running",
- "initializing",
- "stalled",
- "sleeping",
- "waking_up",
- "idle",
- "broken",
- };
- static_assert(Core::NUM_STATES == sizeof(core_state_names) / sizeof(core_state_names[0]),
- "Not enough values in core_state_names");
- const char * Core::CoreStateString(Core::State state)
- {
- LOG_ASSERT_ERROR(state < Core::NUM_STATES, "Invalid core state %d", state);
- return core_state_names[state];
- }
- Lock Core::m_global_core_lock;
- UInt64 Core::g_instructions_hpi_global = 0;
- UInt64 Core::g_instructions_hpi_global_callback = 0;
- Core::Core(SInt32 id)
- : m_core_id(id)
- , m_dvfs_domain(Sim()->getDvfsManager()->getCoreDomain(id))
- , m_thread(NULL)
- , m_bbv(id)
- , m_topology_info(new TopologyInfo(id))
- , m_cheetah_manager(Sim()->getCfg()->getBool("core/cheetah/enabled") ? new CheetahManager(id) : NULL)
- , m_core_state(Core::IDLE)
- , m_icache_last_block(-1)
- , m_spin_loops(0)
- , m_spin_instructions(0)
- , m_spin_elapsed_time(SubsecondTime::Zero())
- , m_instructions(0)
- , m_instructions_callback(UINT64_MAX)
- , m_instructions_hpi_callback(0)
- , m_instructions_hpi_last(0)
- {
- LOG_PRINT("Core ctor for: %d", id);
- registerStatsMetric("core", id, "instructions", &m_instructions);
- registerStatsMetric("core", id, "spin_loops", &m_spin_loops);
- registerStatsMetric("core", id, "spin_instructions", &m_spin_instructions);
- registerStatsMetric("core", id, "spin_elapsed_time", &m_spin_elapsed_time);
- Sim()->getStatsManager()->logTopology("hwcontext", id, id);
- m_network = new Network(this);
- m_clock_skew_minimization_client = ClockSkewMinimizationClient::create(this);
- m_shmem_perf_model = new ShmemPerfModel();
- LOG_PRINT("instantiated memory manager model");
- m_memory_manager = MemoryManagerBase::createMMU(
- Sim()->getCfg()->getString("caching_protocol/type"),
- this, m_network, m_shmem_perf_model);
- m_performance_model = PerformanceModel::create(this);
- }
- Core::~Core()
- {
- if (m_cheetah_manager)
- delete m_cheetah_manager;
- delete m_topology_info;
- delete m_memory_manager;
- delete m_shmem_perf_model;
- delete m_performance_model;
- if (m_clock_skew_minimization_client)
- delete m_clock_skew_minimization_client;
- delete m_network;
- }
- void Core::enablePerformanceModels()
- {
- getShmemPerfModel()->enable();
- getMemoryManager()->enableModels();
- getNetwork()->enableModels();
- getPerformanceModel()->enable();
- }
- void Core::disablePerformanceModels()
- {
- getShmemPerfModel()->disable();
- getMemoryManager()->disableModels();
- getNetwork()->disableModels();
- getPerformanceModel()->disable();
- }
- bool
- Core::countInstructions(IntPtr address, UInt32 count)
- {
- bool check_rescheduled = false;
- m_instructions += count;
- if (m_bbv.sample())
- m_bbv.count(address, count);
- m_performance_model->countInstructions(address, count);
- if (isEnabledInstructionsCallback())
- {
- if (m_instructions >= m_instructions_callback)
- {
- disableInstructionsCallback();
- Sim()->getHooksManager()->callHooks(HookType::HOOK_INSTR_COUNT, m_core_id);
- // When using the fast-forward performance model, HOOK_INSTR_COUNT may cause a rescheduling
- // of the current thread so let it know that it should make the appropriate checks
- check_rescheduled = true;
- }
- }
- hookPeriodicInsCheck();
- return check_rescheduled;
- }
- void
- Core::hookPeriodicInsCheck()
- {
- if (m_instructions > m_instructions_hpi_callback)
- {
- __sync_fetch_and_add(&g_instructions_hpi_global, m_instructions - m_instructions_hpi_last);
- m_instructions_hpi_callback += Sim()->getConfig()->getHPIInstructionsPerCore();
- m_instructions_hpi_last = m_instructions;
- // Quick, unlocked check if we should do the HOOK_PERIODIC_INS callback
- if (g_instructions_hpi_global > g_instructions_hpi_global_callback)
- hookPeriodicInsCall();
- }
- }
- void
- Core::hookPeriodicInsCall()
- {
- // Take the Thread lock, to make sure no other core calls us at the same time
- // and that the hook callback is also serialized w.r.t. other global events
- ScopedLock sl(Sim()->getThreadManager()->getLock());
- // Definitive, locked checked if we should do the HOOK_PERIODIC_INS callback
- if (g_instructions_hpi_global > g_instructions_hpi_global_callback)
- {
- Sim()->getHooksManager()->callHooks(HookType::HOOK_PERIODIC_INS, g_instructions_hpi_global);
- g_instructions_hpi_global_callback += Sim()->getConfig()->getHPIInstructionsGlobal();
- }
- }
- bool
- Core::accessBranchPredictor(IntPtr eip, bool taken, IntPtr target)
- {
- PerformanceModel *prfmdl = getPerformanceModel();
- BranchPredictor *bp = prfmdl->getBranchPredictor();
- if (bp)
- {
- bool prediction = bp->predict(eip, target);
- bp->update(prediction, taken, eip, target);
- return (prediction != taken);
- }
- else
- {
- return false;
- }
- }
- MemoryResult
- makeMemoryResult(HitWhere::where_t _hit_where, SubsecondTime _latency)
- {
- LOG_ASSERT_ERROR(_hit_where < HitWhere::NUM_HITWHERES, "Invalid HitWhere %u", (long)_hit_where);
- MemoryResult res;
- res.hit_where = _hit_where;
- res.latency = _latency;
- return res;
- }
- void
- Core::logMemoryHit(bool icache, mem_op_t mem_op_type, IntPtr address, MemModeled modeled, IntPtr eip)
- {
- getMemoryManager()->addL1Hits(icache, mem_op_type, 1);
- }
- MemoryResult
- Core::readInstructionMemory(IntPtr address, UInt32 instruction_size)
- {
- LOG_PRINT("Instruction: Address(0x%x), Size(%u), Start READ",
- address, instruction_size);
- UInt64 blockmask = ~(getMemoryManager()->getCacheBlockSize() - 1);
- bool single_cache_line = ((address & blockmask) == ((address + instruction_size - 1) & blockmask));
- // Assume the core reads full instruction cache lines and caches them internally for subsequent instructions.
- // This reduces L1-I accesses and power to more realistic levels.
- // For Nehalem, it's in fact only 16 bytes, other architectures (Sandy Bridge) have a micro-op cache,
- // so this is just an approximation.
- // When accessing the same cache line as last time, don't access the L1-I
- if ((address & blockmask) == m_icache_last_block)
- {
- if (single_cache_line)
- {
- return makeMemoryResult(HitWhere::L1I, getMemoryManager()->getL1HitLatency());
- }
- else
- {
- // Instruction spanning cache lines: drop the first line, do access the second one
- address = (address & blockmask) + getMemoryManager()->getCacheBlockSize();
- }
- }
- // Update the most recent cache line accessed
- m_icache_last_block = address & blockmask;
- // Cases with multiple cache lines or when we are not sure that it will be a hit call into the caches
- return initiateMemoryAccess(MemComponent::L1_ICACHE,
- Core::NONE, Core::READ, address & blockmask, NULL, getMemoryManager()->getCacheBlockSize(), MEM_MODELED_COUNT_TLBTIME, 0, SubsecondTime::MaxTime());
- }
- void Core::accessMemoryFast(bool icache, mem_op_t mem_op_type, IntPtr address)
- {
- if (m_cheetah_manager && icache == false)
- m_cheetah_manager->access(mem_op_type, address);
- SubsecondTime latency = getMemoryManager()->coreInitiateMemoryAccessFast(icache, mem_op_type, address);
- if (latency > SubsecondTime::Zero())
- m_performance_model->handleMemoryLatency(latency, HitWhere::MISS);
- }
- MemoryResult
- Core::initiateMemoryAccess(MemComponent::component_t mem_component,
- lock_signal_t lock_signal,
- mem_op_t mem_op_type,
- IntPtr address,
- Byte* data_buf, UInt32 data_size,
- MemModeled modeled,
- IntPtr eip,
- SubsecondTime now)
- {
- MYLOG("access %lx+%u %c%c modeled(%s)", address, data_size, mem_op_type == Core::WRITE ? 'W' : 'R', mem_op_type == Core::READ_EX ? 'X' : ' ', ModeledString(modeled));
- if (data_size <= 0)
- {
- return makeMemoryResult((HitWhere::where_t)mem_component,SubsecondTime::Zero());
- }
- // Setting the initial time
- SubsecondTime initial_time = (now == SubsecondTime::MaxTime()) ? getPerformanceModel()->getElapsedTime() : now;
- // Protect from concurrent access by user thread (doing rewritten memops) and core thread (doing icache lookups)
- if (lock_signal != Core::UNLOCK)
- m_mem_lock.acquire();
- #if 0
- static int i = 0;
- static Lock iolock;
- if ((i++) % 1000 == 0) {
- ScopedLock slio(iolock);
- printf("[TIME],%lu,", (Timer::now() / 100000) % 10000000);
- for(int i = 0; i < Sim()->getConfig()->getApplicationCores(); ++i)
- if (i == m_core_id)
- printf("%lu,%lu,%lu,", initial_time, getShmemPerfModel()->getCycleCount(ShmemPerfModel::_USER_THREAD), getShmemPerfModel()->getCycleCount(ShmemPerfModel::_SIM_THREAD));
- else
- printf(",,,");
- printf("\n");
- }
- #endif
- getShmemPerfModel()->setElapsedTime(ShmemPerfModel::_USER_THREAD, initial_time);
- LOG_PRINT("Time(%s), %s - ADDR(0x%x), data_size(%u), START",
- itostr(initial_time).c_str(),
- ((mem_op_type == READ) ? "READ" : "WRITE"),
- address, data_size);
- UInt32 num_misses = 0;
- HitWhere::where_t hit_where = HitWhere::UNKNOWN;
- UInt32 cache_block_size = getMemoryManager()->getCacheBlockSize();
- IntPtr begin_addr = address;
- IntPtr end_addr = address + data_size;
- IntPtr begin_addr_aligned = begin_addr - (begin_addr % cache_block_size);
- IntPtr end_addr_aligned = end_addr - (end_addr % cache_block_size);
- Byte *curr_data_buffer_head = (Byte*) data_buf;
- for (IntPtr curr_addr_aligned = begin_addr_aligned; curr_addr_aligned <= end_addr_aligned; curr_addr_aligned += cache_block_size)
- {
- // Access the cache one line at a time
- UInt32 curr_offset;
- UInt32 curr_size;
- // Determine the offset
- if (curr_addr_aligned == begin_addr_aligned)
- {
- curr_offset = begin_addr % cache_block_size;
- }
- else
- {
- curr_offset = 0;
- }
- // Determine the size
- if (curr_addr_aligned == end_addr_aligned)
- {
- curr_size = (end_addr % cache_block_size) - (curr_offset);
- if (curr_size == 0)
- {
- continue;
- }
- }
- else
- {
- curr_size = cache_block_size - (curr_offset);
- }
- LOG_PRINT("Start InitiateSharedMemReq: ADDR(0x%x), offset(%u), curr_size(%u)", curr_addr_aligned, curr_offset, curr_size);
- if (m_cheetah_manager)
- m_cheetah_manager->access(mem_op_type, curr_addr_aligned);
- HitWhere::where_t this_hit_where = getMemoryManager()->coreInitiateMemoryAccess(
- mem_component,
- lock_signal,
- mem_op_type,
- curr_addr_aligned, curr_offset,
- data_buf ? curr_data_buffer_head : NULL, curr_size,
- modeled);
- if (hit_where != (HitWhere::where_t)mem_component)
- {
- // If it is a READ or READ_EX operation,
- // 'initiateSharedMemReq' causes curr_data_buffer_head
- // to be automatically filled in
- // If it is a WRITE operation,
- // 'initiateSharedMemReq' reads the data
- // from curr_data_buffer_head
- num_misses ++;
- }
- if (hit_where == HitWhere::UNKNOWN || (this_hit_where != HitWhere::UNKNOWN && this_hit_where > hit_where))
- hit_where = this_hit_where;
- LOG_PRINT("End InitiateSharedMemReq: ADDR(0x%x), offset(%u), curr_size(%u)", curr_addr_aligned, curr_offset, curr_size);
- // Increment the buffer head
- curr_data_buffer_head += curr_size;
- }
- // Get the final cycle time
- SubsecondTime final_time = getShmemPerfModel()->getElapsedTime(ShmemPerfModel::_USER_THREAD);
- LOG_ASSERT_ERROR(final_time >= initial_time,
- "final_time(%s) < initial_time(%s)",
- itostr(final_time).c_str(),
- itostr(initial_time).c_str());
- LOG_PRINT("Time(%s), %s - ADDR(0x%x), data_size(%u), END\n",
- itostr(final_time).c_str(),
- ((mem_op_type == READ) ? "READ" : "WRITE"),
- address, data_size);
- if (lock_signal != Core::LOCK)
- m_mem_lock.release();
- // Calculate the round-trip time
- SubsecondTime shmem_time = final_time - initial_time;
- switch(modeled)
- {
- #if 0
- case MEM_MODELED_DYNINFO:
- {
- DynamicInstructionInfo info = DynamicInstructionInfo::createMemoryInfo(eip, true, shmem_time, address, data_size, (mem_op_type == WRITE) ? Operand::WRITE : Operand::READ, num_misses, hit_where);
- m_performance_model->pushDynamicInstructionInfo(info);
- #endif
- case MEM_MODELED_TIME:
- case MEM_MODELED_FENCED:
- if (m_performance_model->isEnabled())
- {
- /* queue a fake instruction that will account for the access latency */
- PseudoInstruction *i = new MemAccessInstruction(shmem_time, address, data_size, modeled == MEM_MODELED_FENCED);
- m_performance_model->queuePseudoInstruction(i);
- }
- break;
- case MEM_MODELED_COUNT:
- case MEM_MODELED_COUNT_TLBTIME:
- if (shmem_time > SubsecondTime::Zero())
- m_performance_model->handleMemoryLatency(shmem_time, hit_where);
- break;
- case MEM_MODELED_NONE:
- case MEM_MODELED_RETURN:
- break;
- }
- if (modeled != MEM_MODELED_NONE)
- {
- getShmemPerfModel()->incrTotalMemoryAccessLatency(shmem_time);
- }
- LOG_ASSERT_ERROR(hit_where != HitWhere::UNKNOWN, "HitWhere == UNKNOWN");
- return makeMemoryResult(hit_where, shmem_time);
- }
- // FIXME: This should actually be 'accessDataMemory()'
- /*
- * accessMemory (lock_signal_t lock_signal, mem_op_t mem_op_type, IntPtr d_addr, char* data_buffer, UInt32 data_size)
- *
- * Arguments:
- * lock_signal :: NONE, LOCK, or UNLOCK
- * mem_op_type :: READ, READ_EX, or WRITE
- * d_addr :: address of location we want to access (read or write)
- * data_buffer :: buffer holding data for WRITE or buffer which must be written on a READ
- * data_size :: size of data we must read/write
- *
- * Return Value:
- * number of misses :: State the number of cache misses
- */
- MemoryResult
- Core::accessMemory(lock_signal_t lock_signal, mem_op_t mem_op_type, IntPtr d_addr, char* data_buffer, UInt32 data_size, MemModeled modeled, IntPtr eip, SubsecondTime now, bool is_fault_mask)
- {
- // In PINTOOL mode, if the data is requested, copy it to/from real memory
- if (data_buffer && !is_fault_mask)
- {
- if (Sim()->getConfig()->getSimulationMode() == Config::PINTOOL)
- {
- nativeMemOp (NONE, mem_op_type, d_addr, data_buffer, data_size);
- }
- else if (Sim()->getConfig()->getSimulationMode() == Config::STANDALONE)
- {
- Sim()->getTraceManager()->accessMemory(m_core_id, lock_signal, mem_op_type, d_addr, data_buffer, data_size);
- }
- data_buffer = NULL; // initiateMemoryAccess's data is not used
- }
- if (modeled == MEM_MODELED_NONE)
- return makeMemoryResult(HitWhere::UNKNOWN, SubsecondTime::Zero());
- else
- return initiateMemoryAccess(MemComponent::L1_DCACHE, lock_signal, mem_op_type, d_addr, (Byte*) data_buffer, data_size, modeled, eip, now);
- }
- MemoryResult
- Core::nativeMemOp(lock_signal_t lock_signal, mem_op_t mem_op_type, IntPtr d_addr, char* data_buffer, UInt32 data_size)
- {
- if (data_size <= 0)
- {
- return makeMemoryResult(HitWhere::UNKNOWN,SubsecondTime::Zero());
- }
- if (lock_signal == LOCK)
- {
- assert(mem_op_type == READ_EX);
- m_global_core_lock.acquire();
- }
- if ( (mem_op_type == READ) || (mem_op_type == READ_EX) )
- {
- applicationMemCopy ((void*) data_buffer, (void*) d_addr, (size_t) data_size);
- }
- else if (mem_op_type == WRITE)
- {
- applicationMemCopy ((void*) d_addr, (void*) data_buffer, (size_t) data_size);
- }
- if (lock_signal == UNLOCK)
- {
- assert(mem_op_type == WRITE);
- m_global_core_lock.release();
- }
- return makeMemoryResult(HitWhere::UNKNOWN,SubsecondTime::Zero());
- }
- __attribute__((weak)) void
- applicationMemCopy(void *dest, const void *src, size_t n)
- {
- memcpy(dest, src, n);
- }
- void
- Core::emulateCpuid(UInt32 eax, UInt32 ecx, cpuid_result_t &res) const
- {
- switch(eax)
- {
- case 0x0:
- {
- cpuid(0, 0, res);
- res.eax = std::max(UInt32(0xb), res.eax); // Maximum input eax: make sure 0xb is included
- break;
- }
- case 0x1:
- {
- // Return native results, except for CPU id
- cpuid(eax, ecx, res);
- res.ebx = (m_core_id << 24) | (Sim()->getConfig()->getApplicationCores() << 16) | (res.ebx &0xffff);
- break;
- }
- case 0xb:
- {
- // Extended Topology Enumeration Leaf
- switch(ecx)
- {
- case 0:
- // Level 0: SMT
- res.eax = TopologyInfo::SMT_SHIFT_BITS;
- res.ebx = m_topology_info->smt_count; // SMT threads / core
- res.ecx = ecx | (1 << 8); // Level type = SMT
- break;
- case 1:
- // Level 1: cores
- res.eax = TopologyInfo::PACKAGE_SHIFT_BITS;
- res.ebx = m_topology_info->smt_count * m_topology_info->core_count; // HW contexts / package
- res.ecx = ecx | (2 << 8); // Level type = Core
- break;
- default:
- // Invalid level
- res.eax = 0;
- res.ebx = 0;
- res.ecx = ecx;
- break;
- }
- res.edx = m_topology_info->apic_id;
- break;
- }
- default:
- {
- // Return native results (original cpuid instruction is deleted)
- cpuid(eax, ecx, res);
- break;
- }
- }
- #if VERBOSE
- printf("CPUID[%d]: %08x %08x => ", m_core_id, eax, ecx);
- printf("%08x %08x %08x %08x\n", res.eax, res.ebx, res.ecx, res.edx);
- #endif
- }
|