123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163 |
- import collections, sniper_lib, sniper_config
- class CpiData:
- def __init__(self, jobid = '', resultsdir = '', config = None, stats = None, data = None, partial = None):
- if data:
- data_raw = data
- else:
- data_raw = sniper_lib.get_results(jobid = jobid, resultsdir = resultsdir, config = config, stats = stats, partial = partial)
- self.stats = data_raw['results']
- self.config = data_raw['config']
- self.parse()
- def parse(self):
- ncores = int(self.config['general/total_cores'])
- instrs = self.stats['performance_model.instruction_count'] if sum(self.stats['performance_model.instruction_count']) else self.stats['core.instructions']
- try:
- times = self.stats['performance_model.elapsed_time']
- cycles_scale = self.stats['fs_to_cycles_cores']
- except KeyError:
- # On error, assume that we are using the pre-DVFS version
- times = self.stats['performance_model.cycle_count']
- cycles_scale = [ 1. for idx in range(ncores) ]
- time0_begin = self.stats['global.time_begin']
- time0_end = self.stats['global.time_end']
- times = [ self.stats['performance_model.elapsed_time_end'][core] - time0_begin for core in range(ncores) ]
- # TODO: The below is needed for sampling. We're currently set up to work properly with the one-IPC model using in combination with --cache-only
- #if self.stats.get('fastforward_performance_model.fastforwarded_time', [0])[0]:
- # fastforward_scale = times[0] / (times[0] - self.stats['fastforward_performance_model.fastforwarded_time'][0])
- # fastforward_extrapolate = True
- # times = [ t-f for t, f in zip(times, self.stats['fastforward_performance_model.fastforwarded_time']) ]
- #else:
- # fastforward_scale = 1.
- # fastforward_extrapolate = False
- if 'performance_model.cpiFastforwardTime' in self.stats:
- del self.stats['performance_model.cpiFastforwardTime']
- fastforward_scale = 1.
- fastforward_extrapolate = False
- data = collections.defaultdict(lambda: collections.defaultdict(long))
- for key, values in self.stats.items():
- if '.cpi' in key:
- if key.startswith('thread.'):
- # Ignore per-thread statistics
- continue
- if key.startswith('fastforward_timer.') and fastforward_extrapolate:
- continue
- key = key.split('.cpi')[1]
- for core in range(ncores):
- data[core][key] += values[core] * cycles_scale[core]
- if not data:
- raise ValueError('No .cpi data found, simulation did not use the interval core model')
- # Split up cpiBase into 1/issue and path dependencies
- for core in range(ncores):
- if data[core].get('SyncMemAccess', 0) == data[core].get('SyncPthreadBarrier', 0):
- # Work around a bug in iGraphite where SyncMemAccess wrongly copied from SyncPthreadBarrier
- # Since SyncMemAccess usually isn't very big anyway, setting it to zero should be accurate enough
- # For simulations with a fixed version of iGraphite, the changes of SyncMemAccess being identical to
- # SyncPthreadBarrier, down to the last femtosecond, are slim, so this code shouldn't trigger
- data[core]['SyncMemAccess'] = 0
- if data[core].get('StartTime') == None and 'performance_model.idle_elapsed_time' in self.stats:
- # Fix a bug whereby the start time was not being reported in the CPI stacks correctly
- data[core]['StartTime'] = cycles_scale * self.stats['performance_model.idle_elapsed_time'][core] - \
- data[core]['SyncFutex'] - data[core]['SyncPthreadMutex'] - \
- data[core]['SyncPthreadCond'] - data[core]['SyncPthreadBarrier'] - \
- data[core]['Recv']
- # Critical path accounting
- cpContrMap = {
- # critical path components
- 'interval_timer.cpContr_generic': 'PathInt',
- 'interval_timer.cpContr_store': 'PathStore',
- 'interval_timer.cpContr_load_other': 'PathLoadX',
- 'interval_timer.cpContr_branch': 'PathBranch',
- 'interval_timer.cpContr_load_l1': 'DataCacheL1',
- 'interval_timer.cpContr_load_l2': 'DataCacheL2',
- 'interval_timer.cpContr_load_l3': 'DataCacheL3',
- 'interval_timer.cpContr_fp_addsub': 'PathFP',
- 'interval_timer.cpContr_fp_muldiv': 'PathFP',
- # issue ports
- 'interval_timer.cpContr_port0': 'PathP0',
- 'interval_timer.cpContr_port1': 'PathP1',
- 'interval_timer.cpContr_port2': 'PathP2',
- 'interval_timer.cpContr_port34': 'PathP34',
- 'interval_timer.cpContr_port5': 'PathP5',
- 'interval_timer.cpContr_port05': 'PathP05',
- 'interval_timer.cpContr_port015': 'PathP015',
- }
- for k in self.stats:
- if k.startswith('interval_timer.cpContr_'):
- if k not in cpContrMap.keys():
- print 'Missing in cpContrMap: ', k
- # Keep 1/width as base CPI component, break down the remainder according to critical path contributors
- BaseBest = instrs[core] / float(sniper_config.get_config(self.config, 'perf_model/core/interval_timer/dispatch_width', core))
- BaseAct = data[core]['Base']
- BaseCp = BaseAct - BaseBest
- scale = BaseCp / (BaseAct or 1)
- for cpName, cpiName in cpContrMap.items():
- val = float(self.stats.get(cpName, [0]*ncores)[core]) / 1e6
- data[core]['Base'] -= val * scale
- data[core][cpiName] = data[core].get(cpiName, 0) + val * scale
- # Issue width
- for key, values in self.stats.items():
- if key.startswith('interval_timer.detailed-cpiBase-'):
- if 'DispatchWidth' in key:
- if 'DispatchRate' not in key: # We already accounted for DispatchRate above, don't do it twice
- data[core]['Base'] -= values[core]
- data[core]['Issue'] = data[core].get('Issue', 0) + values[core]
- # Fix up large cpiSync fractions that started before but ended inside our interval
- time0_me = 'performance_model.elapsed_time_begin' in self.stats and self.stats['performance_model.elapsed_time_begin'][core] or 0
- if time0_me < time0_begin:
- time0_extra = time0_begin - time0_me
- # Number of cycles that weren't accounted for when starting this interval
- cycles_extra = time0_extra * cycles_scale[core]
- # Components that could be the cause of cycles_extra. It should be just one, but if there's many, we'll have to guess
- sync_components = dict([ (key, value) for key, value in data[core].items() if (key.startswith('Sync') or key == 'StartTime') and value > cycles_extra ])
- sync_total = sum(sync_components.values())
- for key, value in sync_components.items():
- data[core][key] -= cycles_extra*value/float(sync_total)
- data[core]['Imbalance'] = cycles_scale[core] * max(times) - sum(data[core].values())
- self.data = data
- self.ncores = ncores
- self.cores = range(ncores)
- self.instrs = instrs
- self.times = times
- self.cycles_scale = cycles_scale
- self.fastforward_scale = fastforward_scale
- def get_compfrac(self):
- max_time = self.cycles_scale[0] * max(self.times)
- return dict([ (
- core,
- 1 - (self.data[core].get('StartTime', 0) + self.data[core].get('Imbalance', 0) + self.data[core].get('SyncPthreadCond', 0) + \
- self.data[core].get('SyncPthreadBarrier', 0) + self.data[core].get('SyncJoin', 0) + self.data[core].get('Recv', 0)) / (float(max_time) or 1.)
- ) for core in self.data.keys() ])
- def filter(self, cores_list = None, core_mincomp = 0):
- if not cores_list:
- cores_list = self.cores
- if core_mincomp:
- compfrac = self.get_compfrac()
- cores_list = [ core for core in cores_list if compfrac[core] >= core_mincomp ]
- self.data = dict([ (core, self.data[core]) for core in cores_list ])
- self.instrs = dict([ (core, self.instrs[core]) for core in cores_list ])
- self.ncores = len(cores_list)
- self.cores = cores_list
- def aggregate(self):
- allkeys = self.data[self.cores[0]].keys()
- self.data = { 0: dict([ (key, sum([ self.data[core][key] for core in self.cores ]) / len(self.cores)) for key in allkeys ]) }
- self.instrs = { 0: sum(self.instrs[core] for core in self.cores) / len(self.cores) }
- self.ncores = 1
- self.cores = [0]
|