1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
|
#define _GNU_SOURCE
#include <linux/perf_event.h>
#include <sys/syscall.h>
#include <sys/ioctl.h>
#include <unistd.h>
#include <sched.h>
#include <sys/resource.h>
#include <math.h>
#include <time.h>
// Performance counter setup
struct perf_counter {
int fd;
uint64_t value;
const char *name;
uint32_t type;
uint64_t config;
};
struct bench_run {
uint64_t cycles;
uint64_t instructions;
uint64_t stalled_cycles_frontend;
uint64_t stalled_cycles_backend;
uint64_t branches;
uint64_t branch_misses;
uint64_t time_ns;
};
struct bench_stats {
double mean;
double sd;
double rel_sd;
int n;
};
static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
int cpu, int group_fd, unsigned long flags) {
return syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags);
}
static int setup_counter(struct perf_counter *counter, uint32_t type, uint64_t config, const char *name) {
struct perf_event_attr pe;
memset(&pe, 0, sizeof(struct perf_event_attr));
pe.type = type;
pe.size = sizeof(struct perf_event_attr);
pe.config = config;
pe.disabled = 1;
pe.exclude_kernel = 0;
pe.exclude_hv = 0;
pe.exclude_idle = 1;
counter->fd = perf_event_open(&pe, 0, -1, -1, 0);
counter->name = name;
counter->type = type;
counter->config = config;
counter->value = 0;
return counter->fd;
}
static void reset_counters(struct perf_counter *counters, int n) {
for(int i = 0; i < n; i++) {
if(counters[i].fd >= 0) {
ioctl(counters[i].fd, PERF_EVENT_IOC_RESET, 0);
}
}
}
static void start_counters(struct perf_counter *counters, int n) {
for(int i = 0; i < n; i++) {
if(counters[i].fd >= 0) {
ioctl(counters[i].fd, PERF_EVENT_IOC_ENABLE, 0);
}
}
}
static void stop_counters(struct perf_counter *counters, int n) {
for(int i = 0; i < n; i++) {
if(counters[i].fd >= 0) {
ioctl(counters[i].fd, PERF_EVENT_IOC_DISABLE, 0);
read(counters[i].fd, &counters[i].value, sizeof(uint64_t));
}
}
}
static void close_counters(struct perf_counter *counters, int n) {
for(int i = 0; i < n; i++) {
if(counters[i].fd >= 0) {
close(counters[i].fd);
}
}
}
static struct bench_stats calculate_stats(double *values, int n) {
struct bench_stats stats;
stats.n = n;
// Calculate mean
double sum = 0.0;
for(int i = 0; i < n; i++) {
sum += values[i];
}
stats.mean = sum / n;
// Calculate standard deviation
double sum_sq = 0.0;
for(int i = 0; i < n; i++) {
double diff = values[i] - stats.mean;
sum_sq += diff * diff;
}
stats.sd = sqrt(sum_sq / (n - 1));
// Calculate relative standard deviation
stats.rel_sd = (stats.mean != 0.0) ? (100.0 * stats.sd / stats.mean) : 0.0;
return stats;
}
static void set_realtime_priority(void) {
struct sched_param param;
param.sched_priority = 99;
if(sched_setscheduler(0, SCHED_FIFO, ¶m) == -1) {
fprintf(stderr, "Warning: Failed to set realtime priority (try running with sudo or CAP_SYS_NICE)\n");
}
}
static void set_cpu_affinity(int cpu) {
cpu_set_t cpuset;
CPU_ZERO(&cpuset);
CPU_SET(cpu, &cpuset);
if(sched_setaffinity(0, sizeof(cpu_set_t), &cpuset) == -1) {
fprintf(stderr, "Warning: Failed to set CPU affinity to CPU %d\n", cpu);
}
}
// Static allocation for benchmark runs - no malloc, page-aligned
#define MAX_BENCH_RUNS 100
static struct bench_run runs_storage[MAX_BENCH_RUNS] __attribute__((aligned(4096)));
static double per_frame_storage[MAX_BENCH_RUNS * 3] __attribute__((aligned(4096)));
static void run_benchmark(struct nes_state *nstate, uint32_t num_runs, uint32_t frames_per_run) {
if(num_runs > MAX_BENCH_RUNS) {
fprintf(stderr, "Error: num_runs (%u) exceeds MAX_BENCH_RUNS (%u)\n", num_runs, MAX_BENCH_RUNS);
return;
}
struct perf_counter counters[6];
int num_counters = 0;
// Set up performance counters
setup_counter(&counters[num_counters++], PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES, "cycles");
setup_counter(&counters[num_counters++], PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS, "instructions");
setup_counter(&counters[num_counters++], PERF_TYPE_HARDWARE, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND, "stalled-cycles-frontend");
setup_counter(&counters[num_counters++], PERF_TYPE_HARDWARE, PERF_COUNT_HW_STALLED_CYCLES_BACKEND, "stalled-cycles-backend");
setup_counter(&counters[num_counters++], PERF_TYPE_HARDWARE, PERF_COUNT_HW_BRANCH_INSTRUCTIONS, "branches");
setup_counter(&counters[num_counters++], PERF_TYPE_HARDWARE, PERF_COUNT_HW_BRANCH_MISSES, "branch-misses");
// Check which counters are available
int available_counters = 0;
for(int i = 0; i < num_counters; i++) {
if(counters[i].fd >= 0) {
available_counters++;
} else {
fprintf(stderr, "Warning: Counter %s not available\n", counters[i].name);
}
}
if(available_counters == 0) {
fprintf(stderr, "Error: No performance counters available\n");
close_counters(counters, num_counters);
return;
}
// Use static storage for runs
struct bench_run *runs = runs_storage;
memset(runs, 0, sizeof(struct bench_run) * num_runs);
// Set CPU affinity and realtime priority
set_cpu_affinity(1);
set_realtime_priority();
// Warmup run (not measured)
memset(nstate, 0, sizeof(struct nes_state));
ppu_reset(nstate);
ines2_load_from_memory(nstate, benchmark_rom, INCBIN_SIZE(benchmark_rom));
mapper_setup(nstate);
cpu_reset(nstate);
uint32_t warmup_frames = frames_per_run / 10;
for(uint32_t i = 0; i < warmup_frames; i++) {
while(!nstate->ppu.frame_ready) {
cpu_tick(nstate);
}
nstate->ppu.frame_ready = 0;
}
// Run benchmark iterations
for(uint32_t run = 0; run < num_runs; run++) {
// Reset emulator state
memset(nstate, 0, sizeof(struct nes_state));
ppu_reset(nstate);
ines2_load_from_memory(nstate, benchmark_rom, INCBIN_SIZE(benchmark_rom));
mapper_setup(nstate);
cpu_reset(nstate);
// Start timing
struct timespec start_time, end_time;
clock_gettime(CLOCK_MONOTONIC, &start_time);
// Reset and start counters (after clock_gettime to exclude its overhead)
reset_counters(counters, num_counters);
start_counters(counters, num_counters);
// Run emulation
for(uint32_t i = 0; i < frames_per_run; i++) {
while(!nstate->ppu.frame_ready) {
cpu_tick(nstate);
}
nstate->ppu.frame_ready = 0;
}
// Stop counters (before clock_gettime to exclude its overhead)
stop_counters(counters, num_counters);
// Stop timing
clock_gettime(CLOCK_MONOTONIC, &end_time);
// Calculate elapsed time in nanoseconds
uint64_t elapsed_ns = (end_time.tv_sec - start_time.tv_sec) * 1000000000ULL +
(end_time.tv_nsec - start_time.tv_nsec);
// Store results
runs[run].time_ns = elapsed_ns;
for(int i = 0; i < num_counters; i++) {
if(counters[i].fd < 0) continue;
if(counters[i].config == PERF_COUNT_HW_CPU_CYCLES) {
runs[run].cycles = counters[i].value;
} else if(counters[i].config == PERF_COUNT_HW_INSTRUCTIONS) {
runs[run].instructions = counters[i].value;
} else if(counters[i].config == PERF_COUNT_HW_STALLED_CYCLES_FRONTEND) {
runs[run].stalled_cycles_frontend = counters[i].value;
} else if(counters[i].config == PERF_COUNT_HW_STALLED_CYCLES_BACKEND) {
runs[run].stalled_cycles_backend = counters[i].value;
} else if(counters[i].config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) {
runs[run].branches = counters[i].value;
} else if(counters[i].config == PERF_COUNT_HW_BRANCH_MISSES) {
runs[run].branch_misses = counters[i].value;
}
}
}
// Calculate aggregated totals
uint64_t total_instructions = 0;
uint64_t total_cycles = 0;
uint64_t total_stalled_frontend = 0;
uint64_t total_stalled_backend = 0;
uint64_t total_branches = 0;
uint64_t total_branch_misses = 0;
uint64_t total_time_ns = 0;
for(uint32_t i = 0; i < num_runs; i++) {
total_instructions += runs[i].instructions;
total_cycles += runs[i].cycles;
total_stalled_frontend += runs[i].stalled_cycles_frontend;
total_stalled_backend += runs[i].stalled_cycles_backend;
total_branches += runs[i].branches;
total_branch_misses += runs[i].branch_misses;
total_time_ns += runs[i].time_ns;
}
// Calculate per-frame statistics using static storage
double *cycles_per_frame = &per_frame_storage[0];
double *insn_per_frame = &per_frame_storage[num_runs];
double *time_ms = &per_frame_storage[num_runs * 2];
for(uint32_t i = 0; i < num_runs; i++) {
cycles_per_frame[i] = (double)runs[i].cycles / frames_per_run;
insn_per_frame[i] = (double)runs[i].instructions / frames_per_run;
time_ms[i] = (double)runs[i].time_ns / 1000000.0;
}
struct bench_stats cycles_stats = calculate_stats(cycles_per_frame, num_runs);
struct bench_stats insn_stats = calculate_stats(insn_per_frame, num_runs);
struct bench_stats time_stats = calculate_stats(time_ms, num_runs);
// Print results
double total_time_s = (double)total_time_ns / 1000000000.0;
double ipc = (double)total_instructions / total_cycles;
double stalled_per_insn = (double)(total_stalled_frontend + total_stalled_backend) / total_instructions;
double ghz = (double)total_cycles / total_time_s / 1000000000.0;
double branches_per_sec = (double)total_branches / total_time_s;
double branch_miss_rate = (double)total_branch_misses / total_branches * 100.0;
double stalled_frontend_pct = (double)total_stalled_frontend / total_cycles * 100.0;
double stalled_backend_pct = (double)total_stalled_backend / total_cycles * 100.0;
double mips = (double)total_instructions / total_time_s / 1000000.0;
double mcps = (double)total_cycles / total_time_s / 1000000.0;
printf("\n");
printf("%20llu instructions # %.2f insn per cycle \n", (unsigned long long)total_instructions, ipc);
printf("%67s# %.2f stalled cycles per insn \n", "", stalled_per_insn);
printf("%20llu cycles # %.3f GHz \n", (unsigned long long)total_cycles, ghz);
printf("%20llu stalled-cycles-frontend # %.2f%% frontend cycles idle \n", (unsigned long long)total_stalled_frontend, stalled_frontend_pct);
printf("%20llu stalled-cycles-backend # %.2f%% backend cycles idle \n", (unsigned long long)total_stalled_backend, stalled_backend_pct);
printf("%20llu branches # %.3f G/sec \n", (unsigned long long)total_branches, branches_per_sec / 1000000000.0);
printf("%20llu branch-misses # %.2f%% of all branches \n", (unsigned long long)total_branch_misses, branch_miss_rate);
printf("\n");
printf("Throughput: %.2f MIPS, %.2f Mcycles/sec\n", mips, mcps);
printf("\n");
printf("cycles/frame mean=%.0f sd=%.0f relSD=%.3f%% n=%d\n", cycles_stats.mean, cycles_stats.sd, cycles_stats.rel_sd, cycles_stats.n);
printf("insn/frame mean=%.0f sd=%.0f relSD=%.3f%% n=%d\n", insn_stats.mean, insn_stats.sd, insn_stats.rel_sd, insn_stats.n);
printf("time (ms) mean=%.3f sd=%.3f relSD=%.3f%% n=%d\n", time_stats.mean, time_stats.sd, time_stats.rel_sd, time_stats.n);
double fps = (double)frames_per_run / (time_stats.mean / 1000.0);
double ms_per_frame = time_stats.mean / frames_per_run;
printf("FPS (frames/second) = %.2f\n", fps);
printf("ms/frame = %.6f\n", ms_per_frame);
// Cleanup
close_counters(counters, num_counters);
}
|