#define _GNU_SOURCE #include #include #include #include #include #include #include #include // Performance counter setup struct perf_counter { int fd; uint64_t value; const char *name; uint32_t type; uint64_t config; }; struct bench_run { uint64_t cycles; uint64_t instructions; uint64_t stalled_cycles_frontend; uint64_t stalled_cycles_backend; uint64_t branches; uint64_t branch_misses; uint64_t time_ns; }; struct bench_stats { double mean; double sd; double rel_sd; int n; }; static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu, int group_fd, unsigned long flags) { return syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags); } static int setup_counter(struct perf_counter *counter, uint32_t type, uint64_t config, const char *name) { struct perf_event_attr pe; memset(&pe, 0, sizeof(struct perf_event_attr)); pe.type = type; pe.size = sizeof(struct perf_event_attr); pe.config = config; pe.disabled = 1; pe.exclude_kernel = 0; pe.exclude_hv = 0; pe.exclude_idle = 1; counter->fd = perf_event_open(&pe, 0, -1, -1, 0); counter->name = name; counter->type = type; counter->config = config; counter->value = 0; return counter->fd; } static void reset_counters(struct perf_counter *counters, int n) { for(int i = 0; i < n; i++) { if(counters[i].fd >= 0) { ioctl(counters[i].fd, PERF_EVENT_IOC_RESET, 0); } } } static void start_counters(struct perf_counter *counters, int n) { for(int i = 0; i < n; i++) { if(counters[i].fd >= 0) { ioctl(counters[i].fd, PERF_EVENT_IOC_ENABLE, 0); } } } static void stop_counters(struct perf_counter *counters, int n) { for(int i = 0; i < n; i++) { if(counters[i].fd >= 0) { ioctl(counters[i].fd, PERF_EVENT_IOC_DISABLE, 0); read(counters[i].fd, &counters[i].value, sizeof(uint64_t)); } } } static void close_counters(struct perf_counter *counters, int n) { for(int i = 0; i < n; i++) { if(counters[i].fd >= 0) { close(counters[i].fd); } } } static struct bench_stats calculate_stats(double *values, int n) { struct bench_stats stats; stats.n = n; // Calculate mean double sum = 0.0; for(int i = 0; i < n; i++) { sum += values[i]; } stats.mean = sum / n; // Calculate standard deviation double sum_sq = 0.0; for(int i = 0; i < n; i++) { double diff = values[i] - stats.mean; sum_sq += diff * diff; } stats.sd = sqrt(sum_sq / (n - 1)); // Calculate relative standard deviation stats.rel_sd = (stats.mean != 0.0) ? (100.0 * stats.sd / stats.mean) : 0.0; return stats; } static void set_realtime_priority(void) { struct sched_param param; param.sched_priority = 99; if(sched_setscheduler(0, SCHED_FIFO, ¶m) == -1) { fprintf(stderr, "Warning: Failed to set realtime priority (try running with sudo or CAP_SYS_NICE)\n"); } } static void set_cpu_affinity(int cpu) { cpu_set_t cpuset; CPU_ZERO(&cpuset); CPU_SET(cpu, &cpuset); if(sched_setaffinity(0, sizeof(cpu_set_t), &cpuset) == -1) { fprintf(stderr, "Warning: Failed to set CPU affinity to CPU %d\n", cpu); } } // Static allocation for benchmark runs - no malloc, page-aligned #define MAX_BENCH_RUNS 100 static struct bench_run runs_storage[MAX_BENCH_RUNS] __attribute__((aligned(4096))); static double per_frame_storage[MAX_BENCH_RUNS * 3] __attribute__((aligned(4096))); static void run_benchmark(struct nes_state *nstate, uint32_t num_runs, uint32_t frames_per_run) { if(num_runs > MAX_BENCH_RUNS) { fprintf(stderr, "Error: num_runs (%u) exceeds MAX_BENCH_RUNS (%u)\n", num_runs, MAX_BENCH_RUNS); return; } struct perf_counter counters[6]; int num_counters = 0; // Set up performance counters setup_counter(&counters[num_counters++], PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES, "cycles"); setup_counter(&counters[num_counters++], PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS, "instructions"); setup_counter(&counters[num_counters++], PERF_TYPE_HARDWARE, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND, "stalled-cycles-frontend"); setup_counter(&counters[num_counters++], PERF_TYPE_HARDWARE, PERF_COUNT_HW_STALLED_CYCLES_BACKEND, "stalled-cycles-backend"); setup_counter(&counters[num_counters++], PERF_TYPE_HARDWARE, PERF_COUNT_HW_BRANCH_INSTRUCTIONS, "branches"); setup_counter(&counters[num_counters++], PERF_TYPE_HARDWARE, PERF_COUNT_HW_BRANCH_MISSES, "branch-misses"); // Check which counters are available int available_counters = 0; for(int i = 0; i < num_counters; i++) { if(counters[i].fd >= 0) { available_counters++; } else { fprintf(stderr, "Warning: Counter %s not available\n", counters[i].name); } } if(available_counters == 0) { fprintf(stderr, "Error: No performance counters available\n"); close_counters(counters, num_counters); return; } // Use static storage for runs struct bench_run *runs = runs_storage; memset(runs, 0, sizeof(struct bench_run) * num_runs); // Set CPU affinity and realtime priority set_cpu_affinity(1); set_realtime_priority(); // Warmup run (not measured) memset(nstate, 0, sizeof(struct nes_state)); ppu_reset(nstate); ines2_load_from_memory(nstate, benchmark_rom, INCBIN_SIZE(benchmark_rom)); mapper_setup(nstate); cpu_reset(nstate); uint32_t warmup_frames = frames_per_run / 10; for(uint32_t i = 0; i < warmup_frames; i++) { while(!nstate->ppu.frame_ready) { cpu_tick(nstate); } nstate->ppu.frame_ready = 0; } // Run benchmark iterations for(uint32_t run = 0; run < num_runs; run++) { // Reset emulator state memset(nstate, 0, sizeof(struct nes_state)); ppu_reset(nstate); ines2_load_from_memory(nstate, benchmark_rom, INCBIN_SIZE(benchmark_rom)); mapper_setup(nstate); cpu_reset(nstate); // Start timing struct timespec start_time, end_time; clock_gettime(CLOCK_MONOTONIC, &start_time); // Reset and start counters (after clock_gettime to exclude its overhead) reset_counters(counters, num_counters); start_counters(counters, num_counters); // Run emulation for(uint32_t i = 0; i < frames_per_run; i++) { while(!nstate->ppu.frame_ready) { cpu_tick(nstate); } nstate->ppu.frame_ready = 0; } // Stop counters (before clock_gettime to exclude its overhead) stop_counters(counters, num_counters); // Stop timing clock_gettime(CLOCK_MONOTONIC, &end_time); // Calculate elapsed time in nanoseconds uint64_t elapsed_ns = (end_time.tv_sec - start_time.tv_sec) * 1000000000ULL + (end_time.tv_nsec - start_time.tv_nsec); // Store results runs[run].time_ns = elapsed_ns; for(int i = 0; i < num_counters; i++) { if(counters[i].fd < 0) continue; if(counters[i].config == PERF_COUNT_HW_CPU_CYCLES) { runs[run].cycles = counters[i].value; } else if(counters[i].config == PERF_COUNT_HW_INSTRUCTIONS) { runs[run].instructions = counters[i].value; } else if(counters[i].config == PERF_COUNT_HW_STALLED_CYCLES_FRONTEND) { runs[run].stalled_cycles_frontend = counters[i].value; } else if(counters[i].config == PERF_COUNT_HW_STALLED_CYCLES_BACKEND) { runs[run].stalled_cycles_backend = counters[i].value; } else if(counters[i].config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) { runs[run].branches = counters[i].value; } else if(counters[i].config == PERF_COUNT_HW_BRANCH_MISSES) { runs[run].branch_misses = counters[i].value; } } } // Calculate aggregated totals uint64_t total_instructions = 0; uint64_t total_cycles = 0; uint64_t total_stalled_frontend = 0; uint64_t total_stalled_backend = 0; uint64_t total_branches = 0; uint64_t total_branch_misses = 0; uint64_t total_time_ns = 0; for(uint32_t i = 0; i < num_runs; i++) { total_instructions += runs[i].instructions; total_cycles += runs[i].cycles; total_stalled_frontend += runs[i].stalled_cycles_frontend; total_stalled_backend += runs[i].stalled_cycles_backend; total_branches += runs[i].branches; total_branch_misses += runs[i].branch_misses; total_time_ns += runs[i].time_ns; } // Calculate per-frame statistics using static storage double *cycles_per_frame = &per_frame_storage[0]; double *insn_per_frame = &per_frame_storage[num_runs]; double *time_ms = &per_frame_storage[num_runs * 2]; for(uint32_t i = 0; i < num_runs; i++) { cycles_per_frame[i] = (double)runs[i].cycles / frames_per_run; insn_per_frame[i] = (double)runs[i].instructions / frames_per_run; time_ms[i] = (double)runs[i].time_ns / 1000000.0; } struct bench_stats cycles_stats = calculate_stats(cycles_per_frame, num_runs); struct bench_stats insn_stats = calculate_stats(insn_per_frame, num_runs); struct bench_stats time_stats = calculate_stats(time_ms, num_runs); // Print results double total_time_s = (double)total_time_ns / 1000000000.0; double ipc = (double)total_instructions / total_cycles; double stalled_per_insn = (double)(total_stalled_frontend + total_stalled_backend) / total_instructions; double ghz = (double)total_cycles / total_time_s / 1000000000.0; double branches_per_sec = (double)total_branches / total_time_s; double branch_miss_rate = (double)total_branch_misses / total_branches * 100.0; double stalled_frontend_pct = (double)total_stalled_frontend / total_cycles * 100.0; double stalled_backend_pct = (double)total_stalled_backend / total_cycles * 100.0; double mips = (double)total_instructions / total_time_s / 1000000.0; double mcps = (double)total_cycles / total_time_s / 1000000.0; printf("\n"); printf("%20llu instructions # %.2f insn per cycle \n", (unsigned long long)total_instructions, ipc); printf("%67s# %.2f stalled cycles per insn \n", "", stalled_per_insn); printf("%20llu cycles # %.3f GHz \n", (unsigned long long)total_cycles, ghz); printf("%20llu stalled-cycles-frontend # %.2f%% frontend cycles idle \n", (unsigned long long)total_stalled_frontend, stalled_frontend_pct); printf("%20llu stalled-cycles-backend # %.2f%% backend cycles idle \n", (unsigned long long)total_stalled_backend, stalled_backend_pct); printf("%20llu branches # %.3f G/sec \n", (unsigned long long)total_branches, branches_per_sec / 1000000000.0); printf("%20llu branch-misses # %.2f%% of all branches \n", (unsigned long long)total_branch_misses, branch_miss_rate); printf("\n"); printf("Throughput: %.2f MIPS, %.2f Mcycles/sec\n", mips, mcps); printf("\n"); printf("cycles/frame mean=%.0f sd=%.0f relSD=%.3f%% n=%d\n", cycles_stats.mean, cycles_stats.sd, cycles_stats.rel_sd, cycles_stats.n); printf("insn/frame mean=%.0f sd=%.0f relSD=%.3f%% n=%d\n", insn_stats.mean, insn_stats.sd, insn_stats.rel_sd, insn_stats.n); printf("time (ms) mean=%.3f sd=%.3f relSD=%.3f%% n=%d\n", time_stats.mean, time_stats.sd, time_stats.rel_sd, time_stats.n); double fps = (double)frames_per_run / (time_stats.mean / 1000.0); double ms_per_frame = time_stats.mean / frames_per_run; printf("FPS (frames/second) = %.2f\n", fps); printf("ms/frame = %.6f\n", ms_per_frame); // Cleanup close_counters(counters, num_counters); }