From b2f646d9f99dd272f3b3a9d045b5039e6fc1dc50 Mon Sep 17 00:00:00 2001 From: Peter Fors Date: Sat, 25 Oct 2025 23:07:35 +0200 Subject: Refactor benchmarking to self-contained C implementation - Add mknes_bench.c with direct PMC access via perf_event_open() - Remove dependency on external perf/awk for statistics - Add RT priority and CPU affinity control in C code - Use static BSS allocation (page-aligned) instead of malloc - Add stalled-cycles-backend counter (gracefully handles AMD unavailability) - Add throughput metrics (MIPS, Mcycles/sec) - Optimize Bench.sh to only regenerate profile data when needed - Add -n and -f flags for configurable runs and frames - Suppress mapper messages during benchmark - ~6x faster benchmark workflow (20s first run, 16s subsequent) --- mknes_bench.c | 323 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 323 insertions(+) create mode 100644 mknes_bench.c (limited to 'mknes_bench.c') diff --git a/mknes_bench.c b/mknes_bench.c new file mode 100644 index 0000000..fcc7aae --- /dev/null +++ b/mknes_bench.c @@ -0,0 +1,323 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include + +// Performance counter setup +struct perf_counter { + int fd; + uint64_t value; + const char *name; + uint32_t type; + uint64_t config; +}; + +struct bench_run { + uint64_t cycles; + uint64_t instructions; + uint64_t stalled_cycles_frontend; + uint64_t stalled_cycles_backend; + uint64_t branches; + uint64_t branch_misses; + uint64_t time_ns; +}; + +struct bench_stats { + double mean; + double sd; + double rel_sd; + int n; +}; + +static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid, + int cpu, int group_fd, unsigned long flags) { + return syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags); +} + +static int setup_counter(struct perf_counter *counter, uint32_t type, uint64_t config, const char *name) { + struct perf_event_attr pe; + memset(&pe, 0, sizeof(struct perf_event_attr)); + pe.type = type; + pe.size = sizeof(struct perf_event_attr); + pe.config = config; + pe.disabled = 1; + pe.exclude_kernel = 0; + pe.exclude_hv = 0; + pe.exclude_idle = 1; + + counter->fd = perf_event_open(&pe, 0, -1, -1, 0); + counter->name = name; + counter->type = type; + counter->config = config; + counter->value = 0; + + return counter->fd; +} + +static void reset_counters(struct perf_counter *counters, int n) { + for(int i = 0; i < n; i++) { + if(counters[i].fd >= 0) { + ioctl(counters[i].fd, PERF_EVENT_IOC_RESET, 0); + } + } +} + +static void start_counters(struct perf_counter *counters, int n) { + for(int i = 0; i < n; i++) { + if(counters[i].fd >= 0) { + ioctl(counters[i].fd, PERF_EVENT_IOC_ENABLE, 0); + } + } +} + +static void stop_counters(struct perf_counter *counters, int n) { + for(int i = 0; i < n; i++) { + if(counters[i].fd >= 0) { + ioctl(counters[i].fd, PERF_EVENT_IOC_DISABLE, 0); + read(counters[i].fd, &counters[i].value, sizeof(uint64_t)); + } + } +} + +static void close_counters(struct perf_counter *counters, int n) { + for(int i = 0; i < n; i++) { + if(counters[i].fd >= 0) { + close(counters[i].fd); + } + } +} + +static struct bench_stats calculate_stats(double *values, int n) { + struct bench_stats stats; + stats.n = n; + + // Calculate mean + double sum = 0.0; + for(int i = 0; i < n; i++) { + sum += values[i]; + } + stats.mean = sum / n; + + // Calculate standard deviation + double sum_sq = 0.0; + for(int i = 0; i < n; i++) { + double diff = values[i] - stats.mean; + sum_sq += diff * diff; + } + stats.sd = sqrt(sum_sq / (n - 1)); + + // Calculate relative standard deviation + stats.rel_sd = (stats.mean != 0.0) ? (100.0 * stats.sd / stats.mean) : 0.0; + + return stats; +} + +static void set_realtime_priority(void) { + struct sched_param param; + param.sched_priority = 99; + if(sched_setscheduler(0, SCHED_FIFO, ¶m) == -1) { + fprintf(stderr, "Warning: Failed to set realtime priority (try running with sudo or CAP_SYS_NICE)\n"); + } +} + +static void set_cpu_affinity(int cpu) { + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + CPU_SET(cpu, &cpuset); + if(sched_setaffinity(0, sizeof(cpu_set_t), &cpuset) == -1) { + fprintf(stderr, "Warning: Failed to set CPU affinity to CPU %d\n", cpu); + } +} + +// Static allocation for benchmark runs - no malloc, page-aligned +#define MAX_BENCH_RUNS 100 +static struct bench_run runs_storage[MAX_BENCH_RUNS] __attribute__((aligned(4096))); +static double per_frame_storage[MAX_BENCH_RUNS * 3] __attribute__((aligned(4096))); + +static void run_benchmark(struct nes_state *nstate, uint32_t num_runs, uint32_t frames_per_run) { + if(num_runs > MAX_BENCH_RUNS) { + fprintf(stderr, "Error: num_runs (%u) exceeds MAX_BENCH_RUNS (%u)\n", num_runs, MAX_BENCH_RUNS); + return; + } + + struct perf_counter counters[6]; + int num_counters = 0; + + // Set up performance counters + setup_counter(&counters[num_counters++], PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES, "cycles"); + setup_counter(&counters[num_counters++], PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS, "instructions"); + setup_counter(&counters[num_counters++], PERF_TYPE_HARDWARE, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND, "stalled-cycles-frontend"); + setup_counter(&counters[num_counters++], PERF_TYPE_HARDWARE, PERF_COUNT_HW_STALLED_CYCLES_BACKEND, "stalled-cycles-backend"); + setup_counter(&counters[num_counters++], PERF_TYPE_HARDWARE, PERF_COUNT_HW_BRANCH_INSTRUCTIONS, "branches"); + setup_counter(&counters[num_counters++], PERF_TYPE_HARDWARE, PERF_COUNT_HW_BRANCH_MISSES, "branch-misses"); + + // Check which counters are available + int available_counters = 0; + for(int i = 0; i < num_counters; i++) { + if(counters[i].fd >= 0) { + available_counters++; + } else { + fprintf(stderr, "Warning: Counter %s not available\n", counters[i].name); + } + } + + if(available_counters == 0) { + fprintf(stderr, "Error: No performance counters available\n"); + close_counters(counters, num_counters); + return; + } + + // Use static storage for runs + struct bench_run *runs = runs_storage; + memset(runs, 0, sizeof(struct bench_run) * num_runs); + + // Set CPU affinity and realtime priority + set_cpu_affinity(1); + set_realtime_priority(); + + // Warmup run (not measured) + memset(nstate, 0, sizeof(struct nes_state)); + ppu_reset(nstate); + ines2_load_from_memory(nstate, benchmark_rom, INCBIN_SIZE(benchmark_rom)); + mapper_setup(nstate); + cpu_reset(nstate); + + uint32_t warmup_frames = frames_per_run / 10; + for(uint32_t i = 0; i < warmup_frames; i++) { + while(!nstate->ppu.frame_ready) { + cpu_tick(nstate); + } + nstate->ppu.frame_ready = 0; + } + + // Run benchmark iterations + for(uint32_t run = 0; run < num_runs; run++) { + + // Reset emulator state + memset(nstate, 0, sizeof(struct nes_state)); + ppu_reset(nstate); + ines2_load_from_memory(nstate, benchmark_rom, INCBIN_SIZE(benchmark_rom)); + mapper_setup(nstate); + cpu_reset(nstate); + + // Start timing + struct timespec start_time, end_time; + clock_gettime(CLOCK_MONOTONIC, &start_time); + + // Reset and start counters (after clock_gettime to exclude its overhead) + reset_counters(counters, num_counters); + start_counters(counters, num_counters); + + // Run emulation + for(uint32_t i = 0; i < frames_per_run; i++) { + while(!nstate->ppu.frame_ready) { + cpu_tick(nstate); + } + nstate->ppu.frame_ready = 0; + } + + // Stop counters (before clock_gettime to exclude its overhead) + stop_counters(counters, num_counters); + + // Stop timing + clock_gettime(CLOCK_MONOTONIC, &end_time); + + // Calculate elapsed time in nanoseconds + uint64_t elapsed_ns = (end_time.tv_sec - start_time.tv_sec) * 1000000000ULL + + (end_time.tv_nsec - start_time.tv_nsec); + + // Store results + runs[run].time_ns = elapsed_ns; + for(int i = 0; i < num_counters; i++) { + if(counters[i].fd < 0) continue; + + if(counters[i].config == PERF_COUNT_HW_CPU_CYCLES) { + runs[run].cycles = counters[i].value; + } else if(counters[i].config == PERF_COUNT_HW_INSTRUCTIONS) { + runs[run].instructions = counters[i].value; + } else if(counters[i].config == PERF_COUNT_HW_STALLED_CYCLES_FRONTEND) { + runs[run].stalled_cycles_frontend = counters[i].value; + } else if(counters[i].config == PERF_COUNT_HW_STALLED_CYCLES_BACKEND) { + runs[run].stalled_cycles_backend = counters[i].value; + } else if(counters[i].config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) { + runs[run].branches = counters[i].value; + } else if(counters[i].config == PERF_COUNT_HW_BRANCH_MISSES) { + runs[run].branch_misses = counters[i].value; + } + } + } + + // Calculate aggregated totals + uint64_t total_instructions = 0; + uint64_t total_cycles = 0; + uint64_t total_stalled_frontend = 0; + uint64_t total_stalled_backend = 0; + uint64_t total_branches = 0; + uint64_t total_branch_misses = 0; + uint64_t total_time_ns = 0; + + for(uint32_t i = 0; i < num_runs; i++) { + total_instructions += runs[i].instructions; + total_cycles += runs[i].cycles; + total_stalled_frontend += runs[i].stalled_cycles_frontend; + total_stalled_backend += runs[i].stalled_cycles_backend; + total_branches += runs[i].branches; + total_branch_misses += runs[i].branch_misses; + total_time_ns += runs[i].time_ns; + } + + // Calculate per-frame statistics using static storage + double *cycles_per_frame = &per_frame_storage[0]; + double *insn_per_frame = &per_frame_storage[num_runs]; + double *time_ms = &per_frame_storage[num_runs * 2]; + + for(uint32_t i = 0; i < num_runs; i++) { + cycles_per_frame[i] = (double)runs[i].cycles / frames_per_run; + insn_per_frame[i] = (double)runs[i].instructions / frames_per_run; + time_ms[i] = (double)runs[i].time_ns / 1000000.0; + } + + struct bench_stats cycles_stats = calculate_stats(cycles_per_frame, num_runs); + struct bench_stats insn_stats = calculate_stats(insn_per_frame, num_runs); + struct bench_stats time_stats = calculate_stats(time_ms, num_runs); + + // Print results + double total_time_s = (double)total_time_ns / 1000000000.0; + double ipc = (double)total_instructions / total_cycles; + double stalled_per_insn = (double)(total_stalled_frontend + total_stalled_backend) / total_instructions; + double ghz = (double)total_cycles / total_time_s / 1000000000.0; + double branches_per_sec = (double)total_branches / total_time_s; + double branch_miss_rate = (double)total_branch_misses / total_branches * 100.0; + double stalled_frontend_pct = (double)total_stalled_frontend / total_cycles * 100.0; + double stalled_backend_pct = (double)total_stalled_backend / total_cycles * 100.0; + double mips = (double)total_instructions / total_time_s / 1000000.0; + double mcps = (double)total_cycles / total_time_s / 1000000.0; + + printf("\n"); + printf("%20llu instructions # %.2f insn per cycle \n", (unsigned long long)total_instructions, ipc); + printf("%67s# %.2f stalled cycles per insn \n", "", stalled_per_insn); + printf("%20llu cycles # %.3f GHz \n", (unsigned long long)total_cycles, ghz); + printf("%20llu stalled-cycles-frontend # %.2f%% frontend cycles idle \n", (unsigned long long)total_stalled_frontend, stalled_frontend_pct); + printf("%20llu stalled-cycles-backend # %.2f%% backend cycles idle \n", (unsigned long long)total_stalled_backend, stalled_backend_pct); + printf("%20llu branches # %.3f G/sec \n", (unsigned long long)total_branches, branches_per_sec / 1000000000.0); + printf("%20llu branch-misses # %.2f%% of all branches \n", (unsigned long long)total_branch_misses, branch_miss_rate); + printf("\n"); + printf("Throughput: %.2f MIPS, %.2f Mcycles/sec\n", mips, mcps); + printf("\n"); + printf("cycles/frame mean=%.0f sd=%.0f relSD=%.3f%% n=%d\n", cycles_stats.mean, cycles_stats.sd, cycles_stats.rel_sd, cycles_stats.n); + printf("insn/frame mean=%.0f sd=%.0f relSD=%.3f%% n=%d\n", insn_stats.mean, insn_stats.sd, insn_stats.rel_sd, insn_stats.n); + printf("time (ms) mean=%.3f sd=%.3f relSD=%.3f%% n=%d\n", time_stats.mean, time_stats.sd, time_stats.rel_sd, time_stats.n); + + double fps = (double)frames_per_run / (time_stats.mean / 1000.0); + double ms_per_frame = time_stats.mean / frames_per_run; + printf("FPS (frames/second) = %.2f\n", fps); + printf("ms/frame = %.6f\n", ms_per_frame); + + // Cleanup + close_counters(counters, num_counters); +} -- cgit v1.2.3