diff options
| author | Peter Fors <peter.fors@mindkiller.com> | 2025-10-25 23:07:35 +0200 |
|---|---|---|
| committer | Peter Fors <peter.fors@mindkiller.com> | 2025-10-25 23:28:22 +0200 |
| commit | b2f646d9f99dd272f3b3a9d045b5039e6fc1dc50 (patch) | |
| tree | 9ea8977531306f414d94ceca7dcfa6f17c204687 | |
| parent | 54ca8318923fcf11e1cf507bd516b210ba7cf221 (diff) | |
Refactor benchmarking to self-contained C implementation
- Add mknes_bench.c with direct PMC access via perf_event_open()
- Remove dependency on external perf/awk for statistics
- Add RT priority and CPU affinity control in C code
- Use static BSS allocation (page-aligned) instead of malloc
- Add stalled-cycles-backend counter (gracefully handles AMD unavailability)
- Add throughput metrics (MIPS, Mcycles/sec)
- Optimize Bench.sh to only regenerate profile data when needed
- Add -n and -f flags for configurable runs and frames
- Suppress mapper messages during benchmark
- ~6x faster benchmark workflow (20s first run, 16s subsequent)
| -rwxr-xr-x | Bench.sh | 50 | ||||
| -rw-r--r-- | mknes.c | 55 | ||||
| -rw-r--r-- | mknes_bench.c | 323 | ||||
| -rw-r--r-- | mknes_mapper.c | 2 |
4 files changed, 360 insertions, 70 deletions
@@ -1,45 +1,13 @@ #!/usr/bin/env bash -./build.sh clean -./build.sh profile -./mknes -./build.sh profile_release - -runs=10 -frames=4096 -events="cycles,instructions,task-clock" -tmp=$(mktemp) - -taskset -c 1 ./mknes - -> "$tmp" -for i in $(seq 1 $runs); do - taskset -c 1 chrt -f 99 perf stat -x, -e $events -- ./mknes 2>>"$tmp" -done - -awk -F, -v F="$frames" ' - $3=="cycles" { c[++nc]=$1/F } - $3=="instructions" { i[++ni]=$1/F } -# $3=="task-clock" { t[++nt]=$1 } # milliseconds NOTE(peter): changed to nanoseconds... - $3=="task-clock" { t[++nt]=$1/1000000 } +# Only rebuild profile data if it doesn't exist +if [ ! -f mknes.gcda ]; then + ./build.sh clean + ./build.sh profile + ./mknes -n 1 -f 1024 # Quick single run for profile generation +fi - END { - for(k=1;k<=nc;k++) sumc+=c[k]; mc=sumc/nc - for(k=1;k<=ni;k++) sumi+=i[k]; mi=sumi/ni - for(k=1;k<=nt;k++) sumt+=t[k]; mt=sumt/nt - - for(k=1;k<=nc;k++) sdc+=(c[k]-mc)^2; sdc=sqrt(sdc/(nc-1)) - for(k=1;k<=ni;k++) sdi+=(i[k]-mi)^2; sdi=sqrt(sdi/(ni-1)) - for(k=1;k<=nt;k++) sdt+=(t[k]-mt)^2; sdt=sqrt(sdt/(nt-1)) - - ms_per_frame = mt / F - fps = F / (mt / 1000) - - printf "IPC (insn/cycle) = %.3f\n", mi/mc - printf "cycles/frame mean=%.0f sd=%.0f relSD=%.3f%% n=%d\n", mc, sdc, 100*sdc/mc, nc - printf "insn/frame mean=%.0f sd=%.0f relSD=%.3f%% n=%d\n", mi, sdi, 100*sdi/mi, ni - printf "time (ms) mean=%.3f sd=%.3f relSD=%.3f%% n=%d\n", mt, sdt, 100*sdt/mt, nt - printf "FPS (frames/second) = %.2f\n", fps - printf "ms/frame = %.6f\n", ms_per_frame - }' "$tmp" +./build.sh profile_release +# Run full benchmark +./mknes @@ -1,4 +1,5 @@ #define GL_SILENCE_DEPRECATION +#define _GNU_SOURCE #ifdef _WIN32 #define NOMINMAX @@ -57,8 +58,8 @@ static void audio_callback(int16_t *data, size_t frames) { } #ifdef BENCHMARK // Embed the ROM for benchmarking to eliminate file I/O overhead // Uncomment the ROM you want to benchmark: -INCBIN_BYTES(benchmark_rom, "data/Life Force (USA).nes"); -// INCBIN_BYTES(benchmark_rom, "data/0000/Super Mario Bros. (World) (HVC-SM).nes"); +// INCBIN_BYTES(benchmark_rom, "data/Life Force (USA).nes"); +INCBIN_BYTES(benchmark_rom, "data/0000/Super Mario Bros. (World) (HVC-SM).nes"); // INCBIN_BYTES(benchmark_rom, "data/0003/Gradius (USA).zip"); #endif @@ -177,6 +178,10 @@ static uint32_t frames; // debug information #include "mknes_ines2.c" #include "mknes_mapper.c" +#ifdef BENCHMARK +#include "mknes_bench.c" +#endif + // struct nes_state nstate; static void framebuffer_callback(struct mkfw_state *mkfw_window, int32_t width, int32_t height, float aspect_ratio) { @@ -240,14 +245,28 @@ int main(int argc, char **argv) { // protect_opcode_lut(); struct nes_state *nstate = aligned_alloc(4096, (sizeof(struct nes_state) + 4095) & ~4095); - memset(nstate, 0, sizeof(struct nes_state)); - ppu_reset(nstate); #ifdef BENCHMARK - // Use embedded ROM for consistent benchmarking without file I/O overhead - ines2_load_from_memory(nstate, benchmark_rom, INCBIN_SIZE(benchmark_rom)); + // Run benchmark with configurable parameters + uint32_t num_runs = 10; + uint32_t frames_per_run = 0x1000; + + // Parse command line arguments + for(int i = 1; i < argc; i++) { + if(strcmp(argv[i], "-n") == 0 && i + 1 < argc) { + num_runs = atoi(argv[i + 1]); + i++; + } else if(strcmp(argv[i], "-f") == 0 && i + 1 < argc) { + frames_per_run = atoi(argv[i + 1]); + i++; + } + } + run_benchmark(nstate, num_runs, frames_per_run); + return 0; #else + memset(nstate, 0, sizeof(struct nes_state)); + ppu_reset(nstate); // ines2_load(nstate, "data/0000/10-Yard Fight (USA, Europe).nes"); // ines2_load(nstate, "data/0000/Balloon Fight (USA).nes"); // ines2_load(nstate, "data/0000/Excitebike (Japan, USA).nes"); @@ -299,32 +318,10 @@ int main(int argc, char **argv) { // ines2_load(nstate, "data/Blaster Master (USA).zip"); // mapper 1 // ines2_load(nstate, "AccuracyCoin.nes"); // mapper 1 -#endif mapper_setup(nstate); cpu_reset(nstate); -#ifdef BENCHMARK - for(uint32_t i = 0; i < 0x1000; ++i) { - while(!nstate->ppu.frame_ready) { - // PROFILE_NAMED("nes emulator"); - cpu_tick(nstate); - } - nstate->ppu.frame_ready = 0; - frames++; - } - - // for(size_t i = 0; i < 9; ++i) { - // printf("count %d: %lld\n", i, sprite_counts[i]); - // } - - // for(size_t i = 0; i < 256; ++i) { - // printf("instr %2.2x: %lld\n", i, instr_count[i]); - // } - - return 0; -#else - // WINDOW SETUP struct mkfw_state *window = mkfw_init(WINDOW_WIDTH, WINDOW_HEIGHT); mkfw_set_window_title(window, "mknes"); @@ -408,7 +405,7 @@ int main(int argc, char **argv) { // free_nes_state(&nstate); timer_shutdown(); mkfw_cleanup(window); -#endif return 0; +#endif } diff --git a/mknes_bench.c b/mknes_bench.c new file mode 100644 index 0000000..fcc7aae --- /dev/null +++ b/mknes_bench.c @@ -0,0 +1,323 @@ +#define _GNU_SOURCE +#include <linux/perf_event.h> +#include <sys/syscall.h> +#include <sys/ioctl.h> +#include <unistd.h> +#include <sched.h> +#include <sys/resource.h> +#include <math.h> +#include <time.h> + +// Performance counter setup +struct perf_counter { + int fd; + uint64_t value; + const char *name; + uint32_t type; + uint64_t config; +}; + +struct bench_run { + uint64_t cycles; + uint64_t instructions; + uint64_t stalled_cycles_frontend; + uint64_t stalled_cycles_backend; + uint64_t branches; + uint64_t branch_misses; + uint64_t time_ns; +}; + +struct bench_stats { + double mean; + double sd; + double rel_sd; + int n; +}; + +static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid, + int cpu, int group_fd, unsigned long flags) { + return syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags); +} + +static int setup_counter(struct perf_counter *counter, uint32_t type, uint64_t config, const char *name) { + struct perf_event_attr pe; + memset(&pe, 0, sizeof(struct perf_event_attr)); + pe.type = type; + pe.size = sizeof(struct perf_event_attr); + pe.config = config; + pe.disabled = 1; + pe.exclude_kernel = 0; + pe.exclude_hv = 0; + pe.exclude_idle = 1; + + counter->fd = perf_event_open(&pe, 0, -1, -1, 0); + counter->name = name; + counter->type = type; + counter->config = config; + counter->value = 0; + + return counter->fd; +} + +static void reset_counters(struct perf_counter *counters, int n) { + for(int i = 0; i < n; i++) { + if(counters[i].fd >= 0) { + ioctl(counters[i].fd, PERF_EVENT_IOC_RESET, 0); + } + } +} + +static void start_counters(struct perf_counter *counters, int n) { + for(int i = 0; i < n; i++) { + if(counters[i].fd >= 0) { + ioctl(counters[i].fd, PERF_EVENT_IOC_ENABLE, 0); + } + } +} + +static void stop_counters(struct perf_counter *counters, int n) { + for(int i = 0; i < n; i++) { + if(counters[i].fd >= 0) { + ioctl(counters[i].fd, PERF_EVENT_IOC_DISABLE, 0); + read(counters[i].fd, &counters[i].value, sizeof(uint64_t)); + } + } +} + +static void close_counters(struct perf_counter *counters, int n) { + for(int i = 0; i < n; i++) { + if(counters[i].fd >= 0) { + close(counters[i].fd); + } + } +} + +static struct bench_stats calculate_stats(double *values, int n) { + struct bench_stats stats; + stats.n = n; + + // Calculate mean + double sum = 0.0; + for(int i = 0; i < n; i++) { + sum += values[i]; + } + stats.mean = sum / n; + + // Calculate standard deviation + double sum_sq = 0.0; + for(int i = 0; i < n; i++) { + double diff = values[i] - stats.mean; + sum_sq += diff * diff; + } + stats.sd = sqrt(sum_sq / (n - 1)); + + // Calculate relative standard deviation + stats.rel_sd = (stats.mean != 0.0) ? (100.0 * stats.sd / stats.mean) : 0.0; + + return stats; +} + +static void set_realtime_priority(void) { + struct sched_param param; + param.sched_priority = 99; + if(sched_setscheduler(0, SCHED_FIFO, ¶m) == -1) { + fprintf(stderr, "Warning: Failed to set realtime priority (try running with sudo or CAP_SYS_NICE)\n"); + } +} + +static void set_cpu_affinity(int cpu) { + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + CPU_SET(cpu, &cpuset); + if(sched_setaffinity(0, sizeof(cpu_set_t), &cpuset) == -1) { + fprintf(stderr, "Warning: Failed to set CPU affinity to CPU %d\n", cpu); + } +} + +// Static allocation for benchmark runs - no malloc, page-aligned +#define MAX_BENCH_RUNS 100 +static struct bench_run runs_storage[MAX_BENCH_RUNS] __attribute__((aligned(4096))); +static double per_frame_storage[MAX_BENCH_RUNS * 3] __attribute__((aligned(4096))); + +static void run_benchmark(struct nes_state *nstate, uint32_t num_runs, uint32_t frames_per_run) { + if(num_runs > MAX_BENCH_RUNS) { + fprintf(stderr, "Error: num_runs (%u) exceeds MAX_BENCH_RUNS (%u)\n", num_runs, MAX_BENCH_RUNS); + return; + } + + struct perf_counter counters[6]; + int num_counters = 0; + + // Set up performance counters + setup_counter(&counters[num_counters++], PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES, "cycles"); + setup_counter(&counters[num_counters++], PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS, "instructions"); + setup_counter(&counters[num_counters++], PERF_TYPE_HARDWARE, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND, "stalled-cycles-frontend"); + setup_counter(&counters[num_counters++], PERF_TYPE_HARDWARE, PERF_COUNT_HW_STALLED_CYCLES_BACKEND, "stalled-cycles-backend"); + setup_counter(&counters[num_counters++], PERF_TYPE_HARDWARE, PERF_COUNT_HW_BRANCH_INSTRUCTIONS, "branches"); + setup_counter(&counters[num_counters++], PERF_TYPE_HARDWARE, PERF_COUNT_HW_BRANCH_MISSES, "branch-misses"); + + // Check which counters are available + int available_counters = 0; + for(int i = 0; i < num_counters; i++) { + if(counters[i].fd >= 0) { + available_counters++; + } else { + fprintf(stderr, "Warning: Counter %s not available\n", counters[i].name); + } + } + + if(available_counters == 0) { + fprintf(stderr, "Error: No performance counters available\n"); + close_counters(counters, num_counters); + return; + } + + // Use static storage for runs + struct bench_run *runs = runs_storage; + memset(runs, 0, sizeof(struct bench_run) * num_runs); + + // Set CPU affinity and realtime priority + set_cpu_affinity(1); + set_realtime_priority(); + + // Warmup run (not measured) + memset(nstate, 0, sizeof(struct nes_state)); + ppu_reset(nstate); + ines2_load_from_memory(nstate, benchmark_rom, INCBIN_SIZE(benchmark_rom)); + mapper_setup(nstate); + cpu_reset(nstate); + + uint32_t warmup_frames = frames_per_run / 10; + for(uint32_t i = 0; i < warmup_frames; i++) { + while(!nstate->ppu.frame_ready) { + cpu_tick(nstate); + } + nstate->ppu.frame_ready = 0; + } + + // Run benchmark iterations + for(uint32_t run = 0; run < num_runs; run++) { + + // Reset emulator state + memset(nstate, 0, sizeof(struct nes_state)); + ppu_reset(nstate); + ines2_load_from_memory(nstate, benchmark_rom, INCBIN_SIZE(benchmark_rom)); + mapper_setup(nstate); + cpu_reset(nstate); + + // Start timing + struct timespec start_time, end_time; + clock_gettime(CLOCK_MONOTONIC, &start_time); + + // Reset and start counters (after clock_gettime to exclude its overhead) + reset_counters(counters, num_counters); + start_counters(counters, num_counters); + + // Run emulation + for(uint32_t i = 0; i < frames_per_run; i++) { + while(!nstate->ppu.frame_ready) { + cpu_tick(nstate); + } + nstate->ppu.frame_ready = 0; + } + + // Stop counters (before clock_gettime to exclude its overhead) + stop_counters(counters, num_counters); + + // Stop timing + clock_gettime(CLOCK_MONOTONIC, &end_time); + + // Calculate elapsed time in nanoseconds + uint64_t elapsed_ns = (end_time.tv_sec - start_time.tv_sec) * 1000000000ULL + + (end_time.tv_nsec - start_time.tv_nsec); + + // Store results + runs[run].time_ns = elapsed_ns; + for(int i = 0; i < num_counters; i++) { + if(counters[i].fd < 0) continue; + + if(counters[i].config == PERF_COUNT_HW_CPU_CYCLES) { + runs[run].cycles = counters[i].value; + } else if(counters[i].config == PERF_COUNT_HW_INSTRUCTIONS) { + runs[run].instructions = counters[i].value; + } else if(counters[i].config == PERF_COUNT_HW_STALLED_CYCLES_FRONTEND) { + runs[run].stalled_cycles_frontend = counters[i].value; + } else if(counters[i].config == PERF_COUNT_HW_STALLED_CYCLES_BACKEND) { + runs[run].stalled_cycles_backend = counters[i].value; + } else if(counters[i].config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) { + runs[run].branches = counters[i].value; + } else if(counters[i].config == PERF_COUNT_HW_BRANCH_MISSES) { + runs[run].branch_misses = counters[i].value; + } + } + } + + // Calculate aggregated totals + uint64_t total_instructions = 0; + uint64_t total_cycles = 0; + uint64_t total_stalled_frontend = 0; + uint64_t total_stalled_backend = 0; + uint64_t total_branches = 0; + uint64_t total_branch_misses = 0; + uint64_t total_time_ns = 0; + + for(uint32_t i = 0; i < num_runs; i++) { + total_instructions += runs[i].instructions; + total_cycles += runs[i].cycles; + total_stalled_frontend += runs[i].stalled_cycles_frontend; + total_stalled_backend += runs[i].stalled_cycles_backend; + total_branches += runs[i].branches; + total_branch_misses += runs[i].branch_misses; + total_time_ns += runs[i].time_ns; + } + + // Calculate per-frame statistics using static storage + double *cycles_per_frame = &per_frame_storage[0]; + double *insn_per_frame = &per_frame_storage[num_runs]; + double *time_ms = &per_frame_storage[num_runs * 2]; + + for(uint32_t i = 0; i < num_runs; i++) { + cycles_per_frame[i] = (double)runs[i].cycles / frames_per_run; + insn_per_frame[i] = (double)runs[i].instructions / frames_per_run; + time_ms[i] = (double)runs[i].time_ns / 1000000.0; + } + + struct bench_stats cycles_stats = calculate_stats(cycles_per_frame, num_runs); + struct bench_stats insn_stats = calculate_stats(insn_per_frame, num_runs); + struct bench_stats time_stats = calculate_stats(time_ms, num_runs); + + // Print results + double total_time_s = (double)total_time_ns / 1000000000.0; + double ipc = (double)total_instructions / total_cycles; + double stalled_per_insn = (double)(total_stalled_frontend + total_stalled_backend) / total_instructions; + double ghz = (double)total_cycles / total_time_s / 1000000000.0; + double branches_per_sec = (double)total_branches / total_time_s; + double branch_miss_rate = (double)total_branch_misses / total_branches * 100.0; + double stalled_frontend_pct = (double)total_stalled_frontend / total_cycles * 100.0; + double stalled_backend_pct = (double)total_stalled_backend / total_cycles * 100.0; + double mips = (double)total_instructions / total_time_s / 1000000.0; + double mcps = (double)total_cycles / total_time_s / 1000000.0; + + printf("\n"); + printf("%20llu instructions # %.2f insn per cycle \n", (unsigned long long)total_instructions, ipc); + printf("%67s# %.2f stalled cycles per insn \n", "", stalled_per_insn); + printf("%20llu cycles # %.3f GHz \n", (unsigned long long)total_cycles, ghz); + printf("%20llu stalled-cycles-frontend # %.2f%% frontend cycles idle \n", (unsigned long long)total_stalled_frontend, stalled_frontend_pct); + printf("%20llu stalled-cycles-backend # %.2f%% backend cycles idle \n", (unsigned long long)total_stalled_backend, stalled_backend_pct); + printf("%20llu branches # %.3f G/sec \n", (unsigned long long)total_branches, branches_per_sec / 1000000000.0); + printf("%20llu branch-misses # %.2f%% of all branches \n", (unsigned long long)total_branch_misses, branch_miss_rate); + printf("\n"); + printf("Throughput: %.2f MIPS, %.2f Mcycles/sec\n", mips, mcps); + printf("\n"); + printf("cycles/frame mean=%.0f sd=%.0f relSD=%.3f%% n=%d\n", cycles_stats.mean, cycles_stats.sd, cycles_stats.rel_sd, cycles_stats.n); + printf("insn/frame mean=%.0f sd=%.0f relSD=%.3f%% n=%d\n", insn_stats.mean, insn_stats.sd, insn_stats.rel_sd, insn_stats.n); + printf("time (ms) mean=%.3f sd=%.3f relSD=%.3f%% n=%d\n", time_stats.mean, time_stats.sd, time_stats.rel_sd, time_stats.n); + + double fps = (double)frames_per_run / (time_stats.mean / 1000.0); + double ms_per_frame = time_stats.mean / frames_per_run; + printf("FPS (frames/second) = %.2f\n", fps); + printf("ms/frame = %.6f\n", ms_per_frame); + + // Cleanup + close_counters(counters, num_counters); +} diff --git a/mknes_mapper.c b/mknes_mapper.c index 32e8017..4202769 100644 --- a/mknes_mapper.c +++ b/mknes_mapper.c @@ -91,7 +91,9 @@ static void mapper_reset(struct nes_state *state) { static void mapper_setup(struct nes_state *state) { uint32_t mapper_id = state->ines.mapper << 4 | state->ines.submapper; +#ifndef BENCHMARK printf("Mapper %d_%x requested.\n", state->ines.mapper, state->ines.submapper); +#endif mapper_reset(state); |
