summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPeter Fors <peter.fors@mindkiller.com>2025-10-25 23:07:35 +0200
committerPeter Fors <peter.fors@mindkiller.com>2025-10-25 23:28:22 +0200
commitb2f646d9f99dd272f3b3a9d045b5039e6fc1dc50 (patch)
tree9ea8977531306f414d94ceca7dcfa6f17c204687
parent54ca8318923fcf11e1cf507bd516b210ba7cf221 (diff)
Refactor benchmarking to self-contained C implementation
- Add mknes_bench.c with direct PMC access via perf_event_open() - Remove dependency on external perf/awk for statistics - Add RT priority and CPU affinity control in C code - Use static BSS allocation (page-aligned) instead of malloc - Add stalled-cycles-backend counter (gracefully handles AMD unavailability) - Add throughput metrics (MIPS, Mcycles/sec) - Optimize Bench.sh to only regenerate profile data when needed - Add -n and -f flags for configurable runs and frames - Suppress mapper messages during benchmark - ~6x faster benchmark workflow (20s first run, 16s subsequent)
-rwxr-xr-xBench.sh50
-rw-r--r--mknes.c55
-rw-r--r--mknes_bench.c323
-rw-r--r--mknes_mapper.c2
4 files changed, 360 insertions, 70 deletions
diff --git a/Bench.sh b/Bench.sh
index 157ad57..8be485a 100755
--- a/Bench.sh
+++ b/Bench.sh
@@ -1,45 +1,13 @@
#!/usr/bin/env bash
-./build.sh clean
-./build.sh profile
-./mknes
-./build.sh profile_release
-
-runs=10
-frames=4096
-events="cycles,instructions,task-clock"
-tmp=$(mktemp)
-
-taskset -c 1 ./mknes
-
-> "$tmp"
-for i in $(seq 1 $runs); do
- taskset -c 1 chrt -f 99 perf stat -x, -e $events -- ./mknes 2>>"$tmp"
-done
-
-awk -F, -v F="$frames" '
- $3=="cycles" { c[++nc]=$1/F }
- $3=="instructions" { i[++ni]=$1/F }
-# $3=="task-clock" { t[++nt]=$1 } # milliseconds NOTE(peter): changed to nanoseconds...
- $3=="task-clock" { t[++nt]=$1/1000000 }
+# Only rebuild profile data if it doesn't exist
+if [ ! -f mknes.gcda ]; then
+ ./build.sh clean
+ ./build.sh profile
+ ./mknes -n 1 -f 1024 # Quick single run for profile generation
+fi
- END {
- for(k=1;k<=nc;k++) sumc+=c[k]; mc=sumc/nc
- for(k=1;k<=ni;k++) sumi+=i[k]; mi=sumi/ni
- for(k=1;k<=nt;k++) sumt+=t[k]; mt=sumt/nt
-
- for(k=1;k<=nc;k++) sdc+=(c[k]-mc)^2; sdc=sqrt(sdc/(nc-1))
- for(k=1;k<=ni;k++) sdi+=(i[k]-mi)^2; sdi=sqrt(sdi/(ni-1))
- for(k=1;k<=nt;k++) sdt+=(t[k]-mt)^2; sdt=sqrt(sdt/(nt-1))
-
- ms_per_frame = mt / F
- fps = F / (mt / 1000)
-
- printf "IPC (insn/cycle) = %.3f\n", mi/mc
- printf "cycles/frame mean=%.0f sd=%.0f relSD=%.3f%% n=%d\n", mc, sdc, 100*sdc/mc, nc
- printf "insn/frame mean=%.0f sd=%.0f relSD=%.3f%% n=%d\n", mi, sdi, 100*sdi/mi, ni
- printf "time (ms) mean=%.3f sd=%.3f relSD=%.3f%% n=%d\n", mt, sdt, 100*sdt/mt, nt
- printf "FPS (frames/second) = %.2f\n", fps
- printf "ms/frame = %.6f\n", ms_per_frame
- }' "$tmp"
+./build.sh profile_release
+# Run full benchmark
+./mknes
diff --git a/mknes.c b/mknes.c
index f8850b0..567d000 100644
--- a/mknes.c
+++ b/mknes.c
@@ -1,4 +1,5 @@
#define GL_SILENCE_DEPRECATION
+#define _GNU_SOURCE
#ifdef _WIN32
#define NOMINMAX
@@ -57,8 +58,8 @@ static void audio_callback(int16_t *data, size_t frames) { }
#ifdef BENCHMARK
// Embed the ROM for benchmarking to eliminate file I/O overhead
// Uncomment the ROM you want to benchmark:
-INCBIN_BYTES(benchmark_rom, "data/Life Force (USA).nes");
-// INCBIN_BYTES(benchmark_rom, "data/0000/Super Mario Bros. (World) (HVC-SM).nes");
+// INCBIN_BYTES(benchmark_rom, "data/Life Force (USA).nes");
+INCBIN_BYTES(benchmark_rom, "data/0000/Super Mario Bros. (World) (HVC-SM).nes");
// INCBIN_BYTES(benchmark_rom, "data/0003/Gradius (USA).zip");
#endif
@@ -177,6 +178,10 @@ static uint32_t frames; // debug information
#include "mknes_ines2.c"
#include "mknes_mapper.c"
+#ifdef BENCHMARK
+#include "mknes_bench.c"
+#endif
+
// struct nes_state nstate;
static void framebuffer_callback(struct mkfw_state *mkfw_window, int32_t width, int32_t height, float aspect_ratio) {
@@ -240,14 +245,28 @@ int main(int argc, char **argv) {
// protect_opcode_lut();
struct nes_state *nstate = aligned_alloc(4096, (sizeof(struct nes_state) + 4095) & ~4095);
- memset(nstate, 0, sizeof(struct nes_state));
- ppu_reset(nstate);
#ifdef BENCHMARK
- // Use embedded ROM for consistent benchmarking without file I/O overhead
- ines2_load_from_memory(nstate, benchmark_rom, INCBIN_SIZE(benchmark_rom));
+ // Run benchmark with configurable parameters
+ uint32_t num_runs = 10;
+ uint32_t frames_per_run = 0x1000;
+
+ // Parse command line arguments
+ for(int i = 1; i < argc; i++) {
+ if(strcmp(argv[i], "-n") == 0 && i + 1 < argc) {
+ num_runs = atoi(argv[i + 1]);
+ i++;
+ } else if(strcmp(argv[i], "-f") == 0 && i + 1 < argc) {
+ frames_per_run = atoi(argv[i + 1]);
+ i++;
+ }
+ }
+ run_benchmark(nstate, num_runs, frames_per_run);
+ return 0;
#else
+ memset(nstate, 0, sizeof(struct nes_state));
+ ppu_reset(nstate);
// ines2_load(nstate, "data/0000/10-Yard Fight (USA, Europe).nes");
// ines2_load(nstate, "data/0000/Balloon Fight (USA).nes");
// ines2_load(nstate, "data/0000/Excitebike (Japan, USA).nes");
@@ -299,32 +318,10 @@ int main(int argc, char **argv) {
// ines2_load(nstate, "data/Blaster Master (USA).zip"); // mapper 1
// ines2_load(nstate, "AccuracyCoin.nes"); // mapper 1
-#endif
mapper_setup(nstate);
cpu_reset(nstate);
-#ifdef BENCHMARK
- for(uint32_t i = 0; i < 0x1000; ++i) {
- while(!nstate->ppu.frame_ready) {
- // PROFILE_NAMED("nes emulator");
- cpu_tick(nstate);
- }
- nstate->ppu.frame_ready = 0;
- frames++;
- }
-
- // for(size_t i = 0; i < 9; ++i) {
- // printf("count %d: %lld\n", i, sprite_counts[i]);
- // }
-
- // for(size_t i = 0; i < 256; ++i) {
- // printf("instr %2.2x: %lld\n", i, instr_count[i]);
- // }
-
- return 0;
-#else
-
// WINDOW SETUP
struct mkfw_state *window = mkfw_init(WINDOW_WIDTH, WINDOW_HEIGHT);
mkfw_set_window_title(window, "mknes");
@@ -408,7 +405,7 @@ int main(int argc, char **argv) {
// free_nes_state(&nstate);
timer_shutdown();
mkfw_cleanup(window);
-#endif
return 0;
+#endif
}
diff --git a/mknes_bench.c b/mknes_bench.c
new file mode 100644
index 0000000..fcc7aae
--- /dev/null
+++ b/mknes_bench.c
@@ -0,0 +1,323 @@
+#define _GNU_SOURCE
+#include <linux/perf_event.h>
+#include <sys/syscall.h>
+#include <sys/ioctl.h>
+#include <unistd.h>
+#include <sched.h>
+#include <sys/resource.h>
+#include <math.h>
+#include <time.h>
+
+// Performance counter setup
+struct perf_counter {
+ int fd;
+ uint64_t value;
+ const char *name;
+ uint32_t type;
+ uint64_t config;
+};
+
+struct bench_run {
+ uint64_t cycles;
+ uint64_t instructions;
+ uint64_t stalled_cycles_frontend;
+ uint64_t stalled_cycles_backend;
+ uint64_t branches;
+ uint64_t branch_misses;
+ uint64_t time_ns;
+};
+
+struct bench_stats {
+ double mean;
+ double sd;
+ double rel_sd;
+ int n;
+};
+
+static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
+ int cpu, int group_fd, unsigned long flags) {
+ return syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags);
+}
+
+static int setup_counter(struct perf_counter *counter, uint32_t type, uint64_t config, const char *name) {
+ struct perf_event_attr pe;
+ memset(&pe, 0, sizeof(struct perf_event_attr));
+ pe.type = type;
+ pe.size = sizeof(struct perf_event_attr);
+ pe.config = config;
+ pe.disabled = 1;
+ pe.exclude_kernel = 0;
+ pe.exclude_hv = 0;
+ pe.exclude_idle = 1;
+
+ counter->fd = perf_event_open(&pe, 0, -1, -1, 0);
+ counter->name = name;
+ counter->type = type;
+ counter->config = config;
+ counter->value = 0;
+
+ return counter->fd;
+}
+
+static void reset_counters(struct perf_counter *counters, int n) {
+ for(int i = 0; i < n; i++) {
+ if(counters[i].fd >= 0) {
+ ioctl(counters[i].fd, PERF_EVENT_IOC_RESET, 0);
+ }
+ }
+}
+
+static void start_counters(struct perf_counter *counters, int n) {
+ for(int i = 0; i < n; i++) {
+ if(counters[i].fd >= 0) {
+ ioctl(counters[i].fd, PERF_EVENT_IOC_ENABLE, 0);
+ }
+ }
+}
+
+static void stop_counters(struct perf_counter *counters, int n) {
+ for(int i = 0; i < n; i++) {
+ if(counters[i].fd >= 0) {
+ ioctl(counters[i].fd, PERF_EVENT_IOC_DISABLE, 0);
+ read(counters[i].fd, &counters[i].value, sizeof(uint64_t));
+ }
+ }
+}
+
+static void close_counters(struct perf_counter *counters, int n) {
+ for(int i = 0; i < n; i++) {
+ if(counters[i].fd >= 0) {
+ close(counters[i].fd);
+ }
+ }
+}
+
+static struct bench_stats calculate_stats(double *values, int n) {
+ struct bench_stats stats;
+ stats.n = n;
+
+ // Calculate mean
+ double sum = 0.0;
+ for(int i = 0; i < n; i++) {
+ sum += values[i];
+ }
+ stats.mean = sum / n;
+
+ // Calculate standard deviation
+ double sum_sq = 0.0;
+ for(int i = 0; i < n; i++) {
+ double diff = values[i] - stats.mean;
+ sum_sq += diff * diff;
+ }
+ stats.sd = sqrt(sum_sq / (n - 1));
+
+ // Calculate relative standard deviation
+ stats.rel_sd = (stats.mean != 0.0) ? (100.0 * stats.sd / stats.mean) : 0.0;
+
+ return stats;
+}
+
+static void set_realtime_priority(void) {
+ struct sched_param param;
+ param.sched_priority = 99;
+ if(sched_setscheduler(0, SCHED_FIFO, &param) == -1) {
+ fprintf(stderr, "Warning: Failed to set realtime priority (try running with sudo or CAP_SYS_NICE)\n");
+ }
+}
+
+static void set_cpu_affinity(int cpu) {
+ cpu_set_t cpuset;
+ CPU_ZERO(&cpuset);
+ CPU_SET(cpu, &cpuset);
+ if(sched_setaffinity(0, sizeof(cpu_set_t), &cpuset) == -1) {
+ fprintf(stderr, "Warning: Failed to set CPU affinity to CPU %d\n", cpu);
+ }
+}
+
+// Static allocation for benchmark runs - no malloc, page-aligned
+#define MAX_BENCH_RUNS 100
+static struct bench_run runs_storage[MAX_BENCH_RUNS] __attribute__((aligned(4096)));
+static double per_frame_storage[MAX_BENCH_RUNS * 3] __attribute__((aligned(4096)));
+
+static void run_benchmark(struct nes_state *nstate, uint32_t num_runs, uint32_t frames_per_run) {
+ if(num_runs > MAX_BENCH_RUNS) {
+ fprintf(stderr, "Error: num_runs (%u) exceeds MAX_BENCH_RUNS (%u)\n", num_runs, MAX_BENCH_RUNS);
+ return;
+ }
+
+ struct perf_counter counters[6];
+ int num_counters = 0;
+
+ // Set up performance counters
+ setup_counter(&counters[num_counters++], PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES, "cycles");
+ setup_counter(&counters[num_counters++], PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS, "instructions");
+ setup_counter(&counters[num_counters++], PERF_TYPE_HARDWARE, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND, "stalled-cycles-frontend");
+ setup_counter(&counters[num_counters++], PERF_TYPE_HARDWARE, PERF_COUNT_HW_STALLED_CYCLES_BACKEND, "stalled-cycles-backend");
+ setup_counter(&counters[num_counters++], PERF_TYPE_HARDWARE, PERF_COUNT_HW_BRANCH_INSTRUCTIONS, "branches");
+ setup_counter(&counters[num_counters++], PERF_TYPE_HARDWARE, PERF_COUNT_HW_BRANCH_MISSES, "branch-misses");
+
+ // Check which counters are available
+ int available_counters = 0;
+ for(int i = 0; i < num_counters; i++) {
+ if(counters[i].fd >= 0) {
+ available_counters++;
+ } else {
+ fprintf(stderr, "Warning: Counter %s not available\n", counters[i].name);
+ }
+ }
+
+ if(available_counters == 0) {
+ fprintf(stderr, "Error: No performance counters available\n");
+ close_counters(counters, num_counters);
+ return;
+ }
+
+ // Use static storage for runs
+ struct bench_run *runs = runs_storage;
+ memset(runs, 0, sizeof(struct bench_run) * num_runs);
+
+ // Set CPU affinity and realtime priority
+ set_cpu_affinity(1);
+ set_realtime_priority();
+
+ // Warmup run (not measured)
+ memset(nstate, 0, sizeof(struct nes_state));
+ ppu_reset(nstate);
+ ines2_load_from_memory(nstate, benchmark_rom, INCBIN_SIZE(benchmark_rom));
+ mapper_setup(nstate);
+ cpu_reset(nstate);
+
+ uint32_t warmup_frames = frames_per_run / 10;
+ for(uint32_t i = 0; i < warmup_frames; i++) {
+ while(!nstate->ppu.frame_ready) {
+ cpu_tick(nstate);
+ }
+ nstate->ppu.frame_ready = 0;
+ }
+
+ // Run benchmark iterations
+ for(uint32_t run = 0; run < num_runs; run++) {
+
+ // Reset emulator state
+ memset(nstate, 0, sizeof(struct nes_state));
+ ppu_reset(nstate);
+ ines2_load_from_memory(nstate, benchmark_rom, INCBIN_SIZE(benchmark_rom));
+ mapper_setup(nstate);
+ cpu_reset(nstate);
+
+ // Start timing
+ struct timespec start_time, end_time;
+ clock_gettime(CLOCK_MONOTONIC, &start_time);
+
+ // Reset and start counters (after clock_gettime to exclude its overhead)
+ reset_counters(counters, num_counters);
+ start_counters(counters, num_counters);
+
+ // Run emulation
+ for(uint32_t i = 0; i < frames_per_run; i++) {
+ while(!nstate->ppu.frame_ready) {
+ cpu_tick(nstate);
+ }
+ nstate->ppu.frame_ready = 0;
+ }
+
+ // Stop counters (before clock_gettime to exclude its overhead)
+ stop_counters(counters, num_counters);
+
+ // Stop timing
+ clock_gettime(CLOCK_MONOTONIC, &end_time);
+
+ // Calculate elapsed time in nanoseconds
+ uint64_t elapsed_ns = (end_time.tv_sec - start_time.tv_sec) * 1000000000ULL +
+ (end_time.tv_nsec - start_time.tv_nsec);
+
+ // Store results
+ runs[run].time_ns = elapsed_ns;
+ for(int i = 0; i < num_counters; i++) {
+ if(counters[i].fd < 0) continue;
+
+ if(counters[i].config == PERF_COUNT_HW_CPU_CYCLES) {
+ runs[run].cycles = counters[i].value;
+ } else if(counters[i].config == PERF_COUNT_HW_INSTRUCTIONS) {
+ runs[run].instructions = counters[i].value;
+ } else if(counters[i].config == PERF_COUNT_HW_STALLED_CYCLES_FRONTEND) {
+ runs[run].stalled_cycles_frontend = counters[i].value;
+ } else if(counters[i].config == PERF_COUNT_HW_STALLED_CYCLES_BACKEND) {
+ runs[run].stalled_cycles_backend = counters[i].value;
+ } else if(counters[i].config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) {
+ runs[run].branches = counters[i].value;
+ } else if(counters[i].config == PERF_COUNT_HW_BRANCH_MISSES) {
+ runs[run].branch_misses = counters[i].value;
+ }
+ }
+ }
+
+ // Calculate aggregated totals
+ uint64_t total_instructions = 0;
+ uint64_t total_cycles = 0;
+ uint64_t total_stalled_frontend = 0;
+ uint64_t total_stalled_backend = 0;
+ uint64_t total_branches = 0;
+ uint64_t total_branch_misses = 0;
+ uint64_t total_time_ns = 0;
+
+ for(uint32_t i = 0; i < num_runs; i++) {
+ total_instructions += runs[i].instructions;
+ total_cycles += runs[i].cycles;
+ total_stalled_frontend += runs[i].stalled_cycles_frontend;
+ total_stalled_backend += runs[i].stalled_cycles_backend;
+ total_branches += runs[i].branches;
+ total_branch_misses += runs[i].branch_misses;
+ total_time_ns += runs[i].time_ns;
+ }
+
+ // Calculate per-frame statistics using static storage
+ double *cycles_per_frame = &per_frame_storage[0];
+ double *insn_per_frame = &per_frame_storage[num_runs];
+ double *time_ms = &per_frame_storage[num_runs * 2];
+
+ for(uint32_t i = 0; i < num_runs; i++) {
+ cycles_per_frame[i] = (double)runs[i].cycles / frames_per_run;
+ insn_per_frame[i] = (double)runs[i].instructions / frames_per_run;
+ time_ms[i] = (double)runs[i].time_ns / 1000000.0;
+ }
+
+ struct bench_stats cycles_stats = calculate_stats(cycles_per_frame, num_runs);
+ struct bench_stats insn_stats = calculate_stats(insn_per_frame, num_runs);
+ struct bench_stats time_stats = calculate_stats(time_ms, num_runs);
+
+ // Print results
+ double total_time_s = (double)total_time_ns / 1000000000.0;
+ double ipc = (double)total_instructions / total_cycles;
+ double stalled_per_insn = (double)(total_stalled_frontend + total_stalled_backend) / total_instructions;
+ double ghz = (double)total_cycles / total_time_s / 1000000000.0;
+ double branches_per_sec = (double)total_branches / total_time_s;
+ double branch_miss_rate = (double)total_branch_misses / total_branches * 100.0;
+ double stalled_frontend_pct = (double)total_stalled_frontend / total_cycles * 100.0;
+ double stalled_backend_pct = (double)total_stalled_backend / total_cycles * 100.0;
+ double mips = (double)total_instructions / total_time_s / 1000000.0;
+ double mcps = (double)total_cycles / total_time_s / 1000000.0;
+
+ printf("\n");
+ printf("%20llu instructions # %.2f insn per cycle \n", (unsigned long long)total_instructions, ipc);
+ printf("%67s# %.2f stalled cycles per insn \n", "", stalled_per_insn);
+ printf("%20llu cycles # %.3f GHz \n", (unsigned long long)total_cycles, ghz);
+ printf("%20llu stalled-cycles-frontend # %.2f%% frontend cycles idle \n", (unsigned long long)total_stalled_frontend, stalled_frontend_pct);
+ printf("%20llu stalled-cycles-backend # %.2f%% backend cycles idle \n", (unsigned long long)total_stalled_backend, stalled_backend_pct);
+ printf("%20llu branches # %.3f G/sec \n", (unsigned long long)total_branches, branches_per_sec / 1000000000.0);
+ printf("%20llu branch-misses # %.2f%% of all branches \n", (unsigned long long)total_branch_misses, branch_miss_rate);
+ printf("\n");
+ printf("Throughput: %.2f MIPS, %.2f Mcycles/sec\n", mips, mcps);
+ printf("\n");
+ printf("cycles/frame mean=%.0f sd=%.0f relSD=%.3f%% n=%d\n", cycles_stats.mean, cycles_stats.sd, cycles_stats.rel_sd, cycles_stats.n);
+ printf("insn/frame mean=%.0f sd=%.0f relSD=%.3f%% n=%d\n", insn_stats.mean, insn_stats.sd, insn_stats.rel_sd, insn_stats.n);
+ printf("time (ms) mean=%.3f sd=%.3f relSD=%.3f%% n=%d\n", time_stats.mean, time_stats.sd, time_stats.rel_sd, time_stats.n);
+
+ double fps = (double)frames_per_run / (time_stats.mean / 1000.0);
+ double ms_per_frame = time_stats.mean / frames_per_run;
+ printf("FPS (frames/second) = %.2f\n", fps);
+ printf("ms/frame = %.6f\n", ms_per_frame);
+
+ // Cleanup
+ close_counters(counters, num_counters);
+}
diff --git a/mknes_mapper.c b/mknes_mapper.c
index 32e8017..4202769 100644
--- a/mknes_mapper.c
+++ b/mknes_mapper.c
@@ -91,7 +91,9 @@ static void mapper_reset(struct nes_state *state) {
static void mapper_setup(struct nes_state *state) {
uint32_t mapper_id = state->ines.mapper << 4 | state->ines.submapper;
+#ifndef BENCHMARK
printf("Mapper %d_%x requested.\n", state->ines.mapper, state->ines.submapper);
+#endif
mapper_reset(state);