From b2f646d9f99dd272f3b3a9d045b5039e6fc1dc50 Mon Sep 17 00:00:00 2001
From: Peter Fors <peter.fors@mindkiller.com>
Date: Sat, 25 Oct 2025 23:07:35 +0200
Subject: Refactor benchmarking to self-contained C implementation

- Add mknes_bench.c with direct PMC access via perf_event_open()
- Remove dependency on external perf/awk for statistics
- Add RT priority and CPU affinity control in C code
- Use static BSS allocation (page-aligned) instead of malloc
- Add stalled-cycles-backend counter (gracefully handles AMD unavailability)
- Add throughput metrics (MIPS, Mcycles/sec)
- Optimize Bench.sh to only regenerate profile data when needed
- Add -n and -f flags for configurable runs and frames
- Suppress mapper messages during benchmark
- ~6x faster benchmark workflow (20s first run, 16s subsequent)
---
 mknes_bench.c | 323 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 323 insertions(+)
 create mode 100644 mknes_bench.c

(limited to 'mknes_bench.c')

diff --git a/mknes_bench.c b/mknes_bench.c
new file mode 100644
index 0000000..fcc7aae
--- /dev/null
+++ b/mknes_bench.c
@@ -0,0 +1,323 @@
+#define _GNU_SOURCE
+#include <linux/perf_event.h>
+#include <sys/syscall.h>
+#include <sys/ioctl.h>
+#include <unistd.h>
+#include <sched.h>
+#include <sys/resource.h>
+#include <math.h>
+#include <time.h>
+
+// Performance counter setup
+struct perf_counter {
+	int fd;
+	uint64_t value;
+	const char *name;
+	uint32_t type;
+	uint64_t config;
+};
+
+struct bench_run {
+	uint64_t cycles;
+	uint64_t instructions;
+	uint64_t stalled_cycles_frontend;
+	uint64_t stalled_cycles_backend;
+	uint64_t branches;
+	uint64_t branch_misses;
+	uint64_t time_ns;
+};
+
+struct bench_stats {
+	double mean;
+	double sd;
+	double rel_sd;
+	int n;
+};
+
+static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
+                            int cpu, int group_fd, unsigned long flags) {
+	return syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags);
+}
+
+static int setup_counter(struct perf_counter *counter, uint32_t type, uint64_t config, const char *name) {
+	struct perf_event_attr pe;
+	memset(&pe, 0, sizeof(struct perf_event_attr));
+	pe.type = type;
+	pe.size = sizeof(struct perf_event_attr);
+	pe.config = config;
+	pe.disabled = 1;
+	pe.exclude_kernel = 0;
+	pe.exclude_hv = 0;
+	pe.exclude_idle = 1;
+
+	counter->fd = perf_event_open(&pe, 0, -1, -1, 0);
+	counter->name = name;
+	counter->type = type;
+	counter->config = config;
+	counter->value = 0;
+
+	return counter->fd;
+}
+
+static void reset_counters(struct perf_counter *counters, int n) {
+	for(int i = 0; i < n; i++) {
+		if(counters[i].fd >= 0) {
+			ioctl(counters[i].fd, PERF_EVENT_IOC_RESET, 0);
+		}
+	}
+}
+
+static void start_counters(struct perf_counter *counters, int n) {
+	for(int i = 0; i < n; i++) {
+		if(counters[i].fd >= 0) {
+			ioctl(counters[i].fd, PERF_EVENT_IOC_ENABLE, 0);
+		}
+	}
+}
+
+static void stop_counters(struct perf_counter *counters, int n) {
+	for(int i = 0; i < n; i++) {
+		if(counters[i].fd >= 0) {
+			ioctl(counters[i].fd, PERF_EVENT_IOC_DISABLE, 0);
+			read(counters[i].fd, &counters[i].value, sizeof(uint64_t));
+		}
+	}
+}
+
+static void close_counters(struct perf_counter *counters, int n) {
+	for(int i = 0; i < n; i++) {
+		if(counters[i].fd >= 0) {
+			close(counters[i].fd);
+		}
+	}
+}
+
+static struct bench_stats calculate_stats(double *values, int n) {
+	struct bench_stats stats;
+	stats.n = n;
+
+	// Calculate mean
+	double sum = 0.0;
+	for(int i = 0; i < n; i++) {
+		sum += values[i];
+	}
+	stats.mean = sum / n;
+
+	// Calculate standard deviation
+	double sum_sq = 0.0;
+	for(int i = 0; i < n; i++) {
+		double diff = values[i] - stats.mean;
+		sum_sq += diff * diff;
+	}
+	stats.sd = sqrt(sum_sq / (n - 1));
+
+	// Calculate relative standard deviation
+	stats.rel_sd = (stats.mean != 0.0) ? (100.0 * stats.sd / stats.mean) : 0.0;
+
+	return stats;
+}
+
+static void set_realtime_priority(void) {
+	struct sched_param param;
+	param.sched_priority = 99;
+	if(sched_setscheduler(0, SCHED_FIFO, &param) == -1) {
+		fprintf(stderr, "Warning: Failed to set realtime priority (try running with sudo or CAP_SYS_NICE)\n");
+	}
+}
+
+static void set_cpu_affinity(int cpu) {
+	cpu_set_t cpuset;
+	CPU_ZERO(&cpuset);
+	CPU_SET(cpu, &cpuset);
+	if(sched_setaffinity(0, sizeof(cpu_set_t), &cpuset) == -1) {
+		fprintf(stderr, "Warning: Failed to set CPU affinity to CPU %d\n", cpu);
+	}
+}
+
+// Static allocation for benchmark runs - no malloc, page-aligned
+#define MAX_BENCH_RUNS 100
+static struct bench_run runs_storage[MAX_BENCH_RUNS] __attribute__((aligned(4096)));
+static double per_frame_storage[MAX_BENCH_RUNS * 3] __attribute__((aligned(4096)));
+
+static void run_benchmark(struct nes_state *nstate, uint32_t num_runs, uint32_t frames_per_run) {
+	if(num_runs > MAX_BENCH_RUNS) {
+		fprintf(stderr, "Error: num_runs (%u) exceeds MAX_BENCH_RUNS (%u)\n", num_runs, MAX_BENCH_RUNS);
+		return;
+	}
+
+	struct perf_counter counters[6];
+	int num_counters = 0;
+
+	// Set up performance counters
+	setup_counter(&counters[num_counters++], PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES, "cycles");
+	setup_counter(&counters[num_counters++], PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS, "instructions");
+	setup_counter(&counters[num_counters++], PERF_TYPE_HARDWARE, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND, "stalled-cycles-frontend");
+	setup_counter(&counters[num_counters++], PERF_TYPE_HARDWARE, PERF_COUNT_HW_STALLED_CYCLES_BACKEND, "stalled-cycles-backend");
+	setup_counter(&counters[num_counters++], PERF_TYPE_HARDWARE, PERF_COUNT_HW_BRANCH_INSTRUCTIONS, "branches");
+	setup_counter(&counters[num_counters++], PERF_TYPE_HARDWARE, PERF_COUNT_HW_BRANCH_MISSES, "branch-misses");
+
+	// Check which counters are available
+	int available_counters = 0;
+	for(int i = 0; i < num_counters; i++) {
+		if(counters[i].fd >= 0) {
+			available_counters++;
+		} else {
+			fprintf(stderr, "Warning: Counter %s not available\n", counters[i].name);
+		}
+	}
+
+	if(available_counters == 0) {
+		fprintf(stderr, "Error: No performance counters available\n");
+		close_counters(counters, num_counters);
+		return;
+	}
+
+	// Use static storage for runs
+	struct bench_run *runs = runs_storage;
+	memset(runs, 0, sizeof(struct bench_run) * num_runs);
+
+	// Set CPU affinity and realtime priority
+	set_cpu_affinity(1);
+	set_realtime_priority();
+
+	// Warmup run (not measured)
+	memset(nstate, 0, sizeof(struct nes_state));
+	ppu_reset(nstate);
+	ines2_load_from_memory(nstate, benchmark_rom, INCBIN_SIZE(benchmark_rom));
+	mapper_setup(nstate);
+	cpu_reset(nstate);
+
+	uint32_t warmup_frames = frames_per_run / 10;
+	for(uint32_t i = 0; i < warmup_frames; i++) {
+		while(!nstate->ppu.frame_ready) {
+			cpu_tick(nstate);
+		}
+		nstate->ppu.frame_ready = 0;
+	}
+
+	// Run benchmark iterations
+	for(uint32_t run = 0; run < num_runs; run++) {
+
+		// Reset emulator state
+		memset(nstate, 0, sizeof(struct nes_state));
+		ppu_reset(nstate);
+		ines2_load_from_memory(nstate, benchmark_rom, INCBIN_SIZE(benchmark_rom));
+		mapper_setup(nstate);
+		cpu_reset(nstate);
+
+		// Start timing
+		struct timespec start_time, end_time;
+		clock_gettime(CLOCK_MONOTONIC, &start_time);
+
+		// Reset and start counters (after clock_gettime to exclude its overhead)
+		reset_counters(counters, num_counters);
+		start_counters(counters, num_counters);
+
+		// Run emulation
+		for(uint32_t i = 0; i < frames_per_run; i++) {
+			while(!nstate->ppu.frame_ready) {
+				cpu_tick(nstate);
+			}
+			nstate->ppu.frame_ready = 0;
+		}
+
+		// Stop counters (before clock_gettime to exclude its overhead)
+		stop_counters(counters, num_counters);
+
+		// Stop timing
+		clock_gettime(CLOCK_MONOTONIC, &end_time);
+
+		// Calculate elapsed time in nanoseconds
+		uint64_t elapsed_ns = (end_time.tv_sec - start_time.tv_sec) * 1000000000ULL +
+		                      (end_time.tv_nsec - start_time.tv_nsec);
+
+		// Store results
+		runs[run].time_ns = elapsed_ns;
+		for(int i = 0; i < num_counters; i++) {
+			if(counters[i].fd < 0) continue;
+
+			if(counters[i].config == PERF_COUNT_HW_CPU_CYCLES) {
+				runs[run].cycles = counters[i].value;
+			} else if(counters[i].config == PERF_COUNT_HW_INSTRUCTIONS) {
+				runs[run].instructions = counters[i].value;
+			} else if(counters[i].config == PERF_COUNT_HW_STALLED_CYCLES_FRONTEND) {
+				runs[run].stalled_cycles_frontend = counters[i].value;
+			} else if(counters[i].config == PERF_COUNT_HW_STALLED_CYCLES_BACKEND) {
+				runs[run].stalled_cycles_backend = counters[i].value;
+			} else if(counters[i].config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) {
+				runs[run].branches = counters[i].value;
+			} else if(counters[i].config == PERF_COUNT_HW_BRANCH_MISSES) {
+				runs[run].branch_misses = counters[i].value;
+			}
+		}
+	}
+
+	// Calculate aggregated totals
+	uint64_t total_instructions = 0;
+	uint64_t total_cycles = 0;
+	uint64_t total_stalled_frontend = 0;
+	uint64_t total_stalled_backend = 0;
+	uint64_t total_branches = 0;
+	uint64_t total_branch_misses = 0;
+	uint64_t total_time_ns = 0;
+
+	for(uint32_t i = 0; i < num_runs; i++) {
+		total_instructions += runs[i].instructions;
+		total_cycles += runs[i].cycles;
+		total_stalled_frontend += runs[i].stalled_cycles_frontend;
+		total_stalled_backend += runs[i].stalled_cycles_backend;
+		total_branches += runs[i].branches;
+		total_branch_misses += runs[i].branch_misses;
+		total_time_ns += runs[i].time_ns;
+	}
+
+	// Calculate per-frame statistics using static storage
+	double *cycles_per_frame = &per_frame_storage[0];
+	double *insn_per_frame = &per_frame_storage[num_runs];
+	double *time_ms = &per_frame_storage[num_runs * 2];
+
+	for(uint32_t i = 0; i < num_runs; i++) {
+		cycles_per_frame[i] = (double)runs[i].cycles / frames_per_run;
+		insn_per_frame[i] = (double)runs[i].instructions / frames_per_run;
+		time_ms[i] = (double)runs[i].time_ns / 1000000.0;
+	}
+
+	struct bench_stats cycles_stats = calculate_stats(cycles_per_frame, num_runs);
+	struct bench_stats insn_stats = calculate_stats(insn_per_frame, num_runs);
+	struct bench_stats time_stats = calculate_stats(time_ms, num_runs);
+
+	// Print results
+	double total_time_s = (double)total_time_ns / 1000000000.0;
+	double ipc = (double)total_instructions / total_cycles;
+	double stalled_per_insn = (double)(total_stalled_frontend + total_stalled_backend) / total_instructions;
+	double ghz = (double)total_cycles / total_time_s / 1000000000.0;
+	double branches_per_sec = (double)total_branches / total_time_s;
+	double branch_miss_rate = (double)total_branch_misses / total_branches * 100.0;
+	double stalled_frontend_pct = (double)total_stalled_frontend / total_cycles * 100.0;
+	double stalled_backend_pct = (double)total_stalled_backend / total_cycles * 100.0;
+	double mips = (double)total_instructions / total_time_s / 1000000.0;
+	double mcps = (double)total_cycles / total_time_s / 1000000.0;
+
+	printf("\n");
+	printf("%20llu      instructions                     #    %.2f  insn per cycle            \n", (unsigned long long)total_instructions, ipc);
+	printf("%67s#    %.2f  stalled cycles per insn   \n", "", stalled_per_insn);
+	printf("%20llu      cycles                           #    %.3f GHz                       \n", (unsigned long long)total_cycles, ghz);
+	printf("%20llu      stalled-cycles-frontend          #    %.2f%% frontend cycles idle      \n", (unsigned long long)total_stalled_frontend, stalled_frontend_pct);
+	printf("%20llu      stalled-cycles-backend           #    %.2f%% backend cycles idle       \n", (unsigned long long)total_stalled_backend, stalled_backend_pct);
+	printf("%20llu      branches                         #    %.3f G/sec                     \n", (unsigned long long)total_branches, branches_per_sec / 1000000000.0);
+	printf("%20llu      branch-misses                    #    %.2f%% of all branches           \n", (unsigned long long)total_branch_misses, branch_miss_rate);
+	printf("\n");
+	printf("Throughput: %.2f MIPS, %.2f Mcycles/sec\n", mips, mcps);
+	printf("\n");
+	printf("cycles/frame  mean=%.0f  sd=%.0f  relSD=%.3f%%  n=%d\n", cycles_stats.mean, cycles_stats.sd, cycles_stats.rel_sd, cycles_stats.n);
+	printf("insn/frame    mean=%.0f  sd=%.0f  relSD=%.3f%%  n=%d\n", insn_stats.mean, insn_stats.sd, insn_stats.rel_sd, insn_stats.n);
+	printf("time (ms)     mean=%.3f  sd=%.3f  relSD=%.3f%%  n=%d\n", time_stats.mean, time_stats.sd, time_stats.rel_sd, time_stats.n);
+
+	double fps = (double)frames_per_run / (time_stats.mean / 1000.0);
+	double ms_per_frame = time_stats.mean / frames_per_run;
+	printf("FPS (frames/second) = %.2f\n", fps);
+	printf("ms/frame     = %.6f\n", ms_per_frame);
+
+	// Cleanup
+	close_counters(counters, num_counters);
+}
-- 
cgit v1.2.3