summaryrefslogtreecommitdiff
path: root/mknes_bench.c
diff options
context:
space:
mode:
Diffstat (limited to 'mknes_bench.c')
-rw-r--r--mknes_bench.c323
1 files changed, 323 insertions, 0 deletions
diff --git a/mknes_bench.c b/mknes_bench.c
new file mode 100644
index 0000000..fcc7aae
--- /dev/null
+++ b/mknes_bench.c
@@ -0,0 +1,323 @@
+#define _GNU_SOURCE
+#include <linux/perf_event.h>
+#include <sys/syscall.h>
+#include <sys/ioctl.h>
+#include <unistd.h>
+#include <sched.h>
+#include <sys/resource.h>
+#include <math.h>
+#include <time.h>
+
+// Performance counter setup
+struct perf_counter {
+ int fd;
+ uint64_t value;
+ const char *name;
+ uint32_t type;
+ uint64_t config;
+};
+
+struct bench_run {
+ uint64_t cycles;
+ uint64_t instructions;
+ uint64_t stalled_cycles_frontend;
+ uint64_t stalled_cycles_backend;
+ uint64_t branches;
+ uint64_t branch_misses;
+ uint64_t time_ns;
+};
+
+struct bench_stats {
+ double mean;
+ double sd;
+ double rel_sd;
+ int n;
+};
+
+static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid,
+ int cpu, int group_fd, unsigned long flags) {
+ return syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags);
+}
+
+static int setup_counter(struct perf_counter *counter, uint32_t type, uint64_t config, const char *name) {
+ struct perf_event_attr pe;
+ memset(&pe, 0, sizeof(struct perf_event_attr));
+ pe.type = type;
+ pe.size = sizeof(struct perf_event_attr);
+ pe.config = config;
+ pe.disabled = 1;
+ pe.exclude_kernel = 0;
+ pe.exclude_hv = 0;
+ pe.exclude_idle = 1;
+
+ counter->fd = perf_event_open(&pe, 0, -1, -1, 0);
+ counter->name = name;
+ counter->type = type;
+ counter->config = config;
+ counter->value = 0;
+
+ return counter->fd;
+}
+
+static void reset_counters(struct perf_counter *counters, int n) {
+ for(int i = 0; i < n; i++) {
+ if(counters[i].fd >= 0) {
+ ioctl(counters[i].fd, PERF_EVENT_IOC_RESET, 0);
+ }
+ }
+}
+
+static void start_counters(struct perf_counter *counters, int n) {
+ for(int i = 0; i < n; i++) {
+ if(counters[i].fd >= 0) {
+ ioctl(counters[i].fd, PERF_EVENT_IOC_ENABLE, 0);
+ }
+ }
+}
+
+static void stop_counters(struct perf_counter *counters, int n) {
+ for(int i = 0; i < n; i++) {
+ if(counters[i].fd >= 0) {
+ ioctl(counters[i].fd, PERF_EVENT_IOC_DISABLE, 0);
+ read(counters[i].fd, &counters[i].value, sizeof(uint64_t));
+ }
+ }
+}
+
+static void close_counters(struct perf_counter *counters, int n) {
+ for(int i = 0; i < n; i++) {
+ if(counters[i].fd >= 0) {
+ close(counters[i].fd);
+ }
+ }
+}
+
+static struct bench_stats calculate_stats(double *values, int n) {
+ struct bench_stats stats;
+ stats.n = n;
+
+ // Calculate mean
+ double sum = 0.0;
+ for(int i = 0; i < n; i++) {
+ sum += values[i];
+ }
+ stats.mean = sum / n;
+
+ // Calculate standard deviation
+ double sum_sq = 0.0;
+ for(int i = 0; i < n; i++) {
+ double diff = values[i] - stats.mean;
+ sum_sq += diff * diff;
+ }
+ stats.sd = sqrt(sum_sq / (n - 1));
+
+ // Calculate relative standard deviation
+ stats.rel_sd = (stats.mean != 0.0) ? (100.0 * stats.sd / stats.mean) : 0.0;
+
+ return stats;
+}
+
+static void set_realtime_priority(void) {
+ struct sched_param param;
+ param.sched_priority = 99;
+ if(sched_setscheduler(0, SCHED_FIFO, &param) == -1) {
+ fprintf(stderr, "Warning: Failed to set realtime priority (try running with sudo or CAP_SYS_NICE)\n");
+ }
+}
+
+static void set_cpu_affinity(int cpu) {
+ cpu_set_t cpuset;
+ CPU_ZERO(&cpuset);
+ CPU_SET(cpu, &cpuset);
+ if(sched_setaffinity(0, sizeof(cpu_set_t), &cpuset) == -1) {
+ fprintf(stderr, "Warning: Failed to set CPU affinity to CPU %d\n", cpu);
+ }
+}
+
+// Static allocation for benchmark runs - no malloc, page-aligned
+#define MAX_BENCH_RUNS 100
+static struct bench_run runs_storage[MAX_BENCH_RUNS] __attribute__((aligned(4096)));
+static double per_frame_storage[MAX_BENCH_RUNS * 3] __attribute__((aligned(4096)));
+
+static void run_benchmark(struct nes_state *nstate, uint32_t num_runs, uint32_t frames_per_run) {
+ if(num_runs > MAX_BENCH_RUNS) {
+ fprintf(stderr, "Error: num_runs (%u) exceeds MAX_BENCH_RUNS (%u)\n", num_runs, MAX_BENCH_RUNS);
+ return;
+ }
+
+ struct perf_counter counters[6];
+ int num_counters = 0;
+
+ // Set up performance counters
+ setup_counter(&counters[num_counters++], PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES, "cycles");
+ setup_counter(&counters[num_counters++], PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS, "instructions");
+ setup_counter(&counters[num_counters++], PERF_TYPE_HARDWARE, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND, "stalled-cycles-frontend");
+ setup_counter(&counters[num_counters++], PERF_TYPE_HARDWARE, PERF_COUNT_HW_STALLED_CYCLES_BACKEND, "stalled-cycles-backend");
+ setup_counter(&counters[num_counters++], PERF_TYPE_HARDWARE, PERF_COUNT_HW_BRANCH_INSTRUCTIONS, "branches");
+ setup_counter(&counters[num_counters++], PERF_TYPE_HARDWARE, PERF_COUNT_HW_BRANCH_MISSES, "branch-misses");
+
+ // Check which counters are available
+ int available_counters = 0;
+ for(int i = 0; i < num_counters; i++) {
+ if(counters[i].fd >= 0) {
+ available_counters++;
+ } else {
+ fprintf(stderr, "Warning: Counter %s not available\n", counters[i].name);
+ }
+ }
+
+ if(available_counters == 0) {
+ fprintf(stderr, "Error: No performance counters available\n");
+ close_counters(counters, num_counters);
+ return;
+ }
+
+ // Use static storage for runs
+ struct bench_run *runs = runs_storage;
+ memset(runs, 0, sizeof(struct bench_run) * num_runs);
+
+ // Set CPU affinity and realtime priority
+ set_cpu_affinity(1);
+ set_realtime_priority();
+
+ // Warmup run (not measured)
+ memset(nstate, 0, sizeof(struct nes_state));
+ ppu_reset(nstate);
+ ines2_load_from_memory(nstate, benchmark_rom, INCBIN_SIZE(benchmark_rom));
+ mapper_setup(nstate);
+ cpu_reset(nstate);
+
+ uint32_t warmup_frames = frames_per_run / 10;
+ for(uint32_t i = 0; i < warmup_frames; i++) {
+ while(!nstate->ppu.frame_ready) {
+ cpu_tick(nstate);
+ }
+ nstate->ppu.frame_ready = 0;
+ }
+
+ // Run benchmark iterations
+ for(uint32_t run = 0; run < num_runs; run++) {
+
+ // Reset emulator state
+ memset(nstate, 0, sizeof(struct nes_state));
+ ppu_reset(nstate);
+ ines2_load_from_memory(nstate, benchmark_rom, INCBIN_SIZE(benchmark_rom));
+ mapper_setup(nstate);
+ cpu_reset(nstate);
+
+ // Start timing
+ struct timespec start_time, end_time;
+ clock_gettime(CLOCK_MONOTONIC, &start_time);
+
+ // Reset and start counters (after clock_gettime to exclude its overhead)
+ reset_counters(counters, num_counters);
+ start_counters(counters, num_counters);
+
+ // Run emulation
+ for(uint32_t i = 0; i < frames_per_run; i++) {
+ while(!nstate->ppu.frame_ready) {
+ cpu_tick(nstate);
+ }
+ nstate->ppu.frame_ready = 0;
+ }
+
+ // Stop counters (before clock_gettime to exclude its overhead)
+ stop_counters(counters, num_counters);
+
+ // Stop timing
+ clock_gettime(CLOCK_MONOTONIC, &end_time);
+
+ // Calculate elapsed time in nanoseconds
+ uint64_t elapsed_ns = (end_time.tv_sec - start_time.tv_sec) * 1000000000ULL +
+ (end_time.tv_nsec - start_time.tv_nsec);
+
+ // Store results
+ runs[run].time_ns = elapsed_ns;
+ for(int i = 0; i < num_counters; i++) {
+ if(counters[i].fd < 0) continue;
+
+ if(counters[i].config == PERF_COUNT_HW_CPU_CYCLES) {
+ runs[run].cycles = counters[i].value;
+ } else if(counters[i].config == PERF_COUNT_HW_INSTRUCTIONS) {
+ runs[run].instructions = counters[i].value;
+ } else if(counters[i].config == PERF_COUNT_HW_STALLED_CYCLES_FRONTEND) {
+ runs[run].stalled_cycles_frontend = counters[i].value;
+ } else if(counters[i].config == PERF_COUNT_HW_STALLED_CYCLES_BACKEND) {
+ runs[run].stalled_cycles_backend = counters[i].value;
+ } else if(counters[i].config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) {
+ runs[run].branches = counters[i].value;
+ } else if(counters[i].config == PERF_COUNT_HW_BRANCH_MISSES) {
+ runs[run].branch_misses = counters[i].value;
+ }
+ }
+ }
+
+ // Calculate aggregated totals
+ uint64_t total_instructions = 0;
+ uint64_t total_cycles = 0;
+ uint64_t total_stalled_frontend = 0;
+ uint64_t total_stalled_backend = 0;
+ uint64_t total_branches = 0;
+ uint64_t total_branch_misses = 0;
+ uint64_t total_time_ns = 0;
+
+ for(uint32_t i = 0; i < num_runs; i++) {
+ total_instructions += runs[i].instructions;
+ total_cycles += runs[i].cycles;
+ total_stalled_frontend += runs[i].stalled_cycles_frontend;
+ total_stalled_backend += runs[i].stalled_cycles_backend;
+ total_branches += runs[i].branches;
+ total_branch_misses += runs[i].branch_misses;
+ total_time_ns += runs[i].time_ns;
+ }
+
+ // Calculate per-frame statistics using static storage
+ double *cycles_per_frame = &per_frame_storage[0];
+ double *insn_per_frame = &per_frame_storage[num_runs];
+ double *time_ms = &per_frame_storage[num_runs * 2];
+
+ for(uint32_t i = 0; i < num_runs; i++) {
+ cycles_per_frame[i] = (double)runs[i].cycles / frames_per_run;
+ insn_per_frame[i] = (double)runs[i].instructions / frames_per_run;
+ time_ms[i] = (double)runs[i].time_ns / 1000000.0;
+ }
+
+ struct bench_stats cycles_stats = calculate_stats(cycles_per_frame, num_runs);
+ struct bench_stats insn_stats = calculate_stats(insn_per_frame, num_runs);
+ struct bench_stats time_stats = calculate_stats(time_ms, num_runs);
+
+ // Print results
+ double total_time_s = (double)total_time_ns / 1000000000.0;
+ double ipc = (double)total_instructions / total_cycles;
+ double stalled_per_insn = (double)(total_stalled_frontend + total_stalled_backend) / total_instructions;
+ double ghz = (double)total_cycles / total_time_s / 1000000000.0;
+ double branches_per_sec = (double)total_branches / total_time_s;
+ double branch_miss_rate = (double)total_branch_misses / total_branches * 100.0;
+ double stalled_frontend_pct = (double)total_stalled_frontend / total_cycles * 100.0;
+ double stalled_backend_pct = (double)total_stalled_backend / total_cycles * 100.0;
+ double mips = (double)total_instructions / total_time_s / 1000000.0;
+ double mcps = (double)total_cycles / total_time_s / 1000000.0;
+
+ printf("\n");
+ printf("%20llu instructions # %.2f insn per cycle \n", (unsigned long long)total_instructions, ipc);
+ printf("%67s# %.2f stalled cycles per insn \n", "", stalled_per_insn);
+ printf("%20llu cycles # %.3f GHz \n", (unsigned long long)total_cycles, ghz);
+ printf("%20llu stalled-cycles-frontend # %.2f%% frontend cycles idle \n", (unsigned long long)total_stalled_frontend, stalled_frontend_pct);
+ printf("%20llu stalled-cycles-backend # %.2f%% backend cycles idle \n", (unsigned long long)total_stalled_backend, stalled_backend_pct);
+ printf("%20llu branches # %.3f G/sec \n", (unsigned long long)total_branches, branches_per_sec / 1000000000.0);
+ printf("%20llu branch-misses # %.2f%% of all branches \n", (unsigned long long)total_branch_misses, branch_miss_rate);
+ printf("\n");
+ printf("Throughput: %.2f MIPS, %.2f Mcycles/sec\n", mips, mcps);
+ printf("\n");
+ printf("cycles/frame mean=%.0f sd=%.0f relSD=%.3f%% n=%d\n", cycles_stats.mean, cycles_stats.sd, cycles_stats.rel_sd, cycles_stats.n);
+ printf("insn/frame mean=%.0f sd=%.0f relSD=%.3f%% n=%d\n", insn_stats.mean, insn_stats.sd, insn_stats.rel_sd, insn_stats.n);
+ printf("time (ms) mean=%.3f sd=%.3f relSD=%.3f%% n=%d\n", time_stats.mean, time_stats.sd, time_stats.rel_sd, time_stats.n);
+
+ double fps = (double)frames_per_run / (time_stats.mean / 1000.0);
+ double ms_per_frame = time_stats.mean / frames_per_run;
+ printf("FPS (frames/second) = %.2f\n", fps);
+ printf("ms/frame = %.6f\n", ms_per_frame);
+
+ // Cleanup
+ close_counters(counters, num_counters);
+}