#define _GNU_SOURCE #include #include #include #include #include #include #include #include struct perf_counter { int fd; uint64_t value; const char *name; uint32_t type; uint64_t config; }; struct perf_group_read { uint64_t nr; uint64_t time_enabled; uint64_t time_running; struct { uint64_t value; } values[5]; } __attribute__((packed)); struct bench_run { uint64_t cycles; uint64_t instructions; uint64_t stalled_cycles_frontend; uint64_t branches; uint64_t branch_misses; uint64_t time_ns; double multiplexing_coverage; // ratio of time_running/time_enabled }; struct bench_stats { double mean; double sd; double rel_sd; int n; }; static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu, int group_fd, unsigned long flags) { return syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags); } static int setup_counter_group(struct perf_counter *counters, int num_counters) { struct perf_event_attr pe; memset(&pe, 0, sizeof(struct perf_event_attr)); pe.type = PERF_TYPE_HARDWARE; pe.size = sizeof(struct perf_event_attr); pe.disabled = 1; pe.exclude_kernel = 1; pe.exclude_hv = 1; pe.exclude_idle = 1; pe.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING; int leader_fd = -1; for(int i = 0; i < num_counters; i++) { pe.config = counters[i].config; if(i == 0) { // First counter is the group leader leader_fd = perf_event_open(&pe, 0, -1, -1, 0); counters[i].fd = leader_fd; } else { // Subsequent counters are group members counters[i].fd = perf_event_open(&pe, 0, -1, leader_fd, 0); } if(counters[i].fd < 0) { fprintf(stderr, "Warning: Counter %s not available\n", counters[i].name); } counters[i].value = 0; } return leader_fd; } static void reset_counters_group(int leader_fd) { if(leader_fd >= 0) { ioctl(leader_fd, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP); } } static void start_counters_group(int leader_fd) { if(leader_fd >= 0) { ioctl(leader_fd, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP); } } static int stop_and_read_counters_group(int leader_fd, struct perf_counter *counters, int num_counters, struct perf_group_read *result) { if(leader_fd < 0) { return -1; } // Stop all counters atomically ioctl(leader_fd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP); // Read all counter values in one syscall ssize_t bytes_read = read(leader_fd, result, sizeof(struct perf_group_read)); if(bytes_read < 0) { fprintf(stderr, "Error: Failed to read perf group counters\n"); return -1; } // Store values in individual counter structs for compatibility for(int i = 0; i < num_counters && i < result->nr; i++) { counters[i].value = result->values[i].value; } return 0; } static void close_counters(struct perf_counter *counters, int n) { for(int i = 0; i < n; i++) { if(counters[i].fd >= 0) { close(counters[i].fd); } } } static struct bench_stats calculate_stats(double *values, int n) { struct bench_stats stats; stats.n = n; // Calculate mean double sum = 0.0; for(int i = 0; i < n; i++) { sum += values[i]; } stats.mean = sum / n; // Calculate standard deviation double sum_sq = 0.0; for(int i = 0; i < n; i++) { double diff = values[i] - stats.mean; sum_sq += diff * diff; } stats.sd = sqrt(sum_sq / (n - 1)); // Calculate relative standard deviation stats.rel_sd = (stats.mean != 0.0) ? (100.0 * stats.sd / stats.mean) : 0.0; return stats; } static void set_realtime_priority(void) { struct sched_param param; param.sched_priority = 99; if(sched_setscheduler(0, SCHED_FIFO, ¶m) == -1) { fprintf(stderr, "Warning: Failed to set realtime priority (try running with sudo or CAP_SYS_NICE)\n"); } } static void set_cpu_affinity(int cpu) { cpu_set_t cpuset; CPU_ZERO(&cpuset); CPU_SET(cpu, &cpuset); if(sched_setaffinity(0, sizeof(cpu_set_t), &cpuset) == -1) { fprintf(stderr, "Warning: Failed to set CPU affinity to CPU %d\n", cpu); } } #define MAX_BENCH_RUNS 100 static struct bench_run runs_storage[MAX_BENCH_RUNS] __attribute__((aligned(4096))); static double per_frame_storage[MAX_BENCH_RUNS * 3] __attribute__((aligned(4096))); static void run_benchmark(struct nes_state *nstate, uint32_t num_runs, uint32_t frames_per_run) { if(num_runs > MAX_BENCH_RUNS) { fprintf(stderr, "Error: num_runs (%u) exceeds MAX_BENCH_RUNS (%u)\n", num_runs, MAX_BENCH_RUNS); return; } struct perf_counter counters[5]; int num_counters = 5; // Initialize counter metadata counters[0] = (struct perf_counter){.fd = -1, .value = 0, .name = "cycles", .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES}; counters[1] = (struct perf_counter){.fd = -1, .value = 0, .name = "instructions", .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_INSTRUCTIONS}; counters[2] = (struct perf_counter){.fd = -1, .value = 0, .name = "stalled-cycles-frontend", .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_STALLED_CYCLES_FRONTEND}; counters[3] = (struct perf_counter){.fd = -1, .value = 0, .name = "branches", .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_INSTRUCTIONS}; counters[4] = (struct perf_counter){.fd = -1, .value = 0, .name = "branch-misses", .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_MISSES}; // Set up performance counter group int leader_fd = setup_counter_group(counters, num_counters); // Check which counters are available int available_counters = 0; for(int i = 0; i < num_counters; i++) { if(counters[i].fd >= 0) { available_counters++; } } if(available_counters == 0 || leader_fd < 0) { fprintf(stderr, "Error: No performance counters available\n"); close_counters(counters, num_counters); return; } struct bench_run *runs = runs_storage; memset(runs, 0, sizeof(struct bench_run) * num_runs); set_cpu_affinity(1); set_realtime_priority(); // Warmup run (not measured) memset(nstate, 0, sizeof(struct nes_state)); ppu_reset(nstate); ines2_load_from_memory(nstate, benchmark_rom, INCBIN_SIZE(benchmark_rom)); mapper_setup(nstate); cpu_reset(nstate); uint32_t warmup_frames = frames_per_run / 10; for(uint32_t i = 0; i < warmup_frames; i++) { while(!nstate->ppu.frame_ready) { cpu_tick(nstate); } nstate->ppu.frame_ready = 0; } // Run benchmark for(uint32_t run = 0; run < num_runs; run++) { // Reset emulator state memset(nstate, 0, sizeof(struct nes_state)); ppu_reset(nstate); ines2_load_from_memory(nstate, benchmark_rom, INCBIN_SIZE(benchmark_rom)); mapper_setup(nstate); cpu_reset(nstate); // Start timing struct timespec start_time, end_time; clock_gettime(CLOCK_MONOTONIC, &start_time); // Reset and start counters (after clock_gettime to exclude its overhead) reset_counters_group(leader_fd); start_counters_group(leader_fd); // Run emulation for(uint32_t i = 0; i < frames_per_run; i++) { while(!nstate->ppu.frame_ready) { cpu_tick(nstate); } nstate->ppu.frame_ready = 0; } // Stop and read all counters atomically in one syscall struct perf_group_read group_result; stop_and_read_counters_group(leader_fd, counters, num_counters, &group_result); clock_gettime(CLOCK_MONOTONIC, &end_time); uint64_t elapsed_ns = (end_time.tv_sec - start_time.tv_sec) * 1000000000ULL + (end_time.tv_nsec - start_time.tv_nsec); // Calculate multiplexing coverage double coverage = (group_result.time_enabled > 0) ? (double)group_result.time_running / (double)group_result.time_enabled : 1.0; // Store results runs[run].time_ns = elapsed_ns; runs[run].multiplexing_coverage = coverage; for(int i = 0; i < num_counters; i++) { if(counters[i].fd < 0) continue; if(counters[i].config == PERF_COUNT_HW_CPU_CYCLES) { runs[run].cycles = counters[i].value; } else if(counters[i].config == PERF_COUNT_HW_INSTRUCTIONS) { runs[run].instructions = counters[i].value; } else if(counters[i].config == PERF_COUNT_HW_STALLED_CYCLES_FRONTEND) { runs[run].stalled_cycles_frontend = counters[i].value; } else if(counters[i].config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) { runs[run].branches = counters[i].value; } else if(counters[i].config == PERF_COUNT_HW_BRANCH_MISSES) { runs[run].branch_misses = counters[i].value; } } } // Check for multiplexing and warn user int multiplexing_detected = 0; double min_coverage = 1.0; double max_coverage = 1.0; for(uint32_t i = 0; i < num_runs; i++) { if(runs[i].multiplexing_coverage < 0.9999) { // Allow for tiny floating point error multiplexing_detected = 1; } if(runs[i].multiplexing_coverage < min_coverage) { min_coverage = runs[i].multiplexing_coverage; } if(runs[i].multiplexing_coverage > max_coverage) { max_coverage = runs[i].multiplexing_coverage; } } if(multiplexing_detected) { fprintf(stderr, "\n"); fprintf(stderr, "========================================\n"); fprintf(stderr, "WARNING: COUNTER MULTIPLEXING DETECTED!\n"); fprintf(stderr, "========================================\n"); fprintf(stderr, "The kernel time-sliced your performance counters.\n"); fprintf(stderr, "This means the counters were NOT running 100%% of the time.\n"); fprintf(stderr, "Coverage range: %.2f%% - %.2f%%\n", min_coverage * 100.0, max_coverage * 100.0); fprintf(stderr, "Results may be SCALED and LESS PRECISE.\n"); fprintf(stderr, "Consider reducing the number of counters.\n"); fprintf(stderr, "========================================\n"); fprintf(stderr, "\n"); } else { // All good - counters ran at 100% coverage (no multiplexing) printf("Performance counter coverage: 100%% (no multiplexing - full precision)\n"); } // Calculate aggregated totals uint64_t total_instructions = 0; uint64_t total_cycles = 0; uint64_t total_stalled_frontend = 0; uint64_t total_branches = 0; uint64_t total_branch_misses = 0; uint64_t total_time_ns = 0; for(uint32_t i = 0; i < num_runs; i++) { total_instructions += runs[i].instructions; total_cycles += runs[i].cycles; total_stalled_frontend += runs[i].stalled_cycles_frontend; total_branches += runs[i].branches; total_branch_misses += runs[i].branch_misses; total_time_ns += runs[i].time_ns; } // Calculate per-frame statistics using static storage double *cycles_per_frame = &per_frame_storage[0]; double *ipc_per_run = &per_frame_storage[num_runs]; double *time_ms = &per_frame_storage[num_runs * 2]; for(uint32_t i = 0; i < num_runs; i++) { cycles_per_frame[i] = (double)runs[i].cycles / frames_per_run; ipc_per_run[i] = (double)runs[i].instructions / (double)runs[i].cycles; time_ms[i] = (double)runs[i].time_ns / 1000000.0; } struct bench_stats cycles_stats = calculate_stats(cycles_per_frame, num_runs); struct bench_stats ipc_stats = calculate_stats(ipc_per_run, num_runs); struct bench_stats time_stats = calculate_stats(time_ms, num_runs); // Print results double total_time_s = (double)total_time_ns / 1000000000.0; double ipc = (double)total_instructions / total_cycles; double stalled_per_insn = (double)total_stalled_frontend / total_instructions; double ghz = (double)total_cycles / total_time_s / 1000000000.0; double branches_per_sec = (double)total_branches / total_time_s; double branch_miss_rate = (double)total_branch_misses / total_branches * 100.0; double stalled_frontend_pct = (double)total_stalled_frontend / total_cycles * 100.0; double mips = (double)total_instructions / total_time_s / 1000000.0; double mcps = (double)total_cycles / total_time_s / 1000000.0; printf("\n"); printf("%20llu instructions # %.2f insn per cycle \n", (unsigned long long)total_instructions, ipc); printf("%56s# %.2f stalled cycles per insn\n", "", stalled_per_insn); printf("%20llu cycles # %.3f GHz \n", (unsigned long long)total_cycles, ghz); printf("%20llu stalled-cycles-frontend # %.2f%% frontend cycles idle\n", (unsigned long long)total_stalled_frontend, stalled_frontend_pct); printf("%20llu branches # %.3f G/sec \n", (unsigned long long)total_branches, branches_per_sec / 1000000000.0); printf("%20llu branch-misses # %.2f%% of all branches \n", (unsigned long long)total_branch_misses, branch_miss_rate); printf("\n"); printf("Throughput: %.2f MIPS, %.2f Mcycles/sec\n", mips, mcps); printf("\n"); printf("cycles/frame mean=%9.0f sd=%6.0f relSD=%.3f%% n=%d\n", cycles_stats.mean, cycles_stats.sd, cycles_stats.rel_sd, cycles_stats.n); printf("IPC mean=%9.3f sd=%6.3f relSD=%.3f%% n=%d\n", ipc_stats.mean, ipc_stats.sd, ipc_stats.rel_sd, ipc_stats.n); printf("time (ms) mean=%9.3f sd=%6.3f relSD=%.3f%% n=%d\n", time_stats.mean, time_stats.sd, time_stats.rel_sd, time_stats.n); double fps = (double)frames_per_run / (time_stats.mean / 1000.0); double ms_per_frame = time_stats.mean / frames_per_run; printf("FPS (frames/second) = %.2f\n", fps); printf(" ms/frame = %.6f\n", ms_per_frame); close_counters(counters, num_counters); }