diff options
| author | Peter Fors <peter.fors@mindkiller.com> | 2025-10-28 17:44:57 +0100 |
|---|---|---|
| committer | Peter Fors <peter.fors@mindkiller.com> | 2025-10-28 17:44:57 +0100 |
| commit | 9ee76c20c0d093d5adac2dcc3b275b53b879c369 (patch) | |
| tree | 7f21652a42d96f2ede7822950cbbf9c044ca43ca /mknes_bench.c | |
| parent | 3b7621981b56a51756badac70034f68366878df9 (diff) | |
small optimizations of sprite evaluation in ppu_render_pixel
Diffstat (limited to 'mknes_bench.c')
| -rw-r--r-- | mknes_bench.c | 199 |
1 files changed, 129 insertions, 70 deletions
diff --git a/mknes_bench.c b/mknes_bench.c index 12fb642..b064266 100644 --- a/mknes_bench.c +++ b/mknes_bench.c @@ -8,7 +8,6 @@ #include <math.h> #include <time.h> -// Performance counter setup struct perf_counter { int fd; uint64_t value; @@ -17,14 +16,23 @@ struct perf_counter { uint64_t config; }; +struct perf_group_read { + uint64_t nr; + uint64_t time_enabled; + uint64_t time_running; + struct { + uint64_t value; + } values[5]; +} __attribute__((packed)); + struct bench_run { uint64_t cycles; uint64_t instructions; uint64_t stalled_cycles_frontend; - uint64_t stalled_cycles_backend; uint64_t branches; uint64_t branch_misses; uint64_t time_ns; + double multiplexing_coverage; // ratio of time_running/time_enabled }; struct bench_stats { @@ -38,51 +46,76 @@ static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu return syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags); } -static int setup_counter(struct perf_counter *counter, uint32_t type, uint64_t config, const char *name) { +static int setup_counter_group(struct perf_counter *counters, int num_counters) { struct perf_event_attr pe; memset(&pe, 0, sizeof(struct perf_event_attr)); - pe.type = type; + pe.type = PERF_TYPE_HARDWARE; pe.size = sizeof(struct perf_event_attr); - pe.config = config; pe.disabled = 1; - pe.exclude_kernel = 0; - pe.exclude_hv = 0; + pe.exclude_kernel = 1; + pe.exclude_hv = 1; pe.exclude_idle = 1; + pe.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING; - counter->fd = perf_event_open(&pe, 0, -1, -1, 0); - counter->name = name; - counter->type = type; - counter->config = config; - counter->value = 0; + int leader_fd = -1; - return counter->fd; -} + for(int i = 0; i < num_counters; i++) { + pe.config = counters[i].config; -static void reset_counters(struct perf_counter *counters, int n) { - for(int i = 0; i < n; i++) { - if(counters[i].fd >= 0) { - ioctl(counters[i].fd, PERF_EVENT_IOC_RESET, 0); + if(i == 0) { + // First counter is the group leader + leader_fd = perf_event_open(&pe, 0, -1, -1, 0); + counters[i].fd = leader_fd; + } else { + // Subsequent counters are group members + counters[i].fd = perf_event_open(&pe, 0, -1, leader_fd, 0); } + + if(counters[i].fd < 0) { + fprintf(stderr, "Warning: Counter %s not available\n", counters[i].name); + } + + counters[i].value = 0; } + + return leader_fd; } -static void start_counters(struct perf_counter *counters, int n) { - for(int i = 0; i < n; i++) { - if(counters[i].fd >= 0) { - ioctl(counters[i].fd, PERF_EVENT_IOC_ENABLE, 0); - } +static void reset_counters_group(int leader_fd) { + if(leader_fd >= 0) { + ioctl(leader_fd, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP); } } -static void stop_counters(struct perf_counter *counters, int n) { - for(int i = 0; i < n; i++) { - if(counters[i].fd >= 0) { - ioctl(counters[i].fd, PERF_EVENT_IOC_DISABLE, 0); - read(counters[i].fd, &counters[i].value, sizeof(uint64_t)); - } +static void start_counters_group(int leader_fd) { + if(leader_fd >= 0) { + ioctl(leader_fd, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP); } } +static int stop_and_read_counters_group(int leader_fd, struct perf_counter *counters, int num_counters, struct perf_group_read *result) { + if(leader_fd < 0) { + return -1; + } + + // Stop all counters atomically + ioctl(leader_fd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP); + + // Read all counter values in one syscall + ssize_t bytes_read = read(leader_fd, result, sizeof(struct perf_group_read)); + if(bytes_read < 0) { + fprintf(stderr, "Error: Failed to read perf group counters\n"); + return -1; + } + + // Store values in individual counter structs for compatibility + for(int i = 0; i < num_counters && i < result->nr; i++) { + counters[i].value = result->values[i].value; + } + + return 0; +} + static void close_counters(struct perf_counter *counters, int n) { for(int i = 0; i < n; i++) { if(counters[i].fd >= 0) { @@ -133,7 +166,6 @@ static void set_cpu_affinity(int cpu) { } } -// Static allocation for benchmark runs - no malloc, page-aligned #define MAX_BENCH_RUNS 100 static struct bench_run runs_storage[MAX_BENCH_RUNS] __attribute__((aligned(4096))); static double per_frame_storage[MAX_BENCH_RUNS * 3] __attribute__((aligned(4096))); @@ -144,38 +176,36 @@ static void run_benchmark(struct nes_state *nstate, uint32_t num_runs, uint32_t return; } - struct perf_counter counters[6]; - int num_counters = 0; + struct perf_counter counters[5]; + int num_counters = 5; + + // Initialize counter metadata + counters[0] = (struct perf_counter){.fd = -1, .value = 0, .name = "cycles", .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES}; + counters[1] = (struct perf_counter){.fd = -1, .value = 0, .name = "instructions", .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_INSTRUCTIONS}; + counters[2] = (struct perf_counter){.fd = -1, .value = 0, .name = "stalled-cycles-frontend", .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_STALLED_CYCLES_FRONTEND}; + counters[3] = (struct perf_counter){.fd = -1, .value = 0, .name = "branches", .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_INSTRUCTIONS}; + counters[4] = (struct perf_counter){.fd = -1, .value = 0, .name = "branch-misses", .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_MISSES}; - // Set up performance counters - setup_counter(&counters[num_counters++], PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES, "cycles"); - setup_counter(&counters[num_counters++], PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS, "instructions"); - setup_counter(&counters[num_counters++], PERF_TYPE_HARDWARE, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND, "stalled-cycles-frontend"); - setup_counter(&counters[num_counters++], PERF_TYPE_HARDWARE, PERF_COUNT_HW_STALLED_CYCLES_BACKEND, "stalled-cycles-backend"); - setup_counter(&counters[num_counters++], PERF_TYPE_HARDWARE, PERF_COUNT_HW_BRANCH_INSTRUCTIONS, "branches"); - setup_counter(&counters[num_counters++], PERF_TYPE_HARDWARE, PERF_COUNT_HW_BRANCH_MISSES, "branch-misses"); + // Set up performance counter group + int leader_fd = setup_counter_group(counters, num_counters); // Check which counters are available int available_counters = 0; for(int i = 0; i < num_counters; i++) { if(counters[i].fd >= 0) { available_counters++; - } else { - fprintf(stderr, "Warning: Counter %s not available\n", counters[i].name); } } - if(available_counters == 0) { + if(available_counters == 0 || leader_fd < 0) { fprintf(stderr, "Error: No performance counters available\n"); close_counters(counters, num_counters); return; } - // Use static storage for runs struct bench_run *runs = runs_storage; memset(runs, 0, sizeof(struct bench_run) * num_runs); - // Set CPU affinity and realtime priority set_cpu_affinity(1); set_realtime_priority(); @@ -194,7 +224,7 @@ static void run_benchmark(struct nes_state *nstate, uint32_t num_runs, uint32_t nstate->ppu.frame_ready = 0; } - // Run benchmark iterations + // Run benchmark for(uint32_t run = 0; run < num_runs; run++) { // Reset emulator state @@ -209,8 +239,8 @@ static void run_benchmark(struct nes_state *nstate, uint32_t num_runs, uint32_t clock_gettime(CLOCK_MONOTONIC, &start_time); // Reset and start counters (after clock_gettime to exclude its overhead) - reset_counters(counters, num_counters); - start_counters(counters, num_counters); + reset_counters_group(leader_fd); + start_counters_group(leader_fd); // Run emulation for(uint32_t i = 0; i < frames_per_run; i++) { @@ -220,17 +250,20 @@ static void run_benchmark(struct nes_state *nstate, uint32_t num_runs, uint32_t nstate->ppu.frame_ready = 0; } - // Stop counters (before clock_gettime to exclude its overhead) - stop_counters(counters, num_counters); - - // Stop timing + // Stop and read all counters atomically in one syscall + struct perf_group_read group_result; + stop_and_read_counters_group(leader_fd, counters, num_counters, &group_result); clock_gettime(CLOCK_MONOTONIC, &end_time); - // Calculate elapsed time in nanoseconds uint64_t elapsed_ns = (end_time.tv_sec - start_time.tv_sec) * 1000000000ULL + (end_time.tv_nsec - start_time.tv_nsec); + // Calculate multiplexing coverage + double coverage = (group_result.time_enabled > 0) ? + (double)group_result.time_running / (double)group_result.time_enabled : 1.0; + // Store results runs[run].time_ns = elapsed_ns; + runs[run].multiplexing_coverage = coverage; for(int i = 0; i < num_counters; i++) { if(counters[i].fd < 0) continue; @@ -240,8 +273,6 @@ static void run_benchmark(struct nes_state *nstate, uint32_t num_runs, uint32_t runs[run].instructions = counters[i].value; } else if(counters[i].config == PERF_COUNT_HW_STALLED_CYCLES_FRONTEND) { runs[run].stalled_cycles_frontend = counters[i].value; - } else if(counters[i].config == PERF_COUNT_HW_STALLED_CYCLES_BACKEND) { - runs[run].stalled_cycles_backend = counters[i].value; } else if(counters[i].config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) { runs[run].branches = counters[i].value; } else if(counters[i].config == PERF_COUNT_HW_BRANCH_MISSES) { @@ -250,11 +281,43 @@ static void run_benchmark(struct nes_state *nstate, uint32_t num_runs, uint32_t } } + // Check for multiplexing and warn user + int multiplexing_detected = 0; + double min_coverage = 1.0; + double max_coverage = 1.0; + for(uint32_t i = 0; i < num_runs; i++) { + if(runs[i].multiplexing_coverage < 0.9999) { // Allow for tiny floating point error + multiplexing_detected = 1; + } + if(runs[i].multiplexing_coverage < min_coverage) { + min_coverage = runs[i].multiplexing_coverage; + } + if(runs[i].multiplexing_coverage > max_coverage) { + max_coverage = runs[i].multiplexing_coverage; + } + } + + if(multiplexing_detected) { + fprintf(stderr, "\n"); + fprintf(stderr, "========================================\n"); + fprintf(stderr, "WARNING: COUNTER MULTIPLEXING DETECTED!\n"); + fprintf(stderr, "========================================\n"); + fprintf(stderr, "The kernel time-sliced your performance counters.\n"); + fprintf(stderr, "This means the counters were NOT running 100%% of the time.\n"); + fprintf(stderr, "Coverage range: %.2f%% - %.2f%%\n", min_coverage * 100.0, max_coverage * 100.0); + fprintf(stderr, "Results may be SCALED and LESS PRECISE.\n"); + fprintf(stderr, "Consider reducing the number of counters.\n"); + fprintf(stderr, "========================================\n"); + fprintf(stderr, "\n"); + } else { + // All good - counters ran at 100% coverage (no multiplexing) + printf("Performance counter coverage: 100%% (no multiplexing - full precision)\n"); + } + // Calculate aggregated totals uint64_t total_instructions = 0; uint64_t total_cycles = 0; uint64_t total_stalled_frontend = 0; - uint64_t total_stalled_backend = 0; uint64_t total_branches = 0; uint64_t total_branch_misses = 0; uint64_t total_time_ns = 0; @@ -263,7 +326,6 @@ static void run_benchmark(struct nes_state *nstate, uint32_t num_runs, uint32_t total_instructions += runs[i].instructions; total_cycles += runs[i].cycles; total_stalled_frontend += runs[i].stalled_cycles_frontend; - total_stalled_backend += runs[i].stalled_cycles_backend; total_branches += runs[i].branches; total_branch_misses += runs[i].branch_misses; total_time_ns += runs[i].time_ns; @@ -287,35 +349,32 @@ static void run_benchmark(struct nes_state *nstate, uint32_t num_runs, uint32_t // Print results double total_time_s = (double)total_time_ns / 1000000000.0; double ipc = (double)total_instructions / total_cycles; - double stalled_per_insn = (double)(total_stalled_frontend + total_stalled_backend) / total_instructions; + double stalled_per_insn = (double)total_stalled_frontend / total_instructions; double ghz = (double)total_cycles / total_time_s / 1000000000.0; double branches_per_sec = (double)total_branches / total_time_s; double branch_miss_rate = (double)total_branch_misses / total_branches * 100.0; double stalled_frontend_pct = (double)total_stalled_frontend / total_cycles * 100.0; - double stalled_backend_pct = (double)total_stalled_backend / total_cycles * 100.0; double mips = (double)total_instructions / total_time_s / 1000000.0; double mcps = (double)total_cycles / total_time_s / 1000000.0; printf("\n"); - printf("%20llu instructions # %.2f insn per cycle \n", (unsigned long long)total_instructions, ipc); - printf("%56s# %.2f stalled cycles per insn \n", "", stalled_per_insn); - printf("%20llu cycles # %.3f GHz \n", (unsigned long long)total_cycles, ghz); - printf("%20llu stalled-cycles-frontend # %.2f%% frontend cycles idle \n", (unsigned long long)total_stalled_frontend, stalled_frontend_pct); - printf("%20llu stalled-cycles-backend # %.2f%% backend cycles idle \n", (unsigned long long)total_stalled_backend, stalled_backend_pct); - printf("%20llu branches # %.3f G/sec \n", (unsigned long long)total_branches, branches_per_sec / 1000000000.0); - printf("%20llu branch-misses # %.2f%% of all branches \n", (unsigned long long)total_branch_misses, branch_miss_rate); + printf("%20llu instructions # %.2f insn per cycle \n", (unsigned long long)total_instructions, ipc); + printf("%56s# %.2f stalled cycles per insn\n", "", stalled_per_insn); + printf("%20llu cycles # %.3f GHz \n", (unsigned long long)total_cycles, ghz); + printf("%20llu stalled-cycles-frontend # %.2f%% frontend cycles idle\n", (unsigned long long)total_stalled_frontend, stalled_frontend_pct); + printf("%20llu branches # %.3f G/sec \n", (unsigned long long)total_branches, branches_per_sec / 1000000000.0); + printf("%20llu branch-misses # %.2f%% of all branches \n", (unsigned long long)total_branch_misses, branch_miss_rate); printf("\n"); printf("Throughput: %.2f MIPS, %.2f Mcycles/sec\n", mips, mcps); printf("\n"); - printf("cycles/frame mean=%.0f sd=%.0f relSD=%.3f%% n=%d\n", cycles_stats.mean, cycles_stats.sd, cycles_stats.rel_sd, cycles_stats.n); - printf("insn/frame mean=%.0f sd=%.0f relSD=%.3f%% n=%d\n", insn_stats.mean, insn_stats.sd, insn_stats.rel_sd, insn_stats.n); - printf("time (ms) mean=%.3f sd=%.3f relSD=%.3f%% n=%d\n", time_stats.mean, time_stats.sd, time_stats.rel_sd, time_stats.n); + printf("cycles/frame mean=%9.0f sd=%6.0f relSD=%.3f%% n=%d\n", cycles_stats.mean, cycles_stats.sd, cycles_stats.rel_sd, cycles_stats.n); + printf("insn/frame mean=%9.0f sd=%6.0f relSD=%.3f%% n=%d\n", insn_stats.mean, insn_stats.sd, insn_stats.rel_sd, insn_stats.n); + printf("time (ms) mean=%9.3f sd=%6.3f relSD=%.3f%% n=%d\n", time_stats.mean, time_stats.sd, time_stats.rel_sd, time_stats.n); double fps = (double)frames_per_run / (time_stats.mean / 1000.0); double ms_per_frame = time_stats.mean / frames_per_run; printf("FPS (frames/second) = %.2f\n", fps); - printf("ms/frame = %.6f\n", ms_per_frame); + printf(" ms/frame = %.6f\n", ms_per_frame); - // Cleanup close_counters(counters, num_counters); } |
