summaryrefslogtreecommitdiff
path: root/mknes_bench.c
diff options
context:
space:
mode:
Diffstat (limited to 'mknes_bench.c')
-rw-r--r--mknes_bench.c199
1 files changed, 129 insertions, 70 deletions
diff --git a/mknes_bench.c b/mknes_bench.c
index 12fb642..b064266 100644
--- a/mknes_bench.c
+++ b/mknes_bench.c
@@ -8,7 +8,6 @@
#include <math.h>
#include <time.h>
-// Performance counter setup
struct perf_counter {
int fd;
uint64_t value;
@@ -17,14 +16,23 @@ struct perf_counter {
uint64_t config;
};
+struct perf_group_read {
+ uint64_t nr;
+ uint64_t time_enabled;
+ uint64_t time_running;
+ struct {
+ uint64_t value;
+ } values[5];
+} __attribute__((packed));
+
struct bench_run {
uint64_t cycles;
uint64_t instructions;
uint64_t stalled_cycles_frontend;
- uint64_t stalled_cycles_backend;
uint64_t branches;
uint64_t branch_misses;
uint64_t time_ns;
+ double multiplexing_coverage; // ratio of time_running/time_enabled
};
struct bench_stats {
@@ -38,51 +46,76 @@ static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu
return syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags);
}
-static int setup_counter(struct perf_counter *counter, uint32_t type, uint64_t config, const char *name) {
+static int setup_counter_group(struct perf_counter *counters, int num_counters) {
struct perf_event_attr pe;
memset(&pe, 0, sizeof(struct perf_event_attr));
- pe.type = type;
+ pe.type = PERF_TYPE_HARDWARE;
pe.size = sizeof(struct perf_event_attr);
- pe.config = config;
pe.disabled = 1;
- pe.exclude_kernel = 0;
- pe.exclude_hv = 0;
+ pe.exclude_kernel = 1;
+ pe.exclude_hv = 1;
pe.exclude_idle = 1;
+ pe.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING;
- counter->fd = perf_event_open(&pe, 0, -1, -1, 0);
- counter->name = name;
- counter->type = type;
- counter->config = config;
- counter->value = 0;
+ int leader_fd = -1;
- return counter->fd;
-}
+ for(int i = 0; i < num_counters; i++) {
+ pe.config = counters[i].config;
-static void reset_counters(struct perf_counter *counters, int n) {
- for(int i = 0; i < n; i++) {
- if(counters[i].fd >= 0) {
- ioctl(counters[i].fd, PERF_EVENT_IOC_RESET, 0);
+ if(i == 0) {
+ // First counter is the group leader
+ leader_fd = perf_event_open(&pe, 0, -1, -1, 0);
+ counters[i].fd = leader_fd;
+ } else {
+ // Subsequent counters are group members
+ counters[i].fd = perf_event_open(&pe, 0, -1, leader_fd, 0);
}
+
+ if(counters[i].fd < 0) {
+ fprintf(stderr, "Warning: Counter %s not available\n", counters[i].name);
+ }
+
+ counters[i].value = 0;
}
+
+ return leader_fd;
}
-static void start_counters(struct perf_counter *counters, int n) {
- for(int i = 0; i < n; i++) {
- if(counters[i].fd >= 0) {
- ioctl(counters[i].fd, PERF_EVENT_IOC_ENABLE, 0);
- }
+static void reset_counters_group(int leader_fd) {
+ if(leader_fd >= 0) {
+ ioctl(leader_fd, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP);
}
}
-static void stop_counters(struct perf_counter *counters, int n) {
- for(int i = 0; i < n; i++) {
- if(counters[i].fd >= 0) {
- ioctl(counters[i].fd, PERF_EVENT_IOC_DISABLE, 0);
- read(counters[i].fd, &counters[i].value, sizeof(uint64_t));
- }
+static void start_counters_group(int leader_fd) {
+ if(leader_fd >= 0) {
+ ioctl(leader_fd, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP);
}
}
+static int stop_and_read_counters_group(int leader_fd, struct perf_counter *counters, int num_counters, struct perf_group_read *result) {
+ if(leader_fd < 0) {
+ return -1;
+ }
+
+ // Stop all counters atomically
+ ioctl(leader_fd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP);
+
+ // Read all counter values in one syscall
+ ssize_t bytes_read = read(leader_fd, result, sizeof(struct perf_group_read));
+ if(bytes_read < 0) {
+ fprintf(stderr, "Error: Failed to read perf group counters\n");
+ return -1;
+ }
+
+ // Store values in individual counter structs for compatibility
+ for(int i = 0; i < num_counters && i < result->nr; i++) {
+ counters[i].value = result->values[i].value;
+ }
+
+ return 0;
+}
+
static void close_counters(struct perf_counter *counters, int n) {
for(int i = 0; i < n; i++) {
if(counters[i].fd >= 0) {
@@ -133,7 +166,6 @@ static void set_cpu_affinity(int cpu) {
}
}
-// Static allocation for benchmark runs - no malloc, page-aligned
#define MAX_BENCH_RUNS 100
static struct bench_run runs_storage[MAX_BENCH_RUNS] __attribute__((aligned(4096)));
static double per_frame_storage[MAX_BENCH_RUNS * 3] __attribute__((aligned(4096)));
@@ -144,38 +176,36 @@ static void run_benchmark(struct nes_state *nstate, uint32_t num_runs, uint32_t
return;
}
- struct perf_counter counters[6];
- int num_counters = 0;
+ struct perf_counter counters[5];
+ int num_counters = 5;
+
+ // Initialize counter metadata
+ counters[0] = (struct perf_counter){.fd = -1, .value = 0, .name = "cycles", .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES};
+ counters[1] = (struct perf_counter){.fd = -1, .value = 0, .name = "instructions", .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_INSTRUCTIONS};
+ counters[2] = (struct perf_counter){.fd = -1, .value = 0, .name = "stalled-cycles-frontend", .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_STALLED_CYCLES_FRONTEND};
+ counters[3] = (struct perf_counter){.fd = -1, .value = 0, .name = "branches", .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_INSTRUCTIONS};
+ counters[4] = (struct perf_counter){.fd = -1, .value = 0, .name = "branch-misses", .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_MISSES};
- // Set up performance counters
- setup_counter(&counters[num_counters++], PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES, "cycles");
- setup_counter(&counters[num_counters++], PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS, "instructions");
- setup_counter(&counters[num_counters++], PERF_TYPE_HARDWARE, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND, "stalled-cycles-frontend");
- setup_counter(&counters[num_counters++], PERF_TYPE_HARDWARE, PERF_COUNT_HW_STALLED_CYCLES_BACKEND, "stalled-cycles-backend");
- setup_counter(&counters[num_counters++], PERF_TYPE_HARDWARE, PERF_COUNT_HW_BRANCH_INSTRUCTIONS, "branches");
- setup_counter(&counters[num_counters++], PERF_TYPE_HARDWARE, PERF_COUNT_HW_BRANCH_MISSES, "branch-misses");
+ // Set up performance counter group
+ int leader_fd = setup_counter_group(counters, num_counters);
// Check which counters are available
int available_counters = 0;
for(int i = 0; i < num_counters; i++) {
if(counters[i].fd >= 0) {
available_counters++;
- } else {
- fprintf(stderr, "Warning: Counter %s not available\n", counters[i].name);
}
}
- if(available_counters == 0) {
+ if(available_counters == 0 || leader_fd < 0) {
fprintf(stderr, "Error: No performance counters available\n");
close_counters(counters, num_counters);
return;
}
- // Use static storage for runs
struct bench_run *runs = runs_storage;
memset(runs, 0, sizeof(struct bench_run) * num_runs);
- // Set CPU affinity and realtime priority
set_cpu_affinity(1);
set_realtime_priority();
@@ -194,7 +224,7 @@ static void run_benchmark(struct nes_state *nstate, uint32_t num_runs, uint32_t
nstate->ppu.frame_ready = 0;
}
- // Run benchmark iterations
+ // Run benchmark
for(uint32_t run = 0; run < num_runs; run++) {
// Reset emulator state
@@ -209,8 +239,8 @@ static void run_benchmark(struct nes_state *nstate, uint32_t num_runs, uint32_t
clock_gettime(CLOCK_MONOTONIC, &start_time);
// Reset and start counters (after clock_gettime to exclude its overhead)
- reset_counters(counters, num_counters);
- start_counters(counters, num_counters);
+ reset_counters_group(leader_fd);
+ start_counters_group(leader_fd);
// Run emulation
for(uint32_t i = 0; i < frames_per_run; i++) {
@@ -220,17 +250,20 @@ static void run_benchmark(struct nes_state *nstate, uint32_t num_runs, uint32_t
nstate->ppu.frame_ready = 0;
}
- // Stop counters (before clock_gettime to exclude its overhead)
- stop_counters(counters, num_counters);
-
- // Stop timing
+ // Stop and read all counters atomically in one syscall
+ struct perf_group_read group_result;
+ stop_and_read_counters_group(leader_fd, counters, num_counters, &group_result);
clock_gettime(CLOCK_MONOTONIC, &end_time);
- // Calculate elapsed time in nanoseconds
uint64_t elapsed_ns = (end_time.tv_sec - start_time.tv_sec) * 1000000000ULL + (end_time.tv_nsec - start_time.tv_nsec);
+ // Calculate multiplexing coverage
+ double coverage = (group_result.time_enabled > 0) ?
+ (double)group_result.time_running / (double)group_result.time_enabled : 1.0;
+
// Store results
runs[run].time_ns = elapsed_ns;
+ runs[run].multiplexing_coverage = coverage;
for(int i = 0; i < num_counters; i++) {
if(counters[i].fd < 0) continue;
@@ -240,8 +273,6 @@ static void run_benchmark(struct nes_state *nstate, uint32_t num_runs, uint32_t
runs[run].instructions = counters[i].value;
} else if(counters[i].config == PERF_COUNT_HW_STALLED_CYCLES_FRONTEND) {
runs[run].stalled_cycles_frontend = counters[i].value;
- } else if(counters[i].config == PERF_COUNT_HW_STALLED_CYCLES_BACKEND) {
- runs[run].stalled_cycles_backend = counters[i].value;
} else if(counters[i].config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) {
runs[run].branches = counters[i].value;
} else if(counters[i].config == PERF_COUNT_HW_BRANCH_MISSES) {
@@ -250,11 +281,43 @@ static void run_benchmark(struct nes_state *nstate, uint32_t num_runs, uint32_t
}
}
+ // Check for multiplexing and warn user
+ int multiplexing_detected = 0;
+ double min_coverage = 1.0;
+ double max_coverage = 1.0;
+ for(uint32_t i = 0; i < num_runs; i++) {
+ if(runs[i].multiplexing_coverage < 0.9999) { // Allow for tiny floating point error
+ multiplexing_detected = 1;
+ }
+ if(runs[i].multiplexing_coverage < min_coverage) {
+ min_coverage = runs[i].multiplexing_coverage;
+ }
+ if(runs[i].multiplexing_coverage > max_coverage) {
+ max_coverage = runs[i].multiplexing_coverage;
+ }
+ }
+
+ if(multiplexing_detected) {
+ fprintf(stderr, "\n");
+ fprintf(stderr, "========================================\n");
+ fprintf(stderr, "WARNING: COUNTER MULTIPLEXING DETECTED!\n");
+ fprintf(stderr, "========================================\n");
+ fprintf(stderr, "The kernel time-sliced your performance counters.\n");
+ fprintf(stderr, "This means the counters were NOT running 100%% of the time.\n");
+ fprintf(stderr, "Coverage range: %.2f%% - %.2f%%\n", min_coverage * 100.0, max_coverage * 100.0);
+ fprintf(stderr, "Results may be SCALED and LESS PRECISE.\n");
+ fprintf(stderr, "Consider reducing the number of counters.\n");
+ fprintf(stderr, "========================================\n");
+ fprintf(stderr, "\n");
+ } else {
+ // All good - counters ran at 100% coverage (no multiplexing)
+ printf("Performance counter coverage: 100%% (no multiplexing - full precision)\n");
+ }
+
// Calculate aggregated totals
uint64_t total_instructions = 0;
uint64_t total_cycles = 0;
uint64_t total_stalled_frontend = 0;
- uint64_t total_stalled_backend = 0;
uint64_t total_branches = 0;
uint64_t total_branch_misses = 0;
uint64_t total_time_ns = 0;
@@ -263,7 +326,6 @@ static void run_benchmark(struct nes_state *nstate, uint32_t num_runs, uint32_t
total_instructions += runs[i].instructions;
total_cycles += runs[i].cycles;
total_stalled_frontend += runs[i].stalled_cycles_frontend;
- total_stalled_backend += runs[i].stalled_cycles_backend;
total_branches += runs[i].branches;
total_branch_misses += runs[i].branch_misses;
total_time_ns += runs[i].time_ns;
@@ -287,35 +349,32 @@ static void run_benchmark(struct nes_state *nstate, uint32_t num_runs, uint32_t
// Print results
double total_time_s = (double)total_time_ns / 1000000000.0;
double ipc = (double)total_instructions / total_cycles;
- double stalled_per_insn = (double)(total_stalled_frontend + total_stalled_backend) / total_instructions;
+ double stalled_per_insn = (double)total_stalled_frontend / total_instructions;
double ghz = (double)total_cycles / total_time_s / 1000000000.0;
double branches_per_sec = (double)total_branches / total_time_s;
double branch_miss_rate = (double)total_branch_misses / total_branches * 100.0;
double stalled_frontend_pct = (double)total_stalled_frontend / total_cycles * 100.0;
- double stalled_backend_pct = (double)total_stalled_backend / total_cycles * 100.0;
double mips = (double)total_instructions / total_time_s / 1000000.0;
double mcps = (double)total_cycles / total_time_s / 1000000.0;
printf("\n");
- printf("%20llu instructions # %.2f insn per cycle \n", (unsigned long long)total_instructions, ipc);
- printf("%56s# %.2f stalled cycles per insn \n", "", stalled_per_insn);
- printf("%20llu cycles # %.3f GHz \n", (unsigned long long)total_cycles, ghz);
- printf("%20llu stalled-cycles-frontend # %.2f%% frontend cycles idle \n", (unsigned long long)total_stalled_frontend, stalled_frontend_pct);
- printf("%20llu stalled-cycles-backend # %.2f%% backend cycles idle \n", (unsigned long long)total_stalled_backend, stalled_backend_pct);
- printf("%20llu branches # %.3f G/sec \n", (unsigned long long)total_branches, branches_per_sec / 1000000000.0);
- printf("%20llu branch-misses # %.2f%% of all branches \n", (unsigned long long)total_branch_misses, branch_miss_rate);
+ printf("%20llu instructions # %.2f insn per cycle \n", (unsigned long long)total_instructions, ipc);
+ printf("%56s# %.2f stalled cycles per insn\n", "", stalled_per_insn);
+ printf("%20llu cycles # %.3f GHz \n", (unsigned long long)total_cycles, ghz);
+ printf("%20llu stalled-cycles-frontend # %.2f%% frontend cycles idle\n", (unsigned long long)total_stalled_frontend, stalled_frontend_pct);
+ printf("%20llu branches # %.3f G/sec \n", (unsigned long long)total_branches, branches_per_sec / 1000000000.0);
+ printf("%20llu branch-misses # %.2f%% of all branches \n", (unsigned long long)total_branch_misses, branch_miss_rate);
printf("\n");
printf("Throughput: %.2f MIPS, %.2f Mcycles/sec\n", mips, mcps);
printf("\n");
- printf("cycles/frame mean=%.0f sd=%.0f relSD=%.3f%% n=%d\n", cycles_stats.mean, cycles_stats.sd, cycles_stats.rel_sd, cycles_stats.n);
- printf("insn/frame mean=%.0f sd=%.0f relSD=%.3f%% n=%d\n", insn_stats.mean, insn_stats.sd, insn_stats.rel_sd, insn_stats.n);
- printf("time (ms) mean=%.3f sd=%.3f relSD=%.3f%% n=%d\n", time_stats.mean, time_stats.sd, time_stats.rel_sd, time_stats.n);
+ printf("cycles/frame mean=%9.0f sd=%6.0f relSD=%.3f%% n=%d\n", cycles_stats.mean, cycles_stats.sd, cycles_stats.rel_sd, cycles_stats.n);
+ printf("insn/frame mean=%9.0f sd=%6.0f relSD=%.3f%% n=%d\n", insn_stats.mean, insn_stats.sd, insn_stats.rel_sd, insn_stats.n);
+ printf("time (ms) mean=%9.3f sd=%6.3f relSD=%.3f%% n=%d\n", time_stats.mean, time_stats.sd, time_stats.rel_sd, time_stats.n);
double fps = (double)frames_per_run / (time_stats.mean / 1000.0);
double ms_per_frame = time_stats.mean / frames_per_run;
printf("FPS (frames/second) = %.2f\n", fps);
- printf("ms/frame = %.6f\n", ms_per_frame);
+ printf(" ms/frame = %.6f\n", ms_per_frame);
- // Cleanup
close_counters(counters, num_counters);
}