From 2174b3f369e59286ccd10348117a16e08c412508 Mon Sep 17 00:00:00 2001 From: Peter Fors Date: Fri, 14 Nov 2025 22:17:03 +0100 Subject: Rearranged the ppu_state to be even more cacheline aware, gained another 1% performance --- build.sh | 4 +++- mknes.h | 19 ++++++++++--------- mknes_bench.c | 8 ++++---- mknes_ppu.c | 2 -- 4 files changed, 17 insertions(+), 16 deletions(-) diff --git a/build.sh b/build.sh index a5e2f9d..d2cb7b2 100755 --- a/build.sh +++ b/build.sh @@ -16,6 +16,8 @@ WIN_CC=x86_64-w64-mingw32-gcc # Base configuration common to all builds CFLAGS="-std=gnu11 -mtune=generic " # -fdump-tree-alias " +CFLAGS+="-falign-functions=32 -falign-loops=32 " +CFLAGS+="-finline-limit=800 " CFLAGS+="-mbmi -fno-argument-alias " CFLAGS+="-mfunction-return=keep -mindirect-branch=keep " CFLAGS+="-fwrapv -ffast-math -fno-trapping-math -fvisibility=hidden " @@ -93,7 +95,7 @@ set -e # Build Linux version ( - $CC $CFLAGS ${PROJECT_NAME}_sdl.c -o ${PROJECT_NAME} $INCLUDE_PATHS $LINUX_INCLUDE $LDFLAGS $LINUX_LIBS + $CC $CFLAGS ${PROJECT_NAME}.c -o ${PROJECT_NAME} $INCLUDE_PATHS $LINUX_INCLUDE $LDFLAGS $LINUX_LIBS objdump -d -Mintel mknes > mknes.s ) & diff --git a/mknes.h b/mknes.h index b6739d3..944f762 100644 --- a/mknes.h +++ b/mknes.h @@ -56,7 +56,6 @@ struct ppu_state { uint8_t open_bus; // 29 uint8_t sprite_count; // 30 - Number of sprites in secondary OAM (0-8) - uint8_t palette[32]; // 31 // NOTE(peter): CACHELINE 2 uint8_t secondary_oam[32] __attribute__((aligned(64))); @@ -68,16 +67,18 @@ struct ppu_state { uint8_t position; uint8_t priority; uint8_t palette; - } __attribute__((packed)) sprites[8]; + } __attribute__((packed)) sprites[8]; // 32 -> +40 - uint8_t input[2]; // 40 - Controller 1 & 2 - uint8_t input_latch[2]; // 42 - Latched inputs after strobe - uint8_t input_bit[2]; // 44 - Current bit position being shifted out - uint8_t input_strobe; // 46 - Control bit (0 or 1) - uint8_t frame_ready; // 47 - uint8_t sprite_zero_in_range; // 48 - Boolean: is sprite 0 in range (will always be slot 0 if true) + uint8_t input_strobe; // 76 - Control bit (0 or 1) + uint8_t input[2]; // 73 - Controller 1 & 2 + uint8_t input_latch[2]; // 74 - Latched inputs after strobe + uint8_t input_bit[2]; // 75 - Current bit position being shifted out + uint8_t frame_ready; // 77 + uint8_t sprite_zero_in_range; // 78 - Boolean: is sprite 0 in range (will always be slot 0 if true) - // NOTE(peter): CACHELINE 4 + uint8_t palette[32]; // 79 + + // NOTE(peter): CACHELINE 5 uint8_t oam[256] __attribute__((aligned(64))); } __attribute__((packed)); diff --git a/mknes_bench.c b/mknes_bench.c index b064266..e7d53fa 100644 --- a/mknes_bench.c +++ b/mknes_bench.c @@ -333,17 +333,17 @@ static void run_benchmark(struct nes_state *nstate, uint32_t num_runs, uint32_t // Calculate per-frame statistics using static storage double *cycles_per_frame = &per_frame_storage[0]; - double *insn_per_frame = &per_frame_storage[num_runs]; + double *ipc_per_run = &per_frame_storage[num_runs]; double *time_ms = &per_frame_storage[num_runs * 2]; for(uint32_t i = 0; i < num_runs; i++) { cycles_per_frame[i] = (double)runs[i].cycles / frames_per_run; - insn_per_frame[i] = (double)runs[i].instructions / frames_per_run; + ipc_per_run[i] = (double)runs[i].instructions / (double)runs[i].cycles; time_ms[i] = (double)runs[i].time_ns / 1000000.0; } struct bench_stats cycles_stats = calculate_stats(cycles_per_frame, num_runs); - struct bench_stats insn_stats = calculate_stats(insn_per_frame, num_runs); + struct bench_stats ipc_stats = calculate_stats(ipc_per_run, num_runs); struct bench_stats time_stats = calculate_stats(time_ms, num_runs); // Print results @@ -368,7 +368,7 @@ static void run_benchmark(struct nes_state *nstate, uint32_t num_runs, uint32_t printf("Throughput: %.2f MIPS, %.2f Mcycles/sec\n", mips, mcps); printf("\n"); printf("cycles/frame mean=%9.0f sd=%6.0f relSD=%.3f%% n=%d\n", cycles_stats.mean, cycles_stats.sd, cycles_stats.rel_sd, cycles_stats.n); - printf("insn/frame mean=%9.0f sd=%6.0f relSD=%.3f%% n=%d\n", insn_stats.mean, insn_stats.sd, insn_stats.rel_sd, insn_stats.n); + printf("IPC mean=%9.3f sd=%6.3f relSD=%.3f%% n=%d\n", ipc_stats.mean, ipc_stats.sd, ipc_stats.rel_sd, ipc_stats.n); printf("time (ms) mean=%9.3f sd=%6.3f relSD=%.3f%% n=%d\n", time_stats.mean, time_stats.sd, time_stats.rel_sd, time_stats.n); double fps = (double)frames_per_run / (time_stats.mean / 1000.0); diff --git a/mknes_ppu.c b/mknes_ppu.c index b0f6479..a4f7a2c 100644 --- a/mknes_ppu.c +++ b/mknes_ppu.c @@ -277,8 +277,6 @@ shift_and_fetch: switch(dot % 8) { case 1: { - // uint32_t nt_addr = 0x2000 | (ppu->vram_addr & 0xfff); - // ppu->bg_next_tile_id = state->mapper_function.ciram_read(state, nt_addr); ppu->bg_next_tile_id = state->mapper_function.ciram_read(state, ppu->vram_addr); } break; -- cgit v1.2.3