summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPeter Fors <peter.fors@mindkiller.com>2025-11-14 22:17:03 +0100
committerPeter Fors <peter.fors@mindkiller.com>2025-11-14 22:17:03 +0100
commit2174b3f369e59286ccd10348117a16e08c412508 (patch)
tree8ff6c7d02da221aad45fcdb03b22f09080a9c9aa
parent46d0f6aeb1588b85852487e581a8b4c9c2401646 (diff)
Rearranged the ppu_state to be even more cacheline aware, gained another 1% performance
-rwxr-xr-xbuild.sh4
-rw-r--r--mknes.h19
-rw-r--r--mknes_bench.c8
-rw-r--r--mknes_ppu.c2
4 files changed, 17 insertions, 16 deletions
diff --git a/build.sh b/build.sh
index a5e2f9d..d2cb7b2 100755
--- a/build.sh
+++ b/build.sh
@@ -16,6 +16,8 @@ WIN_CC=x86_64-w64-mingw32-gcc
# Base configuration common to all builds
CFLAGS="-std=gnu11 -mtune=generic "
# -fdump-tree-alias "
+CFLAGS+="-falign-functions=32 -falign-loops=32 "
+CFLAGS+="-finline-limit=800 "
CFLAGS+="-mbmi -fno-argument-alias "
CFLAGS+="-mfunction-return=keep -mindirect-branch=keep "
CFLAGS+="-fwrapv -ffast-math -fno-trapping-math -fvisibility=hidden "
@@ -93,7 +95,7 @@ set -e
# Build Linux version
(
- $CC $CFLAGS ${PROJECT_NAME}_sdl.c -o ${PROJECT_NAME} $INCLUDE_PATHS $LINUX_INCLUDE $LDFLAGS $LINUX_LIBS
+ $CC $CFLAGS ${PROJECT_NAME}.c -o ${PROJECT_NAME} $INCLUDE_PATHS $LINUX_INCLUDE $LDFLAGS $LINUX_LIBS
objdump -d -Mintel mknes > mknes.s
) &
diff --git a/mknes.h b/mknes.h
index b6739d3..944f762 100644
--- a/mknes.h
+++ b/mknes.h
@@ -56,7 +56,6 @@ struct ppu_state {
uint8_t open_bus; // 29
uint8_t sprite_count; // 30 - Number of sprites in secondary OAM (0-8)
- uint8_t palette[32]; // 31
// NOTE(peter): CACHELINE 2
uint8_t secondary_oam[32] __attribute__((aligned(64)));
@@ -68,16 +67,18 @@ struct ppu_state {
uint8_t position;
uint8_t priority;
uint8_t palette;
- } __attribute__((packed)) sprites[8];
+ } __attribute__((packed)) sprites[8]; // 32 -> +40
- uint8_t input[2]; // 40 - Controller 1 & 2
- uint8_t input_latch[2]; // 42 - Latched inputs after strobe
- uint8_t input_bit[2]; // 44 - Current bit position being shifted out
- uint8_t input_strobe; // 46 - Control bit (0 or 1)
- uint8_t frame_ready; // 47
- uint8_t sprite_zero_in_range; // 48 - Boolean: is sprite 0 in range (will always be slot 0 if true)
+ uint8_t input_strobe; // 76 - Control bit (0 or 1)
+ uint8_t input[2]; // 73 - Controller 1 & 2
+ uint8_t input_latch[2]; // 74 - Latched inputs after strobe
+ uint8_t input_bit[2]; // 75 - Current bit position being shifted out
+ uint8_t frame_ready; // 77
+ uint8_t sprite_zero_in_range; // 78 - Boolean: is sprite 0 in range (will always be slot 0 if true)
- // NOTE(peter): CACHELINE 4
+ uint8_t palette[32]; // 79
+
+ // NOTE(peter): CACHELINE 5
uint8_t oam[256] __attribute__((aligned(64)));
} __attribute__((packed));
diff --git a/mknes_bench.c b/mknes_bench.c
index b064266..e7d53fa 100644
--- a/mknes_bench.c
+++ b/mknes_bench.c
@@ -333,17 +333,17 @@ static void run_benchmark(struct nes_state *nstate, uint32_t num_runs, uint32_t
// Calculate per-frame statistics using static storage
double *cycles_per_frame = &per_frame_storage[0];
- double *insn_per_frame = &per_frame_storage[num_runs];
+ double *ipc_per_run = &per_frame_storage[num_runs];
double *time_ms = &per_frame_storage[num_runs * 2];
for(uint32_t i = 0; i < num_runs; i++) {
cycles_per_frame[i] = (double)runs[i].cycles / frames_per_run;
- insn_per_frame[i] = (double)runs[i].instructions / frames_per_run;
+ ipc_per_run[i] = (double)runs[i].instructions / (double)runs[i].cycles;
time_ms[i] = (double)runs[i].time_ns / 1000000.0;
}
struct bench_stats cycles_stats = calculate_stats(cycles_per_frame, num_runs);
- struct bench_stats insn_stats = calculate_stats(insn_per_frame, num_runs);
+ struct bench_stats ipc_stats = calculate_stats(ipc_per_run, num_runs);
struct bench_stats time_stats = calculate_stats(time_ms, num_runs);
// Print results
@@ -368,7 +368,7 @@ static void run_benchmark(struct nes_state *nstate, uint32_t num_runs, uint32_t
printf("Throughput: %.2f MIPS, %.2f Mcycles/sec\n", mips, mcps);
printf("\n");
printf("cycles/frame mean=%9.0f sd=%6.0f relSD=%.3f%% n=%d\n", cycles_stats.mean, cycles_stats.sd, cycles_stats.rel_sd, cycles_stats.n);
- printf("insn/frame mean=%9.0f sd=%6.0f relSD=%.3f%% n=%d\n", insn_stats.mean, insn_stats.sd, insn_stats.rel_sd, insn_stats.n);
+ printf("IPC mean=%9.3f sd=%6.3f relSD=%.3f%% n=%d\n", ipc_stats.mean, ipc_stats.sd, ipc_stats.rel_sd, ipc_stats.n);
printf("time (ms) mean=%9.3f sd=%6.3f relSD=%.3f%% n=%d\n", time_stats.mean, time_stats.sd, time_stats.rel_sd, time_stats.n);
double fps = (double)frames_per_run / (time_stats.mean / 1000.0);
diff --git a/mknes_ppu.c b/mknes_ppu.c
index b0f6479..a4f7a2c 100644
--- a/mknes_ppu.c
+++ b/mknes_ppu.c
@@ -277,8 +277,6 @@ shift_and_fetch:
switch(dot % 8) {
case 1: {
- // uint32_t nt_addr = 0x2000 | (ppu->vram_addr & 0xfff);
- // ppu->bg_next_tile_id = state->mapper_function.ciram_read(state, nt_addr);
ppu->bg_next_tile_id = state->mapper_function.ciram_read(state, ppu->vram_addr);
} break;