summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPeter Fors <peter.fors@mindkiller.com>2025-10-29 06:00:20 +0100
committerPeter Fors <peter.fors@mindkiller.com>2025-10-29 06:00:20 +0100
commit26fb39ae33d1c16ba1d64165fe9ad327a94220cb (patch)
tree6927808a2d32cfa0ec703875a693f6b64fe372b2
parenta3087dd6d0938056f7f0e3d89e60f36e56ac27d2 (diff)
Finally back above 3000fps - this was a lot of work
vital@claybabble:/work/current/mknes(master*|u=)$ ./Bench.sh Performance counter coverage: 100% (no multiplexing - full precision) 352509230343 instructions # 4.81 insn per cycle # 0.01 stalled cycles per insn 73277044838 cycles # 5.416 GHz 2957685039 stalled-cycles-frontend # 4.04% frontend cycles idle 70065301653 branches # 5.179 G/sec 297927451 branch-misses # 0.43% of all branches Throughput: 26056.16 MIPS, 5416.36 Mcycles/sec cycles/frame mean= 1788990 sd= 2803 relSD=0.157% n=10 insn/frame mean= 8606182 sd= 0 relSD=0.000% n=10 time (ms) mean= 1352.883 sd= 2.140 relSD=0.158% n=10 FPS (frames/second) = 3027.61 ms/frame = 0.330294
-rw-r--r--mknes.c8
-rw-r--r--mknes_ppu.c105
2 files changed, 100 insertions, 13 deletions
diff --git a/mknes.c b/mknes.c
index b82fc55..244a032 100644
--- a/mknes.c
+++ b/mknes.c
@@ -58,8 +58,8 @@ static void audio_callback(int16_t *data, size_t frames) { }
#ifdef BENCHMARK
// Embed the ROM for benchmarking to eliminate file I/O overhead
// Uncomment the ROM you want to benchmark:
-// INCBIN_BYTES(benchmark_rom, "data/Life Force (USA).nes");
-INCBIN_BYTES(benchmark_rom, "data/0000/Super Mario Bros. (World) (HVC-SM).nes");
+INCBIN_BYTES(benchmark_rom, "data/Life Force (USA).nes");
+// INCBIN_BYTES(benchmark_rom, "data/0000/Super Mario Bros. (World) (HVC-SM).nes");
// INCBIN_BYTES(benchmark_rom, "data/0003/Gradius (USA).nes");
#endif
@@ -271,7 +271,7 @@ int main(int argc, char **argv) {
// ines2_load(nstate, "data/0000/Excitebike (Japan, USA).nes");
// ines2_load(nstate, "data/0000/Ice Climber (USA, Europe, Korea).nes");
// ines2_load(nstate, "data/0000/Kung Fu (Japan, USA).nes");
- // ines2_load(nstate, "data/0000/Super Mario Bros. (World) (HVC-SM).nes");
+ ines2_load(nstate, "data/0000/Super Mario Bros. (World) (HVC-SM).nes");
// ines2_load(nstate, "data/Super Mario Bros. (W) (V1.0) [!].nes");
// ines2_load(nstate, "data/Super Mario Bros. (JU) [!].nes");
// ines2_load(nstate, "data/0000/Urban Champion (World).nes");
@@ -293,7 +293,7 @@ int main(int argc, char **argv) {
// ines2_load(nstate, "data/0000/Xevious - The Avenger (USA).zip");
// ines2_load(nstate, "data/tv.nes");
- ines2_load(nstate, "data/Life Force (USA).nes"); // 2002
+ // ines2_load(nstate, "data/Life Force (USA).nes"); // 2002
// ines2_load(nstate, "data/0003/Flipull - An Exciting Cube Game (Japan) (En).zip");
// ines2_load(nstate, "data/0003/Friday the 13th (USA).zip");
diff --git a/mknes_ppu.c b/mknes_ppu.c
index 6e8e3ae..249f9f9 100644
--- a/mknes_ppu.c
+++ b/mknes_ppu.c
@@ -109,6 +109,93 @@ static inline void ppu_fetch_sprite_patterns(struct nes_state * restrict state,
}
}
+
+#if 0
+// layout reminder
+// struct sprite_data { u8 shift_lo, shift_hi, position, priority, palette; } __attribute__((packed));
+
+__attribute__((always_inline, hot, optimize("no-jump-tables","no-unroll-loops")))
+static inline void ppu_render_pixel(struct nes_state * restrict state, uint32_t x, uint32_t y, uint8_t mask_reg) {
+ struct ppu_state *restrict ppu = &state->ppu;
+
+ // 32-bit temps for bg path; keep struct 16-bit/8-bit
+ const uint32_t s = 15u - (uint32_t)ppu->fine_x;
+ const uint32_t show_bg = (mask_reg & PPU_MASK_SHOW_BG) != 0;
+ const uint32_t show_sprites = (mask_reg & PPU_MASK_SHOW_SPRITES) != 0;
+ const uint32_t left_bg = (mask_reg & 0x02) != 0;
+ const uint32_t left_sp = (mask_reg & 0x04) != 0;
+ const uint32_t x_ge_8 = (x & ~7u) != 0;
+
+ const uint32_t bg_on = show_bg & (left_bg | x_ge_8);
+ const uint32_t sp_on = show_sprites & (left_sp | x_ge_8);
+
+ const uint32_t pat_lo = (uint32_t)ppu->bg_shift_pattern_low;
+ const uint32_t pat_hi = (uint32_t)ppu->bg_shift_pattern_high;
+ const uint32_t att_lo = (uint32_t)ppu->bg_shift_attrib_low;
+ const uint32_t att_hi = (uint32_t)ppu->bg_shift_attrib_high;
+
+ const uint32_t p0 = (pat_lo >> s) & 1u;
+ const uint32_t p1 = (pat_hi >> s) & 1u;
+ const uint32_t a0 = (att_lo >> s) & 1u;
+ const uint32_t a1 = (att_hi >> s) & 1u;
+
+ const uint32_t bg_pixel = ((p1 << 1) | p0) & -bg_on;
+ const uint32_t bg_palette = ((a1 << 1) | a0) & -bg_on;
+
+ // Sprite resolve: only load fields in this order:
+ // position (branch filter) -> shift bytes (pixel test) -> meta (on hit)
+ uint32_t sp_pixel = 0, sp_palette = 0, sp_prio = 0, sp_zero = 0;
+
+ if (sp_on) {
+ struct sprite_data * restrict s_ptr = ppu->sprites; // address calc only
+ uint32_t n = ppu->sprite_count; // one load
+ if (n) {
+ uint32_t i = 0;
+ find_sprite:
+ // 1) position
+ uint32_t pos = s_ptr->position; // load 1
+ if (!pos) {
+ // 2) pixel from shift bytes
+ uint32_t sh = s_ptr->shift_hi; // load 2
+ uint32_t sl = s_ptr->shift_lo; // load 3
+ uint32_t pix = ((sh & 0x80u) >> 6) | ((sl & 0x80u) >> 7);
+ if (pix) {
+ // 3) only now fetch metadata
+ sp_pixel = pix;
+ sp_palette = s_ptr->palette; // load 4 (only on hit)
+ sp_prio = s_ptr->priority; // load 5 (only on hit)
+ sp_zero = (ppu->sprite_zero_in_range != 0u) & (i == 0u);
+ goto sprite_done;
+ }
+ }
+ // next sprite
+ ++s_ptr; ++i;
+ if (i < n) goto find_sprite;
+ }
+ }
+sprite_done: ;
+
+ const uint32_t bg_index = (bg_palette << 2) + bg_pixel;
+ const uint32_t sp_index = (sp_palette << 2) + sp_pixel;
+ const uint32_t selector = ((bg_pixel != 0u) << 1) | (sp_pixel != 0u);
+
+ // Two-way combine with minimal control flow
+ uint32_t palette_index = 0;
+ if (selector == 1u) {
+ palette_index = 0x10u | sp_index;
+ } else if (selector == 2u) {
+ palette_index = bg_index;
+ } else if (selector == 3u) {
+ const uint32_t use_bg = (sp_prio != 0u);
+ palette_index = use_bg ? bg_index : (0x10u | sp_index);
+ if ((ppu->sprite_zero_in_range != 0u) & (sp_zero != 0u) & (x <= 254u)) {
+ ppu->reg_status |= PPU_STATUS_SPRITE_ZERO_HIT;
+ }
+ }
+ state->pixels[y * 256 + x] = ppu->palette[palette_index];
+}
+
+#else
__attribute__((always_inline, hot, optimize("no-jump-tables", "no-unroll-loops")))
static inline void ppu_render_pixel(struct nes_state * restrict state, uint32_t x, uint32_t y, uint8_t mask_reg) {
struct ppu_state *restrict ppu = &state->ppu;
@@ -159,25 +246,25 @@ sprite_found:
}
no_sprite:
-
// Final pixel composition
uint8_t bg_index = (bg_palette << 2) + bg_pixel;
uint8_t sp_index = (sp_palette << 2) + sp_pixel;
uint8_t selector = (bg_pixel ? 2 : 0) | (sp_pixel ? 1 : 0);
uint8_t palette_index = 0;
- switch(selector) {
- // case 0: { palette_index = 0; } break;
- case 1: { palette_index = 0x10 | sp_index; } break;
- case 2: { palette_index = bg_index; } break;
- case 3: {
- palette_index = (sp_prio) ? bg_index : 0x10 | sp_index;
- ppu->reg_status |= (sp_zero && x < 255) ? PPU_STATUS_SPRITE_ZERO_HIT : 0;
- } break; // NOTE(peter): Sprite zero hit!
+ if(selector == 1) {
+ palette_index = 0x10 | sp_index;
+ } else if(selector == 2) {
+ palette_index = bg_index;
+ } else if(selector == 3) {
+ palette_index = (sp_prio) ? bg_index : (0x10 | sp_index);
+ if (sp_zero && x <= 254) ppu->reg_status |= PPU_STATUS_SPRITE_ZERO_HIT;
}
state->pixels[y * 256 + x] = ppu->palette[palette_index]; // NOTE(peter): Add color_emphasis bits (expand palette to 8x).
}
+#endif
+
__attribute__((noinline, hot, optimize("no-jump-tables", "unroll-loops")))
static void ppu_tick(struct nes_state *state) {