From 26fb39ae33d1c16ba1d64165fe9ad327a94220cb Mon Sep 17 00:00:00 2001 From: Peter Fors Date: Wed, 29 Oct 2025 06:00:20 +0100 Subject: Finally back above 3000fps - this was a lot of work vital@claybabble:/work/current/mknes(master*|u=)$ ./Bench.sh Performance counter coverage: 100% (no multiplexing - full precision) 352509230343 instructions # 4.81 insn per cycle # 0.01 stalled cycles per insn 73277044838 cycles # 5.416 GHz 2957685039 stalled-cycles-frontend # 4.04% frontend cycles idle 70065301653 branches # 5.179 G/sec 297927451 branch-misses # 0.43% of all branches Throughput: 26056.16 MIPS, 5416.36 Mcycles/sec cycles/frame mean= 1788990 sd= 2803 relSD=0.157% n=10 insn/frame mean= 8606182 sd= 0 relSD=0.000% n=10 time (ms) mean= 1352.883 sd= 2.140 relSD=0.158% n=10 FPS (frames/second) = 3027.61 ms/frame = 0.330294 --- mknes_ppu.c | 105 ++++++++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 96 insertions(+), 9 deletions(-) (limited to 'mknes_ppu.c') diff --git a/mknes_ppu.c b/mknes_ppu.c index 6e8e3ae..249f9f9 100644 --- a/mknes_ppu.c +++ b/mknes_ppu.c @@ -109,6 +109,93 @@ static inline void ppu_fetch_sprite_patterns(struct nes_state * restrict state, } } + +#if 0 +// layout reminder +// struct sprite_data { u8 shift_lo, shift_hi, position, priority, palette; } __attribute__((packed)); + +__attribute__((always_inline, hot, optimize("no-jump-tables","no-unroll-loops"))) +static inline void ppu_render_pixel(struct nes_state * restrict state, uint32_t x, uint32_t y, uint8_t mask_reg) { + struct ppu_state *restrict ppu = &state->ppu; + + // 32-bit temps for bg path; keep struct 16-bit/8-bit + const uint32_t s = 15u - (uint32_t)ppu->fine_x; + const uint32_t show_bg = (mask_reg & PPU_MASK_SHOW_BG) != 0; + const uint32_t show_sprites = (mask_reg & PPU_MASK_SHOW_SPRITES) != 0; + const uint32_t left_bg = (mask_reg & 0x02) != 0; + const uint32_t left_sp = (mask_reg & 0x04) != 0; + const uint32_t x_ge_8 = (x & ~7u) != 0; + + const uint32_t bg_on = show_bg & (left_bg | x_ge_8); + const uint32_t sp_on = show_sprites & (left_sp | x_ge_8); + + const uint32_t pat_lo = (uint32_t)ppu->bg_shift_pattern_low; + const uint32_t pat_hi = (uint32_t)ppu->bg_shift_pattern_high; + const uint32_t att_lo = (uint32_t)ppu->bg_shift_attrib_low; + const uint32_t att_hi = (uint32_t)ppu->bg_shift_attrib_high; + + const uint32_t p0 = (pat_lo >> s) & 1u; + const uint32_t p1 = (pat_hi >> s) & 1u; + const uint32_t a0 = (att_lo >> s) & 1u; + const uint32_t a1 = (att_hi >> s) & 1u; + + const uint32_t bg_pixel = ((p1 << 1) | p0) & -bg_on; + const uint32_t bg_palette = ((a1 << 1) | a0) & -bg_on; + + // Sprite resolve: only load fields in this order: + // position (branch filter) -> shift bytes (pixel test) -> meta (on hit) + uint32_t sp_pixel = 0, sp_palette = 0, sp_prio = 0, sp_zero = 0; + + if (sp_on) { + struct sprite_data * restrict s_ptr = ppu->sprites; // address calc only + uint32_t n = ppu->sprite_count; // one load + if (n) { + uint32_t i = 0; + find_sprite: + // 1) position + uint32_t pos = s_ptr->position; // load 1 + if (!pos) { + // 2) pixel from shift bytes + uint32_t sh = s_ptr->shift_hi; // load 2 + uint32_t sl = s_ptr->shift_lo; // load 3 + uint32_t pix = ((sh & 0x80u) >> 6) | ((sl & 0x80u) >> 7); + if (pix) { + // 3) only now fetch metadata + sp_pixel = pix; + sp_palette = s_ptr->palette; // load 4 (only on hit) + sp_prio = s_ptr->priority; // load 5 (only on hit) + sp_zero = (ppu->sprite_zero_in_range != 0u) & (i == 0u); + goto sprite_done; + } + } + // next sprite + ++s_ptr; ++i; + if (i < n) goto find_sprite; + } + } +sprite_done: ; + + const uint32_t bg_index = (bg_palette << 2) + bg_pixel; + const uint32_t sp_index = (sp_palette << 2) + sp_pixel; + const uint32_t selector = ((bg_pixel != 0u) << 1) | (sp_pixel != 0u); + + // Two-way combine with minimal control flow + uint32_t palette_index = 0; + if (selector == 1u) { + palette_index = 0x10u | sp_index; + } else if (selector == 2u) { + palette_index = bg_index; + } else if (selector == 3u) { + const uint32_t use_bg = (sp_prio != 0u); + palette_index = use_bg ? bg_index : (0x10u | sp_index); + if ((ppu->sprite_zero_in_range != 0u) & (sp_zero != 0u) & (x <= 254u)) { + ppu->reg_status |= PPU_STATUS_SPRITE_ZERO_HIT; + } + } + state->pixels[y * 256 + x] = ppu->palette[palette_index]; +} + +#else __attribute__((always_inline, hot, optimize("no-jump-tables", "no-unroll-loops"))) static inline void ppu_render_pixel(struct nes_state * restrict state, uint32_t x, uint32_t y, uint8_t mask_reg) { struct ppu_state *restrict ppu = &state->ppu; @@ -159,25 +246,25 @@ sprite_found: } no_sprite: - // Final pixel composition uint8_t bg_index = (bg_palette << 2) + bg_pixel; uint8_t sp_index = (sp_palette << 2) + sp_pixel; uint8_t selector = (bg_pixel ? 2 : 0) | (sp_pixel ? 1 : 0); uint8_t palette_index = 0; - switch(selector) { - // case 0: { palette_index = 0; } break; - case 1: { palette_index = 0x10 | sp_index; } break; - case 2: { palette_index = bg_index; } break; - case 3: { - palette_index = (sp_prio) ? bg_index : 0x10 | sp_index; - ppu->reg_status |= (sp_zero && x < 255) ? PPU_STATUS_SPRITE_ZERO_HIT : 0; - } break; // NOTE(peter): Sprite zero hit! + if(selector == 1) { + palette_index = 0x10 | sp_index; + } else if(selector == 2) { + palette_index = bg_index; + } else if(selector == 3) { + palette_index = (sp_prio) ? bg_index : (0x10 | sp_index); + if (sp_zero && x <= 254) ppu->reg_status |= PPU_STATUS_SPRITE_ZERO_HIT; } state->pixels[y * 256 + x] = ppu->palette[palette_index]; // NOTE(peter): Add color_emphasis bits (expand palette to 8x). } +#endif + __attribute__((noinline, hot, optimize("no-jump-tables", "unroll-loops"))) static void ppu_tick(struct nes_state *state) { -- cgit v1.2.3