diff options
| author | Peter Fors <peter.fors@mindkiller.com> | 2025-10-28 17:44:57 +0100 |
|---|---|---|
| committer | Peter Fors <peter.fors@mindkiller.com> | 2025-10-28 17:44:57 +0100 |
| commit | 9ee76c20c0d093d5adac2dcc3b275b53b879c369 (patch) | |
| tree | 7f21652a42d96f2ede7822950cbbf9c044ca43ca /mknes_ppu.c | |
| parent | 3b7621981b56a51756badac70034f68366878df9 (diff) | |
small optimizations of sprite evaluation in ppu_render_pixel
Diffstat (limited to 'mknes_ppu.c')
| -rw-r--r-- | mknes_ppu.c | 121 |
1 files changed, 55 insertions, 66 deletions
diff --git a/mknes_ppu.c b/mknes_ppu.c index fcaf681..7c2b4ac 100644 --- a/mknes_ppu.c +++ b/mknes_ppu.c @@ -24,7 +24,7 @@ static void ppu_reset(struct nes_state *state) { memset(ppu, 0, sizeof(struct ppu_state)); } -__attribute__((hot, flatten)) +__attribute__((hot, flatten, optimize("unroll-loops"))) static inline void ppu_evaluate_sprites(struct nes_state *state, uint32_t scanline) { struct ppu_state *restrict ppu = &state->ppu; uint8_t sprite_height = (ppu->reg_ctrl & PPU_CTRL_SPRITE_HEIGHT) ? 16 : 8; @@ -62,6 +62,7 @@ static inline void ppu_fetch_sprite_patterns(struct nes_state * restrict state, uint8_t * restrict sec_oam = ppu->secondary_oam; uint8_t ctrl = ppu->reg_ctrl; uint8_t sprite_height = (ctrl & PPU_CTRL_SPRITE_HEIGHT) ? 16 : 8; + uint32_t sprite_pattern_table_base = (ctrl & PPU_CTRL_SPRITE_TILE) << 9; for(uint8_t i = 0; i < ppu->sprite_count; i++, sec_oam += 4) { @@ -76,16 +77,21 @@ static inline void ppu_fetch_sprite_patterns(struct nes_state * restrict state, uint32_t bank; uint32_t addr; if(sprite_height == 16) { - bank = (tile & 1) << 12; - tile &= 0xfe; - if(row >= 8) { - tile++; - row -= 8; - } - addr = bank + tile * 16 + row; + // For 8x16 sprites: + // - Bank comes from tile bit 0 (bits 1-7 are the tile index) + // - Row 0-7 uses base tile, row 8-15 uses base tile + 1 + // - Row offset wraps to 0-7 within each 8-pixel half + // + // Original logic: + // bank = (tile & 1) << 12; + // tile &= 0xfe; + // if(row >= 8) { tile++; row -= 8; } + // addr = bank + tile * 16 + row; + addr = ((tile & 1) << 12) + ((tile & 0xfe) + (row >> 3)) * 16 + (row & 7); } else { addr = sprite_pattern_table_base + tile * 16 + row; + } uint8_t val_lo = state->mapper_function.chr_read(state, addr); @@ -103,9 +109,8 @@ static inline void ppu_fetch_sprite_patterns(struct nes_state * restrict state, } } - -__attribute__((always_inline, hot, optimize("no-jump-tables"))) -static inline void ppu_render_pixel(struct nes_state * restrict state, uint32_t x, uint32_t y) { +__attribute__((always_inline, hot, optimize("no-jump-tables", "no-unroll-loops"))) +static inline void ppu_render_pixel(struct nes_state * restrict state, uint32_t x, uint32_t y, uint8_t mask_reg) { struct ppu_state *restrict ppu = &state->ppu; uint16_t bit = 0x8000 >> ppu->fine_x; @@ -115,14 +120,14 @@ static inline void ppu_render_pixel(struct nes_state * restrict state, uint32_t uint8_t sp_prio = 0; uint8_t sp_zero = 0; - uint8_t mask_reg = ppu->reg_mask; // Single load + // uint8_t mask_reg = ppu->reg_mask; // Single load uint8_t show_bg = mask_reg & PPU_MASK_SHOW_BG; uint8_t show_sprites = mask_reg & PPU_MASK_SHOW_SPRITES; uint8_t left_bg = mask_reg & 0x02; uint8_t left_sp = mask_reg & 0x04; uint8_t bg_mask = (show_bg && (left_bg || x & ~7)) ? 0xff : 0x00; - uint8_t sp_mask = (show_sprites && (left_sp || x & ~7));// ? 0xff : 0x00; + uint8_t sp_mask = (show_sprites && (left_sp || x & ~7)); // Background uint8_t p0 = !!(ppu->bg_shift_pattern_low & bit); @@ -133,50 +138,34 @@ static inline void ppu_render_pixel(struct nes_state * restrict state, uint32_t uint8_t bg_pixel = ((p1 << 1) | p0) & bg_mask; uint8_t bg_palette = ((a1 << 1) | a0) & bg_mask; - // Sprites -#define SPRITE_STEP(N) do { \ - if(!ppu->sprites[(N)].position) { \ - sp_pixel = (((ppu->sprites[(N)].shift_hi & 0x80) >> 6) | ((ppu->sprites[(N)].shift_lo & 0x80) >> 7)); \ - if(sp_pixel) { \ - sp_prio = ppu->sprites[(N)].priority; \ - sp_palette = ppu->sprites[(N)].palette; \ - if((N) == 0) { \ - sp_zero = ppu->sprite_zero_in_range; \ - } \ - goto sprite_done; \ - } \ - } \ -} while (0) - - // sprite_counts[ppu->sprite_count]++; - if(sp_mask && ppu->sprite_count > 0) { - if(ppu->sprite_count == 2) goto sprite_2; - if(ppu->sprite_count == 1) goto sprite_1; - if(ppu->sprite_count == 3) goto sprite_3; - if(ppu->sprite_count == 4) goto sprite_4; - if(ppu->sprite_count == 5) goto sprite_5; - if(ppu->sprite_count == 6) goto sprite_6; - if(ppu->sprite_count == 8) goto sprite_8; - if(ppu->sprite_count == 7) goto sprite_7; - - -sprite_8: SPRITE_STEP(7); -sprite_7: SPRITE_STEP(6); -sprite_6: SPRITE_STEP(5); -sprite_5: SPRITE_STEP(4); -sprite_4: SPRITE_STEP(3); -sprite_3: SPRITE_STEP(2); -sprite_2: SPRITE_STEP(1); -sprite_1: SPRITE_STEP(0); + // Sprites - evaluate in forward order (0 has highest priority) + if(sp_mask) { + uint8_t found_sprite = 0xff; + for(uint8_t i = 0; i < ppu->sprite_count; i++) { + if(!ppu->sprites[i].position) { + sp_pixel = (((ppu->sprites[i].shift_hi & 0x80) >> 6) | ((ppu->sprites[i].shift_lo & 0x80) >> 7)); + if(sp_pixel) { + found_sprite = i; + goto sprite_found; + } + } + } + goto no_sprite; + +sprite_found: + sp_prio = ppu->sprites[found_sprite].priority; + sp_palette = ppu->sprites[found_sprite].palette; + sp_zero = ppu->sprite_zero_in_range & !(found_sprite); } -sprite_done: +no_sprite: + // Final pixel composition uint8_t bg_index = (bg_palette << 2) + bg_pixel; uint8_t sp_index = (sp_palette << 2) + sp_pixel; uint8_t selector = (bg_pixel ? 2 : 0) | (sp_pixel ? 1 : 0); - // NOTE(peter): It's actually faster to preset case3 version of palette_index than to start from zero + // NOTE(peter): It's actually faster to preset case 3 version of palette_index than to start from zero uint8_t palette_index = (sp_prio) ? bg_index : 0x10 | sp_index; switch(selector) { @@ -189,20 +178,19 @@ sprite_done: state->pixels[y * 256 + x] = ppu->palette[palette_index]; // NOTE(peter): Add color_emphasis bits (expand palette to 8x). } -__attribute__((hot, optimize("no-jump-tables"))) +__attribute__((hot, optimize("no-jump-tables", "unroll-loops"))) static void ppu_tick(struct nes_state *state) { struct ppu_state *restrict ppu = &state->ppu; uint32_t dot = ppu->dot; uint32_t scanline = ppu->scanline; - uint8_t rendering = (ppu->reg_mask & (PPU_MASK_SHOW_SPRITES | PPU_MASK_SHOW_BG)); + uint8_t reg_mask = ppu->reg_mask; + uint8_t rendering = (reg_mask & (PPU_MASK_SHOW_SPRITES | PPU_MASK_SHOW_BG)); for(uint8_t ppu_loops = 0; ppu_loops < 3; ++ppu_loops) { if(rendering) { - if(scanline <= 239) { - if(dot >= 1 && dot <= 256) { if(dot == 256) { if((ppu->vram_addr & 0x7000) != 0x7000) { @@ -223,7 +211,7 @@ static void ppu_tick(struct nes_state *state) { } } - ppu_render_pixel(state, dot - 1, scanline); + ppu_render_pixel(state, dot - 1, scanline, reg_mask); goto stupid; } @@ -233,7 +221,7 @@ static void ppu_tick(struct nes_state *state) { } if(dot >= 321 && dot <= 336) { -stupid: if(ppu->reg_mask & PPU_MASK_SHOW_SPRITES) { +stupid: if(reg_mask & PPU_MASK_SHOW_SPRITES) { for(uint32_t i = 0; i < ppu->sprite_count; i++) { if(ppu->sprites[i].position > 0) { ppu->sprites[i].position--; @@ -327,16 +315,8 @@ stupid: if(ppu->reg_mask & PPU_MASK_SHOW_SPRITES) { goto stupid2; } - if(dot == 257) { - ppu->vram_addr = (ppu->vram_addr & ~0x041f) | (ppu->temp_addr & 0x041f); - } - - if(dot >= 280 && dot <= 304) { - ppu->vram_addr = (ppu->vram_addr & ~0x7be0) | (ppu->temp_addr & 0x7be0); - } - if(dot >= 321 && dot <= 336) { -stupid2: if(ppu->reg_mask & PPU_MASK_SHOW_SPRITES) { +stupid2: if(reg_mask & PPU_MASK_SHOW_SPRITES) { for(uint32_t i = 0; i < ppu->sprite_count; i++) { if(ppu->sprites[i].position > 0) { ppu->sprites[i].position--; @@ -399,6 +379,15 @@ stupid2: if(ppu->reg_mask & PPU_MASK_SHOW_SPRITES) { } break; } } + + if(dot == 257) { + ppu->vram_addr = (ppu->vram_addr & ~0x041f) | (ppu->temp_addr & 0x041f); + } + + if(dot >= 280 && dot <= 304) { + ppu->vram_addr = (ppu->vram_addr & ~0x7be0) | (ppu->temp_addr & 0x7be0); + } + } } @@ -436,7 +425,7 @@ stupid2: if(ppu->reg_mask & PPU_MASK_SHOW_SPRITES) { } if(state->mapper_function.tick) { - state->mapper_function.tick(state); + state->mapper_function.tick(state); // TODO(peter): This signature has to be changed to supply dot and scanline! } } ppu->dot = dot; |
