summaryrefslogtreecommitdiff
path: root/mknes_ppu.c
diff options
context:
space:
mode:
authorPeter Fors <peter.fors@mindkiller.com>2025-10-28 17:44:57 +0100
committerPeter Fors <peter.fors@mindkiller.com>2025-10-28 17:44:57 +0100
commit9ee76c20c0d093d5adac2dcc3b275b53b879c369 (patch)
tree7f21652a42d96f2ede7822950cbbf9c044ca43ca /mknes_ppu.c
parent3b7621981b56a51756badac70034f68366878df9 (diff)
small optimizations of sprite evaluation in ppu_render_pixel
Diffstat (limited to 'mknes_ppu.c')
-rw-r--r--mknes_ppu.c121
1 files changed, 55 insertions, 66 deletions
diff --git a/mknes_ppu.c b/mknes_ppu.c
index fcaf681..7c2b4ac 100644
--- a/mknes_ppu.c
+++ b/mknes_ppu.c
@@ -24,7 +24,7 @@ static void ppu_reset(struct nes_state *state) {
memset(ppu, 0, sizeof(struct ppu_state));
}
-__attribute__((hot, flatten))
+__attribute__((hot, flatten, optimize("unroll-loops")))
static inline void ppu_evaluate_sprites(struct nes_state *state, uint32_t scanline) {
struct ppu_state *restrict ppu = &state->ppu;
uint8_t sprite_height = (ppu->reg_ctrl & PPU_CTRL_SPRITE_HEIGHT) ? 16 : 8;
@@ -62,6 +62,7 @@ static inline void ppu_fetch_sprite_patterns(struct nes_state * restrict state,
uint8_t * restrict sec_oam = ppu->secondary_oam;
uint8_t ctrl = ppu->reg_ctrl;
uint8_t sprite_height = (ctrl & PPU_CTRL_SPRITE_HEIGHT) ? 16 : 8;
+
uint32_t sprite_pattern_table_base = (ctrl & PPU_CTRL_SPRITE_TILE) << 9;
for(uint8_t i = 0; i < ppu->sprite_count; i++, sec_oam += 4) {
@@ -76,16 +77,21 @@ static inline void ppu_fetch_sprite_patterns(struct nes_state * restrict state,
uint32_t bank;
uint32_t addr;
if(sprite_height == 16) {
- bank = (tile & 1) << 12;
- tile &= 0xfe;
- if(row >= 8) {
- tile++;
- row -= 8;
- }
- addr = bank + tile * 16 + row;
+ // For 8x16 sprites:
+ // - Bank comes from tile bit 0 (bits 1-7 are the tile index)
+ // - Row 0-7 uses base tile, row 8-15 uses base tile + 1
+ // - Row offset wraps to 0-7 within each 8-pixel half
+ //
+ // Original logic:
+ // bank = (tile & 1) << 12;
+ // tile &= 0xfe;
+ // if(row >= 8) { tile++; row -= 8; }
+ // addr = bank + tile * 16 + row;
+ addr = ((tile & 1) << 12) + ((tile & 0xfe) + (row >> 3)) * 16 + (row & 7);
} else {
addr = sprite_pattern_table_base + tile * 16 + row;
+
}
uint8_t val_lo = state->mapper_function.chr_read(state, addr);
@@ -103,9 +109,8 @@ static inline void ppu_fetch_sprite_patterns(struct nes_state * restrict state,
}
}
-
-__attribute__((always_inline, hot, optimize("no-jump-tables")))
-static inline void ppu_render_pixel(struct nes_state * restrict state, uint32_t x, uint32_t y) {
+__attribute__((always_inline, hot, optimize("no-jump-tables", "no-unroll-loops")))
+static inline void ppu_render_pixel(struct nes_state * restrict state, uint32_t x, uint32_t y, uint8_t mask_reg) {
struct ppu_state *restrict ppu = &state->ppu;
uint16_t bit = 0x8000 >> ppu->fine_x;
@@ -115,14 +120,14 @@ static inline void ppu_render_pixel(struct nes_state * restrict state, uint32_t
uint8_t sp_prio = 0;
uint8_t sp_zero = 0;
- uint8_t mask_reg = ppu->reg_mask; // Single load
+ // uint8_t mask_reg = ppu->reg_mask; // Single load
uint8_t show_bg = mask_reg & PPU_MASK_SHOW_BG;
uint8_t show_sprites = mask_reg & PPU_MASK_SHOW_SPRITES;
uint8_t left_bg = mask_reg & 0x02;
uint8_t left_sp = mask_reg & 0x04;
uint8_t bg_mask = (show_bg && (left_bg || x & ~7)) ? 0xff : 0x00;
- uint8_t sp_mask = (show_sprites && (left_sp || x & ~7));// ? 0xff : 0x00;
+ uint8_t sp_mask = (show_sprites && (left_sp || x & ~7));
// Background
uint8_t p0 = !!(ppu->bg_shift_pattern_low & bit);
@@ -133,50 +138,34 @@ static inline void ppu_render_pixel(struct nes_state * restrict state, uint32_t
uint8_t bg_pixel = ((p1 << 1) | p0) & bg_mask;
uint8_t bg_palette = ((a1 << 1) | a0) & bg_mask;
- // Sprites
-#define SPRITE_STEP(N) do { \
- if(!ppu->sprites[(N)].position) { \
- sp_pixel = (((ppu->sprites[(N)].shift_hi & 0x80) >> 6) | ((ppu->sprites[(N)].shift_lo & 0x80) >> 7)); \
- if(sp_pixel) { \
- sp_prio = ppu->sprites[(N)].priority; \
- sp_palette = ppu->sprites[(N)].palette; \
- if((N) == 0) { \
- sp_zero = ppu->sprite_zero_in_range; \
- } \
- goto sprite_done; \
- } \
- } \
-} while (0)
-
- // sprite_counts[ppu->sprite_count]++;
- if(sp_mask && ppu->sprite_count > 0) {
- if(ppu->sprite_count == 2) goto sprite_2;
- if(ppu->sprite_count == 1) goto sprite_1;
- if(ppu->sprite_count == 3) goto sprite_3;
- if(ppu->sprite_count == 4) goto sprite_4;
- if(ppu->sprite_count == 5) goto sprite_5;
- if(ppu->sprite_count == 6) goto sprite_6;
- if(ppu->sprite_count == 8) goto sprite_8;
- if(ppu->sprite_count == 7) goto sprite_7;
-
-
-sprite_8: SPRITE_STEP(7);
-sprite_7: SPRITE_STEP(6);
-sprite_6: SPRITE_STEP(5);
-sprite_5: SPRITE_STEP(4);
-sprite_4: SPRITE_STEP(3);
-sprite_3: SPRITE_STEP(2);
-sprite_2: SPRITE_STEP(1);
-sprite_1: SPRITE_STEP(0);
+ // Sprites - evaluate in forward order (0 has highest priority)
+ if(sp_mask) {
+ uint8_t found_sprite = 0xff;
+ for(uint8_t i = 0; i < ppu->sprite_count; i++) {
+ if(!ppu->sprites[i].position) {
+ sp_pixel = (((ppu->sprites[i].shift_hi & 0x80) >> 6) | ((ppu->sprites[i].shift_lo & 0x80) >> 7));
+ if(sp_pixel) {
+ found_sprite = i;
+ goto sprite_found;
+ }
+ }
+ }
+ goto no_sprite;
+
+sprite_found:
+ sp_prio = ppu->sprites[found_sprite].priority;
+ sp_palette = ppu->sprites[found_sprite].palette;
+ sp_zero = ppu->sprite_zero_in_range & !(found_sprite);
}
-sprite_done:
+no_sprite:
+
// Final pixel composition
uint8_t bg_index = (bg_palette << 2) + bg_pixel;
uint8_t sp_index = (sp_palette << 2) + sp_pixel;
uint8_t selector = (bg_pixel ? 2 : 0) | (sp_pixel ? 1 : 0);
- // NOTE(peter): It's actually faster to preset case3 version of palette_index than to start from zero
+ // NOTE(peter): It's actually faster to preset case 3 version of palette_index than to start from zero
uint8_t palette_index = (sp_prio) ? bg_index : 0x10 | sp_index;
switch(selector) {
@@ -189,20 +178,19 @@ sprite_done:
state->pixels[y * 256 + x] = ppu->palette[palette_index]; // NOTE(peter): Add color_emphasis bits (expand palette to 8x).
}
-__attribute__((hot, optimize("no-jump-tables")))
+__attribute__((hot, optimize("no-jump-tables", "unroll-loops")))
static void ppu_tick(struct nes_state *state) {
struct ppu_state *restrict ppu = &state->ppu;
uint32_t dot = ppu->dot;
uint32_t scanline = ppu->scanline;
- uint8_t rendering = (ppu->reg_mask & (PPU_MASK_SHOW_SPRITES | PPU_MASK_SHOW_BG));
+ uint8_t reg_mask = ppu->reg_mask;
+ uint8_t rendering = (reg_mask & (PPU_MASK_SHOW_SPRITES | PPU_MASK_SHOW_BG));
for(uint8_t ppu_loops = 0; ppu_loops < 3; ++ppu_loops) {
if(rendering) {
-
if(scanline <= 239) {
-
if(dot >= 1 && dot <= 256) {
if(dot == 256) {
if((ppu->vram_addr & 0x7000) != 0x7000) {
@@ -223,7 +211,7 @@ static void ppu_tick(struct nes_state *state) {
}
}
- ppu_render_pixel(state, dot - 1, scanline);
+ ppu_render_pixel(state, dot - 1, scanline, reg_mask);
goto stupid;
}
@@ -233,7 +221,7 @@ static void ppu_tick(struct nes_state *state) {
}
if(dot >= 321 && dot <= 336) {
-stupid: if(ppu->reg_mask & PPU_MASK_SHOW_SPRITES) {
+stupid: if(reg_mask & PPU_MASK_SHOW_SPRITES) {
for(uint32_t i = 0; i < ppu->sprite_count; i++) {
if(ppu->sprites[i].position > 0) {
ppu->sprites[i].position--;
@@ -327,16 +315,8 @@ stupid: if(ppu->reg_mask & PPU_MASK_SHOW_SPRITES) {
goto stupid2;
}
- if(dot == 257) {
- ppu->vram_addr = (ppu->vram_addr & ~0x041f) | (ppu->temp_addr & 0x041f);
- }
-
- if(dot >= 280 && dot <= 304) {
- ppu->vram_addr = (ppu->vram_addr & ~0x7be0) | (ppu->temp_addr & 0x7be0);
- }
-
if(dot >= 321 && dot <= 336) {
-stupid2: if(ppu->reg_mask & PPU_MASK_SHOW_SPRITES) {
+stupid2: if(reg_mask & PPU_MASK_SHOW_SPRITES) {
for(uint32_t i = 0; i < ppu->sprite_count; i++) {
if(ppu->sprites[i].position > 0) {
ppu->sprites[i].position--;
@@ -399,6 +379,15 @@ stupid2: if(ppu->reg_mask & PPU_MASK_SHOW_SPRITES) {
} break;
}
}
+
+ if(dot == 257) {
+ ppu->vram_addr = (ppu->vram_addr & ~0x041f) | (ppu->temp_addr & 0x041f);
+ }
+
+ if(dot >= 280 && dot <= 304) {
+ ppu->vram_addr = (ppu->vram_addr & ~0x7be0) | (ppu->temp_addr & 0x7be0);
+ }
+
}
}
@@ -436,7 +425,7 @@ stupid2: if(ppu->reg_mask & PPU_MASK_SHOW_SPRITES) {
}
if(state->mapper_function.tick) {
- state->mapper_function.tick(state);
+ state->mapper_function.tick(state); // TODO(peter): This signature has to be changed to supply dot and scanline!
}
}
ppu->dot = dot;