From dabd7a5848e6aa55e91cf4c804f6236b4f7fe30e Mon Sep 17 00:00:00 2001 From: Peter Fors Date: Sat, 5 Apr 2025 20:26:45 +0200 Subject: everything working, 2285 fps --- build.sh | 9 +--- mknes.c | 112 +++++++++++++++++++++++++++------------ mknes.h | 14 ++--- ppu.c | 180 +++++++++++++++++++++++++++++++++++++-------------------------- 4 files changed, 193 insertions(+), 122 deletions(-) diff --git a/build.sh b/build.sh index 5487439..845da73 100755 --- a/build.sh +++ b/build.sh @@ -8,7 +8,7 @@ CFLAGS="-std=gnu++23 " CFLAGS+="-mavx2 -mbmi2 -mtune=native -mfunction-return=keep -mindirect-branch=keep " CFLAGS+="-fwrapv -ffast-math -fno-trapping-math -fwhole-program " CFLAGS+="-fno-stack-protector -fno-PIE -no-pie -fno-strict-aliasing -ffunction-sections -fdata-sections " -CFLAGS+="-fno-exceptions -fno-rtti -fno-use-cxa-atexit -fno-non-call-exceptions " +CFLAGS+="-fno-exceptions -fno-rtti -fno-use-cxa-atexit -fno-non-call-exceptions -fno-unwind-tables -fno-asynchronous-unwind-tables " CFLAGS+="-Wall -Wextra " CFLAGS+="-Wno-unused-parameter -Wno-sign-compare -Wno-trigraphs -Wno-maybe-uninitialized " CFLAGS+="-Wno-unused-variable -Wno-unused-const-variable -Wno-unused-function -Wno-write-strings -Wno-missing-field-initializers " @@ -51,12 +51,7 @@ case "$BUILD_TYPE" in ;; esac -# Rebuild assets every time we compile -#rm -rf data -#mkdir -p data/p{1,2,3,4,5,6,7,8} -#env -C org_assets ../../bin/mks_time ./process.sh - -# Make sure the shaders are up to date if you are experimenting with them. +# Make sure the shaders are up to date shader2h 330 vertex_shader vertex_shader.glsl shader2h 330 fragment_shader shader.h fragment_shader.glsl diff --git a/mknes.c b/mknes.c index 5fbea56..25e920f 100644 --- a/mknes.c +++ b/mknes.c @@ -100,45 +100,88 @@ struct nes_state nstate; static uint32_t frames; + +#define PRG_ROM_SIZE (512 * 1024) +#define CHR_ROM_SIZE (512 * 1024) +#define PIXELS_SIZE (256 * 240) +#define RAM_SIZE 0x800 +#define SRAM_SIZE 0x2000 +#define CIRAM_SIZE 0x1000 + +static struct nes_state *allocate_nes_state(void) { + struct nes_state *state = (struct nes_state*)calloc(1, sizeof(struct nes_state)); + if(!state) return 0; + + size_t total_size = (PRG_ROM_SIZE + CHR_ROM_SIZE + PIXELS_SIZE + RAM_SIZE + SRAM_SIZE + CIRAM_SIZE + 4095) & ~0xfff; + + uint8_t *m = (uint8_t*)aligned_alloc(4096, total_size); + memset(m, 0, total_size); + + size_t offset = 0; + + state->prg_rom = m + offset; + offset += PRG_ROM_SIZE; + + state->chr_rom = m + offset; + offset += CHR_ROM_SIZE; + + state->pixels = m + offset; + offset += PIXELS_SIZE; + + state->ram = m + offset; + offset += RAM_SIZE; + + state->sram = m + offset; + offset += SRAM_SIZE; + + state->ciram = m + offset; + offset += CIRAM_SIZE; + + return state; +} + + int main(int argc, char **argv) { #ifdef _WIN32 timeBeginPeriod(1); #endif + struct nes_state *nstate = allocate_nes_state(); + state.toggle_crt_emulation = 1; setbuf(stdout, 0); init_opcode_lut(); init_opcode_ud_lut(); // protect_opcode_lut(); - ppu_reset(&nstate); - // ines2_load(&nstate, "data/nrom/10-Yard Fight (USA, Europe).nes"); - // ines2_load(&nstate, "data/nrom/Balloon Fight (USA).nes"); - // ines2_load(&nstate, "data/nrom/Excitebike (Japan, USA).nes"); - // ines2_load(&nstate, "data/nrom/Ice Climber (USA, Europe, Korea).nes"); - // ines2_load(&nstate, "data/nrom/Kung Fu (Japan, USA).nes"); - ines2_load(&nstate, "data/nrom/Super Mario Bros. (World) (HVC-SM).nes"); - // ines2_load(&nstate, "data/nrom/Urban Champion (World).nes"); - // ines2_load(&nstate, "data/nrom/Wrecking Crew (World).nes"); - // ines2_load(&nstate, "data/nrom/scanline.nes"); - // ines2_load(&nstate, "data/nrom/Sayoonara!.NES"); - // ines2_load(&nstate, "data/nrom/raster_demos/RasterChromaLuma.NES"); - // ines2_load(&nstate, "data/nrom/raster_demos/RasterTest1.NES"); - // ines2_load(&nstate, "data/nrom/raster_demos/RasterTest2.NES"); - // ines2_load(&nstate, "data/nrom/raster_demos/RasterTest3.NES"); - // ines2_load(&nstate, "data/nrom/raster_demos/RasterTest3a.NES"); - // ines2_load(&nstate, "data/nrom/raster_demos/RasterTest3b.NES"); - // ines2_load(&nstate, "data/nrom/raster_demos/RasterTest3c.NES"); - // ines2_load(&nstate, "data/nrom/raster_demos/RasterTest3d.NES"); - // ines2_load(&nstate, "data/nrom/raster_demos/RasterTest3e.NES"); - // ines2_load(&nstate, "data/nrom/NEStress.NES"); - // ines2_load(&nstate, "data/tv.nes"); - // ines2_load(&nstate, "data/Super Mario Bros. (World) (HVC-SM).zip"); - // ines2_load(&nstate, "data/Super Mario Bros. + Duck Hunt (USA).zip"); - - mapper_setup(&nstate); - uint32_t lo = nstate.mapper.prg_read(&nstate, 0xfffc); - uint32_t hi = nstate.mapper.prg_read(&nstate, 0xfffd); - nstate.cpu.pc = (hi << 8) | lo; + ppu_reset(nstate); + // ines2_load(nstate, "data/nrom/10-Yard Fight (USA, Europe).nes"); + // ines2_load(nstate, "data/nrom/Balloon Fight (USA).nes"); + // ines2_load(nstate, "data/nrom/Excitebike (Japan, USA).nes"); + // ines2_load(nstate, "data/nrom/Ice Climber (USA, Europe, Korea).nes"); + // ines2_load(nstate, "data/nrom/Kung Fu (Japan, USA).nes"); + ines2_load(nstate, "data/nrom/Super Mario Bros. (World) (HVC-SM).nes"); + // ines2_load(nstate, "data/nrom/Urban Champion (World).nes"); + // ines2_load(nstate, "data/nrom/Wrecking Crew (World).nes"); + // ines2_load(nstate, "data/nrom/scanline.nes"); + // ines2_load(nstate, "data/nrom/Sayoonara!.NES"); + // ines2_load(nstate, "data/nrom/raster_demos/RasterChromaLuma.NES"); + // ines2_load(nstate, "data/nrom/raster_demos/RasterTest1.NES"); + // ines2_load(nstate, "data/nrom/raster_demos/RasterTest2.NES"); + // ines2_load(nstate, "data/nrom/raster_demos/RasterTest3.NES"); + // ines2_load(nstate, "data/nrom/raster_demos/RasterTest3a.NES"); + // ines2_load(nstate, "data/nrom/raster_demos/RasterTest3b.NES"); + // ines2_load(nstate, "data/nrom/raster_demos/RasterTest3c.NES"); + // ines2_load(nstate, "data/nrom/raster_demos/RasterTest3d.NES"); + // ines2_load(nstate, "data/nrom/raster_demos/RasterTest3e.NES"); + // ines2_load(nstate, "data/nrom/NEStress.NES"); + // ines2_load(nstate, "data/tv.nes"); + // ines2_load(nstate, "data/Super Mario Bros. (World) (HVC-SM).zip"); + // ines2_load(nstate, "data/Super Mario Bros. + Duck Hunt (USA).zip"); + + mapper_setup(nstate); + uint32_t lo = nstate->mapper.prg_read(nstate, 0xfffc); + uint32_t hi = nstate->mapper.prg_read(nstate, 0xfffd); + nstate->cpu.pc = (hi << 8) | lo; struct timer_handle *timer = timer_new(FRAME_INTERVAL_NS); if(!timer) { @@ -178,19 +221,20 @@ int main(int argc, char **argv) { timer_start(timer); while(!glfwWindowShouldClose(window)) { + // for(uint32_t i = 0; i < 0x5000; ++ i) { timer_wait(timer); glfwPollEvents(); // // - while(!nstate.ppu.frame_ready) { + while(!nstate->ppu.frame_ready) { // PROFILE_NAMED("nes emulator"); - cpu_tick(&nstate); + cpu_tick(nstate); } - nstate.ppu.frame_ready = 0; + nstate->ppu.frame_ready = 0; frames++; uint32_t * restrict dst = buffer; - uint8_t * restrict src = nstate.ppu.pixels; + uint8_t * restrict src = nstate->pixels; for(uint32_t y = 0; y < 240; ++y) { for(uint32_t x = 0; x < 256; ++x) { uint8_t val = *src++; @@ -203,7 +247,7 @@ int main(int argc, char **argv) { render_frame(); glfwSwapBuffers(window); } -printf("total frames: %6.6d total cycles: %ld\n", frames, nstate.cycles); +printf("total frames: %6.6d total cycles: %ld\n", frames, nstate->cycles); glfwDestroyWindow(window); } else { fprintf(stderr, "Failed to create window\n"); diff --git a/mknes.h b/mknes.h index 2d04add..c1784f6 100644 --- a/mknes.h +++ b/mknes.h @@ -35,7 +35,6 @@ struct ppu_state { uint8_t oam_data; uint8_t even_frame; - uint8_t pixels[256 * 240] __attribute__((aligned(64))); uint8_t oam[256]; uint8_t secondary_oam[32]; uint8_t palette[0x20]; @@ -101,12 +100,13 @@ struct nes_state { struct ppu_state ppu; struct mapper_entry mapper; union mapper_data map; - uint8_t ram[0x800] __attribute__((aligned(64))); - uint8_t sram[0x2000] __attribute__((aligned(64))); - uint8_t ciram[0x1000] __attribute__((aligned(64))); // NOTE(peter): Originally 0x800 bytes, but extended as it should work for up to fourway, this is optimization, reality is 2kb, but there is no side-effects, so this is fine! - uint8_t prg_rom[4 * 1024 * 1024] __attribute__((aligned(64))); - uint8_t chr_rom[4 * 1024 * 1024] __attribute__((aligned(64))); -} __attribute__((aligned(64))); + uint8_t *pixels; + uint8_t *ram; + uint8_t *sram; + uint8_t *ciram; + uint8_t *prg_rom; + uint8_t *chr_rom; +}; __attribute__((aligned(4096))) static uint32_t nes_palette[64] = { diff --git a/ppu.c b/ppu.c index d67cff3..db78b91 100644 --- a/ppu.c +++ b/ppu.c @@ -124,8 +124,8 @@ static inline uint8_t ppu_read(struct nes_state *state, uint32_t offset) { return result; } -__attribute__((always_inline, hot)) -static inline void ppu_evaluate_sprites(struct nes_state *state) { +__attribute__((hot)) +static void ppu_evaluate_sprites(struct nes_state *state) { struct ppu_state *ppu = &state->ppu; uint8_t sprite_height = (ppu->reg_ctrl & 0x20) ? 16 : 8; uint8_t n = 0; @@ -143,7 +143,7 @@ static inline void ppu_evaluate_sprites(struct nes_state *state) { dst[2] = src[2]; dst[3] = src[3]; ppu->sprite_indexes[n] = i; - ppu->sprite_zero_hit_possible |= (i == 0) ? 1 : 0; + ppu->sprite_zero_hit_possible |= (i == 0); dst += 4; n++; @@ -158,30 +158,32 @@ static inline void ppu_evaluate_sprites(struct nes_state *state) { ppu->sprite_count = n; } -__attribute__((always_inline, hot)) -static inline void ppu_fetch_sprite_patterns(struct nes_state *state) { +__attribute__((hot)) +static void ppu_fetch_sprite_patterns(struct nes_state *state) { struct ppu_state *ppu = &state->ppu; + uint32_t addr; + uint32_t bank; + + uint8_t *s = ppu->secondary_oam; + uint8_t height = (ppu->reg_ctrl & 0x20) ? 16 : 8; + for(uint8_t i = 0; i < ppu->sprite_count; i++) { - uint8_t *s = ppu->secondary_oam + i * 4; uint8_t y = s[0], tile = s[1], attr = s[2], x = s[3]; uint8_t row = ppu->scanline - y; - uint8_t height = (ppu->reg_ctrl & 0x20) ? 16 : 8; row = (attr & 0x80) ? height - 1 - row : row; - uint32_t addr; if(height == 16) { - uint32_t bank = (tile & 1) ? 0x1000 : 0x0000; + bank = (tile & 1) << 12; tile &= 0xfe; if(row >= 8) { tile++; row -= 8; } - addr = bank + tile * 16 + row; } else { - uint32_t bank = (ppu->reg_ctrl & 0x08) ? 0x1000 : 0x0000; - addr = bank + tile * 16 + row; + bank = (ppu->reg_ctrl & 0x08) << 9; } + addr = bank + tile * 16 + row; uint8_t lsb = state->mapper.chr_read(state, addr); uint8_t msb = state->mapper.chr_read(state, addr + 8); @@ -194,73 +196,107 @@ static inline void ppu_fetch_sprite_patterns(struct nes_state *state) { ppu->sprite_shift_lo[i] = lsb; ppu->sprite_shift_hi[i] = msb; ppu->sprite_positions[i] = x; - ppu->sprite_priorities[i] = (attr >> 5) & 1; + ppu->sprite_priorities[i] = attr & 0x20; + s += 4; } } -__attribute__((always_inline, hot)) -static inline void ppu_render_pixel(struct nes_state *state) { +__attribute__((hot)) +static void ppu_render_pixel(struct nes_state *state) { + struct ppu_state *ppu = &state->ppu; + + uint32_t x = ppu->dot - 1; + uint32_t y = ppu->scanline; + + // Fine X shift mask + // static const uint16_t fine_shift[8] = { 0x8000, 0x4000, 0x2000, 0x1000, 0x0800, 0x0400, 0x0200, 0x0100 }; + uint16_t bit = 0x8000 >> ppu->fine_x;//fine_shift[ppu->fine_x]; + uint8_t bg_pixel = 0; uint8_t bg_palette = 0; uint8_t sp_pixel = 0; uint8_t sp_palette = 0; uint8_t sp_prio = 0; uint8_t sp_zero = 0; - uint8_t final_color = 0; - struct ppu_state *ppu = &state->ppu; +#if 1 // TODO(peter): Decide what I prefer, masking away unlikely path, or LIKELY hint to the compiler + uint8_t bg_mask = (ppu->reg_mask & 0x08) ? 0xff : 0x00; + uint8_t sp_mask = (ppu->reg_mask & 0x10) ? 0xff : 0x00; - uint32_t x = ppu->dot - 1; - uint32_t y = ppu->scanline; + // Background + uint8_t p0 = !!(ppu->bg_shift_pattern_low & bit); + uint8_t p1 = !!(ppu->bg_shift_pattern_high & bit); + uint8_t a0 = !!(ppu->bg_shift_attrib_low & bit); + uint8_t a1 = !!(ppu->bg_shift_attrib_high & bit); + + bg_pixel = ((p1 << 1) | p0) & bg_mask; + bg_palette = ((a1 << 1) | a0) & bg_mask; + + // Sprite + for(uint8_t i = 0; i < ppu->sprite_count; i++) { + if(ppu->sprite_positions[i]) continue; - uint32_t bit = 0x8000 >> ppu->fine_x; + uint8_t lo = ppu->sprite_shift_lo[i]; + uint8_t hi = ppu->sprite_shift_hi[i]; + sp_pixel = (((hi & 0x80) >> 6) | ((lo & 0x80) >> 7)) & sp_mask; - if(ppu->reg_mask & 0x08) { - uint8_t p0 = (ppu->bg_shift_pattern_low & bit) ? 1 : 0; - uint8_t p1 = (ppu->bg_shift_pattern_high & bit) ? 1 : 0; + if(!sp_pixel) continue; + + sp_palette = ppu->secondary_oam[i * 4 + 2] & 3; + sp_prio = ppu->sprite_priorities[i]; + sp_zero = (ppu->sprite_indexes[i] == 0); + break; + } +#else + // Background fetch + if(LIKELY(ppu->reg_mask & 0x08)) { + uint8_t p0 = !!(ppu->bg_shift_pattern_low & bit); + uint8_t p1 = !!(ppu->bg_shift_pattern_high & bit); bg_pixel = (p1 << 1) | p0; - uint8_t a0 = (ppu->bg_shift_attrib_low & bit) ? 1 : 0; - uint8_t a1 = (ppu->bg_shift_attrib_high & bit) ? 1 : 0; + uint8_t a0 = !!(ppu->bg_shift_attrib_low & bit); + uint8_t a1 = !!(ppu->bg_shift_attrib_high & bit); bg_palette = (a1 << 1) | a0; } - if(ppu->reg_mask & 0x10) { + // Sprite fetch + if(LIKELY(ppu->reg_mask & 0x10)) { for(uint8_t i = 0; i < ppu->sprite_count; i++) { - if(ppu->sprite_positions[i] == 0) { - uint8_t p0 = (ppu->sprite_shift_lo[i] & 0x80) ? 1 : 0; - uint8_t p1 = (ppu->sprite_shift_hi[i] & 0x80) ? 1 : 0; - sp_pixel = (p1 << 1) | p0; - - if(sp_pixel) { - sp_palette = ppu->secondary_oam[i * 4 + 2] & 3; - sp_prio = ppu->sprite_priorities[i]; - sp_zero = (ppu->sprite_indexes[i] == 0); - break; - } - } - } - } + if(ppu->sprite_positions[i]) continue; - if(bg_pixel == 0 && sp_pixel == 0) { - final_color = ppu->palette[0]; - } else if(bg_pixel == 0 && sp_pixel != 0) { - final_color = ppu->palette[0x10 | (sp_palette << 2) | sp_pixel]; - } else if(bg_pixel != 0 && sp_pixel == 0) { - final_color = ppu->palette[(bg_palette << 2) | bg_pixel]; - } else { - if(sp_zero && ppu->sprite_zero_hit_possible && x < 255) { - ppu->reg_status |= 0x40; - } - if(sp_prio == 0) { - final_color = ppu->palette[0x10 | (sp_palette << 2) | sp_pixel]; - } else { - final_color = ppu->palette[(bg_palette << 2) | bg_pixel]; + uint8_t lo = ppu->sprite_shift_lo[i]; + uint8_t hi = ppu->sprite_shift_hi[i]; + sp_pixel = ((hi & 0x80) >> 6) | ((lo & 0x80) >> 7); + + if(!sp_pixel) continue; + + sp_palette = ppu->secondary_oam[i * 4 + 2] & 3; + sp_prio = ppu->sprite_priorities[i]; + sp_zero = (ppu->sprite_indexes[i] == 0); + break; } } - ppu->pixels[y * 256 + x] = final_color; -} +#endif + // Final pixel composition + uint8_t palette_index = 0; + uint8_t bg_index = (bg_palette << 2) + bg_pixel; + uint8_t sp_index = (sp_palette << 2) + sp_pixel; + uint8_t selector = (bg_pixel ? 2 : 0) | (sp_pixel ? 1 : 0); + + switch(selector) { + case 0: { palette_index = 0; } break; + case 1: { palette_index = 0x10 | sp_index; } break; + case 2: { palette_index = bg_index; } break; + case 3: { + if(sp_zero && ppu->sprite_zero_hit_possible && x < 255) { + ppu->reg_status |= 0x40; + } + palette_index = (sp_prio) ? bg_index : 0x10 | sp_index; + } break; + } + state->pixels[y * 256 + x] = ppu->palette[palette_index]; // NOTE(peter): Add color_emphasis bits (expand palette to 8x). +} __attribute__((hot, flatten)) static void ppu_tick(struct nes_state *state) { @@ -272,7 +308,6 @@ static void ppu_tick(struct nes_state *state) { for(uint32_t ppu_loops = 0; ppu_loops < 3; ++ppu_loops) { - if(LIKELY(rendering)) { if(ppu->even_frame && dot == 0) { @@ -284,28 +319,25 @@ static void ppu_tick(struct nes_state *state) { ppu_render_pixel(state); } - if((dot >= 1 && dot <= 256) || (dot >= 321 && dot <= 336)) { + if(scanline < 240 || scanline == 261) { + if((dot >= 1 && dot <= 256) || (dot >= 321 && dot <= 336)) { - if(ppu->reg_mask & 0x10) { - for(uint32_t i = 0; i < ppu->sprite_count; i++) { - if(ppu->sprite_positions[i] > 0) { - ppu->sprite_positions[i]--; - } else { - ppu->sprite_shift_lo[i] <<= 1; - ppu->sprite_shift_hi[i] <<= 1; + if(ppu->reg_mask & 0x10) { + for(uint32_t i = 0; i < ppu->sprite_count; i++) { + if(ppu->sprite_positions[i] > 0) { + ppu->sprite_positions[i]--; + } else { + ppu->sprite_shift_lo[i] <<= 1; + ppu->sprite_shift_hi[i] <<= 1; + } } } - } - - ppu->bg_shift_pattern_low <<= 1; - ppu->bg_shift_pattern_high <<= 1; - ppu->bg_shift_attrib_low <<= 1; - ppu->bg_shift_attrib_high <<= 1; - } + ppu->bg_shift_pattern_low <<= 1; + ppu->bg_shift_pattern_high <<= 1; + ppu->bg_shift_attrib_low <<= 1; + ppu->bg_shift_attrib_high <<= 1; - if(scanline < 240 || scanline == 261) { - if((dot >= 1 && dot <= 256) || (dot >= 321 && dot <= 336)) { switch(dot % 8) { case 1: { uint32_t nt_addr = 0x2000 | (ppu->vram_addr & 0x0fff); -- cgit v1.2.3