From c9bd7fecdb5d6c8954cf31efef910ed734386c70 Mon Sep 17 00:00:00 2001 From: Peter Fors Date: Tue, 8 Apr 2025 13:42:13 +0200 Subject: 2690fps --- build.sh | 2 +- mapper.c | 1 + mapper_0003.c | 5 +---- mknes.c | 10 +++++----- ppu.c | 18 ++++++++++-------- render.c | 58 +++++++++++++++++++++++++++++----------------------------- 6 files changed, 47 insertions(+), 47 deletions(-) diff --git a/build.sh b/build.sh index 3e878d3..b8f6a4e 100755 --- a/build.sh +++ b/build.sh @@ -5,7 +5,7 @@ PROJECT_NAME="mknes" # Change this for each new project # Base configuration common to all builds CFLAGS="-std=gnu11 " -CFLAGS+="-mavx2 -mbmi2 -march=native " +CFLAGS+="-mbmi2 " CFLAGS+="-mfunction-return=keep " CFLAGS+="-mindirect-branch=keep " CFLAGS+="-fwrapv -ffast-math -fno-trapping-math -fwhole-program " diff --git a/mapper.c b/mapper.c index 188d88b..e10c97c 100644 --- a/mapper.c +++ b/mapper.c @@ -42,6 +42,7 @@ static struct mapper_entry mapper_table[] = { /* Mapper: b */ { 0x0b, mapper_000b_prg_read, mapper_000b_prg_write, mapper_000b_chr_read, mapper_000b_chr_write, mapper_default_ciram_read, mapper_default_ciram_write, mapper_default_tick, mapper_000b_init }, /* Mapper: 66 */ { 0x42, mapper_0042_prg_read, mapper_0042_prg_write, mapper_0042_chr_read, mapper_0042_chr_write, mapper_default_ciram_read, mapper_default_ciram_write, mapper_default_tick, mapper_0042_init }, /* Mapper: 2002 */ { 0x2002, mapper_2002_prg_read, mapper_2002_prg_write, mapper_2002_chr_read, mapper_2002_chr_write, mapper_default_ciram_read, mapper_default_ciram_write, mapper_default_tick, mapper_2002_init }, +/* Mapper: 2003 */ { 0x2003, mapper_0003_prg_read, mapper_0003_prg_write, mapper_0003_chr_read, mapper_0003_chr_write, mapper_default_ciram_read, mapper_default_ciram_write, mapper_default_tick, mapper_0003_init }, }; static void mapper_setup(struct nes_state *state) { diff --git a/mapper_0003.c b/mapper_0003.c index 743df51..a4ae56c 100644 --- a/mapper_0003.c +++ b/mapper_0003.c @@ -5,10 +5,7 @@ static void mapper_0003_init(struct nes_state *state) { } static uint8_t mapper_0003_prg_read(struct nes_state *state, uint32_t addr) { - if(addr >= 0x8000) { - return state->prg_rom[addr - 0x8000]; - } - return 0; + return state->prg_rom[addr & 0x7fff]; } static void mapper_0003_prg_write(struct nes_state *state, uint32_t addr, uint8_t value) { diff --git a/mknes.c b/mknes.c index ea735f6..09da608 100644 --- a/mknes.c +++ b/mknes.c @@ -173,7 +173,7 @@ int main(int argc, char **argv) { // ines2_load(nstate, "data/0003/Flipull - An Exciting Cube Game (Japan) (En).zip"); // ines2_load(nstate, "data/0003/Friday the 13th (USA).zip"); // ines2_load(nstate, "data/0003/Ghostbusters (Japan).zip"); - + // ines2_load(nstate, "data/0003/Gradius (USA).zip"); // ines2_load(nstate, "data/0007/Battletoads (USA).zip"); // ines2_load(nstate, "data/0007/Beetlejuice (USA).zip"); // ines2_load(nstate, "data/0007/Cabal (USA).zip"); @@ -226,7 +226,7 @@ int main(int argc, char **argv) { } } - set_decay(10); + set_decay(20); timer_start(timer); @@ -241,7 +241,7 @@ int main(int argc, char **argv) { } #else while(!glfwWindowShouldClose(window)) { - timer_wait(timer); + // timer_wait(timer); glfwPollEvents(); while(!nstate->ppu.frame_ready) { @@ -251,7 +251,7 @@ int main(int argc, char **argv) { nstate->ppu.frame_ready = 0; frames++; - uint32_t * restrict dst = buffer; + uint32_t * restrict dst = display_buffer; //buffer; uint8_t * restrict src = nstate->pixels; for(uint32_t y = 0; y < 240; ++y) { for(uint32_t x = 0; x < 256; ++x) { @@ -261,7 +261,7 @@ int main(int argc, char **argv) { } dst += BUFFER_WIDTH; } - apply_phosphor_decay(); + // apply_phosphor_decay(); render_frame(); glfwSwapBuffers(window); } diff --git a/ppu.c b/ppu.c index cef45fa..bab97d5 100644 --- a/ppu.c +++ b/ppu.c @@ -22,8 +22,8 @@ static inline void ppu_evaluate_sprites(struct nes_state *state) { uint8_t sprite_height = (ppu->reg_ctrl & 0x20) ? 16 : 8; uint8_t n = 0; - uint8_t *src = ppu->oam; - uint8_t *dst = ppu->secondary_oam; + uint8_t * restrict src = ppu->oam; + uint8_t * restrict dst = ppu->secondary_oam; for(uint8_t i = 0; i < 64; i++) { uint8_t y = src[0]; int32_t row = (int32_t)ppu->scanline - y; @@ -55,8 +55,10 @@ static inline void ppu_fetch_sprite_patterns(struct nes_state *state) { struct ppu_state *restrict ppu = &state->ppu; uint32_t addr; uint32_t bank; + uint8_t lsb; + uint8_t msb; - uint8_t *s = ppu->secondary_oam; + uint8_t * restrict s = ppu->secondary_oam; uint8_t height = (ppu->reg_ctrl & 0x20) ? 16 : 8; for(uint8_t i = 0; i < ppu->sprite_count; i++) { @@ -77,12 +79,12 @@ static inline void ppu_fetch_sprite_patterns(struct nes_state *state) { } addr = bank + tile * 16 + row; - uint8_t lsb = state->mapper.chr_read(state, addr); - uint8_t msb = state->mapper.chr_read(state, addr + 8); - if(attr & 0x40) { - lsb = ppu_bitreverse_lut[lsb]; - msb = ppu_bitreverse_lut[msb]; + lsb = ppu_bitreverse_lut[state->mapper.chr_read(state, addr)]; + msb = ppu_bitreverse_lut[state->mapper.chr_read(state, addr + 8)]; + } else { + lsb = state->mapper.chr_read(state, addr); + msb = state->mapper.chr_read(state, addr + 8); } ppu->sprite_shift_lo[i] = lsb; diff --git a/render.c b/render.c index 26b3c19..6e399ef 100644 --- a/render.c +++ b/render.c @@ -16,32 +16,32 @@ static void set_decay(uint16_t old_weight) { } /* [=]===^=[ apply_phosphor_decay ]=================================================================^===[=] */ -__attribute__((always_inline, hot)) -static inline void apply_phosphor_decay(void) { - // PROFILE_FUNCTION(); - __m256i old_weight = _mm256_set1_epi16(_old_weight); - __m256i new_weight = _mm256_set1_epi16(_new_weight); - __m128i alpha_mask = _mm_set1_epi32(0x000000ff); - uint32_t * restrict src = buffer; - uint32_t * restrict dst = display_buffer; - - for(uint32_t y = 0; y < BUFFER_HEIGHT; ++y, src += BUFFER_WIDTH, dst += BUFFER_WIDTH) { - for(uint32_t x = 0; x < BUFFER_WIDTH; x += 4) { - _mm_prefetch((char*)&src[x + 2 * BUFFER_WIDTH], _MM_HINT_T0); - _mm_prefetch((char*)&dst[x + 2 * BUFFER_WIDTH], _MM_HINT_T0); - - __m128i new_pixels = _mm_load_si128((__m128i*)&src[x]); - __m128i old_pixels = _mm_load_si128((__m128i*)&dst[x]); - - __m256i old_lo = _mm256_cvtepu8_epi16(old_pixels); - __m256i new_lo = _mm256_cvtepu8_epi16(new_pixels); - - __m256i blended = _mm256_adds_epu16(_mm256_mullo_epi16(old_lo, old_weight), _mm256_mullo_epi16(new_lo, new_weight)); - blended = _mm256_srli_epi16(blended, 8); - - __m128i final_pixels = _mm_packus_epi16(_mm256_castsi256_si128(blended), _mm256_extracti128_si256(blended, 1)); - final_pixels = _mm_or_si128(final_pixels, _mm_and_si128(old_pixels, alpha_mask)); - _mm_store_si128((__m128i*)&dst[x], final_pixels); - } - } -} +// __attribute__((always_inline, hot)) +// static inline void apply_phosphor_decay(void) { +// // PROFILE_FUNCTION(); +// __m256i old_weight = _mm256_set1_epi16(_old_weight); +// __m256i new_weight = _mm256_set1_epi16(_new_weight); +// __m128i alpha_mask = _mm_set1_epi32(0x000000ff); +// uint32_t * restrict src = buffer; +// uint32_t * restrict dst = display_buffer; + +// for(uint32_t y = 0; y < BUFFER_HEIGHT; ++y, src += BUFFER_WIDTH, dst += BUFFER_WIDTH) { +// for(uint32_t x = 0; x < BUFFER_WIDTH; x += 4) { +// _mm_prefetch((char*)&src[x + 2 * BUFFER_WIDTH], _MM_HINT_T0); +// _mm_prefetch((char*)&dst[x + 2 * BUFFER_WIDTH], _MM_HINT_T0); + +// __m128i new_pixels = _mm_load_si128((__m128i*)&src[x]); +// __m128i old_pixels = _mm_load_si128((__m128i*)&dst[x]); + +// __m256i old_lo = _mm256_cvtepu8_epi16(old_pixels); +// __m256i new_lo = _mm256_cvtepu8_epi16(new_pixels); + +// __m256i blended = _mm256_adds_epu16(_mm256_mullo_epi16(old_lo, old_weight), _mm256_mullo_epi16(new_lo, new_weight)); +// blended = _mm256_srli_epi16(blended, 8); + +// __m128i final_pixels = _mm_packus_epi16(_mm256_castsi256_si128(blended), _mm256_extracti128_si256(blended, 1)); +// final_pixels = _mm_or_si128(final_pixels, _mm_and_si128(old_pixels, alpha_mask)); +// _mm_store_si128((__m128i*)&dst[x], final_pixels); +// } +// } +// } -- cgit v1.2.3