diff options
| author | Peter Fors <peter.fors@mindkiller.com> | 2025-10-28 17:44:57 +0100 |
|---|---|---|
| committer | Peter Fors <peter.fors@mindkiller.com> | 2025-10-28 17:44:57 +0100 |
| commit | 9ee76c20c0d093d5adac2dcc3b275b53b879c369 (patch) | |
| tree | 7f21652a42d96f2ede7822950cbbf9c044ca43ca | |
| parent | 3b7621981b56a51756badac70034f68366878df9 (diff) | |
small optimizations of sprite evaluation in ppu_render_pixel
| -rw-r--r-- | .gitignore | 2 | ||||
| -rwxr-xr-x | Bench.sh | 1 | ||||
| -rwxr-xr-x | build.sh | 12 | ||||
| -rw-r--r-- | mknes.c | 6 | ||||
| -rw-r--r-- | mknes_apu.c | 31 | ||||
| -rw-r--r-- | mknes_bench.c | 199 | ||||
| -rw-r--r-- | mknes_ppu.c | 121 |
7 files changed, 227 insertions, 145 deletions
@@ -18,3 +18,5 @@ mknes_memory original.c mknes.s record.txt _Bench.sh +toolchain + @@ -3,7 +3,6 @@ ./build.sh clean ./build.sh profile ./mknes -n 1 &> /dev/null - ./build.sh profile_release # Run full benchmark @@ -1,9 +1,16 @@ #!/bin/bash +# Use project-local GCC if available, otherwise system GCC +TOOLCHAIN_GCC="./toolchain/gcc-15.2.0/bin/gcc" +if [ -f "${TOOLCHAIN_GCC}" ]; then + CC="${TOOLCHAIN_GCC}" +else + CC=gcc +fi + # Set the project name here PROJECT_NAME="mknes" # Change this for each new project -CC=gcc WIN_CC=x86_64-w64-mingw32-gcc # Base configuration common to all builds @@ -52,8 +59,7 @@ case "$BUILD_TYPE" in # -pg # for gprof ;; "release") - # CFLAGS+="-s -Wl,--strip-all -O2 " - CFLAGS+=" -O2 " + CFLAGS+="-s -Wl,--strip-all -O2 " ;; "profile") CFLAGS+="-O2 -fprofile-generate -ftest-coverage -DBENCHMARK " @@ -60,7 +60,7 @@ static void audio_callback(int16_t *data, size_t frames) { } // Uncomment the ROM you want to benchmark: // INCBIN_BYTES(benchmark_rom, "data/Life Force (USA).nes"); INCBIN_BYTES(benchmark_rom, "data/0000/Super Mario Bros. (World) (HVC-SM).nes"); -// INCBIN_BYTES(benchmark_rom, "data/0003/Gradius (USA).zip"); +// INCBIN_BYTES(benchmark_rom, "data/0003/Gradius (USA).nes"); #endif #include "platform_gl_loader.c" @@ -271,7 +271,7 @@ int main(int argc, char **argv) { // ines2_load(nstate, "data/0000/Excitebike (Japan, USA).nes"); // ines2_load(nstate, "data/0000/Ice Climber (USA, Europe, Korea).nes"); // ines2_load(nstate, "data/0000/Kung Fu (Japan, USA).nes"); - // ines2_load(nstate, "data/0000/Super Mario Bros. (World) (HVC-SM).nes"); + ines2_load(nstate, "data/0000/Super Mario Bros. (World) (HVC-SM).nes"); // ines2_load(nstate, "data/Super Mario Bros. (W) (V1.0) [!].nes"); // ines2_load(nstate, "data/Super Mario Bros. (JU) [!].nes"); // ines2_load(nstate, "data/0000/Urban Champion (World).nes"); @@ -293,7 +293,7 @@ int main(int argc, char **argv) { // ines2_load(nstate, "data/0000/Xevious - The Avenger (USA).zip"); // ines2_load(nstate, "data/tv.nes"); - ines2_load(nstate, "data/Life Force (USA).nes"); // 2002 + // ines2_load(nstate, "data/Life Force (USA).nes"); // 2002 // ines2_load(nstate, "data/0003/Flipull - An Exciting Cube Game (Japan) (En).zip"); // ines2_load(nstate, "data/0003/Friday the 13th (USA).zip"); diff --git a/mknes_apu.c b/mknes_apu.c index 98552ed..027b1e5 100644 --- a/mknes_apu.c +++ b/mknes_apu.c @@ -35,12 +35,19 @@ static uint8_t apu_read4015(struct nes_state *state) { } if(apu->irq_pending) { result |= 0x40; - apu->irq_pending = 0; } + + // Reading $4015 clears the frame IRQ flag + apu->irq_pending = 0; + // Only clear CPU IRQ if DMC isn't requesting it + if(!(apu->dmc_bytes_remaining > 0 && apu->dmc_irq_enable)) { + state->cpu.irq_pending = 0; + } + return result; } -// $4010–$4013, $4015 write +// $4010–$4013, $4015, $4017 write static void apu_write(struct nes_state *state, uint16_t addr, uint8_t val) { struct apu_state *apu = &state->apu; @@ -62,6 +69,26 @@ static void apu_write(struct nes_state *state, uint16_t addr, uint8_t val) { case 0x4015: { apu_write4015(state, val); } break; + case 0x4017: { + // Frame counter control + apu->mode = (val >> 7) & 1; + apu->irq_inhibit = (val >> 6) & 1; + + // If IRQ inhibit flag is set, clear the frame IRQ + if(apu->irq_inhibit) { + apu->irq_pending = 0; + // Only clear CPU IRQ if DMC isn't requesting it + if(!(apu->dmc_bytes_remaining > 0 && apu->dmc_irq_enable)) { + state->cpu.irq_pending = 0; + } + } + + // Reset frame counter (with delay, but we'll approximate immediately for now) + apu->frame_cycle = 0; + + // If 5-step mode, immediately clock half-frame and quarter-frame + // (For timing purposes without audio, we can leave this empty) + } break; } } diff --git a/mknes_bench.c b/mknes_bench.c index 12fb642..b064266 100644 --- a/mknes_bench.c +++ b/mknes_bench.c @@ -8,7 +8,6 @@ #include <math.h> #include <time.h> -// Performance counter setup struct perf_counter { int fd; uint64_t value; @@ -17,14 +16,23 @@ struct perf_counter { uint64_t config; }; +struct perf_group_read { + uint64_t nr; + uint64_t time_enabled; + uint64_t time_running; + struct { + uint64_t value; + } values[5]; +} __attribute__((packed)); + struct bench_run { uint64_t cycles; uint64_t instructions; uint64_t stalled_cycles_frontend; - uint64_t stalled_cycles_backend; uint64_t branches; uint64_t branch_misses; uint64_t time_ns; + double multiplexing_coverage; // ratio of time_running/time_enabled }; struct bench_stats { @@ -38,51 +46,76 @@ static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu return syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags); } -static int setup_counter(struct perf_counter *counter, uint32_t type, uint64_t config, const char *name) { +static int setup_counter_group(struct perf_counter *counters, int num_counters) { struct perf_event_attr pe; memset(&pe, 0, sizeof(struct perf_event_attr)); - pe.type = type; + pe.type = PERF_TYPE_HARDWARE; pe.size = sizeof(struct perf_event_attr); - pe.config = config; pe.disabled = 1; - pe.exclude_kernel = 0; - pe.exclude_hv = 0; + pe.exclude_kernel = 1; + pe.exclude_hv = 1; pe.exclude_idle = 1; + pe.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING; - counter->fd = perf_event_open(&pe, 0, -1, -1, 0); - counter->name = name; - counter->type = type; - counter->config = config; - counter->value = 0; + int leader_fd = -1; - return counter->fd; -} + for(int i = 0; i < num_counters; i++) { + pe.config = counters[i].config; -static void reset_counters(struct perf_counter *counters, int n) { - for(int i = 0; i < n; i++) { - if(counters[i].fd >= 0) { - ioctl(counters[i].fd, PERF_EVENT_IOC_RESET, 0); + if(i == 0) { + // First counter is the group leader + leader_fd = perf_event_open(&pe, 0, -1, -1, 0); + counters[i].fd = leader_fd; + } else { + // Subsequent counters are group members + counters[i].fd = perf_event_open(&pe, 0, -1, leader_fd, 0); } + + if(counters[i].fd < 0) { + fprintf(stderr, "Warning: Counter %s not available\n", counters[i].name); + } + + counters[i].value = 0; } + + return leader_fd; } -static void start_counters(struct perf_counter *counters, int n) { - for(int i = 0; i < n; i++) { - if(counters[i].fd >= 0) { - ioctl(counters[i].fd, PERF_EVENT_IOC_ENABLE, 0); - } +static void reset_counters_group(int leader_fd) { + if(leader_fd >= 0) { + ioctl(leader_fd, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP); } } -static void stop_counters(struct perf_counter *counters, int n) { - for(int i = 0; i < n; i++) { - if(counters[i].fd >= 0) { - ioctl(counters[i].fd, PERF_EVENT_IOC_DISABLE, 0); - read(counters[i].fd, &counters[i].value, sizeof(uint64_t)); - } +static void start_counters_group(int leader_fd) { + if(leader_fd >= 0) { + ioctl(leader_fd, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP); } } +static int stop_and_read_counters_group(int leader_fd, struct perf_counter *counters, int num_counters, struct perf_group_read *result) { + if(leader_fd < 0) { + return -1; + } + + // Stop all counters atomically + ioctl(leader_fd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP); + + // Read all counter values in one syscall + ssize_t bytes_read = read(leader_fd, result, sizeof(struct perf_group_read)); + if(bytes_read < 0) { + fprintf(stderr, "Error: Failed to read perf group counters\n"); + return -1; + } + + // Store values in individual counter structs for compatibility + for(int i = 0; i < num_counters && i < result->nr; i++) { + counters[i].value = result->values[i].value; + } + + return 0; +} + static void close_counters(struct perf_counter *counters, int n) { for(int i = 0; i < n; i++) { if(counters[i].fd >= 0) { @@ -133,7 +166,6 @@ static void set_cpu_affinity(int cpu) { } } -// Static allocation for benchmark runs - no malloc, page-aligned #define MAX_BENCH_RUNS 100 static struct bench_run runs_storage[MAX_BENCH_RUNS] __attribute__((aligned(4096))); static double per_frame_storage[MAX_BENCH_RUNS * 3] __attribute__((aligned(4096))); @@ -144,38 +176,36 @@ static void run_benchmark(struct nes_state *nstate, uint32_t num_runs, uint32_t return; } - struct perf_counter counters[6]; - int num_counters = 0; + struct perf_counter counters[5]; + int num_counters = 5; + + // Initialize counter metadata + counters[0] = (struct perf_counter){.fd = -1, .value = 0, .name = "cycles", .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES}; + counters[1] = (struct perf_counter){.fd = -1, .value = 0, .name = "instructions", .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_INSTRUCTIONS}; + counters[2] = (struct perf_counter){.fd = -1, .value = 0, .name = "stalled-cycles-frontend", .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_STALLED_CYCLES_FRONTEND}; + counters[3] = (struct perf_counter){.fd = -1, .value = 0, .name = "branches", .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_INSTRUCTIONS}; + counters[4] = (struct perf_counter){.fd = -1, .value = 0, .name = "branch-misses", .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_MISSES}; - // Set up performance counters - setup_counter(&counters[num_counters++], PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES, "cycles"); - setup_counter(&counters[num_counters++], PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS, "instructions"); - setup_counter(&counters[num_counters++], PERF_TYPE_HARDWARE, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND, "stalled-cycles-frontend"); - setup_counter(&counters[num_counters++], PERF_TYPE_HARDWARE, PERF_COUNT_HW_STALLED_CYCLES_BACKEND, "stalled-cycles-backend"); - setup_counter(&counters[num_counters++], PERF_TYPE_HARDWARE, PERF_COUNT_HW_BRANCH_INSTRUCTIONS, "branches"); - setup_counter(&counters[num_counters++], PERF_TYPE_HARDWARE, PERF_COUNT_HW_BRANCH_MISSES, "branch-misses"); + // Set up performance counter group + int leader_fd = setup_counter_group(counters, num_counters); // Check which counters are available int available_counters = 0; for(int i = 0; i < num_counters; i++) { if(counters[i].fd >= 0) { available_counters++; - } else { - fprintf(stderr, "Warning: Counter %s not available\n", counters[i].name); } } - if(available_counters == 0) { + if(available_counters == 0 || leader_fd < 0) { fprintf(stderr, "Error: No performance counters available\n"); close_counters(counters, num_counters); return; } - // Use static storage for runs struct bench_run *runs = runs_storage; memset(runs, 0, sizeof(struct bench_run) * num_runs); - // Set CPU affinity and realtime priority set_cpu_affinity(1); set_realtime_priority(); @@ -194,7 +224,7 @@ static void run_benchmark(struct nes_state *nstate, uint32_t num_runs, uint32_t nstate->ppu.frame_ready = 0; } - // Run benchmark iterations + // Run benchmark for(uint32_t run = 0; run < num_runs; run++) { // Reset emulator state @@ -209,8 +239,8 @@ static void run_benchmark(struct nes_state *nstate, uint32_t num_runs, uint32_t clock_gettime(CLOCK_MONOTONIC, &start_time); // Reset and start counters (after clock_gettime to exclude its overhead) - reset_counters(counters, num_counters); - start_counters(counters, num_counters); + reset_counters_group(leader_fd); + start_counters_group(leader_fd); // Run emulation for(uint32_t i = 0; i < frames_per_run; i++) { @@ -220,17 +250,20 @@ static void run_benchmark(struct nes_state *nstate, uint32_t num_runs, uint32_t nstate->ppu.frame_ready = 0; } - // Stop counters (before clock_gettime to exclude its overhead) - stop_counters(counters, num_counters); - - // Stop timing + // Stop and read all counters atomically in one syscall + struct perf_group_read group_result; + stop_and_read_counters_group(leader_fd, counters, num_counters, &group_result); clock_gettime(CLOCK_MONOTONIC, &end_time); - // Calculate elapsed time in nanoseconds uint64_t elapsed_ns = (end_time.tv_sec - start_time.tv_sec) * 1000000000ULL + (end_time.tv_nsec - start_time.tv_nsec); + // Calculate multiplexing coverage + double coverage = (group_result.time_enabled > 0) ? + (double)group_result.time_running / (double)group_result.time_enabled : 1.0; + // Store results runs[run].time_ns = elapsed_ns; + runs[run].multiplexing_coverage = coverage; for(int i = 0; i < num_counters; i++) { if(counters[i].fd < 0) continue; @@ -240,8 +273,6 @@ static void run_benchmark(struct nes_state *nstate, uint32_t num_runs, uint32_t runs[run].instructions = counters[i].value; } else if(counters[i].config == PERF_COUNT_HW_STALLED_CYCLES_FRONTEND) { runs[run].stalled_cycles_frontend = counters[i].value; - } else if(counters[i].config == PERF_COUNT_HW_STALLED_CYCLES_BACKEND) { - runs[run].stalled_cycles_backend = counters[i].value; } else if(counters[i].config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) { runs[run].branches = counters[i].value; } else if(counters[i].config == PERF_COUNT_HW_BRANCH_MISSES) { @@ -250,11 +281,43 @@ static void run_benchmark(struct nes_state *nstate, uint32_t num_runs, uint32_t } } + // Check for multiplexing and warn user + int multiplexing_detected = 0; + double min_coverage = 1.0; + double max_coverage = 1.0; + for(uint32_t i = 0; i < num_runs; i++) { + if(runs[i].multiplexing_coverage < 0.9999) { // Allow for tiny floating point error + multiplexing_detected = 1; + } + if(runs[i].multiplexing_coverage < min_coverage) { + min_coverage = runs[i].multiplexing_coverage; + } + if(runs[i].multiplexing_coverage > max_coverage) { + max_coverage = runs[i].multiplexing_coverage; + } + } + + if(multiplexing_detected) { + fprintf(stderr, "\n"); + fprintf(stderr, "========================================\n"); + fprintf(stderr, "WARNING: COUNTER MULTIPLEXING DETECTED!\n"); + fprintf(stderr, "========================================\n"); + fprintf(stderr, "The kernel time-sliced your performance counters.\n"); + fprintf(stderr, "This means the counters were NOT running 100%% of the time.\n"); + fprintf(stderr, "Coverage range: %.2f%% - %.2f%%\n", min_coverage * 100.0, max_coverage * 100.0); + fprintf(stderr, "Results may be SCALED and LESS PRECISE.\n"); + fprintf(stderr, "Consider reducing the number of counters.\n"); + fprintf(stderr, "========================================\n"); + fprintf(stderr, "\n"); + } else { + // All good - counters ran at 100% coverage (no multiplexing) + printf("Performance counter coverage: 100%% (no multiplexing - full precision)\n"); + } + // Calculate aggregated totals uint64_t total_instructions = 0; uint64_t total_cycles = 0; uint64_t total_stalled_frontend = 0; - uint64_t total_stalled_backend = 0; uint64_t total_branches = 0; uint64_t total_branch_misses = 0; uint64_t total_time_ns = 0; @@ -263,7 +326,6 @@ static void run_benchmark(struct nes_state *nstate, uint32_t num_runs, uint32_t total_instructions += runs[i].instructions; total_cycles += runs[i].cycles; total_stalled_frontend += runs[i].stalled_cycles_frontend; - total_stalled_backend += runs[i].stalled_cycles_backend; total_branches += runs[i].branches; total_branch_misses += runs[i].branch_misses; total_time_ns += runs[i].time_ns; @@ -287,35 +349,32 @@ static void run_benchmark(struct nes_state *nstate, uint32_t num_runs, uint32_t // Print results double total_time_s = (double)total_time_ns / 1000000000.0; double ipc = (double)total_instructions / total_cycles; - double stalled_per_insn = (double)(total_stalled_frontend + total_stalled_backend) / total_instructions; + double stalled_per_insn = (double)total_stalled_frontend / total_instructions; double ghz = (double)total_cycles / total_time_s / 1000000000.0; double branches_per_sec = (double)total_branches / total_time_s; double branch_miss_rate = (double)total_branch_misses / total_branches * 100.0; double stalled_frontend_pct = (double)total_stalled_frontend / total_cycles * 100.0; - double stalled_backend_pct = (double)total_stalled_backend / total_cycles * 100.0; double mips = (double)total_instructions / total_time_s / 1000000.0; double mcps = (double)total_cycles / total_time_s / 1000000.0; printf("\n"); - printf("%20llu instructions # %.2f insn per cycle \n", (unsigned long long)total_instructions, ipc); - printf("%56s# %.2f stalled cycles per insn \n", "", stalled_per_insn); - printf("%20llu cycles # %.3f GHz \n", (unsigned long long)total_cycles, ghz); - printf("%20llu stalled-cycles-frontend # %.2f%% frontend cycles idle \n", (unsigned long long)total_stalled_frontend, stalled_frontend_pct); - printf("%20llu stalled-cycles-backend # %.2f%% backend cycles idle \n", (unsigned long long)total_stalled_backend, stalled_backend_pct); - printf("%20llu branches # %.3f G/sec \n", (unsigned long long)total_branches, branches_per_sec / 1000000000.0); - printf("%20llu branch-misses # %.2f%% of all branches \n", (unsigned long long)total_branch_misses, branch_miss_rate); + printf("%20llu instructions # %.2f insn per cycle \n", (unsigned long long)total_instructions, ipc); + printf("%56s# %.2f stalled cycles per insn\n", "", stalled_per_insn); + printf("%20llu cycles # %.3f GHz \n", (unsigned long long)total_cycles, ghz); + printf("%20llu stalled-cycles-frontend # %.2f%% frontend cycles idle\n", (unsigned long long)total_stalled_frontend, stalled_frontend_pct); + printf("%20llu branches # %.3f G/sec \n", (unsigned long long)total_branches, branches_per_sec / 1000000000.0); + printf("%20llu branch-misses # %.2f%% of all branches \n", (unsigned long long)total_branch_misses, branch_miss_rate); printf("\n"); printf("Throughput: %.2f MIPS, %.2f Mcycles/sec\n", mips, mcps); printf("\n"); - printf("cycles/frame mean=%.0f sd=%.0f relSD=%.3f%% n=%d\n", cycles_stats.mean, cycles_stats.sd, cycles_stats.rel_sd, cycles_stats.n); - printf("insn/frame mean=%.0f sd=%.0f relSD=%.3f%% n=%d\n", insn_stats.mean, insn_stats.sd, insn_stats.rel_sd, insn_stats.n); - printf("time (ms) mean=%.3f sd=%.3f relSD=%.3f%% n=%d\n", time_stats.mean, time_stats.sd, time_stats.rel_sd, time_stats.n); + printf("cycles/frame mean=%9.0f sd=%6.0f relSD=%.3f%% n=%d\n", cycles_stats.mean, cycles_stats.sd, cycles_stats.rel_sd, cycles_stats.n); + printf("insn/frame mean=%9.0f sd=%6.0f relSD=%.3f%% n=%d\n", insn_stats.mean, insn_stats.sd, insn_stats.rel_sd, insn_stats.n); + printf("time (ms) mean=%9.3f sd=%6.3f relSD=%.3f%% n=%d\n", time_stats.mean, time_stats.sd, time_stats.rel_sd, time_stats.n); double fps = (double)frames_per_run / (time_stats.mean / 1000.0); double ms_per_frame = time_stats.mean / frames_per_run; printf("FPS (frames/second) = %.2f\n", fps); - printf("ms/frame = %.6f\n", ms_per_frame); + printf(" ms/frame = %.6f\n", ms_per_frame); - // Cleanup close_counters(counters, num_counters); } diff --git a/mknes_ppu.c b/mknes_ppu.c index fcaf681..7c2b4ac 100644 --- a/mknes_ppu.c +++ b/mknes_ppu.c @@ -24,7 +24,7 @@ static void ppu_reset(struct nes_state *state) { memset(ppu, 0, sizeof(struct ppu_state)); } -__attribute__((hot, flatten)) +__attribute__((hot, flatten, optimize("unroll-loops"))) static inline void ppu_evaluate_sprites(struct nes_state *state, uint32_t scanline) { struct ppu_state *restrict ppu = &state->ppu; uint8_t sprite_height = (ppu->reg_ctrl & PPU_CTRL_SPRITE_HEIGHT) ? 16 : 8; @@ -62,6 +62,7 @@ static inline void ppu_fetch_sprite_patterns(struct nes_state * restrict state, uint8_t * restrict sec_oam = ppu->secondary_oam; uint8_t ctrl = ppu->reg_ctrl; uint8_t sprite_height = (ctrl & PPU_CTRL_SPRITE_HEIGHT) ? 16 : 8; + uint32_t sprite_pattern_table_base = (ctrl & PPU_CTRL_SPRITE_TILE) << 9; for(uint8_t i = 0; i < ppu->sprite_count; i++, sec_oam += 4) { @@ -76,16 +77,21 @@ static inline void ppu_fetch_sprite_patterns(struct nes_state * restrict state, uint32_t bank; uint32_t addr; if(sprite_height == 16) { - bank = (tile & 1) << 12; - tile &= 0xfe; - if(row >= 8) { - tile++; - row -= 8; - } - addr = bank + tile * 16 + row; + // For 8x16 sprites: + // - Bank comes from tile bit 0 (bits 1-7 are the tile index) + // - Row 0-7 uses base tile, row 8-15 uses base tile + 1 + // - Row offset wraps to 0-7 within each 8-pixel half + // + // Original logic: + // bank = (tile & 1) << 12; + // tile &= 0xfe; + // if(row >= 8) { tile++; row -= 8; } + // addr = bank + tile * 16 + row; + addr = ((tile & 1) << 12) + ((tile & 0xfe) + (row >> 3)) * 16 + (row & 7); } else { addr = sprite_pattern_table_base + tile * 16 + row; + } uint8_t val_lo = state->mapper_function.chr_read(state, addr); @@ -103,9 +109,8 @@ static inline void ppu_fetch_sprite_patterns(struct nes_state * restrict state, } } - -__attribute__((always_inline, hot, optimize("no-jump-tables"))) -static inline void ppu_render_pixel(struct nes_state * restrict state, uint32_t x, uint32_t y) { +__attribute__((always_inline, hot, optimize("no-jump-tables", "no-unroll-loops"))) +static inline void ppu_render_pixel(struct nes_state * restrict state, uint32_t x, uint32_t y, uint8_t mask_reg) { struct ppu_state *restrict ppu = &state->ppu; uint16_t bit = 0x8000 >> ppu->fine_x; @@ -115,14 +120,14 @@ static inline void ppu_render_pixel(struct nes_state * restrict state, uint32_t uint8_t sp_prio = 0; uint8_t sp_zero = 0; - uint8_t mask_reg = ppu->reg_mask; // Single load + // uint8_t mask_reg = ppu->reg_mask; // Single load uint8_t show_bg = mask_reg & PPU_MASK_SHOW_BG; uint8_t show_sprites = mask_reg & PPU_MASK_SHOW_SPRITES; uint8_t left_bg = mask_reg & 0x02; uint8_t left_sp = mask_reg & 0x04; uint8_t bg_mask = (show_bg && (left_bg || x & ~7)) ? 0xff : 0x00; - uint8_t sp_mask = (show_sprites && (left_sp || x & ~7));// ? 0xff : 0x00; + uint8_t sp_mask = (show_sprites && (left_sp || x & ~7)); // Background uint8_t p0 = !!(ppu->bg_shift_pattern_low & bit); @@ -133,50 +138,34 @@ static inline void ppu_render_pixel(struct nes_state * restrict state, uint32_t uint8_t bg_pixel = ((p1 << 1) | p0) & bg_mask; uint8_t bg_palette = ((a1 << 1) | a0) & bg_mask; - // Sprites -#define SPRITE_STEP(N) do { \ - if(!ppu->sprites[(N)].position) { \ - sp_pixel = (((ppu->sprites[(N)].shift_hi & 0x80) >> 6) | ((ppu->sprites[(N)].shift_lo & 0x80) >> 7)); \ - if(sp_pixel) { \ - sp_prio = ppu->sprites[(N)].priority; \ - sp_palette = ppu->sprites[(N)].palette; \ - if((N) == 0) { \ - sp_zero = ppu->sprite_zero_in_range; \ - } \ - goto sprite_done; \ - } \ - } \ -} while (0) - - // sprite_counts[ppu->sprite_count]++; - if(sp_mask && ppu->sprite_count > 0) { - if(ppu->sprite_count == 2) goto sprite_2; - if(ppu->sprite_count == 1) goto sprite_1; - if(ppu->sprite_count == 3) goto sprite_3; - if(ppu->sprite_count == 4) goto sprite_4; - if(ppu->sprite_count == 5) goto sprite_5; - if(ppu->sprite_count == 6) goto sprite_6; - if(ppu->sprite_count == 8) goto sprite_8; - if(ppu->sprite_count == 7) goto sprite_7; - - -sprite_8: SPRITE_STEP(7); -sprite_7: SPRITE_STEP(6); -sprite_6: SPRITE_STEP(5); -sprite_5: SPRITE_STEP(4); -sprite_4: SPRITE_STEP(3); -sprite_3: SPRITE_STEP(2); -sprite_2: SPRITE_STEP(1); -sprite_1: SPRITE_STEP(0); + // Sprites - evaluate in forward order (0 has highest priority) + if(sp_mask) { + uint8_t found_sprite = 0xff; + for(uint8_t i = 0; i < ppu->sprite_count; i++) { + if(!ppu->sprites[i].position) { + sp_pixel = (((ppu->sprites[i].shift_hi & 0x80) >> 6) | ((ppu->sprites[i].shift_lo & 0x80) >> 7)); + if(sp_pixel) { + found_sprite = i; + goto sprite_found; + } + } + } + goto no_sprite; + +sprite_found: + sp_prio = ppu->sprites[found_sprite].priority; + sp_palette = ppu->sprites[found_sprite].palette; + sp_zero = ppu->sprite_zero_in_range & !(found_sprite); } -sprite_done: +no_sprite: + // Final pixel composition uint8_t bg_index = (bg_palette << 2) + bg_pixel; uint8_t sp_index = (sp_palette << 2) + sp_pixel; uint8_t selector = (bg_pixel ? 2 : 0) | (sp_pixel ? 1 : 0); - // NOTE(peter): It's actually faster to preset case3 version of palette_index than to start from zero + // NOTE(peter): It's actually faster to preset case 3 version of palette_index than to start from zero uint8_t palette_index = (sp_prio) ? bg_index : 0x10 | sp_index; switch(selector) { @@ -189,20 +178,19 @@ sprite_done: state->pixels[y * 256 + x] = ppu->palette[palette_index]; // NOTE(peter): Add color_emphasis bits (expand palette to 8x). } -__attribute__((hot, optimize("no-jump-tables"))) +__attribute__((hot, optimize("no-jump-tables", "unroll-loops"))) static void ppu_tick(struct nes_state *state) { struct ppu_state *restrict ppu = &state->ppu; uint32_t dot = ppu->dot; uint32_t scanline = ppu->scanline; - uint8_t rendering = (ppu->reg_mask & (PPU_MASK_SHOW_SPRITES | PPU_MASK_SHOW_BG)); + uint8_t reg_mask = ppu->reg_mask; + uint8_t rendering = (reg_mask & (PPU_MASK_SHOW_SPRITES | PPU_MASK_SHOW_BG)); for(uint8_t ppu_loops = 0; ppu_loops < 3; ++ppu_loops) { if(rendering) { - if(scanline <= 239) { - if(dot >= 1 && dot <= 256) { if(dot == 256) { if((ppu->vram_addr & 0x7000) != 0x7000) { @@ -223,7 +211,7 @@ static void ppu_tick(struct nes_state *state) { } } - ppu_render_pixel(state, dot - 1, scanline); + ppu_render_pixel(state, dot - 1, scanline, reg_mask); goto stupid; } @@ -233,7 +221,7 @@ static void ppu_tick(struct nes_state *state) { } if(dot >= 321 && dot <= 336) { -stupid: if(ppu->reg_mask & PPU_MASK_SHOW_SPRITES) { +stupid: if(reg_mask & PPU_MASK_SHOW_SPRITES) { for(uint32_t i = 0; i < ppu->sprite_count; i++) { if(ppu->sprites[i].position > 0) { ppu->sprites[i].position--; @@ -327,16 +315,8 @@ stupid: if(ppu->reg_mask & PPU_MASK_SHOW_SPRITES) { goto stupid2; } - if(dot == 257) { - ppu->vram_addr = (ppu->vram_addr & ~0x041f) | (ppu->temp_addr & 0x041f); - } - - if(dot >= 280 && dot <= 304) { - ppu->vram_addr = (ppu->vram_addr & ~0x7be0) | (ppu->temp_addr & 0x7be0); - } - if(dot >= 321 && dot <= 336) { -stupid2: if(ppu->reg_mask & PPU_MASK_SHOW_SPRITES) { +stupid2: if(reg_mask & PPU_MASK_SHOW_SPRITES) { for(uint32_t i = 0; i < ppu->sprite_count; i++) { if(ppu->sprites[i].position > 0) { ppu->sprites[i].position--; @@ -399,6 +379,15 @@ stupid2: if(ppu->reg_mask & PPU_MASK_SHOW_SPRITES) { } break; } } + + if(dot == 257) { + ppu->vram_addr = (ppu->vram_addr & ~0x041f) | (ppu->temp_addr & 0x041f); + } + + if(dot >= 280 && dot <= 304) { + ppu->vram_addr = (ppu->vram_addr & ~0x7be0) | (ppu->temp_addr & 0x7be0); + } + } } @@ -436,7 +425,7 @@ stupid2: if(ppu->reg_mask & PPU_MASK_SHOW_SPRITES) { } if(state->mapper_function.tick) { - state->mapper_function.tick(state); + state->mapper_function.tick(state); // TODO(peter): This signature has to be changed to supply dot and scanline! } } ppu->dot = dot; |
