summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.gitignore2
-rwxr-xr-xBench.sh1
-rwxr-xr-xbuild.sh12
-rw-r--r--mknes.c6
-rw-r--r--mknes_apu.c31
-rw-r--r--mknes_bench.c199
-rw-r--r--mknes_ppu.c121
7 files changed, 227 insertions, 145 deletions
diff --git a/.gitignore b/.gitignore
index 9281757..c588f8e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -18,3 +18,5 @@ mknes_memory original.c
mknes.s
record.txt
_Bench.sh
+toolchain
+
diff --git a/Bench.sh b/Bench.sh
index 5e4e104..768d5b6 100755
--- a/Bench.sh
+++ b/Bench.sh
@@ -3,7 +3,6 @@
./build.sh clean
./build.sh profile
./mknes -n 1 &> /dev/null
-
./build.sh profile_release
# Run full benchmark
diff --git a/build.sh b/build.sh
index a8b26e8..abb6d4f 100755
--- a/build.sh
+++ b/build.sh
@@ -1,9 +1,16 @@
#!/bin/bash
+# Use project-local GCC if available, otherwise system GCC
+TOOLCHAIN_GCC="./toolchain/gcc-15.2.0/bin/gcc"
+if [ -f "${TOOLCHAIN_GCC}" ]; then
+ CC="${TOOLCHAIN_GCC}"
+else
+ CC=gcc
+fi
+
# Set the project name here
PROJECT_NAME="mknes" # Change this for each new project
-CC=gcc
WIN_CC=x86_64-w64-mingw32-gcc
# Base configuration common to all builds
@@ -52,8 +59,7 @@ case "$BUILD_TYPE" in
# -pg # for gprof
;;
"release")
- # CFLAGS+="-s -Wl,--strip-all -O2 "
- CFLAGS+=" -O2 "
+ CFLAGS+="-s -Wl,--strip-all -O2 "
;;
"profile")
CFLAGS+="-O2 -fprofile-generate -ftest-coverage -DBENCHMARK "
diff --git a/mknes.c b/mknes.c
index 597768f..e3e9ffd 100644
--- a/mknes.c
+++ b/mknes.c
@@ -60,7 +60,7 @@ static void audio_callback(int16_t *data, size_t frames) { }
// Uncomment the ROM you want to benchmark:
// INCBIN_BYTES(benchmark_rom, "data/Life Force (USA).nes");
INCBIN_BYTES(benchmark_rom, "data/0000/Super Mario Bros. (World) (HVC-SM).nes");
-// INCBIN_BYTES(benchmark_rom, "data/0003/Gradius (USA).zip");
+// INCBIN_BYTES(benchmark_rom, "data/0003/Gradius (USA).nes");
#endif
#include "platform_gl_loader.c"
@@ -271,7 +271,7 @@ int main(int argc, char **argv) {
// ines2_load(nstate, "data/0000/Excitebike (Japan, USA).nes");
// ines2_load(nstate, "data/0000/Ice Climber (USA, Europe, Korea).nes");
// ines2_load(nstate, "data/0000/Kung Fu (Japan, USA).nes");
- // ines2_load(nstate, "data/0000/Super Mario Bros. (World) (HVC-SM).nes");
+ ines2_load(nstate, "data/0000/Super Mario Bros. (World) (HVC-SM).nes");
// ines2_load(nstate, "data/Super Mario Bros. (W) (V1.0) [!].nes");
// ines2_load(nstate, "data/Super Mario Bros. (JU) [!].nes");
// ines2_load(nstate, "data/0000/Urban Champion (World).nes");
@@ -293,7 +293,7 @@ int main(int argc, char **argv) {
// ines2_load(nstate, "data/0000/Xevious - The Avenger (USA).zip");
// ines2_load(nstate, "data/tv.nes");
- ines2_load(nstate, "data/Life Force (USA).nes"); // 2002
+ // ines2_load(nstate, "data/Life Force (USA).nes"); // 2002
// ines2_load(nstate, "data/0003/Flipull - An Exciting Cube Game (Japan) (En).zip");
// ines2_load(nstate, "data/0003/Friday the 13th (USA).zip");
diff --git a/mknes_apu.c b/mknes_apu.c
index 98552ed..027b1e5 100644
--- a/mknes_apu.c
+++ b/mknes_apu.c
@@ -35,12 +35,19 @@ static uint8_t apu_read4015(struct nes_state *state) {
}
if(apu->irq_pending) {
result |= 0x40;
- apu->irq_pending = 0;
}
+
+ // Reading $4015 clears the frame IRQ flag
+ apu->irq_pending = 0;
+ // Only clear CPU IRQ if DMC isn't requesting it
+ if(!(apu->dmc_bytes_remaining > 0 && apu->dmc_irq_enable)) {
+ state->cpu.irq_pending = 0;
+ }
+
return result;
}
-// $4010–$4013, $4015 write
+// $4010–$4013, $4015, $4017 write
static void apu_write(struct nes_state *state, uint16_t addr, uint8_t val) {
struct apu_state *apu = &state->apu;
@@ -62,6 +69,26 @@ static void apu_write(struct nes_state *state, uint16_t addr, uint8_t val) {
case 0x4015: {
apu_write4015(state, val);
} break;
+ case 0x4017: {
+ // Frame counter control
+ apu->mode = (val >> 7) & 1;
+ apu->irq_inhibit = (val >> 6) & 1;
+
+ // If IRQ inhibit flag is set, clear the frame IRQ
+ if(apu->irq_inhibit) {
+ apu->irq_pending = 0;
+ // Only clear CPU IRQ if DMC isn't requesting it
+ if(!(apu->dmc_bytes_remaining > 0 && apu->dmc_irq_enable)) {
+ state->cpu.irq_pending = 0;
+ }
+ }
+
+ // Reset frame counter (with delay, but we'll approximate immediately for now)
+ apu->frame_cycle = 0;
+
+ // If 5-step mode, immediately clock half-frame and quarter-frame
+ // (For timing purposes without audio, we can leave this empty)
+ } break;
}
}
diff --git a/mknes_bench.c b/mknes_bench.c
index 12fb642..b064266 100644
--- a/mknes_bench.c
+++ b/mknes_bench.c
@@ -8,7 +8,6 @@
#include <math.h>
#include <time.h>
-// Performance counter setup
struct perf_counter {
int fd;
uint64_t value;
@@ -17,14 +16,23 @@ struct perf_counter {
uint64_t config;
};
+struct perf_group_read {
+ uint64_t nr;
+ uint64_t time_enabled;
+ uint64_t time_running;
+ struct {
+ uint64_t value;
+ } values[5];
+} __attribute__((packed));
+
struct bench_run {
uint64_t cycles;
uint64_t instructions;
uint64_t stalled_cycles_frontend;
- uint64_t stalled_cycles_backend;
uint64_t branches;
uint64_t branch_misses;
uint64_t time_ns;
+ double multiplexing_coverage; // ratio of time_running/time_enabled
};
struct bench_stats {
@@ -38,51 +46,76 @@ static long perf_event_open(struct perf_event_attr *hw_event, pid_t pid, int cpu
return syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags);
}
-static int setup_counter(struct perf_counter *counter, uint32_t type, uint64_t config, const char *name) {
+static int setup_counter_group(struct perf_counter *counters, int num_counters) {
struct perf_event_attr pe;
memset(&pe, 0, sizeof(struct perf_event_attr));
- pe.type = type;
+ pe.type = PERF_TYPE_HARDWARE;
pe.size = sizeof(struct perf_event_attr);
- pe.config = config;
pe.disabled = 1;
- pe.exclude_kernel = 0;
- pe.exclude_hv = 0;
+ pe.exclude_kernel = 1;
+ pe.exclude_hv = 1;
pe.exclude_idle = 1;
+ pe.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING;
- counter->fd = perf_event_open(&pe, 0, -1, -1, 0);
- counter->name = name;
- counter->type = type;
- counter->config = config;
- counter->value = 0;
+ int leader_fd = -1;
- return counter->fd;
-}
+ for(int i = 0; i < num_counters; i++) {
+ pe.config = counters[i].config;
-static void reset_counters(struct perf_counter *counters, int n) {
- for(int i = 0; i < n; i++) {
- if(counters[i].fd >= 0) {
- ioctl(counters[i].fd, PERF_EVENT_IOC_RESET, 0);
+ if(i == 0) {
+ // First counter is the group leader
+ leader_fd = perf_event_open(&pe, 0, -1, -1, 0);
+ counters[i].fd = leader_fd;
+ } else {
+ // Subsequent counters are group members
+ counters[i].fd = perf_event_open(&pe, 0, -1, leader_fd, 0);
}
+
+ if(counters[i].fd < 0) {
+ fprintf(stderr, "Warning: Counter %s not available\n", counters[i].name);
+ }
+
+ counters[i].value = 0;
}
+
+ return leader_fd;
}
-static void start_counters(struct perf_counter *counters, int n) {
- for(int i = 0; i < n; i++) {
- if(counters[i].fd >= 0) {
- ioctl(counters[i].fd, PERF_EVENT_IOC_ENABLE, 0);
- }
+static void reset_counters_group(int leader_fd) {
+ if(leader_fd >= 0) {
+ ioctl(leader_fd, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP);
}
}
-static void stop_counters(struct perf_counter *counters, int n) {
- for(int i = 0; i < n; i++) {
- if(counters[i].fd >= 0) {
- ioctl(counters[i].fd, PERF_EVENT_IOC_DISABLE, 0);
- read(counters[i].fd, &counters[i].value, sizeof(uint64_t));
- }
+static void start_counters_group(int leader_fd) {
+ if(leader_fd >= 0) {
+ ioctl(leader_fd, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP);
}
}
+static int stop_and_read_counters_group(int leader_fd, struct perf_counter *counters, int num_counters, struct perf_group_read *result) {
+ if(leader_fd < 0) {
+ return -1;
+ }
+
+ // Stop all counters atomically
+ ioctl(leader_fd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP);
+
+ // Read all counter values in one syscall
+ ssize_t bytes_read = read(leader_fd, result, sizeof(struct perf_group_read));
+ if(bytes_read < 0) {
+ fprintf(stderr, "Error: Failed to read perf group counters\n");
+ return -1;
+ }
+
+ // Store values in individual counter structs for compatibility
+ for(int i = 0; i < num_counters && i < result->nr; i++) {
+ counters[i].value = result->values[i].value;
+ }
+
+ return 0;
+}
+
static void close_counters(struct perf_counter *counters, int n) {
for(int i = 0; i < n; i++) {
if(counters[i].fd >= 0) {
@@ -133,7 +166,6 @@ static void set_cpu_affinity(int cpu) {
}
}
-// Static allocation for benchmark runs - no malloc, page-aligned
#define MAX_BENCH_RUNS 100
static struct bench_run runs_storage[MAX_BENCH_RUNS] __attribute__((aligned(4096)));
static double per_frame_storage[MAX_BENCH_RUNS * 3] __attribute__((aligned(4096)));
@@ -144,38 +176,36 @@ static void run_benchmark(struct nes_state *nstate, uint32_t num_runs, uint32_t
return;
}
- struct perf_counter counters[6];
- int num_counters = 0;
+ struct perf_counter counters[5];
+ int num_counters = 5;
+
+ // Initialize counter metadata
+ counters[0] = (struct perf_counter){.fd = -1, .value = 0, .name = "cycles", .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES};
+ counters[1] = (struct perf_counter){.fd = -1, .value = 0, .name = "instructions", .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_INSTRUCTIONS};
+ counters[2] = (struct perf_counter){.fd = -1, .value = 0, .name = "stalled-cycles-frontend", .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_STALLED_CYCLES_FRONTEND};
+ counters[3] = (struct perf_counter){.fd = -1, .value = 0, .name = "branches", .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_INSTRUCTIONS};
+ counters[4] = (struct perf_counter){.fd = -1, .value = 0, .name = "branch-misses", .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_MISSES};
- // Set up performance counters
- setup_counter(&counters[num_counters++], PERF_TYPE_HARDWARE, PERF_COUNT_HW_CPU_CYCLES, "cycles");
- setup_counter(&counters[num_counters++], PERF_TYPE_HARDWARE, PERF_COUNT_HW_INSTRUCTIONS, "instructions");
- setup_counter(&counters[num_counters++], PERF_TYPE_HARDWARE, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND, "stalled-cycles-frontend");
- setup_counter(&counters[num_counters++], PERF_TYPE_HARDWARE, PERF_COUNT_HW_STALLED_CYCLES_BACKEND, "stalled-cycles-backend");
- setup_counter(&counters[num_counters++], PERF_TYPE_HARDWARE, PERF_COUNT_HW_BRANCH_INSTRUCTIONS, "branches");
- setup_counter(&counters[num_counters++], PERF_TYPE_HARDWARE, PERF_COUNT_HW_BRANCH_MISSES, "branch-misses");
+ // Set up performance counter group
+ int leader_fd = setup_counter_group(counters, num_counters);
// Check which counters are available
int available_counters = 0;
for(int i = 0; i < num_counters; i++) {
if(counters[i].fd >= 0) {
available_counters++;
- } else {
- fprintf(stderr, "Warning: Counter %s not available\n", counters[i].name);
}
}
- if(available_counters == 0) {
+ if(available_counters == 0 || leader_fd < 0) {
fprintf(stderr, "Error: No performance counters available\n");
close_counters(counters, num_counters);
return;
}
- // Use static storage for runs
struct bench_run *runs = runs_storage;
memset(runs, 0, sizeof(struct bench_run) * num_runs);
- // Set CPU affinity and realtime priority
set_cpu_affinity(1);
set_realtime_priority();
@@ -194,7 +224,7 @@ static void run_benchmark(struct nes_state *nstate, uint32_t num_runs, uint32_t
nstate->ppu.frame_ready = 0;
}
- // Run benchmark iterations
+ // Run benchmark
for(uint32_t run = 0; run < num_runs; run++) {
// Reset emulator state
@@ -209,8 +239,8 @@ static void run_benchmark(struct nes_state *nstate, uint32_t num_runs, uint32_t
clock_gettime(CLOCK_MONOTONIC, &start_time);
// Reset and start counters (after clock_gettime to exclude its overhead)
- reset_counters(counters, num_counters);
- start_counters(counters, num_counters);
+ reset_counters_group(leader_fd);
+ start_counters_group(leader_fd);
// Run emulation
for(uint32_t i = 0; i < frames_per_run; i++) {
@@ -220,17 +250,20 @@ static void run_benchmark(struct nes_state *nstate, uint32_t num_runs, uint32_t
nstate->ppu.frame_ready = 0;
}
- // Stop counters (before clock_gettime to exclude its overhead)
- stop_counters(counters, num_counters);
-
- // Stop timing
+ // Stop and read all counters atomically in one syscall
+ struct perf_group_read group_result;
+ stop_and_read_counters_group(leader_fd, counters, num_counters, &group_result);
clock_gettime(CLOCK_MONOTONIC, &end_time);
- // Calculate elapsed time in nanoseconds
uint64_t elapsed_ns = (end_time.tv_sec - start_time.tv_sec) * 1000000000ULL + (end_time.tv_nsec - start_time.tv_nsec);
+ // Calculate multiplexing coverage
+ double coverage = (group_result.time_enabled > 0) ?
+ (double)group_result.time_running / (double)group_result.time_enabled : 1.0;
+
// Store results
runs[run].time_ns = elapsed_ns;
+ runs[run].multiplexing_coverage = coverage;
for(int i = 0; i < num_counters; i++) {
if(counters[i].fd < 0) continue;
@@ -240,8 +273,6 @@ static void run_benchmark(struct nes_state *nstate, uint32_t num_runs, uint32_t
runs[run].instructions = counters[i].value;
} else if(counters[i].config == PERF_COUNT_HW_STALLED_CYCLES_FRONTEND) {
runs[run].stalled_cycles_frontend = counters[i].value;
- } else if(counters[i].config == PERF_COUNT_HW_STALLED_CYCLES_BACKEND) {
- runs[run].stalled_cycles_backend = counters[i].value;
} else if(counters[i].config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) {
runs[run].branches = counters[i].value;
} else if(counters[i].config == PERF_COUNT_HW_BRANCH_MISSES) {
@@ -250,11 +281,43 @@ static void run_benchmark(struct nes_state *nstate, uint32_t num_runs, uint32_t
}
}
+ // Check for multiplexing and warn user
+ int multiplexing_detected = 0;
+ double min_coverage = 1.0;
+ double max_coverage = 1.0;
+ for(uint32_t i = 0; i < num_runs; i++) {
+ if(runs[i].multiplexing_coverage < 0.9999) { // Allow for tiny floating point error
+ multiplexing_detected = 1;
+ }
+ if(runs[i].multiplexing_coverage < min_coverage) {
+ min_coverage = runs[i].multiplexing_coverage;
+ }
+ if(runs[i].multiplexing_coverage > max_coverage) {
+ max_coverage = runs[i].multiplexing_coverage;
+ }
+ }
+
+ if(multiplexing_detected) {
+ fprintf(stderr, "\n");
+ fprintf(stderr, "========================================\n");
+ fprintf(stderr, "WARNING: COUNTER MULTIPLEXING DETECTED!\n");
+ fprintf(stderr, "========================================\n");
+ fprintf(stderr, "The kernel time-sliced your performance counters.\n");
+ fprintf(stderr, "This means the counters were NOT running 100%% of the time.\n");
+ fprintf(stderr, "Coverage range: %.2f%% - %.2f%%\n", min_coverage * 100.0, max_coverage * 100.0);
+ fprintf(stderr, "Results may be SCALED and LESS PRECISE.\n");
+ fprintf(stderr, "Consider reducing the number of counters.\n");
+ fprintf(stderr, "========================================\n");
+ fprintf(stderr, "\n");
+ } else {
+ // All good - counters ran at 100% coverage (no multiplexing)
+ printf("Performance counter coverage: 100%% (no multiplexing - full precision)\n");
+ }
+
// Calculate aggregated totals
uint64_t total_instructions = 0;
uint64_t total_cycles = 0;
uint64_t total_stalled_frontend = 0;
- uint64_t total_stalled_backend = 0;
uint64_t total_branches = 0;
uint64_t total_branch_misses = 0;
uint64_t total_time_ns = 0;
@@ -263,7 +326,6 @@ static void run_benchmark(struct nes_state *nstate, uint32_t num_runs, uint32_t
total_instructions += runs[i].instructions;
total_cycles += runs[i].cycles;
total_stalled_frontend += runs[i].stalled_cycles_frontend;
- total_stalled_backend += runs[i].stalled_cycles_backend;
total_branches += runs[i].branches;
total_branch_misses += runs[i].branch_misses;
total_time_ns += runs[i].time_ns;
@@ -287,35 +349,32 @@ static void run_benchmark(struct nes_state *nstate, uint32_t num_runs, uint32_t
// Print results
double total_time_s = (double)total_time_ns / 1000000000.0;
double ipc = (double)total_instructions / total_cycles;
- double stalled_per_insn = (double)(total_stalled_frontend + total_stalled_backend) / total_instructions;
+ double stalled_per_insn = (double)total_stalled_frontend / total_instructions;
double ghz = (double)total_cycles / total_time_s / 1000000000.0;
double branches_per_sec = (double)total_branches / total_time_s;
double branch_miss_rate = (double)total_branch_misses / total_branches * 100.0;
double stalled_frontend_pct = (double)total_stalled_frontend / total_cycles * 100.0;
- double stalled_backend_pct = (double)total_stalled_backend / total_cycles * 100.0;
double mips = (double)total_instructions / total_time_s / 1000000.0;
double mcps = (double)total_cycles / total_time_s / 1000000.0;
printf("\n");
- printf("%20llu instructions # %.2f insn per cycle \n", (unsigned long long)total_instructions, ipc);
- printf("%56s# %.2f stalled cycles per insn \n", "", stalled_per_insn);
- printf("%20llu cycles # %.3f GHz \n", (unsigned long long)total_cycles, ghz);
- printf("%20llu stalled-cycles-frontend # %.2f%% frontend cycles idle \n", (unsigned long long)total_stalled_frontend, stalled_frontend_pct);
- printf("%20llu stalled-cycles-backend # %.2f%% backend cycles idle \n", (unsigned long long)total_stalled_backend, stalled_backend_pct);
- printf("%20llu branches # %.3f G/sec \n", (unsigned long long)total_branches, branches_per_sec / 1000000000.0);
- printf("%20llu branch-misses # %.2f%% of all branches \n", (unsigned long long)total_branch_misses, branch_miss_rate);
+ printf("%20llu instructions # %.2f insn per cycle \n", (unsigned long long)total_instructions, ipc);
+ printf("%56s# %.2f stalled cycles per insn\n", "", stalled_per_insn);
+ printf("%20llu cycles # %.3f GHz \n", (unsigned long long)total_cycles, ghz);
+ printf("%20llu stalled-cycles-frontend # %.2f%% frontend cycles idle\n", (unsigned long long)total_stalled_frontend, stalled_frontend_pct);
+ printf("%20llu branches # %.3f G/sec \n", (unsigned long long)total_branches, branches_per_sec / 1000000000.0);
+ printf("%20llu branch-misses # %.2f%% of all branches \n", (unsigned long long)total_branch_misses, branch_miss_rate);
printf("\n");
printf("Throughput: %.2f MIPS, %.2f Mcycles/sec\n", mips, mcps);
printf("\n");
- printf("cycles/frame mean=%.0f sd=%.0f relSD=%.3f%% n=%d\n", cycles_stats.mean, cycles_stats.sd, cycles_stats.rel_sd, cycles_stats.n);
- printf("insn/frame mean=%.0f sd=%.0f relSD=%.3f%% n=%d\n", insn_stats.mean, insn_stats.sd, insn_stats.rel_sd, insn_stats.n);
- printf("time (ms) mean=%.3f sd=%.3f relSD=%.3f%% n=%d\n", time_stats.mean, time_stats.sd, time_stats.rel_sd, time_stats.n);
+ printf("cycles/frame mean=%9.0f sd=%6.0f relSD=%.3f%% n=%d\n", cycles_stats.mean, cycles_stats.sd, cycles_stats.rel_sd, cycles_stats.n);
+ printf("insn/frame mean=%9.0f sd=%6.0f relSD=%.3f%% n=%d\n", insn_stats.mean, insn_stats.sd, insn_stats.rel_sd, insn_stats.n);
+ printf("time (ms) mean=%9.3f sd=%6.3f relSD=%.3f%% n=%d\n", time_stats.mean, time_stats.sd, time_stats.rel_sd, time_stats.n);
double fps = (double)frames_per_run / (time_stats.mean / 1000.0);
double ms_per_frame = time_stats.mean / frames_per_run;
printf("FPS (frames/second) = %.2f\n", fps);
- printf("ms/frame = %.6f\n", ms_per_frame);
+ printf(" ms/frame = %.6f\n", ms_per_frame);
- // Cleanup
close_counters(counters, num_counters);
}
diff --git a/mknes_ppu.c b/mknes_ppu.c
index fcaf681..7c2b4ac 100644
--- a/mknes_ppu.c
+++ b/mknes_ppu.c
@@ -24,7 +24,7 @@ static void ppu_reset(struct nes_state *state) {
memset(ppu, 0, sizeof(struct ppu_state));
}
-__attribute__((hot, flatten))
+__attribute__((hot, flatten, optimize("unroll-loops")))
static inline void ppu_evaluate_sprites(struct nes_state *state, uint32_t scanline) {
struct ppu_state *restrict ppu = &state->ppu;
uint8_t sprite_height = (ppu->reg_ctrl & PPU_CTRL_SPRITE_HEIGHT) ? 16 : 8;
@@ -62,6 +62,7 @@ static inline void ppu_fetch_sprite_patterns(struct nes_state * restrict state,
uint8_t * restrict sec_oam = ppu->secondary_oam;
uint8_t ctrl = ppu->reg_ctrl;
uint8_t sprite_height = (ctrl & PPU_CTRL_SPRITE_HEIGHT) ? 16 : 8;
+
uint32_t sprite_pattern_table_base = (ctrl & PPU_CTRL_SPRITE_TILE) << 9;
for(uint8_t i = 0; i < ppu->sprite_count; i++, sec_oam += 4) {
@@ -76,16 +77,21 @@ static inline void ppu_fetch_sprite_patterns(struct nes_state * restrict state,
uint32_t bank;
uint32_t addr;
if(sprite_height == 16) {
- bank = (tile & 1) << 12;
- tile &= 0xfe;
- if(row >= 8) {
- tile++;
- row -= 8;
- }
- addr = bank + tile * 16 + row;
+ // For 8x16 sprites:
+ // - Bank comes from tile bit 0 (bits 1-7 are the tile index)
+ // - Row 0-7 uses base tile, row 8-15 uses base tile + 1
+ // - Row offset wraps to 0-7 within each 8-pixel half
+ //
+ // Original logic:
+ // bank = (tile & 1) << 12;
+ // tile &= 0xfe;
+ // if(row >= 8) { tile++; row -= 8; }
+ // addr = bank + tile * 16 + row;
+ addr = ((tile & 1) << 12) + ((tile & 0xfe) + (row >> 3)) * 16 + (row & 7);
} else {
addr = sprite_pattern_table_base + tile * 16 + row;
+
}
uint8_t val_lo = state->mapper_function.chr_read(state, addr);
@@ -103,9 +109,8 @@ static inline void ppu_fetch_sprite_patterns(struct nes_state * restrict state,
}
}
-
-__attribute__((always_inline, hot, optimize("no-jump-tables")))
-static inline void ppu_render_pixel(struct nes_state * restrict state, uint32_t x, uint32_t y) {
+__attribute__((always_inline, hot, optimize("no-jump-tables", "no-unroll-loops")))
+static inline void ppu_render_pixel(struct nes_state * restrict state, uint32_t x, uint32_t y, uint8_t mask_reg) {
struct ppu_state *restrict ppu = &state->ppu;
uint16_t bit = 0x8000 >> ppu->fine_x;
@@ -115,14 +120,14 @@ static inline void ppu_render_pixel(struct nes_state * restrict state, uint32_t
uint8_t sp_prio = 0;
uint8_t sp_zero = 0;
- uint8_t mask_reg = ppu->reg_mask; // Single load
+ // uint8_t mask_reg = ppu->reg_mask; // Single load
uint8_t show_bg = mask_reg & PPU_MASK_SHOW_BG;
uint8_t show_sprites = mask_reg & PPU_MASK_SHOW_SPRITES;
uint8_t left_bg = mask_reg & 0x02;
uint8_t left_sp = mask_reg & 0x04;
uint8_t bg_mask = (show_bg && (left_bg || x & ~7)) ? 0xff : 0x00;
- uint8_t sp_mask = (show_sprites && (left_sp || x & ~7));// ? 0xff : 0x00;
+ uint8_t sp_mask = (show_sprites && (left_sp || x & ~7));
// Background
uint8_t p0 = !!(ppu->bg_shift_pattern_low & bit);
@@ -133,50 +138,34 @@ static inline void ppu_render_pixel(struct nes_state * restrict state, uint32_t
uint8_t bg_pixel = ((p1 << 1) | p0) & bg_mask;
uint8_t bg_palette = ((a1 << 1) | a0) & bg_mask;
- // Sprites
-#define SPRITE_STEP(N) do { \
- if(!ppu->sprites[(N)].position) { \
- sp_pixel = (((ppu->sprites[(N)].shift_hi & 0x80) >> 6) | ((ppu->sprites[(N)].shift_lo & 0x80) >> 7)); \
- if(sp_pixel) { \
- sp_prio = ppu->sprites[(N)].priority; \
- sp_palette = ppu->sprites[(N)].palette; \
- if((N) == 0) { \
- sp_zero = ppu->sprite_zero_in_range; \
- } \
- goto sprite_done; \
- } \
- } \
-} while (0)
-
- // sprite_counts[ppu->sprite_count]++;
- if(sp_mask && ppu->sprite_count > 0) {
- if(ppu->sprite_count == 2) goto sprite_2;
- if(ppu->sprite_count == 1) goto sprite_1;
- if(ppu->sprite_count == 3) goto sprite_3;
- if(ppu->sprite_count == 4) goto sprite_4;
- if(ppu->sprite_count == 5) goto sprite_5;
- if(ppu->sprite_count == 6) goto sprite_6;
- if(ppu->sprite_count == 8) goto sprite_8;
- if(ppu->sprite_count == 7) goto sprite_7;
-
-
-sprite_8: SPRITE_STEP(7);
-sprite_7: SPRITE_STEP(6);
-sprite_6: SPRITE_STEP(5);
-sprite_5: SPRITE_STEP(4);
-sprite_4: SPRITE_STEP(3);
-sprite_3: SPRITE_STEP(2);
-sprite_2: SPRITE_STEP(1);
-sprite_1: SPRITE_STEP(0);
+ // Sprites - evaluate in forward order (0 has highest priority)
+ if(sp_mask) {
+ uint8_t found_sprite = 0xff;
+ for(uint8_t i = 0; i < ppu->sprite_count; i++) {
+ if(!ppu->sprites[i].position) {
+ sp_pixel = (((ppu->sprites[i].shift_hi & 0x80) >> 6) | ((ppu->sprites[i].shift_lo & 0x80) >> 7));
+ if(sp_pixel) {
+ found_sprite = i;
+ goto sprite_found;
+ }
+ }
+ }
+ goto no_sprite;
+
+sprite_found:
+ sp_prio = ppu->sprites[found_sprite].priority;
+ sp_palette = ppu->sprites[found_sprite].palette;
+ sp_zero = ppu->sprite_zero_in_range & !(found_sprite);
}
-sprite_done:
+no_sprite:
+
// Final pixel composition
uint8_t bg_index = (bg_palette << 2) + bg_pixel;
uint8_t sp_index = (sp_palette << 2) + sp_pixel;
uint8_t selector = (bg_pixel ? 2 : 0) | (sp_pixel ? 1 : 0);
- // NOTE(peter): It's actually faster to preset case3 version of palette_index than to start from zero
+ // NOTE(peter): It's actually faster to preset case 3 version of palette_index than to start from zero
uint8_t palette_index = (sp_prio) ? bg_index : 0x10 | sp_index;
switch(selector) {
@@ -189,20 +178,19 @@ sprite_done:
state->pixels[y * 256 + x] = ppu->palette[palette_index]; // NOTE(peter): Add color_emphasis bits (expand palette to 8x).
}
-__attribute__((hot, optimize("no-jump-tables")))
+__attribute__((hot, optimize("no-jump-tables", "unroll-loops")))
static void ppu_tick(struct nes_state *state) {
struct ppu_state *restrict ppu = &state->ppu;
uint32_t dot = ppu->dot;
uint32_t scanline = ppu->scanline;
- uint8_t rendering = (ppu->reg_mask & (PPU_MASK_SHOW_SPRITES | PPU_MASK_SHOW_BG));
+ uint8_t reg_mask = ppu->reg_mask;
+ uint8_t rendering = (reg_mask & (PPU_MASK_SHOW_SPRITES | PPU_MASK_SHOW_BG));
for(uint8_t ppu_loops = 0; ppu_loops < 3; ++ppu_loops) {
if(rendering) {
-
if(scanline <= 239) {
-
if(dot >= 1 && dot <= 256) {
if(dot == 256) {
if((ppu->vram_addr & 0x7000) != 0x7000) {
@@ -223,7 +211,7 @@ static void ppu_tick(struct nes_state *state) {
}
}
- ppu_render_pixel(state, dot - 1, scanline);
+ ppu_render_pixel(state, dot - 1, scanline, reg_mask);
goto stupid;
}
@@ -233,7 +221,7 @@ static void ppu_tick(struct nes_state *state) {
}
if(dot >= 321 && dot <= 336) {
-stupid: if(ppu->reg_mask & PPU_MASK_SHOW_SPRITES) {
+stupid: if(reg_mask & PPU_MASK_SHOW_SPRITES) {
for(uint32_t i = 0; i < ppu->sprite_count; i++) {
if(ppu->sprites[i].position > 0) {
ppu->sprites[i].position--;
@@ -327,16 +315,8 @@ stupid: if(ppu->reg_mask & PPU_MASK_SHOW_SPRITES) {
goto stupid2;
}
- if(dot == 257) {
- ppu->vram_addr = (ppu->vram_addr & ~0x041f) | (ppu->temp_addr & 0x041f);
- }
-
- if(dot >= 280 && dot <= 304) {
- ppu->vram_addr = (ppu->vram_addr & ~0x7be0) | (ppu->temp_addr & 0x7be0);
- }
-
if(dot >= 321 && dot <= 336) {
-stupid2: if(ppu->reg_mask & PPU_MASK_SHOW_SPRITES) {
+stupid2: if(reg_mask & PPU_MASK_SHOW_SPRITES) {
for(uint32_t i = 0; i < ppu->sprite_count; i++) {
if(ppu->sprites[i].position > 0) {
ppu->sprites[i].position--;
@@ -399,6 +379,15 @@ stupid2: if(ppu->reg_mask & PPU_MASK_SHOW_SPRITES) {
} break;
}
}
+
+ if(dot == 257) {
+ ppu->vram_addr = (ppu->vram_addr & ~0x041f) | (ppu->temp_addr & 0x041f);
+ }
+
+ if(dot >= 280 && dot <= 304) {
+ ppu->vram_addr = (ppu->vram_addr & ~0x7be0) | (ppu->temp_addr & 0x7be0);
+ }
+
}
}
@@ -436,7 +425,7 @@ stupid2: if(ppu->reg_mask & PPU_MASK_SHOW_SPRITES) {
}
if(state->mapper_function.tick) {
- state->mapper_function.tick(state);
+ state->mapper_function.tick(state); // TODO(peter): This signature has to be changed to supply dot and scanline!
}
}
ppu->dot = dot;