// Get pointer to where in the buffer to render RENDER_START(0,0) is top left #define RENDER_START(x, y) (state.display_buffer + ((y) << 11) + (x)) // Center X Coordinate for Rendering #define CENTER_X(w) ((state.render_width - (w)) >> 1) /* [=]===^=[ update_render_position ]=================================================================^===[=] */ static void update_render_position(void) { state.render_x = (BUFFER_WIDTH - state.render_width) >> 1; state.render_y = (BUFFER_HEIGHT - state.render_height) >> 1; state.display_buffer = buffer + (state.render_y * BUFFER_WIDTH) + state.render_x; } /* [=]===^=[ change_resolution ]=================================================================^===[=] */ static void change_resolution(uint32_t new_width, uint32_t new_height) { if(new_width != state.render_width || new_height != state.render_height) { state.render_width = new_width; state.render_height = new_height; update_render_position(); setup_render_target(); } } /* [=]===^=[ clear_buffer ]=================================================================^===[=] */ __attribute__((always_inline, hot)) static inline void clear_buffer(void) { PROFILE_FUNCTION(); uint32_t * restrict dst = RENDER_START(0, 0); for(uint32_t i = 0; i < state.render_height; i++) { memset(dst, 0, state.render_width * 4); dst += BUFFER_WIDTH; } } /* [=]===^=[ set_decay ]=================================================================^===[=] */ static uint16_t _old_weight; static uint16_t _new_weight; static void set_decay(uint16_t old_weight) { _old_weight = old_weight ? (old_weight > 256 ? 256 : old_weight) : 0; _new_weight = 256 - old_weight; } /* [=]===^=[ apply_phosphor_decay ]=================================================================^===[=] */ __attribute__((always_inline, hot)) static inline void apply_phosphor_decay(void) { PROFILE_FUNCTION(); __m256i old_weight = _mm256_set1_epi16(_old_weight); __m256i new_weight = _mm256_set1_epi16(_new_weight); __m128i alpha_mask = _mm_set1_epi32(0x000000ff); uint32_t render_width = state.render_width; uint32_t render_height = state.render_height; uint32_t * restrict src = RENDER_START(0, 0); uint32_t * restrict dst = display_buffer; for(uint32_t y = 0; y < render_height; ++y, src += BUFFER_WIDTH, dst += render_width) { for(uint32_t x = 0; x < render_width; x += 4) { _mm_prefetch((char*)&src[x + 2 * BUFFER_WIDTH], _MM_HINT_T0); _mm_prefetch((char*)&dst[x + 2 * render_width], _MM_HINT_T0); __m128i new_pixels = _mm_loadu_si128((__m128i*)&src[x]); __m128i old_pixels = _mm_loadu_si128((__m128i*)&dst[x]); __m256i old_lo = _mm256_cvtepu8_epi16(old_pixels); __m256i new_lo = _mm256_cvtepu8_epi16(new_pixels); __m256i blended = _mm256_adds_epu16(_mm256_mullo_epi16(old_lo, old_weight), _mm256_mullo_epi16(new_lo, new_weight)); blended = _mm256_srli_epi16(blended, 8); __m128i final_pixels = _mm_packus_epi16(_mm256_castsi256_si128(blended), _mm256_extracti128_si256(blended, 1)); final_pixels = _mm_or_si128(final_pixels, _mm_and_si128(old_pixels, alpha_mask)); _mm_storeu_si128((__m128i*)&dst[x], final_pixels); } } }