/* [=]===^=[ clear_buffer ]=================================================================^===[=] */ __attribute__((always_inline, hot)) static inline void clear_buffer(void) { // PROFILE_FUNCTION(); memset(buffer, 0, sizeof(buffer)); } /* [=]===^=[ set_decay ]=================================================================^===[=] */ static uint16_t _old_weight; static uint16_t _new_weight; static void set_decay(uint16_t old_weight) { _old_weight = old_weight ? (old_weight > 256 ? 256 : old_weight) : 0; _new_weight = 256 - old_weight; } /* [=]===^=[ apply_phosphor_decay ]=================================================================^===[=] */ __attribute__((always_inline, hot)) static inline void apply_phosphor_decay(void) { // PROFILE_FUNCTION(); __m256i old_weight = _mm256_set1_epi16(_old_weight); __m256i new_weight = _mm256_set1_epi16(_new_weight); __m128i alpha_mask = _mm_set1_epi32(0x000000ff); uint32_t * restrict src = buffer; uint32_t * restrict dst = display_buffer; for(uint32_t y = 0; y < BUFFER_HEIGHT; ++y, src += BUFFER_WIDTH, dst += BUFFER_WIDTH) { for(uint32_t x = 0; x < BUFFER_WIDTH; x += 4) { _mm_prefetch((char*)&src[x + 2 * BUFFER_WIDTH], _MM_HINT_T0); _mm_prefetch((char*)&dst[x + 2 * BUFFER_WIDTH], _MM_HINT_T0); __m128i new_pixels = _mm_loadu_si128((__m128i*)&src[x]); __m128i old_pixels = _mm_loadu_si128((__m128i*)&dst[x]); __m256i old_lo = _mm256_cvtepu8_epi16(old_pixels); __m256i new_lo = _mm256_cvtepu8_epi16(new_pixels); __m256i blended = _mm256_adds_epu16(_mm256_mullo_epi16(old_lo, old_weight), _mm256_mullo_epi16(new_lo, new_weight)); blended = _mm256_srli_epi16(blended, 8); __m128i final_pixels = _mm_packus_epi16(_mm256_castsi256_si128(blended), _mm256_extracti128_si256(blended, 1)); final_pixels = _mm_or_si128(final_pixels, _mm_and_si128(old_pixels, alpha_mask)); _mm_storeu_si128((__m128i*)&dst[x], final_pixels); } } }