diff options
Diffstat (limited to 'render.c')
| -rw-r--r-- | render.c | 58 |
1 files changed, 29 insertions, 29 deletions
@@ -16,32 +16,32 @@ static void set_decay(uint16_t old_weight) { } /* [=]===^=[ apply_phosphor_decay ]=================================================================^===[=] */ -__attribute__((always_inline, hot)) -static inline void apply_phosphor_decay(void) { - // PROFILE_FUNCTION(); - __m256i old_weight = _mm256_set1_epi16(_old_weight); - __m256i new_weight = _mm256_set1_epi16(_new_weight); - __m128i alpha_mask = _mm_set1_epi32(0x000000ff); - uint32_t * restrict src = buffer; - uint32_t * restrict dst = display_buffer; - - for(uint32_t y = 0; y < BUFFER_HEIGHT; ++y, src += BUFFER_WIDTH, dst += BUFFER_WIDTH) { - for(uint32_t x = 0; x < BUFFER_WIDTH; x += 4) { - _mm_prefetch((char*)&src[x + 2 * BUFFER_WIDTH], _MM_HINT_T0); - _mm_prefetch((char*)&dst[x + 2 * BUFFER_WIDTH], _MM_HINT_T0); - - __m128i new_pixels = _mm_load_si128((__m128i*)&src[x]); - __m128i old_pixels = _mm_load_si128((__m128i*)&dst[x]); - - __m256i old_lo = _mm256_cvtepu8_epi16(old_pixels); - __m256i new_lo = _mm256_cvtepu8_epi16(new_pixels); - - __m256i blended = _mm256_adds_epu16(_mm256_mullo_epi16(old_lo, old_weight), _mm256_mullo_epi16(new_lo, new_weight)); - blended = _mm256_srli_epi16(blended, 8); - - __m128i final_pixels = _mm_packus_epi16(_mm256_castsi256_si128(blended), _mm256_extracti128_si256(blended, 1)); - final_pixels = _mm_or_si128(final_pixels, _mm_and_si128(old_pixels, alpha_mask)); - _mm_store_si128((__m128i*)&dst[x], final_pixels); - } - } -} +// __attribute__((always_inline, hot)) +// static inline void apply_phosphor_decay(void) { +// // PROFILE_FUNCTION(); +// __m256i old_weight = _mm256_set1_epi16(_old_weight); +// __m256i new_weight = _mm256_set1_epi16(_new_weight); +// __m128i alpha_mask = _mm_set1_epi32(0x000000ff); +// uint32_t * restrict src = buffer; +// uint32_t * restrict dst = display_buffer; + +// for(uint32_t y = 0; y < BUFFER_HEIGHT; ++y, src += BUFFER_WIDTH, dst += BUFFER_WIDTH) { +// for(uint32_t x = 0; x < BUFFER_WIDTH; x += 4) { +// _mm_prefetch((char*)&src[x + 2 * BUFFER_WIDTH], _MM_HINT_T0); +// _mm_prefetch((char*)&dst[x + 2 * BUFFER_WIDTH], _MM_HINT_T0); + +// __m128i new_pixels = _mm_load_si128((__m128i*)&src[x]); +// __m128i old_pixels = _mm_load_si128((__m128i*)&dst[x]); + +// __m256i old_lo = _mm256_cvtepu8_epi16(old_pixels); +// __m256i new_lo = _mm256_cvtepu8_epi16(new_pixels); + +// __m256i blended = _mm256_adds_epu16(_mm256_mullo_epi16(old_lo, old_weight), _mm256_mullo_epi16(new_lo, new_weight)); +// blended = _mm256_srli_epi16(blended, 8); + +// __m128i final_pixels = _mm_packus_epi16(_mm256_castsi256_si128(blended), _mm256_extracti128_si256(blended, 1)); +// final_pixels = _mm_or_si128(final_pixels, _mm_and_si128(old_pixels, alpha_mask)); +// _mm_store_si128((__m128i*)&dst[x], final_pixels); +// } +// } +// } |
