1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
|
// Get pointer to where in the buffer to render RENDER_START(0,0) is top left
#define RENDER_START(x, y) (state.display_buffer + ((y) << 11) + (x))
// Center X Coordinate for Rendering
#define CENTER_X(w) ((state.render_width - (w)) >> 1)
/* [=]===^=[ update_render_position ]=================================================================^===[=] */
static void update_render_position(void) {
state.render_x = (BUFFER_WIDTH - state.render_width) >> 1;
state.render_y = (BUFFER_HEIGHT - state.render_height) >> 1;
state.display_buffer = buffer + (state.render_y * BUFFER_WIDTH) + state.render_x;
}
/* [=]===^=[ change_resolution ]=================================================================^===[=] */
static void change_resolution(uint32_t new_width, uint32_t new_height) {
if(new_width != state.render_width || new_height != state.render_height) {
state.render_width = new_width;
state.render_height = new_height;
update_render_position();
setup_render_target();
}
}
/* [=]===^=[ clear_buffer ]=================================================================^===[=] */
__attribute__((always_inline, hot))
static inline void clear_buffer(void) {
PROFILE_FUNCTION();
uint32_t * restrict dst = RENDER_START(0, 0);
for(uint32_t i = 0; i < state.render_height; i++) {
memset(dst, 0, state.render_width * 4);
dst += BUFFER_WIDTH;
}
}
/* [=]===^=[ set_decay ]=================================================================^===[=] */
static uint16_t _old_weight;
static uint16_t _new_weight;
static void set_decay(uint16_t old_weight) {
_old_weight = old_weight ? (old_weight > 256 ? 256 : old_weight) : 0;
_new_weight = 256 - old_weight;
}
/* [=]===^=[ apply_phosphor_decay ]=================================================================^===[=] */
__attribute__((always_inline, hot))
static inline void apply_phosphor_decay(void) {
PROFILE_FUNCTION();
__m256i old_weight = _mm256_set1_epi16(_old_weight);
__m256i new_weight = _mm256_set1_epi16(_new_weight);
__m128i alpha_mask = _mm_set1_epi32(0x000000ff);
uint32_t render_width = state.render_width;
uint32_t render_height = state.render_height;
uint32_t * restrict src = RENDER_START(0, 0);
uint32_t * restrict dst = display_buffer;
for(uint32_t y = 0; y < render_height; ++y, src += BUFFER_WIDTH, dst += render_width) {
for(uint32_t x = 0; x < render_width; x += 4) {
_mm_prefetch((char*)&src[x + 2 * BUFFER_WIDTH], _MM_HINT_T0);
_mm_prefetch((char*)&dst[x + 2 * render_width], _MM_HINT_T0);
__m128i new_pixels = _mm_loadu_si128((__m128i*)&src[x]);
__m128i old_pixels = _mm_loadu_si128((__m128i*)&dst[x]);
__m256i old_lo = _mm256_cvtepu8_epi16(old_pixels);
__m256i new_lo = _mm256_cvtepu8_epi16(new_pixels);
__m256i blended = _mm256_adds_epu16(_mm256_mullo_epi16(old_lo, old_weight), _mm256_mullo_epi16(new_lo, new_weight));
blended = _mm256_srli_epi16(blended, 8);
__m128i final_pixels = _mm_packus_epi16(_mm256_castsi256_si128(blended), _mm256_extracti128_si256(blended, 1));
final_pixels = _mm_or_si128(final_pixels, _mm_and_si128(old_pixels, alpha_mask));
_mm_storeu_si128((__m128i*)&dst[x], final_pixels);
}
}
}
|