; void fade_image(uint16_t *dest, uint32_t *src, uint16_t fade)
	;  - dest points to a 16bit 565 framebuffer
	;  - src points to a 32bit RGBX image
	;  - fade is 24.8 fixed point [0, 1]
	; watcom register calling convention arguments: eax, edx, ebx
	global fade_image_
fade_image_:
	push ecx
	push edi

	mov ecx, 640 * 480
	mov edi, eax

	; take fade and duplicate it across all words of mm1
	movd mm1, ebx		; mm1 [00|00|00|VV]
	punpckldq mm1, mm1	; mm1 [00|VV|00|VV]
	packssdw mm1, mm1	; mm1 [VV|VV|VV|VV]
.loop:
	; grab RGB32 pixel and unpack it to zero-extended words in mm0
	movd mm0, [edx]		; mm0 [??|??|?R|GB]
	add edx, 4
	pxor mm7, mm7
	punpcklbw mm0, mm7	; mm0 [0?|0R|0G|0B]
	; multiply by fade and divide by 256 to drop the decimal part
	pmullw mm0, mm1
	psrlw mm0, 10	; 8 for the div + 2 to make them 666, easier 565 packing
	; pack result into 565 in ax
	packuswb mm0, mm0
	movd eax, mm0
	mov ebx, eax
	shr al, 1	; blue in position [........|00RRRRRR|00GGGGGG|000BBBBB]
	xor bl, bl
	shr bx, 3
	xor ah, ah
	or ax, bx	; green in position ...|00RRRRRR|00000GGG|GGGBBBBB]
	shr ebx, 6
	and ebx, 0f800h
	or eax, ebx	; done [RRRRRGGG|GGGBBBBB]
	mov [edi], ax
	add edi, 2

	dec ecx
	jnz .loop

	emms		; clear fpu state

	pop edi
	pop ecx
	ret