AxibugEmuOnline_old/References/VirtuaNESex_src_191105/nx_2xSaI.h
2024-08-05 17:58:53 +08:00

1256 lines
27 KiB
C

/*---------------------------------------------------------------------*
* The following (piece of) code, (part of) the 2xSaI engine, *
* copyright (c) 2001 by Derek Liauw Kie Fa. *
* Non-Commercial use of the engine is allowed and is encouraged, *
* provided that appropriate credit be given and that this copyright *
* notice will not be removed under any circumstance. *
* You may freely modify this code, but I request *
* that any improvements to the engine be submitted to me, so *
* that I can implement these improvements in newer versions of *
* the engine. *
* If you need more information, have any comments or suggestions, *
* you can e-mail me. My e-mail: DerekL666@yahoo.com *
*---------------------------------------------------------------------*/
//
// This code was converted into VirtuaNES by Norix.
//
static void nx_2xSaILine_16bpp_mmx( euI8* pSrc, euI8* pDlt, euI32 srcPitch, euI32 width, euI8* pDst, euI32 dstPitch, euI32 bForceWrite )
{
__asm {
mov eax, pSrc
mov ebx, srcPitch
mov edx, pDst
sub eax, ebx
nx_2xSaILine_16mmx_loop:
mov ecx, bForceWrite
test ecx, ecx
jz nx_2xSaILine_16mmx_normal
mov esi, pDlt
movq mm6, [eax+colorI]
movq [esi+colorI], mm6
jmp nx_2xSaILine_16mmx_forcewrite
nx_2xSaILine_16mmx_normal:
// Check delta
mov ecx, pDlt
// load source img
lea esi, [eax+ebx]
movq mm0, [eax+colorI]
movq mm1, [eax+colorJ]
movq mm2, [eax+ebx+colorG]
movq mm3, [eax+ebx+colorK]
movq mm4, [eax+ebx*2+colorH]
movq mm5, [eax+ebx*2+colorL]
movq mm6, [esi+ebx*2+colorM]
movq mm7, [esi+ebx*2+colorP]
// compare to delta
lea esi, [ecx+ebx]
pcmpeqw mm0, [ecx+colorI]
pcmpeqw mm1, [ecx+colorJ]
pcmpeqw mm2, [ecx+ebx+colorG]
pcmpeqw mm3, [ecx+ebx+colorK]
pcmpeqw mm4, [ecx+ebx*2+colorH]
pcmpeqw mm5, [ecx+ebx*2+colorL]
pcmpeqw mm6, [esi+ebx*2+colorM]
pcmpeqw mm7, [esi+ebx*2+colorP]
// compose results
pand mm0, mm1
pand mm2, mm3
pand mm4, mm5
pand mm6, mm7
pand mm0, mm2
pand mm4, mm6
pxor mm7, mm7
pand mm0, mm4
movq mm6, [eax+colorI]
pcmpeqw mm7, mm0
movq [ecx+colorI], mm6
packsswb mm7, mm7
movd ecx, mm7
test ecx, ecx
jz nx_2xSaILine_16mmx_skipprocess
// End Delta
nx_2xSaILine_16mmx_forcewrite:
//------------------------------
// 1
// if ((colorA == colorD) && (colorB != colorC) && (colorA == colorE) && (colorB == colorL)
movq mm0, [eax+ebx+colorA] // mm0 and mm1 contain colorA
movq mm2, [eax+ebx+colorB] // mm2 and mm3 contain colorB
movq mm1, mm0
movq mm3, mm2
pcmpeqw mm0, [eax+ebx*2+colorD]
pcmpeqw mm1, [eax+colorE]
pcmpeqw mm2, [eax+ebx*2+colorL]
pcmpeqw mm3, [eax+ebx*2+colorC]
pand mm0, mm1
pxor mm1, mm1
pand mm0, mm2
pcmpeqw mm3, mm1
pand mm0, mm3 // result in mm0
// if ((colorA == colorC) && (colorB != colorE) && (colorA == colorF) && (colorB == colorJ)
movq mm4, [eax+ebx+colorA] // mm4 and mm5 contain colorA
movq mm6, [eax+ebx+colorB] // mm6 and mm7 contain colorB
movq mm5, mm4
movq mm7, mm6
pcmpeqw mm4, [eax+ebx*2+colorC]
pcmpeqw mm5, [eax+colorF]
pcmpeqw mm6, [eax+colorJ]
pcmpeqw mm7, [eax+colorE]
pand mm4, mm5
pxor mm5, mm5
pand mm4, mm6
pcmpeqw mm7, mm5
pand mm4, mm7 // result in mm4
por mm0, mm4 // combine the masks
movq Mask1, mm0
//------------------------------
// 2
// if ((colorB == colorC) && (colorA != colorD) && (colorB == colorF) && (colorA == colorH)
movq mm0, [eax+ebx+colorB] // mm0 and mm1 contain colorB
movq mm2, [eax+ebx+colorA] // mm2 and mm3 contain colorA
movq mm1, mm0
movq mm3, mm2
pcmpeqw mm0, [eax+ebx*2+colorC]
pcmpeqw mm1, [eax+colorF]
pcmpeqw mm2, [eax+ebx*2+colorH]
pcmpeqw mm3, [eax+ebx*2+colorD]
pand mm0, mm1
pxor mm1, mm1
pand mm0, mm2
pcmpeqw mm3, mm1
pand mm0, mm3 // result in mm0
// if ((colorB == colorE) && (colorB == colorD) && (colorA != colorF) && (colorA == colorI)
movq mm4, [eax+ebx+colorB] // mm4 and mm5 contain colorB
movq mm6, [eax+ebx+colorA] // mm6 and mm7 contain colorA
movq mm5, mm4
movq mm7, mm6
pcmpeqw mm4, [eax+ebx*2+colorD]
pcmpeqw mm5, [eax+colorE]
pcmpeqw mm6, [eax+colorI]
pcmpeqw mm7, [eax+colorF]
pand mm4, mm5
pxor mm5, mm5
pand mm4, mm6
pcmpeqw mm7, mm5
pand mm4, mm7 // result in mm4
por mm0, mm4 // combine the masks
movq Mask2, mm0
//------------------------------
// interpolate colorA and colorB
//------------------------------
movq mm0, [eax+ebx+colorA]
movq mm1, [eax+ebx+colorB]
movq mm2, mm0
movq mm3, mm1
movq mm6, cMask
pxor mm3, mm2
pand mm0, mm1
pand mm3, mm6
psrlw mm3, 1
paddw mm0, mm3 // mm0 contains the interpolated values
// assemble the pixels
movq mm1, [eax+ebx+colorA]
movq mm2, [eax+ebx+colorB]
movq mm3, Mask1
movq mm5, mm1
movq mm4, Mask2
movq mm6, mm1
pand mm1, mm3
por mm3, mm4
pxor mm7, mm7
pand mm2, mm4
pcmpeqw mm3, mm7
por mm1, mm2
pand mm0, mm3
por mm0, mm1
punpcklwd mm5, mm0
punpckhwd mm6, mm0
//------------------------------
// Write image
//------------------------------
movq [edx+0], mm5
movq [edx+8], mm6
//------------------------------
// Create the Nextline
//------------------------------
// 3
// if ((colorA == colorD) && (colorB != colorC) && (colorA == colorG) && (colorC == colorO)
lea esi, [eax+ebx]
movq mm0, [eax+ebx+colorA] // mm0 and mm1 contain colorA
movq mm2, [eax+ebx*2+colorC] // mm2 and mm3 contain colorC
movq mm1, mm0
movq mm3, mm2
pcmpeqw mm0, [esi+ebx+colorD]
pcmpeqw mm1, [esi+colorG]
pcmpeqw mm2, [esi+ebx*2+colorO]
pcmpeqw mm3, [esi+colorB]
pand mm0, mm1
pxor mm1, mm1
pand mm0, mm2
pcmpeqw mm3, mm1
pand mm0, mm3 // result in mm0
// if ((colorA == colorB) && (colorG != colorC) && (colorA == colorH) && (colorC == colorM)
movq mm4, [eax+ebx+colorA] // mm4 and mm5 contain colorA
movq mm6, [eax+ebx*2+colorC] // mm6 and mm7 contain colorC
movq mm5, mm4
movq mm7, mm6
pcmpeqw mm4, [esi+ebx+colorH]
pcmpeqw mm5, [esi+colorB]
pcmpeqw mm6, [esi+ebx*2+colorM]
pcmpeqw mm7, [esi+colorG]
pand mm4, mm5
pxor mm5, mm5
pand mm4, mm6
pcmpeqw mm7, mm5
pand mm4, mm7 // result in mm4
por mm0, mm4 // combine the masks
movq Mask1, mm0
//------------------------------
// 4
// if ((colorB == colorC) && (colorA != colorD) && (colorC == colorH) && (colorA == colorF)
movq mm0, [eax+ebx*2+colorC] // mm0 and mm1 contain colorC
movq mm2, [eax+ebx+colorA] // mm2 and mm3 contain colorA
movq mm1, mm0
movq mm3, mm2
pcmpeqw mm0, [eax+ebx+colorB]
pcmpeqw mm1, [eax+ebx*2+colorH]
pcmpeqw mm2, [eax+colorF]
pcmpeqw mm3, [eax+ebx*2+colorD]
pand mm0, mm1
pxor mm1, mm1
pand mm0, mm2
pcmpeqw mm3, mm1
pand mm0, mm3 // result in mm0
// if ((colorC == colorG) && (colorC == colorD) && (colorA != colorH) && (colorA == colorI)
movq mm4, [eax+ebx*2+colorC] // mm4 and mm5 contain colorC
movq mm6, [eax+ebx+colorA] // mm6 and mm7 contain colorA
movq mm5, mm4
movq mm7, mm6
pcmpeqw mm4, [eax+ebx*2+colorD]
pcmpeqw mm5, [eax+ebx+colorG]
pcmpeqw mm6, [eax+colorI]
pcmpeqw mm7, [eax+ebx*2+colorH]
pand mm4, mm5
pxor mm5, mm5
pand mm4, mm6
pcmpeqw mm7, mm5
pand mm4, mm7 // result in mm4
por mm0, mm4 ;combine the masks
movq Mask2, mm0
//------------------------------
// interpolate colorA and colorC
//------------------------------
movq mm0, [eax+ebx+colorA]
movq mm1, [eax+ebx*2+colorC]
movq mm2, mm0
movq mm3, mm1
movq mm6, cMask
pxor mm3, mm2
pand mm0, mm1
pand mm3, mm6
psrlw mm3, 1
paddw mm0, mm3 // mm0 contains the interpolated values
// assemble the pixels
movq mm1, [eax+ebx+colorA]
movq mm2, [eax+ebx*2+colorC]
movq mm3, Mask1
movq mm4, Mask2
pand mm1, mm3
pand mm2, mm4
por mm3, mm4
pxor mm7, mm7
por mm1, mm2
pcmpeqw mm3, mm7
pand mm0, mm3
por mm0, mm1
movq ACPixel, mm0
//------------------------------
// Decide which "branch" to take
//------------------------------
movq mm0, [eax+ebx+colorA]
movq mm1, [eax+ebx+colorB]
movq mm6, mm0
movq mm7, mm1
pcmpeqw mm0, [eax+ebx*2+colorD]
pcmpeqw mm1, [eax+ebx*2+colorC]
pcmpeqw mm6, mm7
movq mm2, mm0
movq mm3, mm0
pand mm0, mm1 // colorA == colorD && colorB == colorC
pxor mm7, mm7
pcmpeqw mm2, mm7
pand mm6, mm0
pand mm2, mm1 // colorA != colorD && colorB == colorC
pcmpeqw mm1, mm7
pand mm1, mm3 // colorA == colorD && colorB != colorC
pxor mm0, mm6
por mm1, mm6
movq mm7, mm0
movq Mask2, mm2
packsswb mm7, mm7
movq Mask1, mm1
movd ecx, mm7
test ecx, ecx
jz nx_2xSaILine_16mmx_skipguess
//------------------------------
// Map of the pixels: I|E F|J
// G|A B|K
// H|C D|L
// M|N O|P
//------------------------------
movq mm6, mm0
movq mm4, [eax+ebx+colorA]
movq mm5, [eax+ebx+colorB]
pxor mm7, mm7
pand mm6, ONE
movq mm0, [eax+colorE]
movq mm1, [eax+ebx+colorG]
movq mm2, mm0
movq mm3, mm1
pcmpeqw mm0, mm4
pcmpeqw mm1, mm4
pcmpeqw mm2, mm5
pcmpeqw mm3, mm5
pand mm0, mm6
pand mm1, mm6
pand mm2, mm6
pand mm3, mm6
paddw mm0, mm1
paddw mm2, mm3
pxor mm3, mm3
pcmpgtw mm0, mm6
pcmpgtw mm2, mm6
pcmpeqw mm0, mm3
pcmpeqw mm2, mm3
pand mm0, mm6
pand mm2, mm6
paddw mm7, mm0
psubw mm7, mm2
movq mm0, [eax+colorF]
movq mm1, [eax+ebx+colorK]
movq mm2, mm0
movq mm3, mm1
pcmpeqw mm0, mm4
pcmpeqw mm1, mm4
pcmpeqw mm2, mm5
pcmpeqw mm3, mm5
pand mm0, mm6
pand mm1, mm6
pand mm2, mm6
pand mm3, mm6
paddw mm0, mm1
paddw mm2, mm3
lea esi, [eax+ebx]
pxor mm3, mm3
pcmpgtw mm0, mm6
pcmpgtw mm2, mm6
pcmpeqw mm0, mm3
pcmpeqw mm2, mm3
pand mm0, mm6
pand mm2, mm6
paddw mm7, mm0
psubw mm7, mm2
movq mm0, [esi+ebx+colorH]
movq mm1, [esi+ebx*2+colorN]
movq mm2, mm0
movq mm3, mm1
pcmpeqw mm0, mm4
pcmpeqw mm1, mm4
pcmpeqw mm2, mm5
pcmpeqw mm3, mm5
pand mm0, mm6
pand mm1, mm6
pand mm2, mm6
pand mm3, mm6
paddw mm0, mm1
paddw mm2, mm3
pxor mm3, mm3
pcmpgtw mm0, mm6
pcmpgtw mm2, mm6
pcmpeqw mm0, mm3
pcmpeqw mm2, mm3
pand mm0, mm6
pand mm2, mm6
paddw mm7, mm0
psubw mm7, mm2
movq mm0, [esi+ebx+colorL]
movq mm1, [esi+ebx*2+colorO]
movq mm2, mm0
movq mm3, mm1
pcmpeqw mm0, mm4
pcmpeqw mm1, mm4
pcmpeqw mm2, mm5
pcmpeqw mm3, mm5
pand mm0, mm6
pand mm1, mm6
pand mm2, mm6
pand mm3, mm6
paddw mm0, mm1
paddw mm2, mm3
pxor mm3, mm3
pcmpgtw mm0, mm6
pcmpgtw mm2, mm6
pcmpeqw mm0, mm3
pcmpeqw mm2, mm3
pand mm0, mm6
pand mm2, mm6
paddw mm7, mm0
psubw mm7, mm2
movq mm1, mm7
pxor mm0, mm0
pcmpgtw mm7, mm0
pcmpgtw mm0, mm1
por mm7, Mask1
por mm0, Mask2
movq Mask1, mm7
movq Mask2, mm0
nx_2xSaILine_16mmx_skipguess:
//------------------------------
// interpolate A, B, C and D
//------------------------------
movq mm0, [eax+ebx+colorA]
movq mm1, [eax+ebx+colorB]
movq mm4, mm0
movq mm2, [eax+ebx*2+colorC]
movq mm5, mm1
movq mm3, qMask // qcolorMask
movq mm6, mm2
movq mm7, lMask // qlowcolorMask
pand mm0, mm3
pand mm1, mm3
pand mm2, mm3
pand mm3, [eax+ebx*2+colorD]
psrlw mm0, 2
pand mm4, mm7
psrlw mm1, 2
pand mm5, mm7
psrlw mm2, 2
pand mm6, mm7
psrlw mm3, 2
pand mm7, [eax+ebx*2+colorD]
paddw mm0, mm1
paddw mm2, mm3
paddw mm4, mm5
paddw mm6, mm7
paddw mm4, mm6
movq mm7, lMask // qlowcolorMask
paddw mm0, mm2
psrlw mm4, 2
pand mm4, mm7
paddw mm0, mm4 // mm0 contains the interpolated value of A, B, C and D
// assemble the pixels
movq mm1, Mask1
movq mm2, Mask2
movq mm4, [eax+ebx+colorA]
movq mm5, [eax+ebx+colorB]
pand mm4, mm1
pand mm5, mm2
pxor mm7, mm7
por mm1, mm2
por mm4, mm5
pcmpeqw mm1, mm7
pand mm0, mm1
por mm4, mm0 // mm4 contains the diagonal pixels
movq mm0, ACPixel
mov edi, dstPitch
movq mm1, mm0
punpcklwd mm0, mm4
punpckhwd mm1, mm4
//------------------------------
// Write image
//------------------------------
movq [edx+edi+0], mm0
movq [edx+edi+8], mm1
nx_2xSaILine_16mmx_skipprocess:
add pDlt, 8 // 4 pixels
lea eax, [eax+ 8] // 4 pixels
lea edx, [edx+16] // 8 pixels
sub width, 4 // 4 pixels
jg nx_2xSaILine_16mmx_loop
emms
}
}
static void nx_2xSaILine_32bpp_mmx( euI8* pSrc, euI8* pDlt, euI32 srcPitch, euI32 width, euI8* pDst, euI32 dstPitch, euI32 bForceWrite )
{
__asm {
mov eax, pSrc
mov ebx, srcPitch
mov edx, pDst
sub eax, ebx
nx_2xSaILine_32mmx_loop:
mov ecx, bForceWrite
test ecx, ecx
jz nx_2xSaILine_32mmx_normal
mov esi, pDlt
movq mm6, [eax+colorI]
movq [esi+colorI], mm6
jmp nx_2xSaILine_32mmx_forcewrite
nx_2xSaILine_32mmx_normal:
// Check delta
mov ecx, pDlt
// load source img
lea esi, [eax+ebx]
movq mm0, [eax+colorI]
movq mm1, [eax+colorJ]
movq mm2, [eax+ebx+colorG]
movq mm3, [eax+ebx+colorK]
movq mm4, [eax+ebx*2+colorH]
movq mm5, [eax+ebx*2+colorL]
movq mm6, [esi+ebx*2+colorM]
movq mm7, [esi+ebx*2+colorP]
// compare to delta
lea esi, [ecx+ebx]
pcmpeqw mm0, [ecx+colorI]
pcmpeqw mm1, [ecx+colorJ]
pcmpeqw mm2, [ecx+ebx+colorG]
pcmpeqw mm3, [ecx+ebx+colorK]
pcmpeqw mm4, [ecx+ebx*2+colorH]
pcmpeqw mm5, [ecx+ebx*2+colorL]
pcmpeqw mm6, [esi+ebx*2+colorM]
pcmpeqw mm7, [esi+ebx*2+colorP]
// compose results
pand mm0, mm1
pand mm2, mm3
pand mm4, mm5
pand mm6, mm7
pand mm0, mm2
pand mm4, mm6
pxor mm7, mm7
pand mm0, mm4
movq mm6, [eax+colorI]
pcmpeqw mm7, mm0
movq [ecx+colorI], mm6
packsswb mm7, mm7
movd ecx, mm7
test ecx, ecx
jz nx_2xSaILine_32mmx_skipprocess
// End Delta
nx_2xSaILine_32mmx_forcewrite:
//------------------------------
// 1
// if ((colorA == colorD) && (colorB != colorC) && (colorA == colorE) && (colorB == colorL)
movq mm0, [eax+ebx+colorA] // mm0 and mm1 contain colorA
movq mm2, [eax+ebx+colorB] // mm2 and mm3 contain colorB
movq mm1, mm0
movq mm3, mm2
pcmpeqw mm0, [eax+ebx*2+colorD]
pcmpeqw mm1, [eax+colorE]
pcmpeqw mm2, [eax+ebx*2+colorL]
pcmpeqw mm3, [eax+ebx*2+colorC]
pand mm0, mm1
pxor mm1, mm1
pand mm0, mm2
pcmpeqw mm3, mm1
pand mm0, mm3 // result in mm0
// if ((colorA == colorC) && (colorB != colorE) && (colorA == colorF) && (colorB == colorJ)
movq mm4, [eax+ebx+colorA] // mm4 and mm5 contain colorA
movq mm6, [eax+ebx+colorB] // mm6 and mm7 contain colorB
movq mm5, mm4
movq mm7, mm6
pcmpeqw mm4, [eax+ebx*2+colorC]
pcmpeqw mm5, [eax+colorF]
pcmpeqw mm6, [eax+colorJ]
pcmpeqw mm7, [eax+colorE]
pand mm4, mm5
pxor mm5, mm5
pand mm4, mm6
pcmpeqw mm7, mm5
pand mm4, mm7 // result in mm4
por mm0, mm4 // combine the masks
movq Mask1, mm0
//------------------------------
// 2
// if ((colorB == colorC) && (colorA != colorD) && (colorB == colorF) && (colorA == colorH)
movq mm0, [eax+ebx+colorB] // mm0 and mm1 contain colorB
movq mm2, [eax+ebx+colorA] // mm2 and mm3 contain colorA
movq mm1, mm0
movq mm3, mm2
pcmpeqw mm0, [eax+ebx*2+colorC]
pcmpeqw mm1, [eax+colorF]
pcmpeqw mm2, [eax+ebx*2+colorH]
pcmpeqw mm3, [eax+ebx*2+colorD]
pand mm0, mm1
pxor mm1, mm1
pand mm0, mm2
pcmpeqw mm3, mm1
pand mm0, mm3 // result in mm0
// if ((colorB == colorE) && (colorB == colorD) && (colorA != colorF) && (colorA == colorI)
movq mm4, [eax+ebx+colorB] // mm4 and mm5 contain colorB
movq mm6, [eax+ebx+colorA] // mm6 and mm7 contain colorA
movq mm5, mm4
movq mm7, mm6
pcmpeqw mm4, [eax+ebx*2+colorD]
pcmpeqw mm5, [eax+colorE]
pcmpeqw mm6, [eax+colorI]
pcmpeqw mm7, [eax+colorF]
pand mm4, mm5
pxor mm5, mm5
pand mm4, mm6
pcmpeqw mm7, mm5
pand mm4, mm7 // result in mm4
por mm0, mm4 // combine the masks
movq Mask2, mm0
//------------------------------
// interpolate colorA and colorB
//------------------------------
movq mm0, [eax+ebx+colorA]
movq mm1, [eax+ebx+colorB]
movq mm2, mm0
movq mm3, mm1
movq mm6, cMask
pxor mm3, mm2
pand mm0, mm1
pand mm3, mm6
psrlw mm3, 1
paddw mm0, mm3 // mm0 contains the interpolated values
// assemble the pixels
movq mm1, [eax+ebx+colorA]
movq mm2, [eax+ebx+colorB]
movq mm3, Mask1
movq mm5, mm1
movq mm4, Mask2
movq mm6, mm1
pand mm1, mm3
por mm3, mm4
pxor mm7, mm7
pand mm2, mm4
pcmpeqw mm3, mm7
por mm1, mm2
pand mm0, mm3
por mm0, mm1
punpcklwd mm5, mm0
punpckhwd mm6, mm0
//------------------------------
// Write image
//------------------------------
// save
mov esi, eax
mov edi, ebx
mov ecx, 0x00F8F8F8 // mask
// movq [edx+0], mm5
movd eax, mm5
movd ebx, mm5
shl eax, 3
shl ebx, 9
shl ah, 3
and ebx, ecx
mov bx, ax
psrlq mm5, 16
mov [edx+0], ebx
movd eax, mm5
movd ebx, mm5
shl eax, 3
shl ebx, 9
shl ah, 3
and ebx, ecx
mov bx, ax
psrlq mm5, 16
mov [edx+4], ebx
movd eax, mm5
movd ebx, mm5
shl eax, 3
shl ebx, 9
shl ah, 3
and ebx, ecx
mov bx, ax
psrlq mm5, 16
mov [edx+8], ebx
movd eax, mm5
movd ebx, mm5
shl eax, 3
shl ebx, 9
shl ah, 3
and ebx, ecx
mov bx, ax
mov [edx+12], ebx
// movq [edx+8], mm6
movd eax, mm6
movd ebx, mm6
shl eax, 3
shl ebx, 9
shl ah, 3
and ebx, ecx
mov bx, ax
psrlq mm6, 16
mov [edx+16], ebx
movd eax, mm6
movd ebx, mm6
shl eax, 3
shl ebx, 9
shl ah, 3
and ebx, ecx
mov bx, ax
psrlq mm6, 16
mov [edx+20], ebx
movd eax, mm6
movd ebx, mm6
shl eax, 3
shl ebx, 9
shl ah, 3
and ebx, ecx
mov bx, ax
psrlq mm6, 16
mov [edx+24], ebx
movd eax, mm6
movd ebx, mm6
shl eax, 3
shl ebx, 9
shl ah, 3
and ebx, ecx
mov bx, ax
mov [edx+28], ebx
// restore
mov eax, esi
mov ebx, edi
//------------------------------
// Create the Nextline
//------------------------------
// 3
// if ((colorA == colorD) && (colorB != colorC) && (colorA == colorG) && (colorC == colorO)
lea esi, [eax+ebx]
movq mm0, [eax+ebx+colorA] // mm0 and mm1 contain colorA
movq mm2, [eax+ebx*2+colorC] // mm2 and mm3 contain colorC
movq mm1, mm0
movq mm3, mm2
pcmpeqw mm0, [esi+ebx+colorD]
pcmpeqw mm1, [esi+colorG]
pcmpeqw mm2, [esi+ebx*2+colorO]
pcmpeqw mm3, [esi+colorB]
pand mm0, mm1
pxor mm1, mm1
pand mm0, mm2
pcmpeqw mm3, mm1
pand mm0, mm3 // result in mm0
// if ((colorA == colorB) && (colorG != colorC) && (colorA == colorH) && (colorC == colorM)
movq mm4, [eax+ebx+colorA] // mm4 and mm5 contain colorA
movq mm6, [eax+ebx*2+colorC] // mm6 and mm7 contain colorC
movq mm5, mm4
movq mm7, mm6
pcmpeqw mm4, [esi+ebx+colorH]
pcmpeqw mm5, [esi+colorB]
pcmpeqw mm6, [esi+ebx*2+colorM]
pcmpeqw mm7, [esi+colorG]
pand mm4, mm5
pxor mm5, mm5
pand mm4, mm6
pcmpeqw mm7, mm5
pand mm4, mm7 // result in mm4
por mm0, mm4 // combine the masks
movq Mask1, mm0
//------------------------------
// 4
// if ((colorB == colorC) && (colorA != colorD) && (colorC == colorH) && (colorA == colorF)
movq mm0, [eax+ebx*2+colorC] // mm0 and mm1 contain colorC
movq mm2, [eax+ebx+colorA] // mm2 and mm3 contain colorA
movq mm1, mm0
movq mm3, mm2
pcmpeqw mm0, [eax+ebx+colorB]
pcmpeqw mm1, [eax+ebx*2+colorH]
pcmpeqw mm2, [eax+colorF]
pcmpeqw mm3, [eax+ebx*2+colorD]
pand mm0, mm1
pxor mm1, mm1
pand mm0, mm2
pcmpeqw mm3, mm1
pand mm0, mm3 // result in mm0
// if ((colorC == colorG) && (colorC == colorD) && (colorA != colorH) && (colorA == colorI)
movq mm4, [eax+ebx*2+colorC] // mm4 and mm5 contain colorC
movq mm6, [eax+ebx+colorA] // mm6 and mm7 contain colorA
movq mm5, mm4
movq mm7, mm6
pcmpeqw mm4, [eax+ebx*2+colorD]
pcmpeqw mm5, [eax+ebx+colorG]
pcmpeqw mm6, [eax+colorI]
pcmpeqw mm7, [eax+ebx*2+colorH]
pand mm4, mm5
pxor mm5, mm5
pand mm4, mm6
pcmpeqw mm7, mm5
pand mm4, mm7 // result in mm4
por mm0, mm4 ;combine the masks
movq Mask2, mm0
//------------------------------
// interpolate colorA and colorC
//------------------------------
movq mm0, [eax+ebx+colorA]
movq mm1, [eax+ebx*2+colorC]
movq mm2, mm0
movq mm3, mm1
movq mm6, cMask
pxor mm3, mm2
pand mm0, mm1
pand mm3, mm6
psrlw mm3, 1
paddw mm0, mm3 // mm0 contains the interpolated values
// assemble the pixels
movq mm1, [eax+ebx+colorA]
movq mm2, [eax+ebx*2+colorC]
movq mm3, Mask1
movq mm4, Mask2
pand mm1, mm3
pand mm2, mm4
por mm3, mm4
pxor mm7, mm7
por mm1, mm2
pcmpeqw mm3, mm7
pand mm0, mm3
por mm0, mm1
movq ACPixel, mm0
//------------------------------
// Decide which "branch" to take
//------------------------------
movq mm0, [eax+ebx+colorA]
movq mm1, [eax+ebx+colorB]
movq mm6, mm0
movq mm7, mm1
pcmpeqw mm0, [eax+ebx*2+colorD]
pcmpeqw mm1, [eax+ebx*2+colorC]
pcmpeqw mm6, mm7
movq mm2, mm0
movq mm3, mm0
pand mm0, mm1 // colorA == colorD && colorB == colorC
pxor mm7, mm7
pcmpeqw mm2, mm7
pand mm6, mm0
pand mm2, mm1 // colorA != colorD && colorB == colorC
pcmpeqw mm1, mm7
pand mm1, mm3 // colorA == colorD && colorB != colorC
pxor mm0, mm6
por mm1, mm6
movq mm7, mm0
movq Mask2, mm2
packsswb mm7, mm7
movq Mask1, mm1
movd ecx, mm7
test ecx, ecx
jz nx_2xSaILine_32mmx_skipguess
//------------------------------
// Map of the pixels: I|E F|J
// G|A B|K
// H|C D|L
// M|N O|P
//------------------------------
movq mm6, mm0
movq mm4, [eax+ebx+colorA]
movq mm5, [eax+ebx+colorB]
pxor mm7, mm7
pand mm6, ONE
movq mm0, [eax+colorE]
movq mm1, [eax+ebx+colorG]
movq mm2, mm0
movq mm3, mm1
pcmpeqw mm0, mm4
pcmpeqw mm1, mm4
pcmpeqw mm2, mm5
pcmpeqw mm3, mm5
pand mm0, mm6
pand mm1, mm6
pand mm2, mm6
pand mm3, mm6
paddw mm0, mm1
paddw mm2, mm3
pxor mm3, mm3
pcmpgtw mm0, mm6
pcmpgtw mm2, mm6
pcmpeqw mm0, mm3
pcmpeqw mm2, mm3
pand mm0, mm6
pand mm2, mm6
paddw mm7, mm0
psubw mm7, mm2
movq mm0, [eax+colorF]
movq mm1, [eax+ebx+colorK]
movq mm2, mm0
movq mm3, mm1
pcmpeqw mm0, mm4
pcmpeqw mm1, mm4
pcmpeqw mm2, mm5
pcmpeqw mm3, mm5
pand mm0, mm6
pand mm1, mm6
pand mm2, mm6
pand mm3, mm6
paddw mm0, mm1
paddw mm2, mm3
lea esi, [eax+ebx]
pxor mm3, mm3
pcmpgtw mm0, mm6
pcmpgtw mm2, mm6
pcmpeqw mm0, mm3
pcmpeqw mm2, mm3
pand mm0, mm6
pand mm2, mm6
paddw mm7, mm0
psubw mm7, mm2
movq mm0, [esi+ebx+colorH]
movq mm1, [esi+ebx*2+colorN]
movq mm2, mm0
movq mm3, mm1
pcmpeqw mm0, mm4
pcmpeqw mm1, mm4
pcmpeqw mm2, mm5
pcmpeqw mm3, mm5
pand mm0, mm6
pand mm1, mm6
pand mm2, mm6
pand mm3, mm6
paddw mm0, mm1
paddw mm2, mm3
pxor mm3, mm3
pcmpgtw mm0, mm6
pcmpgtw mm2, mm6
pcmpeqw mm0, mm3
pcmpeqw mm2, mm3
pand mm0, mm6
pand mm2, mm6
paddw mm7, mm0
psubw mm7, mm2
movq mm0, [esi+ebx+colorL]
movq mm1, [esi+ebx*2+colorO]
movq mm2, mm0
movq mm3, mm1
pcmpeqw mm0, mm4
pcmpeqw mm1, mm4
pcmpeqw mm2, mm5
pcmpeqw mm3, mm5
pand mm0, mm6
pand mm1, mm6
pand mm2, mm6
pand mm3, mm6
paddw mm0, mm1
paddw mm2, mm3
pxor mm3, mm3
pcmpgtw mm0, mm6
pcmpgtw mm2, mm6
pcmpeqw mm0, mm3
pcmpeqw mm2, mm3
pand mm0, mm6
pand mm2, mm6
paddw mm7, mm0
psubw mm7, mm2
movq mm1, mm7
pxor mm0, mm0
pcmpgtw mm7, mm0
pcmpgtw mm0, mm1
por mm7, Mask1
por mm0, Mask2
movq Mask1, mm7
movq Mask2, mm0
nx_2xSaILine_32mmx_skipguess:
//------------------------------
// interpolate A, B, C and D
//------------------------------
movq mm0, [eax+ebx+colorA]
movq mm1, [eax+ebx+colorB]
movq mm4, mm0
movq mm2, [eax+ebx*2+colorC]
movq mm5, mm1
movq mm3, qMask // qcolorMask
movq mm6, mm2
movq mm7, lMask // qlowcolorMask
pand mm0, mm3
pand mm1, mm3
pand mm2, mm3
pand mm3, [eax+ebx*2+colorD]
psrlw mm0, 2
pand mm4, mm7
psrlw mm1, 2
pand mm5, mm7
psrlw mm2, 2
pand mm6, mm7
psrlw mm3, 2
pand mm7, [eax+ebx*2+colorD]
paddw mm0, mm1
paddw mm2, mm3
paddw mm4, mm5
paddw mm6, mm7
paddw mm4, mm6
movq mm7, lMask // qlowcolorMask
paddw mm0, mm2
psrlw mm4, 2
pand mm4, mm7
paddw mm0, mm4 // mm0 contains the interpolated value of A, B, C and D
// assemble the pixels
movq mm1, Mask1
movq mm2, Mask2
movq mm4, [eax+ebx+colorA]
movq mm5, [eax+ebx+colorB]
pand mm4, mm1
pand mm5, mm2
pxor mm7, mm7
por mm1, mm2
por mm4, mm5
pcmpeqw mm1, mm7
pand mm0, mm1
por mm4, mm0 // mm4 contains the diagonal pixels
movq mm0, ACPixel
mov edi, dstPitch
movq mm1, mm0
punpcklwd mm0, mm4
punpckhwd mm1, mm4
//------------------------------
// Write image RGB1555->RGBx888
//------------------------------
// save
movd mm6, eax
movd mm7, ebx
mov ecx, 0x00F8F8F8 // mask
// movq [edx+edi+0], mm0
movd eax, mm0
movd ebx, mm0
shl eax, 3
shl ebx, 9
shl ah, 3
and ebx, ecx
mov bx, ax
psrlq mm0, 16
mov [edx+edi+0], ebx
movd eax, mm0
movd ebx, mm0
shl eax, 3
shl ebx, 9
shl ah, 3
and ebx, ecx
mov bx, ax
psrlq mm0, 16
mov [edx+edi+4], ebx
movd eax, mm0
movd ebx, mm0
shl eax, 3
shl ebx, 9
shl ah, 3
and ebx, ecx
mov bx, ax
psrlq mm0, 16
mov [edx+edi+8], ebx
movd eax, mm0
movd ebx, mm0
shl eax, 3
shl ebx, 9
shl ah, 3
and ebx, ecx
mov bx, ax
mov [edx+edi+12], ebx
// movq [edx+edi+8], mm1
movd eax, mm1
movd ebx, mm1
shl eax, 3
shl ebx, 9
shl ah, 3
and ebx, ecx
mov bx, ax
psrlq mm1, 16
mov [edx+edi+16], ebx
movd eax, mm1
movd ebx, mm1
shl eax, 3
shl ebx, 9
shl ah, 3
and ebx, ecx
mov bx, ax
psrlq mm1, 16
mov [edx+edi+20], ebx
movd eax, mm1
movd ebx, mm1
shl eax, 3
shl ebx, 9
shl ah, 3
and ebx, ecx
mov bx, ax
psrlq mm1, 16
mov [edx+edi+24], ebx
movd eax, mm1
movd ebx, mm1
shl eax, 3
shl ebx, 9
shl ah, 3
and ebx, ecx
mov bx, ax
mov [edx+edi+28], ebx
// restore
movd eax, mm6
movd ebx, mm7
nx_2xSaILine_32mmx_skipprocess:
add pDlt, 8 // 4 pixels
lea eax, [eax+ 8] // 4 pixels
lea edx, [edx+32] // 8 pixels
sub width, 4 // 4 pixels
jg nx_2xSaILine_32mmx_loop
emms
}
}