/*---------------------------------------------------------------------*
 * The following (piece of) code, (part of) the 2xSaI engine,          *
 * copyright (c) 2001 by Derek Liauw Kie Fa.                           *
 * Non-Commercial use of the engine is allowed and is encouraged,      *
 * provided that appropriate credit be given and that this copyright   *
 * notice will not be removed under any circumstance.                  *
 * You may freely modify this code, but I request                      *
 * that any improvements to the engine be submitted to me, so          *
 * that I can implement these improvements in newer versions of        *
 * the engine.                                                         *
 * If you need more information, have any comments or suggestions,     *
 * you can e-mail me. My e-mail: DerekL666@yahoo.com                   *
 *---------------------------------------------------------------------*/
//
// This code was converted into VirtuaNES by Norix.
//
static	void	nx_Super2xSaILine_16bpp_mmx( euI8* pSrc, euI8* pDlt, euI32 srcPitch, euI32 width, euI8* pDst, euI32 dstPitch, euI32 bForceWrite )
{
	__asm {
		mov		eax, pSrc
		mov		ebx, srcPitch
		mov		edx, pDst
		sub		eax, ebx
nx_Super2xSaILine_16mmx_loop:
		mov		ecx, bForceWrite
		test		ecx, ecx
		jz		nx_Super2xSaILine_16mmx_normal

		mov		esi, pDlt
		movq		mm6, [eax+colorB0]
#if	0
		movq		[esi+2+colorB0], mm6
#else
		movq		[esi+colorB0], mm6
#endif

		jmp		nx_Super2xSaILine_16mmx_forcewrite
nx_Super2xSaILine_16mmx_normal:
		// Check delta
		mov		ecx, pDlt

		// load source img
		lea		esi, [eax+ebx]
		movq		mm0, [eax+colorB0]
		movq		mm1, [eax+colorB3]
		movq		mm2, [eax+ebx+color4]
		movq		mm3, [eax+ebx+colorS2]
		movq		mm4, [eax+ebx*2+color1]
		movq		mm5, [eax+ebx*2+colorS1]
		movq		mm6, [esi+ebx*2+colorA0]
		movq		mm7, [esi+ebx*2+colorA3]

		// compare to delta
		lea		esi, [ecx+ebx]
#if	0
		pcmpeqw		mm0, [ecx+2+colorB0]
		pcmpeqw		mm1, [ecx+2+colorB3]
		pcmpeqw		mm2, [ecx+ebx+2+color4]
		pcmpeqw		mm3, [ecx+ebx+2+colorS2]
		pcmpeqw		mm4, [ecx+ebx*2+2+color1]
		pcmpeqw		mm5, [ecx+ebx*2+2+colorS1]
		pcmpeqw		mm6, [esi+ebx*2+2+colorA0]
		pcmpeqw		mm7, [esi+ebx*2+2+colorA3]
#else
		pcmpeqw		mm0, [ecx+colorB0]
		pcmpeqw		mm1, [ecx+colorB3]
		pcmpeqw		mm2, [ecx+ebx+color4]
		pcmpeqw		mm3, [ecx+ebx+colorS2]
		pcmpeqw		mm4, [ecx+ebx*2+color1]
		pcmpeqw		mm5, [ecx+ebx*2+colorS1]
		pcmpeqw		mm6, [esi+ebx*2+colorA0]
		pcmpeqw		mm7, [esi+ebx*2+colorA3]
#endif

		// compose results
		pand		mm0, mm1
		pand		mm2, mm3
		pand		mm4, mm5
		pand		mm6, mm7
		pand		mm0, mm2
		pand		mm4, mm6
		pxor		mm7, mm7
		pand		mm0, mm4
		movq		mm6, [eax+colorB0]
		pcmpeqw		mm7, mm0

#if	0
		movq		[ecx+2+colorB0], mm6
#else
		movq		[ecx+colorB0], mm6
#endif
		packsswb	mm7, mm7
		movd		ecx, mm7
		test		ecx, ecx
		jz		nx_Super2xSaILine_16mmx_skipprocess
		// End Delta
nx_Super2xSaILine_16mmx_forcewrite:
		//------------------------------
		// Interpolate pixels
		// (c0&c1)+(((c0^c1)&colorMask)>>1)
		//------------------------------
		movq		mm6, cMask

		movq		mm0, [eax+ebx+color5]
		movq		mm1, [eax+ebx+color6]
		movq		mm2, mm0
		movq		mm3, mm1
		movq		mm4, mm0
		movq		mm5, mm1

		pxor		mm3, mm2
		pand		mm0, mm1
		pand		mm3, mm6
		psrlw		mm3, 1
		paddw		mm0, mm3
		movq		I56Pixel, mm0
		movq		mm7, mm0

		//------------------------------
		movq		mm0, mm7
		movq		mm2, mm7
		movq		mm1, mm7
		movq		mm3, mm7
		pxor		mm2, mm4
		pxor		mm3, mm5
		pand		mm0, mm4
		pand		mm2, mm6
		pand		mm1, mm5
		pand		mm3, mm6
		psrlw		mm2, 1
		psrlw		mm3, 1
		paddw		mm0, mm2
		movq		I5556Pixel, mm0
		paddw		mm1, mm3
		movq		I5666Pixel, mm1

		//------------------------------
		//------------------------------
		movq		mm0, [eax+ebx*2+color2]
		movq		mm1, [eax+ebx*2+color3]
		movq		mm2, mm0
		movq		mm3, mm1
		movq		mm4, mm0
		movq		mm5, mm1

		pxor		mm3, mm2
		pand		mm0, mm1
		pand		mm3, mm6
		psrlw		mm3, 1
		paddw		mm0, mm3
		movq		I23Pixel, mm0
		movq		mm7, mm0

		//------------------------------
		movq		mm0, mm7
		movq		mm2, mm7
		movq		mm1, mm7
		movq		mm3, mm7
		pxor		mm2, mm4
		pxor		mm3, mm5
		pand		mm0, mm4
		pand		mm2, mm6
		pand		mm1, mm5
		pand		mm3, mm6
		psrlw		mm2, 1
		psrlw		mm3, 1
		paddw		mm0, mm2
		movq		I2223Pixel, mm0
		paddw		mm1, mm3
		movq		I2333Pixel, mm1

		//------------------------------
		// Decide which "branch" to take
		//------------------------------
		movq		mm0, [eax+ebx+color5]
		movq		mm1, [eax+ebx+color6]
		movq		mm6, mm0
		movq		mm7, mm1
		pcmpeqw		mm0, [eax+ebx*2+color3]
		pcmpeqw		mm1, [eax+ebx*2+color2]
		pcmpeqw		mm6, mm7

		movq		mm2, mm0
		movq		mm3, mm0

		pand		mm0, mm1	// colorA == colorD && colorB == colorC
		pxor		mm7, mm7

		pcmpeqw		mm2, mm7
		pand		mm6, mm0
		pand		mm2, mm1	// colorA != colorD && colorB == colorC

		pcmpeqw		mm1, mm7

		pand		mm1, mm3	// colorA == colorD && colorB != colorC
		pxor		mm0, mm6
		por		mm1, mm6
		movq		mm7, mm0
		movq		Mask26, mm2
		packsswb	mm7, mm7
		movq		Mask35, mm1

		movd		ecx, mm7
		test		ecx, ecx
		jz		nx_Super2xSaILine_16mmx_skipguess

		//------------------------------
		movq		mm6, mm0
		movq		mm4, [eax+ebx+color5]		// colorA
		movq		mm5, [eax+ebx+color6]		// colorB
		pxor		mm7, mm7
		pand		mm6, ONE

		movq		mm0, [eax+colorB1]		// colorE
		movq		mm1, [eax+ebx+color4]		// colorG
		movq		mm2, mm0
		movq		mm3, mm1
		pcmpeqw		mm0, mm4
		pcmpeqw		mm1, mm4
		pcmpeqw		mm2, mm5
		pcmpeqw		mm3, mm5
		pand		mm0, mm6
		pand		mm1, mm6
		pand		mm2, mm6
		pand		mm3, mm6
		paddw		mm0, mm1
		paddw		mm2, mm3

		pxor		mm3, mm3
		pcmpgtw		mm0, mm6
		pcmpgtw		mm2, mm6
		pcmpeqw		mm0, mm3
		pcmpeqw		mm2, mm3
		pand		mm0, mm6
		pand		mm2, mm6
		paddw		mm7, mm0
		psubw		mm7, mm2

		movq		mm0, [eax+colorB2]		// colorF
		movq		mm1, [eax+ebx+colorS2]		// colorK
		movq		mm2, mm0
		movq		mm3, mm1
		pcmpeqw		mm0, mm4
		pcmpeqw		mm1, mm4
		pcmpeqw		mm2, mm5
		pcmpeqw		mm3, mm5
		pand		mm0, mm6
		pand		mm1, mm6
		pand		mm2, mm6
		pand		mm3, mm6
		paddw		mm0, mm1
		paddw		mm2, mm3

		lea		edi, [eax+ebx]
		pxor		mm3, mm3
		pcmpgtw		mm0, mm6
		pcmpgtw		mm2, mm6
		pcmpeqw		mm0, mm3
		pcmpeqw		mm2, mm3
		pand		mm0, mm6
		pand		mm2, mm6
		paddw		mm7, mm0
		psubw		mm7, mm2

		movq		mm0, [edi+ebx+color1]		// colorH
		movq		mm1, [edi+ebx*2+colorA1]	// colorN
		movq		mm2, mm0
		movq		mm3, mm1
		pcmpeqw		mm0, mm4
		pcmpeqw		mm1, mm4
		pcmpeqw		mm2, mm5
		pcmpeqw		mm3, mm5
		pand		mm0, mm6
		pand		mm1, mm6
		pand		mm2, mm6
		pand		mm3, mm6
		paddw		mm0, mm1
		paddw		mm2, mm3

		pxor		mm3, mm3
		pcmpgtw		mm0, mm6
		pcmpgtw		mm2, mm6
		pcmpeqw		mm0, mm3
		pcmpeqw		mm2, mm3
		pand		mm0, mm6
		pand		mm2, mm6
		paddw		mm7, mm0
		psubw		mm7, mm2

		movq		mm0, [edi+ebx+colorS1]		// colorL
		movq		mm1, [edi+ebx*2+colorA2]	// colorO
		movq		mm2, mm0
		movq		mm3, mm1
		pcmpeqw		mm0, mm4
		pcmpeqw		mm1, mm4
		pcmpeqw		mm2, mm5
		pcmpeqw		mm3, mm5
		pand		mm0, mm6
		pand		mm1, mm6
		pand		mm2, mm6
		pand		mm3, mm6
		paddw		mm0, mm1
		paddw		mm2, mm3

		pxor		mm3, mm3
		pcmpgtw		mm0, mm6
		pcmpgtw		mm2, mm6
		pcmpeqw		mm0, mm3
		pcmpeqw		mm2, mm3
		pand		mm0, mm6
		pand		mm2, mm6
		paddw		mm7, mm0
		psubw		mm7, mm2

		movq		mm1, mm7
		pxor		mm0, mm0
		pcmpgtw		mm7, mm0
		pcmpgtw		mm0, mm1

		por		mm7, Mask35
		por		mm0, Mask26
		movq		Mask35, mm7
		movq		Mask26, mm0

nx_Super2xSaILine_16mmx_skipguess:

		// Start the ASSEMBLY !!! eh... compose all the results together to form the final image...

		movq		mm0, [eax+ebx+color5]
		movq		mm1, [eax+ebx*2+color2]
		movq		mm2, mm0

		pand		mm0, mm1
		pxor		mm2, mm1
		pand		mm2, cMask
		psrlw		mm2, 1
		paddw		mm0, mm2

		//------------------------------
		movq		mm7, Mask26
		movq		mm6, [eax+colorB2]
		movq		mm5, [eax+ebx*2+color2]
		movq		mm4, [eax+ebx*2+color1]
		pcmpeqw		mm4, mm5
		pcmpeqw		mm6, mm5
		pxor		mm5, mm5
		pand		mm7, mm4
		pcmpeqw		mm6, mm5
		pand		mm7, mm6

		movq		mm6, [eax+ebx*2+color3]
		movq		mm5, [eax+ebx*2+color2]
		movq		mm4, [eax+ebx*2+color1]
		movq		mm2, [eax+ebx+color5]
		movq		mm1, [eax+ebx+color4]
		movq		mm3, [eax+colorB0]

		pcmpeqw		mm2, mm4
		pcmpeqw		mm6, mm5
		pcmpeqw		mm1, mm5
		pcmpeqw		mm3, mm5
		pxor		mm5, mm5
		pcmpeqw		mm2, mm5
		pcmpeqw		mm3, mm5
		pand		mm6, mm1
		pand		mm2, mm3
		pand		mm6, mm2
		por		mm7, mm6

		movq		mm6, mm7
		pcmpeqw		mm6, mm5
		pand		mm7, mm0

		movq		mm1, [eax+ebx+color5]
		pand		mm6, mm1
		por		mm7, mm6
		movq		final1a, mm7		// finished  1a

		//------------------------------
		lea		esi, [eax+ebx]
		movq		mm7, Mask35
		movq		mm6, [esi+ebx*2+colorA2]

		movq		mm5, [eax+ebx+color5]
		movq		mm4, [eax+ebx+color4]
		pcmpeqw		mm4, mm5
		pcmpeqw		mm6, mm5
		pxor		mm5, mm5
		pand		mm7, mm4
		pcmpeqw		mm6, mm5
		pand		mm7, mm6

		movq		mm6, [eax+ebx+color6]
		movq		mm5, [eax+ebx+color5]
		movq		mm4, [eax+ebx+color4]
		movq		mm2, [eax+ebx*2+color2]
		movq		mm1, [eax+ebx*2+color1]
		movq		mm3, [esi+ebx*2+colorA0]

		pcmpeqw		mm2, mm4
		pcmpeqw		mm6, mm5
		pcmpeqw		mm1, mm5
		pcmpeqw		mm3, mm5
		pxor		mm5, mm5
		pcmpeqw		mm2, mm5
		pcmpeqw		mm3, mm5
		pand		mm6, mm1
		pand		mm2, mm3
		pand		mm6, mm2
		por		mm7, mm6

		movq		mm6, mm7
		pcmpeqw		mm6, mm5
		pand		mm7, mm0

		movq		mm1, [eax+ebx*2+color2]
		pand		mm6, mm1
		por		mm7, mm6
		movq		final2a, mm7		// finished  2a

		pxor		mm7, mm7
		movq		mm0, [esi+ebx*2+colorA0]
		movq		mm1, [esi+ebx*2+colorA1]
		movq		mm2, [esi+ebx*2+colorA2]
		movq		mm3, [esi+ebx*2+colorA3]
		movq		mm4, [eax+ebx*2+color2]
		movq		mm5, [eax+ebx*2+color3]
		movq		mm6, [eax+ebx+color6]

		pcmpeqw		mm6, mm5
		pcmpeqw		mm1, mm5
		pcmpeqw		mm4, mm2
		pcmpeqw		mm0, mm5
		pcmpeqw		mm4, mm7
		pcmpeqw		mm0, mm7
		pand		mm0, mm4
		pand		mm6, mm1
		pand		mm0, mm6

		movq		mm1, [esi+ebx*2+colorA1]
		movq		mm4, [eax+ebx*2+color2]
		movq		mm5, [eax+ebx+color5]
		movq		mm6, [eax+ebx*2+color3]

		pcmpeqw		mm5, mm4
		pcmpeqw		mm2, mm4
		pcmpeqw		mm1, mm6
		pcmpeqw		mm3, mm4
		pcmpeqw		mm1, mm7
		pcmpeqw		mm3, mm7
		pand		mm2, mm5
		pand		mm1, mm3
		pand		mm1, mm2

		movq		mm7, mm0
		por		mm7, mm1

		movq		mm4, Mask35
		movq		mm3, Mask26

		movq		mm6, mm4
		pand		mm6, mm7
		pxor		mm4, mm6

		movq		mm6, mm3
		pand		mm6, mm7
		pxor		mm3, mm6

		movq		mm2, mm0
		movq		mm7, I2333Pixel
		movq		mm6, I2223Pixel
		movq		mm5, I23Pixel

		por		mm2, mm4
		pand		mm4, [eax+ebx*2+color3]
		por		mm2, mm3
		pand		mm3, [eax+ebx*2+color2]
		por		mm2, mm1
		pand		mm0, mm7
		pand		mm1, mm6
		pxor		mm7, mm7
		pcmpeqw		mm2, mm7
		por		mm0, mm1
		por		mm3, mm4
		pand		mm2, mm5
		por		mm0, mm3
		por		mm0, mm2
		movq		final2b, mm0

		//------------------------------
		pxor		mm7, mm7
		movq		mm0, [eax+colorB0]
		movq		mm1, [eax+colorB1]
		movq		mm2, [eax+colorB2]
		movq		mm3, [eax+colorB3]
		movq		mm4, [eax+ebx+color5]
		movq		mm5, [eax+ebx+color6]
		movq		mm6, [eax+ebx*2+color3]

		pcmpeqw		mm6, mm5
		pcmpeqw		mm1, mm5
		pcmpeqw		mm4, mm2
		pcmpeqw		mm0, mm5
		pcmpeqw		mm4, mm7
		pcmpeqw		mm0, mm7
		pand		mm0, mm4
		pand		mm6, mm1
		pand		mm0, mm6

		movq		mm1, [eax+colorB1]
		movq		mm4, [eax+ebx+color5]
		movq		mm5, [eax+ebx*2+color2]
		movq		mm6, [eax+ebx+color6]

		pcmpeqw		mm5, mm4
		pcmpeqw		mm2, mm4
		pcmpeqw		mm1, mm6
		pcmpeqw		mm3, mm4
		pcmpeqw		mm1, mm7
		pcmpeqw		mm3, mm7
		pand		mm2, mm5
		pand		mm1, mm3
		pand		mm1, mm2


		movq		mm7, mm0
		por		mm7, mm1

		movq		mm4, Mask35
		movq		mm3, Mask26

		movq		mm6, mm4
		pand		mm6, mm7
		pxor		mm4, mm6

		movq		mm6, mm3
		pand		mm6, mm7
		pxor		mm3, mm6

		movq		mm2, mm0
		movq		mm7, I5666Pixel
		movq		mm6, I5556Pixel
		movq		mm5, I56Pixel

		por		mm2, mm4
		pand		mm4, [eax+ebx+color5]
		por		mm2, mm3
		pand		mm3, [eax+ebx+color6]
		por		mm2, mm1
		pand		mm0, mm7
		pand		mm1, mm6
		pxor		mm7, mm7
		pcmpeqw		mm2, mm7
		por		mm0, mm1
		por		mm3, mm4
		pand		mm2, mm5
		por		mm0, mm3
		por		mm0, mm2
		movq		final1b, mm0

		//------------------------------
		// Write final image
		//------------------------------
		movq		mm0, final1a
		movq		mm4, final2a
		movq		mm2, final1b
		movq		mm6, final2b

		movq		mm1, mm0
		movq		mm5, mm4

		punpcklwd	mm0, mm2		// B1A1B0A0=B3B2B1B0(dst):A3A2A1A0(src)
		punpckhwd	mm1, mm2		// B3A3B2A2=B3B2B1B0(dst):A3A2A1A0(src)
		punpcklwd	mm4, mm6		// B1A1B0A0=B3B2B1B0(dst):A3A2A1A0(src)
		punpckhwd	mm5, mm6		// B3A3B2A2=B3B2B1B0(dst):A3A2A1A0(src)

		mov		edi, dstPitch
		movq		[edx+0], mm0		// 1st line
		movq		[edx+8], mm1
		movq		[edi+edx+0], mm4	// 2nd line
		movq		[edi+edx+8], mm5

nx_Super2xSaILine_16mmx_skipprocess:
		add		pDlt, 8		// 4 pixels
		add		eax, 8		// 4 pixels
		add		edx, 16		// 8 pixels
		sub		width, 4	// 4 pixels
		jg		nx_Super2xSaILine_16mmx_loop

		emms
	}
}


static	void	nx_Super2xSaILine_32bpp_mmx( euI8* pSrc, euI8* pDlt, euI32 srcPitch, euI32 width, euI8* pDst, euI32 dstPitch, euI32 bForceWrite )
{
	__asm {
		mov		eax, pSrc
		mov		ebx, srcPitch
		mov		edx, pDst
		sub		eax, ebx
nx_Super2xSaILine_32mmx_loop:
		mov		ecx, bForceWrite
		test		ecx, ecx
		jz		nx_Super2xSaILine_32mmx_normal

		mov		esi, pDlt
		movq		mm6, [eax+colorB0]
#if	0
		movq		[esi+2+colorB0], mm6
#else
		movq		[esi+colorB0], mm6
#endif

		jmp		nx_Super2xSaILine_32mmx_forcewrite
nx_Super2xSaILine_32mmx_normal:
		// Check delta
		mov		ecx, pDlt

		// load source img
		lea		esi, [eax+ebx]
		movq		mm0, [eax+colorB0]
		movq		mm1, [eax+colorB3]
		movq		mm2, [eax+ebx+color4]
		movq		mm3, [eax+ebx+colorS2]
		movq		mm4, [eax+ebx*2+color1]
		movq		mm5, [eax+ebx*2+colorS1]
		movq		mm6, [esi+ebx*2+colorA0]
		movq		mm7, [esi+ebx*2+colorA3]

		// compare to delta
		lea		esi, [ecx+ebx]
#if	0
		pcmpeqw		mm0, [ecx+2+colorB0]
		pcmpeqw		mm1, [ecx+2+colorB3]
		pcmpeqw		mm2, [ecx+ebx+2+color4]
		pcmpeqw		mm3, [ecx+ebx+2+colorS2]
		pcmpeqw		mm4, [ecx+ebx*2+2+color1]
		pcmpeqw		mm5, [ecx+ebx*2+2+colorS1]
		pcmpeqw		mm6, [esi+ebx*2+2+colorA0]
		pcmpeqw		mm7, [esi+ebx*2+2+colorA3]
#else
		pcmpeqw		mm0, [ecx+colorB0]
		pcmpeqw		mm1, [ecx+colorB3]
		pcmpeqw		mm2, [ecx+ebx+color4]
		pcmpeqw		mm3, [ecx+ebx+colorS2]
		pcmpeqw		mm4, [ecx+ebx*2+color1]
		pcmpeqw		mm5, [ecx+ebx*2+colorS1]
		pcmpeqw		mm6, [esi+ebx*2+colorA0]
		pcmpeqw		mm7, [esi+ebx*2+colorA3]
#endif

		// compose results
		pand		mm0, mm1
		pand		mm2, mm3
		pand		mm4, mm5
		pand		mm6, mm7
		pand		mm0, mm2
		pand		mm4, mm6
		pxor		mm7, mm7
		pand		mm0, mm4
		movq		mm6, [eax+colorB0]
		pcmpeqw		mm7, mm0
#if	0
		movq		[ecx+2+colorB0], mm6
#else
		movq		[ecx+colorB0], mm6
#endif
		packsswb	mm7, mm7
		movd		ecx, mm7
		test		ecx, ecx
		jz		nx_Super2xSaILine_32mmx_skipprocess
		// End Delta
nx_Super2xSaILine_32mmx_forcewrite:
		//------------------------------
		// Interpolate pixels
		// (c0&c1)+(((c0^c1)&colorMask)>>1)
		//------------------------------
		movq		mm6, cMask

		movq		mm0, [eax+ebx+color5]
		movq		mm1, [eax+ebx+color6]
		movq		mm2, mm0
		movq		mm3, mm1
		movq		mm4, mm0
		movq		mm5, mm1

		pxor		mm3, mm2
		pand		mm0, mm1
		pand		mm3, mm6
		psrlw		mm3, 1
		paddw		mm0, mm3
		movq		I56Pixel, mm0
		movq		mm7, mm0

		//------------------------------
		movq		mm0, mm7
		movq		mm2, mm7
		movq		mm1, mm7
		movq		mm3, mm7
		pxor		mm2, mm4
		pxor		mm3, mm5
		pand		mm0, mm4
		pand		mm2, mm6
		pand		mm1, mm5
		pand		mm3, mm6
		psrlw		mm2, 1
		psrlw		mm3, 1
		paddw		mm0, mm2
		movq		I5556Pixel, mm0
		paddw		mm1, mm3
		movq		I5666Pixel, mm1

		//------------------------------
		//------------------------------
		movq		mm0, [eax+ebx*2+color2]
		movq		mm1, [eax+ebx*2+color3]
		movq		mm2, mm0
		movq		mm3, mm1
		movq		mm4, mm0
		movq		mm5, mm1

		pxor		mm3, mm2
		pand		mm0, mm1
		pand		mm3, mm6
		psrlw		mm3, 1
		paddw		mm0, mm3
		movq		I23Pixel, mm0
		movq		mm7, mm0

		//------------------------------
		movq		mm0, mm7
		movq		mm2, mm7
		movq		mm1, mm7
		movq		mm3, mm7
		pxor		mm2, mm4
		pxor		mm3, mm5
		pand		mm0, mm4
		pand		mm2, mm6
		pand		mm1, mm5
		pand		mm3, mm6
		psrlw		mm2, 1
		psrlw		mm3, 1
		paddw		mm0, mm2
		movq		I2223Pixel, mm0
		paddw		mm1, mm3
		movq		I2333Pixel, mm1

		//------------------------------
		// Decide which "branch" to take
		//------------------------------
		movq		mm0, [eax+ebx+color5]
		movq		mm1, [eax+ebx+color6]
		movq		mm6, mm0
		movq		mm7, mm1
		pcmpeqw		mm0, [eax+ebx*2+color3]
		pcmpeqw		mm1, [eax+ebx*2+color2]
		pcmpeqw		mm6, mm7

		movq		mm2, mm0
		movq		mm3, mm0

		pand		mm0, mm1	// colorA == colorD && colorB == colorC
		pxor		mm7, mm7

		pcmpeqw		mm2, mm7
		pand		mm6, mm0
		pand		mm2, mm1	// colorA != colorD && colorB == colorC

		pcmpeqw		mm1, mm7

		pand		mm1, mm3	// colorA == colorD && colorB != colorC
		pxor		mm0, mm6
		por		mm1, mm6
		movq		mm7, mm0
		movq		Mask26, mm2
		packsswb	mm7, mm7
		movq		Mask35, mm1

		movd		ecx, mm7
		test		ecx, ecx
		jz		nx_Super2xSaILine_32mmx_skipguess

		//------------------------------
		movq		mm6, mm0
		movq		mm4, [eax+ebx+color5]		// colorA
		movq		mm5, [eax+ebx+color6]		// colorB
		pxor		mm7, mm7
		pand		mm6, ONE

		movq		mm0, [eax+colorB1]		// colorE
		movq		mm1, [eax+ebx+color4]		// colorG
		movq		mm2, mm0
		movq		mm3, mm1
		pcmpeqw		mm0, mm4
		pcmpeqw		mm1, mm4
		pcmpeqw		mm2, mm5
		pcmpeqw		mm3, mm5
		pand		mm0, mm6
		pand		mm1, mm6
		pand		mm2, mm6
		pand		mm3, mm6
		paddw		mm0, mm1
		paddw		mm2, mm3

		pxor		mm3, mm3
		pcmpgtw		mm0, mm6
		pcmpgtw		mm2, mm6
		pcmpeqw		mm0, mm3
		pcmpeqw		mm2, mm3
		pand		mm0, mm6
		pand		mm2, mm6
		paddw		mm7, mm0
		psubw		mm7, mm2

		movq		mm0, [eax+colorB2]		// colorF
		movq		mm1, [eax+ebx+colorS2]		// colorK
		movq		mm2, mm0
		movq		mm3, mm1
		pcmpeqw		mm0, mm4
		pcmpeqw		mm1, mm4
		pcmpeqw		mm2, mm5
		pcmpeqw		mm3, mm5
		pand		mm0, mm6
		pand		mm1, mm6
		pand		mm2, mm6
		pand		mm3, mm6
		paddw		mm0, mm1
		paddw		mm2, mm3

		lea		edi, [eax+ebx]
		pxor		mm3, mm3
		pcmpgtw		mm0, mm6
		pcmpgtw		mm2, mm6
		pcmpeqw		mm0, mm3
		pcmpeqw		mm2, mm3
		pand		mm0, mm6
		pand		mm2, mm6
		paddw		mm7, mm0
		psubw		mm7, mm2

		movq		mm0, [edi+ebx+color1]		// colorH
		movq		mm1, [edi+ebx*2+colorA1]	// colorN
		movq		mm2, mm0
		movq		mm3, mm1
		pcmpeqw		mm0, mm4
		pcmpeqw		mm1, mm4
		pcmpeqw		mm2, mm5
		pcmpeqw		mm3, mm5
		pand		mm0, mm6
		pand		mm1, mm6
		pand		mm2, mm6
		pand		mm3, mm6
		paddw		mm0, mm1
		paddw		mm2, mm3

		pxor		mm3, mm3
		pcmpgtw		mm0, mm6
		pcmpgtw		mm2, mm6
		pcmpeqw		mm0, mm3
		pcmpeqw		mm2, mm3
		pand		mm0, mm6
		pand		mm2, mm6
		paddw		mm7, mm0
		psubw		mm7, mm2

		movq		mm0, [edi+ebx+colorS1]		// colorL
		movq		mm1, [edi+ebx*2+colorA2]	// colorO
		movq		mm2, mm0
		movq		mm3, mm1
		pcmpeqw		mm0, mm4
		pcmpeqw		mm1, mm4
		pcmpeqw		mm2, mm5
		pcmpeqw		mm3, mm5
		pand		mm0, mm6
		pand		mm1, mm6
		pand		mm2, mm6
		pand		mm3, mm6
		paddw		mm0, mm1
		paddw		mm2, mm3

		pxor		mm3, mm3
		pcmpgtw		mm0, mm6
		pcmpgtw		mm2, mm6
		pcmpeqw		mm0, mm3
		pcmpeqw		mm2, mm3
		pand		mm0, mm6
		pand		mm2, mm6
		paddw		mm7, mm0
		psubw		mm7, mm2

		movq		mm1, mm7
		pxor		mm0, mm0
		pcmpgtw		mm7, mm0
		pcmpgtw		mm0, mm1

		por		mm7, Mask35
		por		mm0, Mask26
		movq		Mask35, mm7
		movq		Mask26, mm0

nx_Super2xSaILine_32mmx_skipguess:

		// Start the ASSEMBLY !!! eh... compose all the results together to form the final image...

		movq		mm0, [eax+ebx+color5]
		movq		mm1, [eax+ebx*2+color2]
		movq		mm2, mm0

		pand		mm0, mm1
		pxor		mm2, mm1
		pand		mm2, cMask
		psrlw		mm2, 1
		paddw		mm0, mm2

		//------------------------------
		movq		mm7, Mask26
		movq		mm6, [eax+colorB2]
		movq		mm5, [eax+ebx*2+color2]
		movq		mm4, [eax+ebx*2+color1]
		pcmpeqw		mm4, mm5
		pcmpeqw		mm6, mm5
		pxor		mm5, mm5
		pand		mm7, mm4
		pcmpeqw		mm6, mm5
		pand		mm7, mm6

		movq		mm6, [eax+ebx*2+color3]
		movq		mm5, [eax+ebx*2+color2]
		movq		mm4, [eax+ebx*2+color1]
		movq		mm2, [eax+ebx+color5]
		movq		mm1, [eax+ebx+color4]
		movq		mm3, [eax+colorB0]

		pcmpeqw		mm2, mm4
		pcmpeqw		mm6, mm5
		pcmpeqw		mm1, mm5
		pcmpeqw		mm3, mm5
		pxor		mm5, mm5
		pcmpeqw		mm2, mm5
		pcmpeqw		mm3, mm5
		pand		mm6, mm1
		pand		mm2, mm3
		pand		mm6, mm2
		por		mm7, mm6

		movq		mm6, mm7
		pcmpeqw		mm6, mm5
		pand		mm7, mm0

		movq		mm1, [eax+ebx+color5]
		pand		mm6, mm1
		por		mm7, mm6
		movq		final1a, mm7		// finished  1a

		//------------------------------
		lea		esi, [eax+ebx]
		movq		mm7, Mask35
		movq		mm6, [esi+ebx*2+colorA2]

		movq		mm5, [eax+ebx+color5]
		movq		mm4, [eax+ebx+color4]
		pcmpeqw		mm4, mm5
		pcmpeqw		mm6, mm5
		pxor		mm5, mm5
		pand		mm7, mm4
		pcmpeqw		mm6, mm5
		pand		mm7, mm6

		movq		mm6, [eax+ebx+color6]
		movq		mm5, [eax+ebx+color5]
		movq		mm4, [eax+ebx+color4]
		movq		mm2, [eax+ebx*2+color2]
		movq		mm1, [eax+ebx*2+color1]
		movq		mm3, [esi+ebx*2+colorA0]

		pcmpeqw		mm2, mm4
		pcmpeqw		mm6, mm5
		pcmpeqw		mm1, mm5
		pcmpeqw		mm3, mm5
		pxor		mm5, mm5
		pcmpeqw		mm2, mm5
		pcmpeqw		mm3, mm5
		pand		mm6, mm1
		pand		mm2, mm3
		pand		mm6, mm2
		por		mm7, mm6

		movq		mm6, mm7
		pcmpeqw		mm6, mm5
		pand		mm7, mm0

		movq		mm1, [eax+ebx*2+color2]
		pand		mm6, mm1
		por		mm7, mm6
		movq		final2a, mm7		// finished  2a

		pxor		mm7, mm7
		movq		mm0, [esi+ebx*2+colorA0]
		movq		mm1, [esi+ebx*2+colorA1]
		movq		mm2, [esi+ebx*2+colorA2]
		movq		mm3, [esi+ebx*2+colorA3]
		movq		mm4, [eax+ebx*2+color2]
		movq		mm5, [eax+ebx*2+color3]
		movq		mm6, [eax+ebx+color6]

		pcmpeqw		mm6, mm5
		pcmpeqw		mm1, mm5
		pcmpeqw		mm4, mm2
		pcmpeqw		mm0, mm5
		pcmpeqw		mm4, mm7
		pcmpeqw		mm0, mm7
		pand		mm0, mm4
		pand		mm6, mm1
		pand		mm0, mm6

		movq		mm1, [esi+ebx*2+colorA1]
		movq		mm4, [eax+ebx*2+color2]
		movq		mm5, [eax+ebx+color5]
		movq		mm6, [eax+ebx*2+color3]

		pcmpeqw		mm5, mm4
		pcmpeqw		mm2, mm4
		pcmpeqw		mm1, mm6
		pcmpeqw		mm3, mm4
		pcmpeqw		mm1, mm7
		pcmpeqw		mm3, mm7
		pand		mm2, mm5
		pand		mm1, mm3
		pand		mm1, mm2

		movq		mm7, mm0
		por		mm7, mm1

		movq		mm4, Mask35
		movq		mm3, Mask26

		movq		mm6, mm4
		pand		mm6, mm7
		pxor		mm4, mm6

		movq		mm6, mm3
		pand		mm6, mm7
		pxor		mm3, mm6

		movq		mm2, mm0
		movq		mm7, I2333Pixel
		movq		mm6, I2223Pixel
		movq		mm5, I23Pixel

		por		mm2, mm4
		pand		mm4, [eax+ebx*2+color3]
		por		mm2, mm3
		pand		mm3, [eax+ebx*2+color2]
		por		mm2, mm1
		pand		mm0, mm7
		pand		mm1, mm6
		pxor		mm7, mm7
		pcmpeqw		mm2, mm7
		por		mm0, mm1
		por		mm3, mm4
		pand		mm2, mm5
		por		mm0, mm3
		por		mm0, mm2
		movq		final2b, mm0

		//------------------------------
		pxor		mm7, mm7
		movq		mm0, [eax+colorB0]
		movq		mm1, [eax+colorB1]
		movq		mm2, [eax+colorB2]
		movq		mm3, [eax+colorB3]
		movq		mm4, [eax+ebx+color5]
		movq		mm5, [eax+ebx+color6]
		movq		mm6, [eax+ebx*2+color3]

		pcmpeqw		mm6, mm5
		pcmpeqw		mm1, mm5
		pcmpeqw		mm4, mm2
		pcmpeqw		mm0, mm5
		pcmpeqw		mm4, mm7
		pcmpeqw		mm0, mm7
		pand		mm0, mm4
		pand		mm6, mm1
		pand		mm0, mm6

		movq		mm1, [eax+colorB1]
		movq		mm4, [eax+ebx+color5]
		movq		mm5, [eax+ebx*2+color2]
		movq		mm6, [eax+ebx+color6]

		pcmpeqw		mm5, mm4
		pcmpeqw		mm2, mm4
		pcmpeqw		mm1, mm6
		pcmpeqw		mm3, mm4
		pcmpeqw		mm1, mm7
		pcmpeqw		mm3, mm7
		pand		mm2, mm5
		pand		mm1, mm3
		pand		mm1, mm2


		movq		mm7, mm0
		por		mm7, mm1

		movq		mm4, Mask35
		movq		mm3, Mask26

		movq		mm6, mm4
		pand		mm6, mm7
		pxor		mm4, mm6

		movq		mm6, mm3
		pand		mm6, mm7
		pxor		mm3, mm6

		movq		mm2, mm0
		movq		mm7, I5666Pixel
		movq		mm6, I5556Pixel
		movq		mm5, I56Pixel

		por		mm2, mm4
		pand		mm4, [eax+ebx+color5]
		por		mm2, mm3
		pand		mm3, [eax+ebx+color6]
		por		mm2, mm1
		pand		mm0, mm7
		pand		mm1, mm6
		pxor		mm7, mm7
		pcmpeqw		mm2, mm7
		por		mm0, mm1
		por		mm3, mm4
		pand		mm2, mm5
		por		mm0, mm3
		por		mm0, mm2
		movq		final1b, mm0

		//------------------------------
		// Write final image
		//------------------------------
		movq		mm0, final1a
		movq		mm4, final2a
		movq		mm2, final1b
		movq		mm6, final2b

		movq		mm1, mm0
		movq		mm5, mm4

		punpcklwd	mm0, mm2		// B1A1B0A0=B3B2B1B0(dst):A3A2A1A0(src)
		punpckhwd	mm1, mm2		// B3A3B2A2=B3B2B1B0(dst):A3A2A1A0(src)
		punpcklwd	mm4, mm6		// B1A1B0A0=B3B2B1B0(dst):A3A2A1A0(src)
		punpckhwd	mm5, mm6		// B3A3B2A2=B3B2B1B0(dst):A3A2A1A0(src)

		// Write image  RGB1555->RGBx888
		// save
		mov		esi, eax
		movd		mm7, ebx
		mov		edi, dstPitch
		mov		ecx, 0x00F8F8F8		// mask

		// 1st line
		movd		eax, mm0
		movd		ebx, mm0
		shl		eax, 3
		shl		ebx, 9
		shl		ah, 3
		and		ebx, ecx
		mov		bx, ax
		psrlq		mm0, 16
		mov		[edx+0], ebx
		movd		eax, mm0
		movd		ebx, mm0
		shl		eax, 3
		shl		ebx, 9
		shl		ah, 3
		and		ebx, ecx
		mov		bx, ax
		psrlq		mm0, 16
		mov		[edx+4], ebx
		movd		eax, mm0
		movd		ebx, mm0
		shl		eax, 3
		shl		ebx, 9
		shl		ah, 3
		and		ebx, ecx
		mov		bx, ax
		psrlq		mm0, 16
		mov		[edx+8], ebx
		movd		eax, mm0
		movd		ebx, mm0
		shl		eax, 3
		shl		ebx, 9
		shl		ah, 3
		and		ebx, ecx
		mov		bx, ax
		mov		[edx+12], ebx

		movd		eax, mm1
		movd		ebx, mm1
		shl		eax, 3
		shl		ebx, 9
		shl		ah, 3
		and		ebx, ecx
		mov		bx, ax
		psrlq		mm1, 16
		mov		[edx+16], ebx
		movd		eax, mm1
		movd		ebx, mm1
		shl		eax, 3
		shl		ebx, 9
		shl		ah, 3
		and		ebx, ecx
		mov		bx, ax
		psrlq		mm1, 16
		mov		[edx+20], ebx
		movd		eax, mm1
		movd		ebx, mm1
		shl		eax, 3
		shl		ebx, 9
		shl		ah, 3
		and		ebx, ecx
		mov		bx, ax
		psrlq		mm1, 16
		mov		[edx+24], ebx
		movd		eax, mm1
		movd		ebx, mm1
		shl		eax, 3
		shl		ebx, 9
		shl		ah, 3
		and		ebx, ecx
		mov		bx, ax
		mov		[edx+28], ebx

		// 2nd line
		movd		eax, mm4
		movd		ebx, mm4
		shl		eax, 3
		shl		ebx, 9
		shl		ah, 3
		and		ebx, ecx
		mov		bx, ax
		psrlq		mm4, 16
		mov		[edx+edi+0], ebx
		movd		eax, mm4
		movd		ebx, mm4
		shl		eax, 3
		shl		ebx, 9
		shl		ah, 3
		and		ebx, ecx
		mov		bx, ax
		psrlq		mm4, 16
		mov		[edx+edi+4], ebx
		movd		eax, mm4
		movd		ebx, mm4
		shl		eax, 3
		shl		ebx, 9
		shl		ah, 3
		and		ebx, ecx
		mov		bx, ax
		psrlq		mm4, 16
		mov		[edx+edi+8], ebx
		movd		eax, mm4
		movd		ebx, mm4
		shl		eax, 3
		shl		ebx, 9
		shl		ah, 3
		and		ebx, ecx
		mov		bx, ax
		mov		[edx+edi+12], ebx

		movd		eax, mm5
		movd		ebx, mm5
		shl		eax, 3
		shl		ebx, 9
		shl		ah, 3
		and		ebx, ecx
		mov		bx, ax
		psrlq		mm5, 16
		mov		[edx+edi+16], ebx
		movd		eax, mm5
		movd		ebx, mm5
		shl		eax, 3
		shl		ebx, 9
		shl		ah, 3
		and		ebx, ecx
		mov		bx, ax
		psrlq		mm5, 16
		mov		[edx+edi+20], ebx
		movd		eax, mm5
		movd		ebx, mm5
		shl		eax, 3
		shl		ebx, 9
		shl		ah, 3
		and		ebx, ecx
		mov		bx, ax
		psrlq		mm5, 16
		mov		[edx+edi+24], ebx
		movd		eax, mm5
		movd		ebx, mm5
		shl		eax, 3
		shl		ebx, 9
		shl		ah, 3
		and		ebx, ecx
		mov		bx, ax
		mov		[edx+edi+28], ebx

		// restore
		mov		eax, esi
		movd		ebx, mm7

nx_Super2xSaILine_32mmx_skipprocess:
		add		pDlt, 8		// 4 pixels
		add		eax, 8		// 4 pixels
		add		edx, 32		// 8 pixels
		sub		width, 4	// 4 pixels
		jg		nx_Super2xSaILine_32mmx_loop

		emms
	}
}