/*
 * This file is part of the Advance project.
 *
 * Copyright (C) 1999-2002 Andrea Mazzoleni
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

/*
 * This file contains a C and MMX implentation of the Scale2x effect.
 *
 * You can found an high level description of the effect at :
 *
 * http://scale2x.sourceforge.net/scale2x.html
 *
 * Alternatively at the previous license terms, you are allowed to use this
 * code in your program with these conditions:
 * - the program is not used in commercial activities.
 * - the whole source code of the program is released with the binary.
 * - derivative works of the program are allowed.
 */

// Note
//
// This code has deleted most from the original code.
// An original code is acquirable from the website of Advanced project.
// MS VisualC++ and ProcessorPack are required for compile of this code.
//
static	void	internal_scale2x_16_mmx_single(euI16* dst, const euI16* src0, const euI16* src1, const euI16* src2, unsigned count)
{
	/* always do the first and last run */
	count -= 2*4;

	__asm {
		mov		eax, src0
		mov		ebx, src1
		mov		ecx, src2
		mov		edx, dst
		mov		esi, count

		/* first run */
		/* set the current, current_pre, current_next registers */
		pxor		mm0, mm0		/* use a fake black out of screen */
		movq		mm7, qword ptr [ebx+0]
		movq		mm1, qword ptr [ebx+8]
		psrlq		mm0, 48
		psllq		mm1, 48
		movq		mm2, mm7
		movq		mm3, mm7
		psllq		mm2, 16
		psrlq		mm3, 16
		por		mm0, mm2
		por		mm1, mm3

		/* current_upper */
		movq		mm6, qword ptr [eax]

		/* compute the upper-left pixel for dst0 on %%mm2 */
		/* compute the upper-right pixel for dst0 on %%mm4 */
		movq		mm2, mm0
		movq		mm4, mm1
		movq		mm3, mm0
		movq		mm5, mm1
		pcmpeqw		mm2, mm6
		pcmpeqw		mm4, mm6
		pcmpeqw		mm3, qword ptr [ecx]
		pcmpeqw		mm5, qword ptr [ecx]
		pandn		mm3, mm2
		pandn		mm5, mm4
		movq		mm2, mm0
		movq		mm4, mm1
		pcmpeqw		mm2, mm1
		pcmpeqw		mm4, mm0
		pandn		mm2, mm3
		pandn		mm4, mm5
		movq		mm3, mm2
		movq		mm5, mm4
		pand		mm2, mm6
		pand		mm4, mm6
		pandn		mm3, mm7
		pandn		mm5, mm7
		por		mm2, mm3
		por		mm4, mm5

		/* set *dst0 */
		movq		mm3, mm2
		punpcklwd	mm2, mm4
		punpckhwd	mm3, mm4
		movq		qword ptr [edx+0], mm2
		movq		qword ptr [edx+8], mm3

		/* next */
		add		eax, 8
		add		ebx, 8
		add		ecx, 8
		add		edx, 16

		/* central runs */
		shr		esi, 2
		jz		label1
		align 4
label0:

		/* set the current, current_pre, current_next registers */
		movq		mm0, qword ptr [ebx-8]
		movq		mm7, qword ptr [ebx+0]
		movq		mm1, qword ptr [ebx+8]
		psrlq		mm0, 48
		psllq		mm1, 48
		movq		mm2, mm7
		movq		mm3, mm7
		psllq		mm2, 16
		psrlq		mm3, 16
		por		mm0, mm2
		por		mm1, mm3

		/* current_upper */
		movq		mm6, qword ptr [eax]

		/* compute the upper-left pixel for dst0 on %%mm2 */
		/* compute the upper-right pixel for dst0 on %%mm4 */
		movq		mm2, mm0
		movq		mm4, mm1
		movq		mm3, mm0
		movq		mm5, mm1
		pcmpeqw		mm2, mm6
		pcmpeqw		mm4, mm6
		pcmpeqw		mm3, qword ptr [ecx]
		pcmpeqw		mm5, qword ptr [ecx]
		pandn		mm3, mm2
		pandn		mm5, mm4
		movq		mm2, mm0
		movq		mm4, mm1
		pcmpeqw		mm2, mm1
		pcmpeqw		mm4, mm0
		pandn		mm2, mm3
		pandn		mm4, mm5
		movq		mm3, mm2
		movq		mm5, mm4
		pand		mm2, mm6
		pand		mm4, mm6
		pandn		mm3, mm7
		pandn		mm5, mm7
		por		mm2, mm3
		por		mm4, mm5

		/* set *dst0 */
		movq		mm3, mm2
		punpcklwd	mm2, mm4
		punpckhwd	mm3, mm4
		movq		qword ptr [edx+0], mm2
		movq		qword ptr [edx+8], mm3

		/* next */
		add		eax, 8
		add		ebx, 8
		add		ecx, 8
		add		edx, 16

		dec		esi
		jnz		label0
label1:
		/* final run */
		/* set the current, current_pre, current_next registers */
		movq		mm0, qword ptr [ebx-8]
		movq		mm7, qword ptr [ebx+0]
		pxor		mm1, mm1		/* use a fake black out of screen */
		psrlq		mm0, 48
		psllq		mm1, 48
		movq		mm2, mm7
		movq		mm3, mm7
		psllq		mm2, 16
		psrlq		mm3, 16
		por		mm0, mm2
		por		mm1, mm3

		/* current_upper */
		movq		mm6, qword ptr [eax]

		/* compute the upper-left pixel for dst0 on %%mm2 */
		/* compute the upper-right pixel for dst0 on %%mm4 */
		movq		mm2, mm0
		movq		mm4, mm1
		movq		mm3, mm0
		movq		mm5, mm1
		pcmpeqw		mm2, mm6
		pcmpeqw		mm4, mm6
		pcmpeqw		mm3, qword ptr [ecx]
		pcmpeqw		mm5, qword ptr [ecx]
		pandn		mm3, mm2
		pandn		mm5, mm4
		movq		mm2, mm0
		movq		mm4, mm1
		pcmpeqw		mm2, mm1
		pcmpeqw		mm4, mm0
		pandn		mm2, mm3
		pandn		mm4, mm5
		movq		mm3, mm2
		movq		mm5, mm4
		pand		mm2, mm6
		pand		mm4, mm6
		pandn		mm3, mm7
		pandn		mm5, mm7
		por		mm2, mm3
		por		mm4, mm5

		/* set *dst0 */
		movq		mm3, mm2
		punpcklwd	mm2, mm4
		punpckhwd	mm3, mm4
		movq		qword ptr [edx+0], mm2
		movq		qword ptr [edx+8], mm3

		mov		src0, eax
		mov		src1, ebx
		mov		src2, ecx
		mov		dst, edx
		mov		count, esi

		emms
	}
}

static	void	internal_scale2x_32_mmx_single(euI32* dst, const euI32* src0, const euI32* src1, const euI32* src2, unsigned count)
{
	/* always do the first and last run */
	count -= 2*2;

	__asm {
		mov		eax, src0
		mov		ebx, src1
		mov		ecx, src2
		mov		edx, dst
		mov		esi, count

		/* first run */
		/* set the current, current_pre, current_next registers */
		pxor		mm0, mm0
		movq		mm7, qword ptr [ebx+0]
		movq		mm1, qword ptr [ebx+8]
		psrlq		mm0, 32
		psllq		mm1, 32
		movq		mm2, mm7
		movq		mm3, mm7
		psllq		mm2, 32
		psrlq		mm3, 32
		por		mm0, mm2
		por		mm1, mm3

		/* current_upper */
		movq		mm6, qword ptr [eax]

		/* compute the upper-left pixel for dst0 on %%mm2 */
		/* compute the upper-right pixel for dst0 on %%mm4 */
		movq		mm2, mm0
		movq		mm4, mm1
		movq		mm3, mm0
		movq		mm5, mm1
		pcmpeqd		mm2, mm6
		pcmpeqd		mm4, mm6
		pcmpeqd		mm3, qword ptr [ecx]
		pcmpeqd		mm5, qword ptr [ecx]
		pandn		mm3, mm2
		pandn		mm5, mm4
		movq		mm2, mm0
		movq		mm4, mm1
		pcmpeqd		mm2, mm1
		pcmpeqd		mm4, mm0
		pandn		mm2, mm3
		pandn		mm4, mm5
		movq		mm3, mm2
		movq		mm5, mm4
		pand		mm2, mm6
		pand		mm4, mm6
		pandn		mm3, mm7
		pandn		mm5, mm7
		por		mm2, mm3
		por		mm4, mm5

		/* set *dst0 */
		movq		mm3, mm2
		punpckldq	mm2, mm4
		punpckhdq	mm3, mm4
		movq		qword ptr [edx+0], mm2
		movq		qword ptr [edx+8], mm3

		/* next */
		add		eax, 8
		add		ebx, 8
		add		ecx, 8
		add		edx, 16

		/* central runs */
		shr		esi, 1
		jz		label1
label0:

		/* set the current, current_pre, current_next registers */
		movq		mm0, qword ptr [ebx-8]
		movq		mm7, qword ptr [ebx+0]
		movq		mm1, qword ptr [ebx+8]
		psrlq		mm0, 32
		psllq		mm1, 32
		movq		mm2, mm7
		movq		mm3, mm7
		psllq		mm2, 32
		psrlq		mm3, 32
		por		mm0, mm2
		por		mm1, mm3

		/* current_upper */
		movq		mm6, qword ptr[eax]

		/* compute the upper-left pixel for dst0 on %%mm2 */
		/* compute the upper-right pixel for dst0 on %%mm4 */
		movq		mm2, mm0
		movq		mm4, mm1
		movq		mm3, mm0
		movq		mm5, mm1
		pcmpeqd		mm2, mm6
		pcmpeqd		mm4, mm6
		pcmpeqd		mm3, qword ptr[ecx]
		pcmpeqd		mm5, qword ptr[ecx]
		pandn		mm3, mm2
		pandn		mm5, mm4
		movq		mm2, mm0
		movq		mm4, mm1
		pcmpeqd		mm2, mm1
		pcmpeqd		mm4, mm0
		pandn		mm2, mm3
		pandn		mm4, mm5
		movq		mm3, mm2
		movq		mm5, mm4
		pand		mm2, mm6
		pand		mm4, mm6
		pandn		mm3, mm7
		pandn		mm5, mm7
		por		mm2, mm3
		por		mm4, mm5

		/* set *dst0 */
		movq		mm3, mm2
		punpckldq	mm2, mm4
		punpckhdq	mm3, mm4
		movq		qword ptr [edx+0], mm2
		movq		qword ptr [edx+8], mm3

		/* next */
		add		eax, 8
		add		ebx, 8
		add		ecx, 8
		add		edx, 16

		dec		esi
		jnz		label0
label1:
		
		/* final run */
		/* set the current, current_pre, current_next registers */
		movq		mm0, qword ptr [ebx-8]
		movq		mm7, qword ptr [ebx+0]
		pxor		mm1, mm1
		psrlq		mm0, 32
		psllq		mm1, 32
		movq		mm2, mm7
		movq		mm3, mm7
		psllq		mm2, 32
		psrlq		mm3, 32
		por		mm0, mm2
		por		mm1, mm3

		/* current_upper */
		movq		mm6, qword ptr [eax]

		/* compute the upper-left pixel for dst0 on %%mm2 */
		/* compute the upper-right pixel for dst0 on %%mm4 */
		movq		mm2, mm0
		movq		mm4, mm1
		movq		mm3, mm0
		movq		mm5, mm1
		pcmpeqd		mm2, mm6
		pcmpeqd		mm4, mm6
		pcmpeqd		mm3, qword ptr [ecx]
		pcmpeqd		mm5, qword ptr [ecx]
		pandn		mm3, mm2
		pandn		mm5, mm4
		movq		mm2, mm0
		movq		mm4, mm1
		pcmpeqd		mm2, mm1
		pcmpeqd		mm4, mm0
		pandn		mm2, mm3
		pandn		mm4, mm5
		movq		mm3, mm2
		movq		mm5, mm4
		pand		mm2, mm6
		pand		mm4, mm6
		pandn		mm3, mm7
		pandn		mm5, mm7
		por		mm2, mm3
		por		mm4, mm5

		/* set *dst0 */
		movq		mm3, mm2
		punpckldq	mm2, mm4
		punpckhdq	mm3, mm4
		movq		qword ptr [edx+0], mm2
		movq		qword ptr [edx+8], mm3

		mov		src0, eax
		mov		src1, ebx
		mov		src2, ecx
		mov		dst, edx
		mov		count, esi

		emms
	}
}

static	void	internal_scale2x_16_mmx(euI16* dst0, euI16* dst1, const euI16* src0, const euI16* src1, const euI16* src2, unsigned count)
{
	internal_scale2x_16_mmx_single(dst0, src0, src1, src2, count);
	internal_scale2x_16_mmx_single(dst1, src2, src1, src0, count);
}

static	void	internal_scale2x_32_mmx(euI32* dst0, euI32* dst1, const euI32* src0, const euI32* src1, const euI32* src2, unsigned count)
{
	internal_scale2x_32_mmx_single(dst0, src0, src1, src2, count);
	internal_scale2x_32_mmx_single(dst1, src2, src1, src0, count);
}