//
// 32bit Normal
//
void CDirectDraw::Render32bpp_Normal(LPBYTE lpRdr, LPBYTE lpDlt, DDSURFACEDESC2& ddsd, BOOL bForceWrite)
{
	LPBYTE pScn = lpRdr;
	LPBYTE pDst = (LPBYTE)ddsd.lpSurface;	
	LPBYTE pDlt = (LPBYTE)lpDlt;
	LPBYTE pPal;
	DWORD width;
	DWORD pitch = ddsd.lPitch;

	for (INT i = 0; i < SCREEN_HEIGHT; i++) {
		if (!(m_LineColormode[i] & 0x80)) {
			pPal = (LPBYTE)m_cnPalette[m_LineColormode[i] & 0x07];
		}
		else {
			pPal = (LPBYTE)m_mnPalette[m_LineColormode[i] & 0x07];
		}

		BOOL bFWrite = FALSE;
		if (bForceWrite || *pDlt != m_LineColormode[i]) {
			bFWrite = TRUE;
			*pDlt = m_LineColormode[i];
		}
		pDlt += 4;

		width = SCREEN_WIDTH;

		if (true) {
			while (width > 0) {
				DWORD edx = *(DWORD*)pScn;
				*(DWORD*)pDlt = edx;

				for (int j = 0; j < 4; j++) {
					BYTE index = (edx >> (8 * j)) & 0xFF;
					((DWORD*)pDst)[j] = ((DWORD*)pPal)[index];
				}

				edx = *(DWORD*)(pScn + 4);
				*(DWORD*)(pDlt + 4) = edx;

				for (int j = 0; j < 4; j++) {
					BYTE index = (edx >> (8 * j)) & 0xFF;
					((DWORD*)pDst)[4 + j] = ((DWORD*)pPal)[index];
				}

				pScn += 8;
				pDlt += 8;
				pDst += 32;
				width -= 8;
			}

			pScn += RENDER_WIDTH - SCREEN_WIDTH;
			pDlt += SCREEN_WIDTH - SCREEN_WIDTH;
			pDst += pitch - SCREEN_WIDTH * 4;
		}
		else {
			while (width > 0) {
				DWORD edx = *(DWORD*)pScn;
				if (edx != *(DWORD*)pDlt) {
					*(DWORD*)pDlt = edx;

					for (int j = 0; j < 4; j++) {
						BYTE index = (edx >> (8 * j)) & 0xFF;
						((DWORD*)pDst)[j] = ((DWORD*)pPal)[index];
					}
				}

				edx = *(DWORD*)(pScn + 4);
				if (edx != *(DWORD*)(pDlt + 4)) {
					*(DWORD*)(pDlt + 4) = edx;

					for (int j = 0; j < 4; j++) {
						BYTE index = (edx >> (8 * j)) & 0xFF;
						((DWORD*)pDst)[4 + j] = ((DWORD*)pPal)[index];
					}
				}

				pScn += 8;
				pDlt += 8;
				pDst += 32;
				width -= 8;
			}

			pScn += RENDER_WIDTH - SCREEN_WIDTH;
			pDlt += SCREEN_WIDTH - SCREEN_WIDTH;
			pDst += pitch - SCREEN_WIDTH * 4;
		}
	}
}

//
// 32bit Scanline
//
void	CDirectDraw::Render32bpp_Scanline(LPBYTE lpRdr, LPBYTE lpDlt, DDSURFACEDESC2& ddsd, BOOL bForceWrite)
{
	LPBYTE	pScn = lpRdr;
	LPBYTE	pDst = (LPBYTE)ddsd.lpSurface;
	LPBYTE	pDlt = (LPBYTE)lpDlt;
	LPBYTE	pPal, pPalScan;
	DWORD	width = SCREEN_WIDTH;
	DWORD	pitch = ddsd.lPitch;

	for (INT i = 0; i < SCREEN_HEIGHT; i++) {
		if (!(m_LineColormode[i] & 0x80)) {
			pPal = (LPBYTE)m_cnPalette[m_LineColormode[i] & 0x07];
			pPalScan = (LPBYTE)m_csPalette[m_LineColormode[i] & 0x07];
		}
		else {
			pPal = (LPBYTE)m_mnPalette[m_LineColormode[i] & 0x07];
			pPalScan = (LPBYTE)m_msPalette[m_LineColormode[i] & 0x07];
		}

		BOOL	bFWrite = FALSE;
		if (bForceWrite || *pDlt != m_LineColormode[i]) {
			bFWrite = TRUE;
			*pDlt = m_LineColormode[i];
		}
		pDlt += 4;

		width = SCREEN_WIDTH;

		if (bFWrite) {
			__asm {
				mov		eax, pScn
				mov		ebx, pDlt
				mov		esi, pPal
				mov		edi, pDst
				_r32bs_loop_fw :
				mov		edx, [eax + 0]
					mov[ebx + 0], edx
					movzx		ecx, dl
					mov		ecx, [esi + ecx * 4]
					shr		edx, 8
					mov[edi + 0], ecx
					movzx		ecx, dl
					mov		ecx, [esi + ecx * 4]
					shr		edx, 8
					mov[edi + 4], ecx
					movzx		ecx, dl
					shr		edx, 8
					mov		ecx, [esi + ecx * 4]
					mov		edx, [esi + edx * 4]
					mov[edi + 8], ecx
					mov[edi + 12], edx

					mov		edx, [eax + 4]
					mov[ebx + 4], edx
					movzx		ecx, dl
					mov		ecx, [esi + ecx * 4]
					shr		edx, 8
					mov[edi + 16], ecx
					movzx		ecx, dl
					mov		ecx, [esi + ecx * 4]
					shr		edx, 8
					mov[edi + 20], ecx
					movzx		ecx, dl
					shr		edx, 8
					mov		ecx, [esi + ecx * 4]
					mov		edx, [esi + edx * 4]
					mov[edi + 24], ecx
					mov[edi + 28], edx

					lea		eax, [eax + 8]
					lea		ebx, [ebx + 8]
					lea		edi, [edi + 32]

					sub		width, 8
					jg		_r32bs_loop_fw
			}

			width = SCREEN_WIDTH;
			pDst += pitch;

			__asm {
				mov		eax, pScn
				mov		ebx, pDlt
				mov		esi, pPalScan
				mov		edi, pDst
				_r32bs_loop2_fw :
				mov		edx, [eax + 0]
					movzx		ecx, dl
					mov		ecx, [esi + ecx * 4]
					shr		edx, 8
					mov[edi + 0], ecx
					movzx		ecx, dl
					mov		ecx, [esi + ecx * 4]
					shr		edx, 8
					mov[edi + 4], ecx
					movzx		ecx, dl
					shr		edx, 8
					mov		ecx, [esi + ecx * 4]
					mov		edx, [esi + edx * 4]
					mov[edi + 8], ecx
					mov[edi + 12], edx

					mov		edx, [eax + 4]
					movzx		ecx, dl
					mov		ecx, [esi + ecx * 4]
					shr		edx, 8
					mov[edi + 16], ecx
					movzx		ecx, dl
					mov		ecx, [esi + ecx * 4]
					shr		edx, 8
					mov[edi + 20], ecx
					movzx		ecx, dl
					shr		edx, 8
					mov		ecx, [esi + ecx * 4]
					mov		edx, [esi + edx * 4]
					mov[edi + 24], ecx
					mov[edi + 28], edx

					lea		eax, [eax + 8]
					lea		ebx, [ebx + 8]
					lea		edi, [edi + 32]

					sub		width, 8
					jg		_r32bs_loop2_fw
			}

			pScn += RENDER_WIDTH;
			pDlt += SCREEN_WIDTH;
			pDst += pitch;
		}
		else {
			__asm {
				mov		eax, pScn
				mov		ebx, pDlt
				mov		esi, pPal
				mov		edi, pDst
				_r32bs_loop :
				// check previous!!
				mov		edx, [eax + 0]
					cmp		edx, [ebx + 0]
					je		_r32bs_skip1
					//			mov		[ebx+0], edx		// 1回目は書き込まない
					movzx		ecx, dl
					mov		ecx, [esi + ecx * 4]
					shr		edx, 8
					mov[edi + 0], ecx
					movzx		ecx, dl
					mov		ecx, [esi + ecx * 4]
					shr		edx, 8
					mov[edi + 4], ecx
					movzx		ecx, dl
					shr		edx, 8
					mov		ecx, [esi + ecx * 4]
					mov		edx, [esi + edx * 4]
					mov[edi + 8], ecx
					mov[edi + 12], edx
					_r32bs_skip1 :
				// check previous!!
				mov		edx, [eax + 4]
					cmp		edx, [ebx + 4]
					je		_r32bs_skip2
					//			mov		[ebx+4], edx		// 1回目は書き込まない
					movzx		ecx, dl
					mov		ecx, [esi + ecx * 4]
					shr		edx, 8
					mov[edi + 16], ecx
					movzx		ecx, dl
					mov		ecx, [esi + ecx * 4]
					shr		edx, 8
					mov[edi + 20], ecx
					movzx		ecx, dl
					shr		edx, 8
					mov		ecx, [esi + ecx * 4]
					mov		edx, [esi + edx * 4]
					mov[edi + 24], ecx
					mov[edi + 28], edx
					_r32bs_skip2 :
				lea		eax, [eax + 8]
					lea		ebx, [ebx + 8]
					lea		edi, [edi + 32]

					sub		width, 8
					jg		_r32bs_loop
			}

			width = SCREEN_WIDTH;
			pDst += pitch;

			__asm {
				mov		eax, pScn
				mov		ebx, pDlt
				mov		esi, pPalScan
				mov		edi, pDst
				_r32bs_loop2 :
				// check previous!!
				mov		edx, [eax + 0]
					cmp		edx, [ebx + 0]
					je		_r32bs_skip3
					mov[ebx + 0], edx		// 2回目は書き込む
					movzx		ecx, dl
					mov		ecx, [esi + ecx * 4]
					shr		edx, 8
					mov[edi + 0], ecx
					movzx		ecx, dl
					mov		ecx, [esi + ecx * 4]
					shr		edx, 8
					mov[edi + 4], ecx
					movzx		ecx, dl
					shr		edx, 8
					mov		ecx, [esi + ecx * 4]
					mov		edx, [esi + edx * 4]
					mov[edi + 8], ecx
					mov[edi + 12], edx
					_r32bs_skip3 :
				// check previous!!
				mov		edx, [eax + 4]
					cmp		edx, [ebx + 4]
					je		_r32bs_skip4
					mov[ebx + 4], edx		// 2回目は書き込む
					movzx		ecx, dl
					mov		ecx, [esi + ecx * 4]
					shr		edx, 8
					mov[edi + 16], ecx
					movzx		ecx, dl
					mov		ecx, [esi + ecx * 4]
					shr		edx, 8
					mov[edi + 20], ecx
					movzx		ecx, dl
					shr		edx, 8
					mov		ecx, [esi + ecx * 4]
					mov		edx, [esi + edx * 4]
					mov[edi + 24], ecx
					mov[edi + 28], edx
					_r32bs_skip4 :
				lea		eax, [eax + 8]
					lea		ebx, [ebx + 8]
					lea		edi, [edi + 32]

					sub		width, 8
					jg		_r32bs_loop2
			}

			pScn += RENDER_WIDTH;
			pDlt += SCREEN_WIDTH;
			pDst += pitch;
		}
	}
}

//
// 32bit Double
//
void	CDirectDraw::Render32bpp_Double(LPBYTE lpRdr, LPBYTE lpDlt, DDSURFACEDESC2& ddsd, BOOL bForceWrite)
{
	LPBYTE	pScn = lpRdr;
	LPBYTE	pDst = (LPBYTE)ddsd.lpSurface;
	LPBYTE	pDlt = (LPBYTE)lpDlt;
	LPBYTE	pPal;
	DWORD	width;
	DWORD	pitch = ddsd.lPitch;

	for (INT i = 0; i < SCREEN_HEIGHT; i++) {
		if (!(m_LineColormode[i] & 0x80)) {
			pPal = (LPBYTE)m_cnPalette[m_LineColormode[i] & 0x07];
		}
		else {
			pPal = (LPBYTE)m_mnPalette[m_LineColormode[i] & 0x07];
		}

		BOOL	bFWrite = FALSE;
		if (bForceWrite || *pDlt != m_LineColormode[i]) {
			bFWrite = TRUE;
			*pDlt = m_LineColormode[i];
		}
		pDlt += 4;

		width = SCREEN_WIDTH;

		if (bFWrite) {
			__asm {
				mov		eax, pScn
				mov		ebx, pDlt
				mov		esi, pPal
				mov		edi, pDst
				_r32bs_d_loop_fw :
				mov		edx, [eax + 0]
					mov[ebx + 0], edx
					movzx		ecx, dl
					mov		ecx, [esi + ecx * 4]
					shr		edx, 8
					mov[edi + 0], ecx
					mov[edi + 4], ecx
					movzx		ecx, dl
					mov		ecx, [esi + ecx * 4]
					shr		edx, 8
					mov[edi + 8], ecx
					mov[edi + 12], ecx
					movzx		ecx, dl
					shr		edx, 8
					mov		ecx, [esi + ecx * 4]
					mov		edx, [esi + edx * 4]
					mov[edi + 16], ecx
					mov[edi + 20], ecx
					mov[edi + 24], edx
					mov[edi + 28], edx

					mov		edx, [eax + 4]
					mov[ebx + 4], edx
					movzx		ecx, dl
					mov		ecx, [esi + ecx * 4]
					shr		edx, 8
					mov[edi + 32], ecx
					mov[edi + 36], ecx
					movzx		ecx, dl
					mov		ecx, [esi + ecx * 4]
					shr		edx, 8
					mov[edi + 40], ecx
					mov[edi + 44], ecx
					movzx		ecx, dl
					shr		edx, 8
					mov		ecx, [esi + ecx * 4]
					mov		edx, [esi + edx * 4]
					mov[edi + 48], ecx
					mov[edi + 52], ecx
					mov[edi + 56], edx
					mov[edi + 60], edx

					lea		eax, [eax + 8]
					lea		ebx, [ebx + 8]
					lea		edi, [edi + 64]

					sub		width, 8
					jg		_r32bs_d_loop_fw
			}

			width = SCREEN_WIDTH;
			pDst += pitch;

			__asm {
				mov		eax, pScn
				mov		ebx, pDlt
				mov		esi, pPal
				mov		edi, pDst
				_r32bs_d_loop2_fw :
				mov		edx, [eax + 0]
					movzx		ecx, dl
					mov		ecx, [esi + ecx * 4]
					shr		edx, 8
					mov[edi + 0], ecx
					mov[edi + 4], ecx
					movzx		ecx, dl
					mov		ecx, [esi + ecx * 4]
					shr		edx, 8
					mov[edi + 8], ecx
					mov[edi + 12], ecx
					movzx		ecx, dl
					shr		edx, 8
					mov		ecx, [esi + ecx * 4]
					mov		edx, [esi + edx * 4]
					mov[edi + 16], ecx
					mov[edi + 20], ecx
					mov[edi + 24], edx
					mov[edi + 28], edx

					mov		edx, [eax + 4]
					movzx		ecx, dl
					mov		ecx, [esi + ecx * 4]
					shr		edx, 8
					mov[edi + 32], ecx
					mov[edi + 36], ecx
					movzx		ecx, dl
					mov		ecx, [esi + ecx * 4]
					shr		edx, 8
					mov[edi + 40], ecx
					mov[edi + 44], ecx
					movzx		ecx, dl
					shr		edx, 8
					mov		ecx, [esi + ecx * 4]
					mov		edx, [esi + edx * 4]
					mov[edi + 48], ecx
					mov[edi + 52], ecx
					mov[edi + 56], edx
					mov[edi + 60], edx

					lea		eax, [eax + 8]
					lea		ebx, [ebx + 8]
					lea		edi, [edi + 64]

					sub		width, 8
					jg		_r32bs_d_loop2_fw
			}

			pScn += RENDER_WIDTH;
			pDlt += SCREEN_WIDTH;
			pDst += pitch;
		}
		else {
			__asm {
				mov		eax, pScn
				mov		ebx, pDlt
				mov		esi, pPal
				mov		edi, pDst
				_r32bs_d_loop :
				// check previous!!
				mov		edx, [eax + 0]
					cmp		edx, [ebx + 0]
					je		_r32bs_d_skip1
					//			mov		[ebx+0], edx		// 1回目は書き込まない
					movzx		ecx, dl
					mov		ecx, [esi + ecx * 4]
					shr		edx, 8
					mov[edi + 0], ecx
					mov[edi + 4], ecx
					movzx		ecx, dl
					mov		ecx, [esi + ecx * 4]
					shr		edx, 8
					mov[edi + 8], ecx
					mov[edi + 12], ecx
					movzx		ecx, dl
					shr		edx, 8
					mov		ecx, [esi + ecx * 4]
					mov		edx, [esi + edx * 4]
					mov[edi + 16], ecx
					mov[edi + 20], ecx
					mov[edi + 24], edx
					mov[edi + 28], edx
					_r32bs_d_skip1 :
				// check previous!!
				mov		edx, [eax + 4]
					cmp		edx, [ebx + 4]
					je		_r32bs_d_skip2
					//			mov		[ebx+4], edx		// 1回目は書き込まない
					movzx		ecx, dl
					mov		ecx, [esi + ecx * 4]
					shr		edx, 8
					mov[edi + 32], ecx
					mov[edi + 36], ecx
					movzx		ecx, dl
					mov		ecx, [esi + ecx * 4]
					shr		edx, 8
					mov[edi + 40], ecx
					mov[edi + 44], ecx
					movzx		ecx, dl
					shr		edx, 8
					mov		ecx, [esi + ecx * 4]
					mov		edx, [esi + edx * 4]
					mov[edi + 48], ecx
					mov[edi + 52], ecx
					mov[edi + 56], edx
					mov[edi + 60], edx
					_r32bs_d_skip2 :
				lea		eax, [eax + 8]
					lea		ebx, [ebx + 8]
					lea		edi, [edi + 64]

					sub		width, 8
					jg		_r32bs_d_loop
			}

			width = SCREEN_WIDTH;
			pDst += pitch;

			__asm {
				mov		eax, pScn
				mov		ebx, pDlt
				mov		esi, pPal
				mov		edi, pDst
				_r32bs_d_loop2 :
				// check previous!!
				mov		edx, [eax + 0]
					cmp		edx, [ebx + 0]
					je		_r32bs_d_skip3
					mov[ebx + 0], edx		// 2回目は書き込む
					movzx		ecx, dl
					mov		ecx, [esi + ecx * 4]
					shr		edx, 8
					mov[edi + 0], ecx
					mov[edi + 4], ecx
					movzx		ecx, dl
					mov		ecx, [esi + ecx * 4]
					shr		edx, 8
					mov[edi + 8], ecx
					mov[edi + 12], ecx
					movzx		ecx, dl
					shr		edx, 8
					mov		ecx, [esi + ecx * 4]
					mov		edx, [esi + edx * 4]
					mov[edi + 16], ecx
					mov[edi + 20], ecx
					mov[edi + 24], edx
					mov[edi + 28], edx
					_r32bs_d_skip3 :
				// check previous!!
				mov		edx, [eax + 4]
					cmp		edx, [ebx + 4]
					je		_r32bs_d_skip4
					mov[ebx + 4], edx		// 2回目は書き込む
					movzx		ecx, dl
					mov		ecx, [esi + ecx * 4]
					shr		edx, 8
					mov[edi + 32], ecx
					mov[edi + 36], ecx
					movzx		ecx, dl
					mov		ecx, [esi + ecx * 4]
					shr		edx, 8
					mov[edi + 40], ecx
					mov[edi + 44], ecx
					movzx		ecx, dl
					shr		edx, 8
					mov		ecx, [esi + ecx * 4]
					mov		edx, [esi + edx * 4]
					mov[edi + 48], ecx
					mov[edi + 52], ecx
					mov[edi + 56], edx
					mov[edi + 60], edx
					_r32bs_d_skip4 :
				lea		eax, [eax + 8]
					lea		ebx, [ebx + 8]
					lea		edi, [edi + 64]

					sub		width, 8
					jg		_r32bs_d_loop2
			}

			pScn += RENDER_WIDTH;
			pDlt += SCREEN_WIDTH;
			pDst += pitch;
		}
	}
}

//
// 32bit Double Scanline
//
void	CDirectDraw::Render32bpp_DoubleScanline(LPBYTE lpRdr, LPBYTE lpDlt, DDSURFACEDESC2& ddsd, BOOL bForceWrite)
{
	LPBYTE	pScn = lpRdr;
	LPBYTE	pDst = (LPBYTE)ddsd.lpSurface;
	LPBYTE	pDlt = (LPBYTE)lpDlt;
	LPBYTE	pPal, pPalScan;
	DWORD	width;
	DWORD	pitch = ddsd.lPitch;

	for (INT i = 0; i < SCREEN_HEIGHT; i++) {
		if (!(m_LineColormode[i] & 0x80)) {
			pPal = (LPBYTE)m_cnPalette[m_LineColormode[i] & 0x07];
			pPalScan = (LPBYTE)m_csPalette[m_LineColormode[i] & 0x07];
		}
		else {
			pPal = (LPBYTE)m_mnPalette[m_LineColormode[i] & 0x07];
			pPalScan = (LPBYTE)m_msPalette[m_LineColormode[i] & 0x07];
		}

		BOOL	bFWrite = FALSE;
		if (bForceWrite || *pDlt != m_LineColormode[i]) {
			bFWrite = TRUE;
			*pDlt = m_LineColormode[i];
		}
		pDlt += 4;

		width = SCREEN_WIDTH;

		if (bFWrite) {
			__asm {
				mov		eax, pScn
				mov		ebx, pDlt
				mov		esi, pPal
				mov		edi, pDst
				_r32bn_d_loop_fw :
				mov		edx, [eax + 0]
					mov[ebx + 0], edx
					movzx		ecx, dl
					mov		ecx, [esi + ecx * 4]
					shr		edx, 8
					mov[edi + 0], ecx
					mov[edi + 4], ecx
					movzx		ecx, dl
					mov		ecx, [esi + ecx * 4]
					shr		edx, 8
					mov[edi + 8], ecx
					mov[edi + 12], ecx
					movzx		ecx, dl
					shr		edx, 8
					mov		ecx, [esi + ecx * 4]
					mov		edx, [esi + edx * 4]
					mov[edi + 16], ecx
					mov[edi + 20], ecx
					mov[edi + 24], edx
					mov[edi + 28], edx

					mov		edx, [eax + 4]
					mov[ebx + 4], edx
					movzx		ecx, dl
					mov		ecx, [esi + ecx * 4]
					shr		edx, 8
					mov[edi + 32], ecx
					mov[edi + 36], ecx
					movzx		ecx, dl
					mov		ecx, [esi + ecx * 4]
					shr		edx, 8
					mov[edi + 40], ecx
					mov[edi + 44], ecx
					movzx		ecx, dl
					shr		edx, 8
					mov		ecx, [esi + ecx * 4]
					mov		edx, [esi + edx * 4]
					mov[edi + 48], ecx
					mov[edi + 52], ecx
					mov[edi + 56], edx
					mov[edi + 60], edx

					lea		eax, [eax + 8]
					lea		ebx, [ebx + 8]
					lea		edi, [edi + 64]

					sub		width, 8
					jg		_r32bn_d_loop_fw
			}

			width = SCREEN_WIDTH;
			pDst += pitch;

			__asm {
				mov		eax, pScn
				mov		ebx, pDlt
				mov		esi, pPalScan
				mov		edi, pDst
				_r32bn_d_loop2_fw :
				mov		edx, [eax + 0]
					movzx		ecx, dl
					mov		ecx, [esi + ecx * 4]
					shr		edx, 8
					mov[edi + 0], ecx
					mov[edi + 4], ecx
					movzx		ecx, dl
					mov		ecx, [esi + ecx * 4]
					shr		edx, 8
					mov[edi + 8], ecx
					mov[edi + 12], ecx
					movzx		ecx, dl
					shr		edx, 8
					mov		ecx, [esi + ecx * 4]
					mov		edx, [esi + edx * 4]
					mov[edi + 16], ecx
					mov[edi + 20], ecx
					mov[edi + 24], edx
					mov[edi + 28], edx

					mov		edx, [eax + 4]
					movzx		ecx, dl
					mov		ecx, [esi + ecx * 4]
					shr		edx, 8
					mov[edi + 32], ecx
					mov[edi + 36], ecx
					movzx		ecx, dl
					mov		ecx, [esi + ecx * 4]
					shr		edx, 8
					mov[edi + 40], ecx
					mov[edi + 44], ecx
					movzx		ecx, dl
					shr		edx, 8
					mov		ecx, [esi + ecx * 4]
					mov		edx, [esi + edx * 4]
					mov[edi + 48], ecx
					mov[edi + 52], ecx
					mov[edi + 56], edx
					mov[edi + 60], edx

					lea		eax, [eax + 8]
					lea		ebx, [ebx + 8]
					lea		edi, [edi + 64]

					sub		width, 8
					jg		_r32bn_d_loop2_fw
			}

			pScn += RENDER_WIDTH;
			pDlt += SCREEN_WIDTH;
			pDst += pitch;
		}
		else {
			__asm {
				mov		eax, pScn
				mov		ebx, pDlt
				mov		esi, pPal
				mov		edi, pDst
				_r32bn_d_loop :
				// check previous!!
				mov		edx, [eax + 0]
					cmp		edx, [ebx + 0]
					je		_r32bn_d_skip1
					//			mov		[ebx+0], edx		// 1回目は書き込まない
					movzx		ecx, dl
					mov		ecx, [esi + ecx * 4]
					shr		edx, 8
					mov[edi + 0], ecx
					mov[edi + 4], ecx
					movzx		ecx, dl
					mov		ecx, [esi + ecx * 4]
					shr		edx, 8
					mov[edi + 8], ecx
					mov[edi + 12], ecx
					movzx		ecx, dl
					shr		edx, 8
					mov		ecx, [esi + ecx * 4]
					mov		edx, [esi + edx * 4]
					mov[edi + 16], ecx
					mov[edi + 20], ecx
					mov[edi + 24], edx
					mov[edi + 28], edx
					_r32bn_d_skip1 :
				// check previous!!
				mov		edx, [eax + 4]
					cmp		edx, [ebx + 4]
					je		_r32bn_d_skip2
					//			mov		[ebx+4], edx		// 1回目は書き込まない
					movzx		ecx, dl
					mov		ecx, [esi + ecx * 4]
					shr		edx, 8
					mov[edi + 32], ecx
					mov[edi + 36], ecx
					movzx		ecx, dl
					mov		ecx, [esi + ecx * 4]
					shr		edx, 8
					mov[edi + 40], ecx
					mov[edi + 44], ecx
					movzx		ecx, dl
					shr		edx, 8
					mov		ecx, [esi + ecx * 4]
					mov		edx, [esi + edx * 4]
					mov[edi + 48], ecx
					mov[edi + 52], ecx
					mov[edi + 56], edx
					mov[edi + 60], edx
					_r32bn_d_skip2 :
				lea		eax, [eax + 8]
					lea		ebx, [ebx + 8]
					lea		edi, [edi + 64]

					sub		width, 8
					jg		_r32bn_d_loop
			}

			width = SCREEN_WIDTH;
			pDst += pitch;

			__asm {
				mov		eax, pScn
				mov		ebx, pDlt
				mov		esi, pPalScan
				mov		edi, pDst
				_r32bn_d_loop2 :
				// check previous!!
				mov		edx, [eax + 0]
					cmp		edx, [ebx + 0]
					je		_r32bn_d_skip3
					mov[ebx + 0], edx		// 2回目は書き込む
					movzx		ecx, dl
					mov		ecx, [esi + ecx * 4]
					shr		edx, 8
					mov[edi + 0], ecx
					mov[edi + 4], ecx
					movzx		ecx, dl
					mov		ecx, [esi + ecx * 4]
					shr		edx, 8
					mov[edi + 8], ecx
					mov[edi + 12], ecx
					movzx		ecx, dl
					shr		edx, 8
					mov		ecx, [esi + ecx * 4]
					mov		edx, [esi + edx * 4]
					mov[edi + 16], ecx
					mov[edi + 20], ecx
					mov[edi + 24], edx
					mov[edi + 28], edx
					_r32bn_d_skip3 :
				// check previous!!
				mov		edx, [eax + 4]
					cmp		edx, [ebx + 4]
					je		_r32bn_d_skip4
					mov[ebx + 4], edx		// 2回目は書き込む
					movzx		ecx, dl
					mov		ecx, [esi + ecx * 4]
					shr		edx, 8
					mov[edi + 32], ecx
					mov[edi + 36], ecx
					movzx		ecx, dl
					mov		ecx, [esi + ecx * 4]
					shr		edx, 8
					mov[edi + 40], ecx
					mov[edi + 44], ecx
					movzx		ecx, dl
					shr		edx, 8
					mov		ecx, [esi + ecx * 4]
					mov		edx, [esi + edx * 4]
					mov[edi + 48], ecx
					mov[edi + 52], ecx
					mov[edi + 56], edx
					mov[edi + 60], edx
					_r32bn_d_skip4 :
				lea		eax, [eax + 8]
					lea		ebx, [ebx + 8]
					lea		edi, [edi + 64]

					sub		width, 8
					jg		_r32bn_d_loop2
			}

			pScn += RENDER_WIDTH;
			pDlt += SCREEN_WIDTH;
			pDst += pitch;
		}
	}
}