/* ************************************************************************* ** INTEL Corporation Proprietary Information ** ** This listing is supplied under the terms of a license ** agreement with INTEL Corporation and may not be copied ** nor disclosed except in accordance with the terms of ** that agreement. ** ** Copyright (c) 1995, 1996 Intel Corporation. ** All Rights Reserved. ** ** ************************************************************************* */ // $Author: AGUPTA2 $ // $Date: 08 Mar 1996 16:46:34 $ // $Archive: S:\h26x\src\dec\dxblkcpy.cpv $ // $Header: S:\h26x\src\dec\dxblkcpy.cpv 1.4 08 Mar 1996 16:46:34 AGUPTA2 $ // $Log: S:\h26x\src\dec\dxblkcpy.cpv $ // // Rev 1.4 08 Mar 1996 16:46:34 AGUPTA2 // Rewritten to reduce code size by avoiding 32-bit displacements. Added // pragma code_seg. May need to optimize for misaligned case. // // // Rev 1.3 31 Jan 1996 13:15:14 RMCKENZX // Rewrote file to avoid bank conflicts. Fully unrolled the loop. // Module now really will execute in 52 cycles if the cache is hot. // // Rev 1.2 22 Dec 1995 13:51:06 KMILLS // added new copyright notice // // Rev 1.1 25 Sep 1995 09:03:22 CZHU // Added comments on cycle counts // // Rev 1.0 11 Sep 1995 16:52:26 CZHU // Initial revision. // //------------------------------------------------------------------------------ //------------------------------------------------------------------------------ // // Note: // - BlockCopy reads and writes in DWORDS. // - The __fastcall convention is used. // - Code re-written to minimize code size. // - We assume the output frame to NOT be in cache. // - The constants PITCH and U32 are defined internally (no include files used). // // Registers used: // eax accumulator // ebx accumulator // ecx destination address // edx source address // ebp PITCH // // Pentium cycle count (input cache hot, output cache cold): // 33 + 8*(cache miss time) input aligned // 81 + 8*(cache miss time) input mis-aligned // //------------------------------------------------------------------------------ #include "precomp.h" #define U32 unsigned long // Already defined in precomp.h #define DXPITCH 384 #pragma code_seg("IACODE2") /* * Notes: * The parameter uDstBlock is in ecx and uSrcBlock is in edx. */ __declspec(naked) void __fastcall BlockCopy (U32 uDstBlock, U32 uSrcBlock) { __asm { push edi push ebx push ebp mov ebp, DXPITCH // row 0 mov eax, [edx] mov ebx, [edx+4] add edx, ebp mov edi, [ecx] // heat output cache mov [ecx], eax mov [ecx+4], ebx // row 1 add ecx, ebp mov eax, [edx] mov ebx, [edx+4] add edx, ebp mov edi, [ecx] // heat output cache mov [ecx], eax mov [ecx+4], ebx add ecx, ebp // row 2 mov eax, [edx] mov ebx, [edx+4] add edx, ebp mov edi, [ecx] // heat output cache mov [ecx], eax mov [ecx+4], ebx // row 3 add ecx, ebp mov eax, [edx] mov ebx, [edx+4] add edx, ebp mov edi, [ecx] // heat output cache mov [ecx], eax mov [ecx+4], ebx add ecx, ebp // row 4 mov eax, [edx] mov ebx, [edx+4] add edx, ebp mov edi, [ecx] // heat output cache mov [ecx], eax mov [ecx+4], ebx // row 5 add ecx, ebp mov eax, [edx] mov ebx, [edx+4] add edx, ebp mov edi, [ecx] // heat output cache mov [ecx], eax mov [ecx+4], ebx add ecx, ebp // row 6 mov eax, [edx] mov ebx, [edx+4] add edx, ebp mov edi, [ecx] // heat output cache mov [ecx], eax mov [ecx+4], ebx // row 7 add ecx, ebp pop ebp mov eax, [edx] mov ebx, [edx+4] mov edi, [ecx] // heat output cache mov [ecx], eax mov [ecx+4], ebx pop ebx pop edi ret } // end of asm } #pragma code_seg()