;---------------------------Module-Header------------------------------; ; Module Name: str.asm ; ; Contains the x86 'Asm' versions of some inner-loop routines for the ; partially hardware accelerated StretchBlt. ; ; Copyright (c) 1994-1995 Microsoft Corporation ;-----------------------------------------------------------------------; .386 .model small,c assume cs:FLAT,ds:FLAT,es:FLAT,ss:FLAT assume fs:nothing,gs:nothing .xlist include stdcall.inc ;calling convention cmacros include i386\strucs.inc include i386\hw.inc .list .data ; ; stack based params and local variables ; STACK_STRUC struc ; Feel free to add any local variables here: sp_TempXFrac dd ? sp_YCarry dd ? sp_LeftCase dd ? sp_RightCase dd ? sp_pjSrcScan dd ? sp_SrcIntStep dd ? sp_DstStride dd ? sp_XCntHW dd ? sp_XCount dd ? sp_xyOFfset dd ? sp_yDst dd ? sp_pdev dd ? ; Don't add any fields below here without modifying PROC_MEM_SIZE! sp_ebp dd ? sp_esi dd ? sp_edi dd ? sp_ebx dd ? sp_RetAddr dd ? sp_pSTR_BLT dd ? ; If adding parameters, adjust 'ret' value! STACK_STRUC ends PROC_MEM_SIZE equ 6 * 4 ; ; Make sure this STR_BLT matches that declared in driver.h! ; STR_BLT struc str_ppdev dd ? str_pjSrcScan dd ? str_lDeltaSrc dd ? str_XSrcStart dd ? str_pjDstScan dd ? str_lDeltaDst dd ? str_XDstStart dd ? str_XDstEnd dd ? str_YDstStart dd ? str_YDstCount dd ? str_ulXDstToSrcIntCeil dd ? str_ulXDstToSrcFracCeil dd ? str_ulYDstToSrcIntCeil dd ? str_ulYDstToSrcFracCeil dd ? str_ulXFracAccumulator dd ? str_ulYFracAccumulator dd ? STR_BLT ends .code ;---------------------------Public-Routine------------------------------; ; VOID vDirectStretch8(pStrBlt) ; ; NOTE: This routine doesn't handle cases where the blt stretch starts ; and ends in the same destination dword! vDirectStretchNarrow ; is expected to have been called for that case. ; ; Stretch blt 8 -> 8 ;-----------------------------------------------------------------------; public vDirectStretch8@4 vDirectStretch8@4 proc near ; ; use ebp as general register, use esp for parameter and local access ; save ebp,ebx,esi,edi ; push ebx push edi push esi push ebp sub esp,(size STACK_STRUC) - PROC_MEM_SIZE ; make room for local variables mov ebp,[esp].sp_pSTR_BLT ; load pSTR_BLT into ebp ; ; load up some stack-based parameters to be used by our scan ; duplicator when doing vertical stretches ; mov eax,[ebp].str_ppdev mov ecx,[ebp].str_YDstStart ; get start y coordinate mov [esp].sp_pdev,eax ; save ppdev pointer mov ebx,[eax].pdev_xyOffset mov [esp].sp_xyOffset,ebx ; save xyOffset mov [esp].sp_yDst,ecx ; save current y coordinate ; ; calc starting addressing parameters ; mov esi,[ebp].str_pjSrcScan ; load src DIB pointer add esi,[ebp].str_XSrcStart ; add starting Src Pixel mov edi,[ebp].str_pjDstScan ; load dst DIB pointer add edi,[ebp].str_XDstStart ; add strarting Dst Pixel mov [esp].sp_pjSrcScan,esi ; save scan line start pointer mov eax,[ebp].str_ulYDstToSrcIntCeil ; number of src scan lines to step mul [ebp].str_lDeltaSrc ; calc scan line int lines to step mov [esp].sp_SrcIntStep,eax ; save int portion of Y src step mov edx,4 ; calc left bytes = (4 - LeftCase) & 0x03 sub edx,edi and edx,3 ; left edge bytes mov [esp].sp_LeftCase,edx ; save left edge case pixels (4-LeftCase)&0x03 mov eax,[ebp].str_pjDstScan ; make copy mov ecx,[ebp].str_XDstEnd ; load x end add eax,ecx ; ending dst addr and eax,3 ; calc right edge case mov [esp].sp_RightCase,eax ; save right edge case sub ecx,[ebp].str_XDstStart ; calc x count dec ecx mov [esp].sp_XCntHW,ecx ; x width for accelerator inc ecx mov ebx,[ebp].str_lDeltaDst ; dst scan line stride sub ebx,ecx ; distance from end of one line to start of next mov [esp].sp_DstStride,ebx ; save dst scan line stride sub ecx,eax ; sub right edge from XCount sub ecx,edx ; sub left edge from XCount shr ecx,2 ; convert from byte to DWORD count mov [esp].sp_XCount,ecx ; save DWORD count mov ebx,[ebp].str_ulXDstToSrcFracCeil ; get x frac mov [esp].sp_TempXFrac,ebx ; save x frac to a esp based location NextScan: ; ; Wait until the accelerator is done with current blt ; mov dx,3ceh ;index register mov al,31h ;status reg out dx,al mov dx,3cfh ;data register @@: in al,dx test al,1 jnz short @b SingleLoop: ; ; esi and edi are assumed to be correctly loaded ; mov eax,[ebp].str_ulXDstToSrcIntCeil ; get src integer step for step in dst mov ebx,[ebp].str_ulXDstToSrcFracCeil ; get src frac step for step in dst mov edx,[ebp].str_ulXFracAccumulator ; put it in edx as tmp mov ebp,edi ; get dst pointer to ebp ; ; Can't directly access pSTR_BLT variables through ebp ; mov edi,edx ; get accumulator where we want it mov ecx,[esp].sp_LeftCase ; eax = integer step in source ; ebx = fractional step in source ; ecx = left edge case ; edx = free for pixel data ; esi = pointer to source pixel ; edi = fractional accumulator ; ebp = pointer to dest pixel ; ; first do the left side to align dwords ; test ecx,ecx jz DwordAligned @@: mov dl,[esi] ; fetch pixel mov [ebp],dl ; write it out add edi,ebx ; step fraction adc esi,eax ; add in integer and possible carry inc ebp ; step 1 in dest dec ecx ; dec left count jne @B ; repeat until done DwordAligned: mov ecx,[esp].sp_XCount ; get run length @@: mov dl,[esi] ; get a source pixel edx = ???0 add edi,ebx ; step fraction adc esi,eax ; add integer and carry add edi,ebx ; step fraction mov dh,[esi] ; get source pixel edx = ??10 adc esi,eax ; add integer and carry shl edx,16 ; edx = 10?? add edi,ebx ; step fraction mov dl,[esi] ; get a source pixel edx = 10?2 adc esi,eax ; add integer and carry add edi,ebx ; step fraction mov dh,[esi] ; get source pixel edx = 0132 adc esi,eax ; add integer and carry ror edx,16 ; edx = 3210 mov [ebp],edx ; write everything to dest add ebp,4 ; increment dest pointer by 1 dword dec ecx ; decrement count jnz @b ; do more pixels ; ; now do the right side trailing bytes ; mov ecx,[esp].sp_RightCase test ecx,ecx jz EndScanLine @@: mov dl,[esi] ; fetch pixel mov [ebp],dl ; write it out add edi,ebx ; step fraction adc esi,eax ; add in integer and possible carry inc ebp ; step 1 in dest dec ecx ; dec right count jnz @b ; repeat until done EndScanLine: mov edi,ebp ; get dst pointer back mov ebp,[esp].sp_pSTR_BLT ; load pSTR_BLT into ebp EndSkipScan: mov esi,[esp].sp_pjSrcScan ; load src scan start addr mov ebx,esi ; save a copy mov eax,[ebp].str_ulYFracAccumulator ; get .32 part of Y pointer add eax,[ebp].str_ulYDstToSrcFracCeil ; add in fractional step jnc @f add esi,[ebp].str_lDeltaSrc ; step one extra in src @@: mov [ebp].str_ulYFracAccumulator,eax ; save Y accumulator add esi,[esp].sp_SrcIntStep ; step int part mov [esp].sp_pjSrcScan,esi ; save starting scan addr add edi,[esp].sp_DstStride ; step to next scan in dst dec [ebp].str_YDstCount ; decrement scan count jz Done ; no more scans inc [esp].sp_yDst ; one scan further down in dst cmp esi,ebx ; is src scan same as before? jne NextScan ; if so, fall through to dupe scan ;-------------------------------------------------------------------- ; The source scan is the same one used for the previous destination ; scan, so we can simply use the hardware to copy the previous ; destination scan. ; ; Since on the S3 we can set up a 'rolling blt' to copy one scan ; line to several scans in a single command, we will count up how ; many times this scan should be duplicated. If your hardware ; cannot do a rolling blt, simply issue a new blt command for ; every time the scan should be duplicated. ; ; eax = ulYFracAccumulator ; ebx = original pjSrcScan ; esi = current pjSrcScan ; ebp = pSTR_BLT ; mov ecx,-1 ; number of times scan is to be ; duplicated, less one AnotherDuplicate: inc ecx ; one scan further down dec [ebp].str_YDstCount ; decrement scan count jz OutputDuplicates ; no more scans add eax,[ebp].str_ulYDstToSrcFracCeil ; add in fractional step jnc @f add esi,[ebp].str_lDeltaSrc ; step one extra in src @@: add esi,[esp].sp_SrcIntStep ; step int part add edi,[ebp].str_lDeltaDst ; step entire dest scan cmp esi,ebx ; is src scan same as before? je AnotherDuplicate OutputDuplicates: mov [esp].sp_pjSrcScan,esi ; save starting scan address mov [ebp].str_ulYFracAccumulator,eax ; save Y accumulator ; ; Now output the command to do the 'rolling blt' ; ;; mov edx,[esp].sp_pjBase ; ; Wait until the accelerator is done with current blt ; mov dx,3ceh ;index register mov al,31h ;status reg out dx,al mov dx,3cfh ;data register @@: in al,dx test al,1 jnz short @b mov ebx,[esp].sp_XCntHW mov eax,[esp].sp_yDst ; eax = yDst -- Destination scan line (source scan line is yDst - 1) ; ebx = XCntHW -- Number of bytes across (width) - 1 ; ecx = cy -- Number of times scan is to be duplicated - 1 ; ebp = pSTR_BLT -- Stretch blt info DuplicateViaMmIo: ; ; Do the copy: ; if 0 CP_XCNT(ppdev, pjBase, (WidthXBytes - 1)); CP_YCNT(ppdev, pjBase, (cyDuplicate - 1)); CP_SRC_ADDR(ppdev, pjBase, (xyOffset + ((yDst - 1) * lDelta) + xDstBytes)); SET_DEST_ADDR(ppdev, ((yDst * lDelta) + xDstBytes)); START_ACL(ppdev); endif .errnz RECT_HEIGHT ;; mov [edx+OFFSET_wXCnt],bx ;; mov [edx+OFFSET_wYCnt],cx mov dx,3ceh ;index register ; ; set XCNT=bx and YCNT=cx ; push eax mov al,21h ;BLT_WIDTH_HIGH mov ah,bh ;x count high byte out dx,ax mov al,20h ;BLT_WIDTH_LOW mov ah,bl ;x count low byte out dx,ax mov al,23h ;BLT_HEIGHT_HIGH mov ah,ch ;y count high byte out dx,ax mov al,22h ;BLT_HEIGHT_LOW mov ah,cl ;y count low byte out dx,ax pop eax ; ; Calculate src address ; mov ebx,eax ; ebx <- yDst dec ebx imul ebx,[ebp].str_lDeltaDst add ebx,[esp].sp_xyOffset add ebx,[ebp].str_xDstStart ;; mov [edx+OFFSET_ulSrcAddr],ebx push eax mov al,2ch ;SRC_ADDR_LOW mov ah,bl ;src addr low byte out dx,ax mov al,2dh ;SRC_ADDR_MID mov ah,bh ;src addr mid byte out dx,ax shr ebx,16 mov al,2eh ;SRC_ADDR_HIGH mov ah,bl ;src addr high byte out dx,ax pop eax ; ; Calculate dst address ; inc eax ; account for 'ecx' being ; one less than scan count mov ebx,eax ; ebx <- yDst dec ebx imul ebx,[ebp].str_lDeltaDst add ebx,[esp].sp_xyOffset add ebx,[ebp].str_xDstStart ;; mov [edx+OFFSET_ulDstAddr],ebx push eax mov al,28h ;DST_ADDR_LOW mov ah,bl ;dst addr low byte out dx,ax mov al,29h ;DST_ADDR_MID mov ah,bh ;dst addr mid byte out dx,ax shr ebx,16 mov al,2ah ;DST_ADDR_HIGH mov ah,bl ;dst addr high byte out dx,ax ; ; Start blt ; mov al,31h ;BLT_START_STATUS_REG mov ah,2 ;BLT_START out dx,ax pop eax DoneSetDestAddr: add eax,ecx ; add num scans just done mov [esp].sp_yDst,eax DoneDuplicate: cmp [ebp].str_YDstCount,0 ; we might be all done jne NextScan Done: add esp,(size STACK_STRUC) - PROC_MEM_SIZE pop ebp pop esi pop edi pop ebx ret 4 vDirectStretch8@4 endp ;---------------------------Public-Routine------------------------------; ; VOID vDirectStretch16(pStrBlt) ; ; Stretch blt 16 -> 16 ;-----------------------------------------------------------------------; public vDirectStretch16@4 vDirectStretch16@4 proc near ; ; use ebp as general register, use esp for parameter and local access ; save ebp,ebx,esi,edi ; push ebx push edi push esi push ebp sub esp,(size STACK_STRUC) - PROC_MEM_SIZE ; make room for local variables mov ebp,[esp].sp_pSTR_BLT ; load pSTR_BLT into ebp ; ; load up some stack-based parameters to be used by our scan ; duplicator when doing vertical stretches ; mov eax,[ebp].str_ppdev mov ecx,[ebp].str_YDstStart ; get start y coordinate mov [esp].sp_pdev,eax ; save ppdev pointer mov ebx,[eax].pdev_xyOffset mov [esp].sp_xyOffset,ebx ; save xyOffset mov [esp].sp_yDst,ecx ; save current y coordinate ; ; calc starting addressing parameters ; mov esi,[ebp].str_pjSrcScan ; load src DIB pointer mov eax,[ebp].str_XSrcStart mov edi,[ebp].str_pjDstScan ; load dst DIB pointer mov ebx,[ebp].str_XDstStart add esi,eax add edi,ebx add esi,eax ; add starting Src Pixel add edi,ebx ; add starting Dst Pixel mov [esp].sp_pjSrcScan,esi ; save scan line start pointer mov eax,[ebp].str_ulYDstToSrcIntCeil ; number of src scan lines to step mul [ebp].str_lDeltaSrc ; calc scan line int lines to step mov [esp].sp_SrcIntStep,eax ; save int portion of Y src step mov edx,edi ; make copy of pjDst and edx,2 ; calc left edge case shr edx,1 ; left edge pixels mov [esp].sp_LeftCase,edx ; save left edge case pixels mov eax,[ebp].str_pjDstScan ; make copy mov ecx,[ebp].str_XDstEnd ; load x end add eax,ecx add eax,ecx ; ending dst addr and eax,2 ; calc right edge case shr eax,1 ; right edge pixels mov [esp].sp_RightCase,eax ; save right edge case sub ecx,[ebp].str_XDstStart ; calc x count shl ecx,1 dec ecx mov [esp].sp_XCntHW,ecx ; x width for accelerator inc ecx shr ecx,1 mov ebx,[ebp].str_lDeltaDst ; dst scan line stride sub ebx,ecx sub ebx,ecx ; distance from end of one line to start of next mov [esp].sp_DstStride,ebx ; save dst scan line stride sub ecx,eax ; sub right edge from XCount sub ecx,edx ; sub left edge from XCount shr ecx,1 ; convert from pixels to DWORD count mov [esp].sp_XCount,ecx ; save DWORD count mov ebx,[ebp].str_ulXDstToSrcFracCeil ; get x frac mov [esp].sp_TempXFrac,ebx ; save x frac to a esp based location NextScan: ; ; Wait until the accelerator is done with current blt ; mov dx,3ceh ;index register mov al,31h ;status reg out dx,al mov dx,3cfh ;data register @@: in al,dx test al,1 jnz short @b SingleLoop: ; ; esi and edi are assumed to be correctly loaded ; mov eax,[ebp].str_ulXDstToSrcIntCeil ; get src integer step for step in dst mov ebx,[ebp].str_ulXDstToSrcFracCeil ; get src frac step for step in dst mov edx,[ebp].str_ulXFracAccumulator ; put it in edx as tmp mov ebp,edi ; get dst pointer to ebp ; ; Can't directly access pSTR_BLT variables through ebp ; mov edi,edx ; get accumulator where we want it mov ecx,[esp].sp_LeftCase ; eax = integer step in source ; ebx = fractional step in source ; ecx = left edge case ; edx = free for pixel data ; esi = pointer to source pixel ; edi = fractional accumulator ; ebp = pointer to dest pixel ; ; divide 'esi' by 2 so that we can always dereference it by ; [2*esi] -- this allows us to still use an 'add with carry' ; to jump to the next pixel ; shr esi,1 ; ; first do the left side to align dwords ; test ecx,ecx jz DwordAligned mov dx,[2*esi] ; fetch pixel mov [ebp],dx ; write it out add edi,ebx ; step fraction adc esi,eax ; add in integer and possible carry add ebp,2 ; step 1 in dest DwordAligned: mov ecx,[esp].sp_XCount ; get run length test ecx,ecx jz TrailingBytes ; watch for zero dword case @@: mov dx,[2*esi] ; get a source pixel add edi,ebx ; step fraction adc esi,eax ; add integer and carry shl edx,16 add edi,ebx ; step fraction mov dx,[2*esi] ; get source pixel adc esi,eax ; add integer and carry ror edx,16 mov [ebp],edx ; write everything to dest add ebp,4 ; increment dest pointer by 1 dword dec ecx ; decrement count jnz @b ; do more pixels TrailingBytes: ; ; now do the right side trailing bytes ; mov ecx,[esp].sp_RightCase test ecx,ecx jz EndScanLine mov dx,[2*esi] ; fetch pixel mov [ebp],dx ; write it out add edi,ebx ; step fraction adc esi,eax ; add in integer and possible carry add ebp,2 ; step 1 in dest EndScanLine: mov edi,ebp ; get dst pointer back mov ebp,[esp].sp_pSTR_BLT ; load pSTR_BLT into ebp EndSkipScan: mov esi,[esp].sp_pjSrcScan ; load src scan start addr mov ebx,esi ; save a copy mov eax,[ebp].str_ulYFracAccumulator ; get .32 part of Y pointer add eax,[ebp].str_ulYDstToSrcFracCeil ; add in fractional step jnc @f add esi,[ebp].str_lDeltaSrc ; step one extra in src @@: mov [ebp].str_ulYFracAccumulator,eax ; save Y accumulator add esi,[esp].sp_SrcIntStep ; step int part mov [esp].sp_pjSrcScan,esi ; save starting scan addr add edi,[esp].sp_DstStride ; step to next scan in dst dec [ebp].str_YDstCount ; decrement scan count jz Done ; no more scans inc [esp].sp_yDst ; one scan further down in dst cmp esi,ebx ; is src scan same as before? jne NextScan ; if so, fall through to dupe scan ;-------------------------------------------------------------------- ; The source scan is the same one used for the previous destination ; scan, so we can simply use the hardware to copy the previous ; destination scan. ; ; Since on the S3 we can set up a 'rolling blt' to copy one scan ; line to several scans in a single command, we will count up how ; many times this scan should be duplicated. If your hardware ; cannot do a rolling blt, simply issue a new blt command for ; every time the scan should be duplicated. ; ; eax = ulYFracAccumulator ; ebx = original pjSrcScan ; esi = current pjSrcScan ; ebp = pSTR_BLT ; mov ecx,-1 ; number of times scan is to be ; duplicated, less one AnotherDuplicate: inc ecx ; one scan further down dec [ebp].str_YDstCount ; decrement scan count jz OutputDuplicates ; no more scans add eax,[ebp].str_ulYDstToSrcFracCeil ; add in fractional step jnc @f add esi,[ebp].str_lDeltaSrc ; step one extra in src @@: add esi,[esp].sp_SrcIntStep ; step int part add edi,[ebp].str_lDeltaDst ; step entire dest scan cmp esi,ebx ; is src scan same as before? je AnotherDuplicate OutputDuplicates: mov [esp].sp_pjSrcScan,esi ; save starting scan address mov [ebp].str_ulYFracAccumulator,eax ; save Y accumulator ; ; Now output the command to do the 'rolling blt' ; ;; mov edx,[esp].sp_pjBase ; ; Wait until the accelerator is done with current blt ; mov dx,3ceh ;index register mov al,31h ;status reg out dx,al mov dx,3cfh ;data register @@: in al,dx test al,1 jnz short @b mov ebx,[esp].sp_XCntHW mov eax,[esp].sp_yDst ; eax = yDst -- Destination scan line (source scan line is yDst - 1) ; ebx = XCntHW -- Number of bytes across (width) - 1 ; ecx = cy -- Number of times scan is to be duplicated - 1 ; edx = pjBase -- Pointer to memory mapped accelerator registers ; ebp = pSTR_BLT -- Stretch blt info DuplicateViaMmIo: ; ; Do the copy: ; if 0 CP_XCNT(ppdev, pjBase, (WidthXBytes - 1)); CP_YCNT(ppdev, pjBase, (cyDuplicate - 1)); CP_SRC_ADDR(ppdev, pjBase, (xyOffset + ((yDst - 1) * lDelta) + xDstBytes)); SET_DEST_ADDR(ppdev, ((yDst * lDelta) + xDstBytes)); START_ACL(ppdev); endif .errnz RECT_HEIGHT ;; mov [edx+OFFSET_wXCnt],bx ;; mov [edx+OFFSET_wYCnt],cx mov dx,3ceh ;index register ; ; set XCNT=bx and YCNT=cx ; push eax mov al,21h ;BLT_WIDTH_HIGH mov ah,bh ;x count high byte out dx,ax mov al,20h ;BLT_WIDTH_LOW mov ah,bl ;x count low byte out dx,ax mov al,23h ;BLT_HEIGHT_HIGH mov ah,ch ;y count high byte out dx,ax mov al,22h ;BLT_HEIGHT_LOW mov ah,cl ;y count low byte out dx,ax pop eax ; ; Calculate src address ; mov ebx,eax ; ebx <- yDst dec ebx imul ebx,[ebp].str_lDeltaDst add ebx,[esp].sp_xyOffset add ebx,[ebp].str_xDstStart add ebx,[ebp].str_xDstStart ;; mov [edx+OFFSET_ulSrcAddr],ebx push eax mov al,2ch ;SRC_ADDR_LOW mov ah,bl ;src addr low byte out dx,ax mov al,2dh ;SRC_ADDR_MID mov ah,bh ;src addr mid byte out dx,ax shr ebx,16 mov al,2eh ;SRC_ADDR_HIGH mov ah,bl ;src addr high byte out dx,ax pop eax ; ; Calculate dst address ; inc eax ; account for 'ecx' being ; one less than scan count mov ebx,eax ; ebx <- yDst dec ebx imul ebx,[ebp].str_lDeltaDst add ebx,[esp].sp_xyOffset add ebx,[ebp].str_xDstStart add ebx,[ebp].str_xDstStart ;; mov [edx+OFFSET_ulDstAddr],ebx push eax mov al,28h ;DST_ADDR_LOW mov ah,bl ;dst addr low byte out dx,ax mov al,29h ;DST_ADDR_MID mov ah,bh ;dst addr mid byte out dx,ax shr ebx,16 mov al,2ah ;DST_ADDR_HIGH mov ah,bl ;dst addr high byte out dx,ax ; ; Start blt ; mov al,31h ;BLT_START_STATUS_REG mov ah,2 ;BLT_START out dx,ax pop eax DoneSetDestAddr: add eax,ecx ; add num scans just done mov [esp].sp_yDst,eax DoneDuplicate: cmp [ebp].str_YDstCount,0 ; we might be all done jne NextScan Done: add esp,(size STACK_STRUC) - PROC_MEM_SIZE pop ebp pop esi pop edi pop ebx ret 4 vDirectStretch16@4 endp end