Windows NT 4.0 source code leak
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

2034 lines
42 KiB

//
// Copyright (c) 1995 FirePower Systems, Inc.
//
// Module Name:
// patop.s
//
// Abstract:
// This module includes asmmebler functions to be used
// in PSIDISP.DLL display driver for PowerPro & PowerTop. These
// functions are used for faster pattern xor operation.
//
// Author:
// Neil Ogura
// 7-20-1995
//
// Environment:
// User mode.
//
// Revision History:
//
//--
//
// Copyright (c) 1995 FirePower Systems, Inc.
// DO NOT DISTRIBUTE without permission
//
// $RCSfile: patop.s $
// $Revision: 1.2 $
// $Date: 1996/04/10 17:59:32 $
// $Locker: $
//
//++
//--
#include "ladj.h"
#include <ksppc.h>
// Register defs
#define pbDst r3
#define pdSrc r4
#define cbX r5
#define cy r6
#define ld r7
#define pSave r8
#define t r9
#define w r10
// Registers to be used for 32 bpp case
#define pixel0 r11
#define pixel1 r12
#define pixel2 r14
#define pixel3 r15
#define pixel4 r16
#define pixel5 r17
#define pixel6 r18
#define pixel7 r19
// Registers to be used for 16 bpp case
#define pixel01 r11
#define pixel23 r12
#define loopcount r14
#define remainder r15
#define w2 r16
#define pixel45 r17
#define pixel67 r18
// Registers to be used for 8 bpp case
#define pixel0123 r11
#define pixel4567 r12
// Stack frame size
#define MINSTACKSIZE 64
// Stacl Slack offset
#define SLACK1 -4
#define SLACK2 -8
#define SLACK3 -12
#define SLACK4 -16
#define SLACK5 -20
#define SLACK6 -24
#define SLACK7 -28
#define SLACK8 -32
//
.text
//
// __nxor_pat32(pbDst, pdSrc, cbX, cy, ld, pSave)
// pbDst: byte addr of destination
// pdSrc: double word addr of fill value needed for 1st word of CB -> entry point of line routine
// cbX: number of bytes to xor per scan line --> count of pixels
// cy: count of scan lines
// ld: stride between scan lines
// pSave: 8 word register save area (first entry is used in this routine, the rest is used in the calling routine)
//
NESTED_ENTRY(__nxor_pat32, MINSTACKSIZE, 0, 0)
PROLOGUE_END(__nxor_pat32)
//
li w,0 // w <- 0
addi w,w,-1 // w <- 0xffffffff
li t,8
mtctr t // CTR <- 8
__nxor_pat32_00:
lwz t,0(pdSrc)
xor t,t,w
stw t,0(pdSrc) // invert pattern
addi pdSrc,pdSrc,4
bdnz __nxor_pat32_00
addi pdSrc,pdSrc,-32 // seek back pointer
//
bl ..__xor_pat32_entry // call pat xor function
//
li w,0 // w <- 0
addi w,w,-1 // w <- 0xffffffff
li t,8
mtctr t // CTR <- 8
__nxor_pat32_10:
lwz t,0(pdSrc)
xor t,t,w
stw t,0(pdSrc) // restore pattern
addi pdSrc,pdSrc,4
bdnz __nxor_pat32_10
//
NESTED_EXIT(__nxor_pat32, MINSTACKSIZE, 0, 0)
//
// __xor_pat32(pbDst, pdSrc, cbX, cy, ld, pSave)
// pbDst: byte addr of destination
// pdSrc: double word addr of fill value needed for 1st word of CB -> entry point of line routine
// cbX: number of bytes to xor per scan line --> count of pixels
// cy: count of scan lines
// ld: stride between scan lines
// pSave: 8 word register save area (first entry may be used in caller routine, use 2nd and later)
//
NESTED_ENTRY(__xor_pat32, MINSTACKSIZE, 0, 0)
PROLOGUE_END(__xor_pat32)
//
bl ..__xor_pat32_entry // call pat xor function
//
NESTED_EXIT(__xor_pat32, MINSTACKSIZE, 0, 0)
//
SPECIAL_ENTRY(__xor_pat32_entry)
stw r31,SLACK1(sp)
mflr r31
//
// Save non-volatile registers
//
stw r14,SLACK2(sp)
stw r15,SLACK3(sp)
stw r16,SLACK4(sp)
stw r17,SLACK5(sp)
stw r18,SLACK6(sp)
stw r19,SLACK7(sp)
//
PROLOGUE_END(__xor_pat32_entry)
//
and. cy,cy,cy // any lines?
beq- __xor_32_exit
srawi. cbX,cbX,2 // cbX is now number of pixels
beq- __xor_32_exit
//
// Load pattern into pixel0 ~ pixel7
//
lwz pixel0,0(pdSrc)
lwz pixel1,4(pdSrc)
lwz pixel2,8(pdSrc)
lwz pixel3,12(pdSrc)
lwz pixel4,16(pdSrc)
lwz pixel5,20(pdSrc)
lwz pixel6,24(pdSrc)
lwz pixel7,28(pdSrc)
//
bl __xor_32_00
.ualong __xor_32_30
.ualong __xor_32_31
.ualong __xor_32_32
.ualong __xor_32_33
.ualong __xor_32_34
.ualong __xor_32_35
.ualong __xor_32_36
.ualong __xor_32_37
//
__xor_32_00:
mflr w
rlwinm t,pbDst,0,27,29 // t <- first pixel offset from pattern staring position
lwzx w,w,t // w <- entry point for processing each line
mtlr w // LR <- entry point for processing each line
//
__xor_32_20:
mtctr cbX // CTR <- number of pixel to xor per line
mr t,pbDst // t <- strating target address of the line
blr // dispatch to line process routines
//
__xor_32_30:
lwz w,0(t)
xor w,w,pixel0
stw w,0(t)
addi t,t,4
bdz __xor_32_50
__xor_32_31:
lwz w,0(t)
xor w,w,pixel1
stw w,0(t)
addi t,t,4
bdz __xor_32_50
__xor_32_32:
lwz w,0(t)
xor w,w,pixel2
stw w,0(t)
addi t,t,4
bdz __xor_32_50
__xor_32_33:
lwz w,0(t)
xor w,w,pixel3
stw w,0(t)
addi t,t,4
bdz __xor_32_50
__xor_32_34:
lwz w,0(t)
xor w,w,pixel4
stw w,0(t)
addi t,t,4
bdz __xor_32_50
__xor_32_35:
lwz w,0(t)
xor w,w,pixel5
stw w,0(t)
addi t,t,4
bdz __xor_32_50
__xor_32_36:
lwz w,0(t)
xor w,w,pixel6
stw w,0(t)
addi t,t,4
bdz __xor_32_50
__xor_32_37:
lwz w,0(t)
xor w,w,pixel7
stw w,0(t)
addi t,t,4
bdnz __xor_32_30
//
__xor_32_50:
add pbDst,pbDst,ld // pointing to the next line
addic. cy,cy,-1 // any more lines?
bne __xor_32_20 // yes -> do next line
//
// Restore non-volatile registers
//
lwz r14,SLACK2(sp)
lwz r15,SLACK3(sp)
lwz r16,SLACK4(sp)
lwz r17,SLACK5(sp)
lwz r18,SLACK6(sp)
lwz r19,SLACK7(sp)
mtlr r31
lwz r31,SLACK1(sp)
__xor_32_exit:
SPECIAL_EXIT(__xor_pat32_entry)
//
// __nxor_pat16(pbDst, pdSrc, cbX, cy, ld, pSave)
// pbDst: byte addr of destination
// pdSrc: double word addr of fill value needed for 1st word of CB -> entry point of line routine
// cbX: number of bytes to xor per scan line --> count of pixels
// cy: count of scan lines
// ld: stride between scan lines
// pSave: 8 word register save area (first entry is used in this routine, the rest is used in the calling routine)
//
NESTED_ENTRY(__nxor_pat16, MINSTACKSIZE, 0, 0)
PROLOGUE_END(__nxor_pat16)
//
li w,0 // w <- 0
addi w,w,-1 // w <- 0xffffffff
li t,4
mtctr t // CTR <- 4
__nxor_pat16_00:
lwz t,0(pdSrc)
xor t,t,w
stw t,0(pdSrc) // invert pattern
addi pdSrc,pdSrc,4
bdnz __nxor_pat16_00
addi pdSrc,pdSrc,-16 // seek back pointer
//
bl ..__xor_pat16_entry // call pat xor function
//
li w,0 // w <- 0
addi w,w,-1 // w <- 0xffffffff
li t,8
mtctr t // CTR <- 8
__nxor_pat16_10:
lwz t,0(pdSrc)
xor t,t,w
stw t,0(pdSrc) // restore pattern
addi pdSrc,pdSrc,4
bdnz __nxor_pat16_10
//
NESTED_EXIT(__nxor_pat16, MINSTACKSIZE, 0, 0)
//
// __xor_pat16(pbDst, pdSrc, cbX, cy, ld, pSave)
// pbDst: byte addr of destination
// pdSrc: double word addr of fill value needed for 1st word of CB -> entry point of line routine
// cbX: number of bytes to xor per scan line --> count of pixels
// cy: count of scan lines
// ld: stride between scan lines
// pSave: 8 word register save area (first entry may be used in caller routine, use 2nd and later)
NESTED_ENTRY(__xor_pat16, MINSTACKSIZE, 0, 0)
PROLOGUE_END(__xor_pat16)
//
bl ..__xor_pat16_entry // call pat xor function
//
NESTED_EXIT(__xor_pat16, MINSTACKSIZE, 0, 0)
//
SPECIAL_ENTRY(__xor_pat16_entry)
stw r31,SLACK1(sp)
mflr r31
//
// Save non-volatile registers
//
stw r14,SLACK2(sp)
stw r15,SLACK3(sp)
stw r16,SLACK4(sp)
stw r17,SLACK5(sp)
stw r18,SLACK6(sp)
//
PROLOGUE_END(__xor_pat16_entry)
//
and. cy,cy,cy // any lines?
beq- __xor_16_exit
srawi. cbX,cbX,1 // cbX is now number of pixels
beq- __xor_16_exit
//
bl __xor_16_00
__xor_16_proc:
.ualong __xor_16_30
.ualong __xor_16_31
.ualong __xor_16_32
.ualong __xor_16_33
.ualong __xor_16_34
.ualong __xor_16_35
.ualong __xor_16_36
.ualong __xor_16_37
__xor_16_shortproc:
.ualong __xor_16_s1
.ualong __xor_16_s1
.ualong __xor_16_s20
.ualong __xor_16_s21
//
__xor_16_00:
mflr w // w <- top of table address
cmplwi cbX,2 // more than 2 pixels?
bgt __xor_16_10 // yes ->
addi t,cbX,-1 // t <- pixel count - 1
rlwinm t,t,3,28,28
rlwimi t,pbDst,1,29,29 // (1 bits of cbX(-1)) || (MOD 2 of dest h/w addr) || (2 bits of 0)
addi t,t,__xor_16_shortproc-__xor_16_proc
lwzx w,w,t
mtctr w // CTR <- entry for short routine
rlwinm t,pbDst,0,28,30 // alignment in pat of 1st pixel
bctrl
b __xor_16_exit
//
// Short routines for xor 16
// At entry: pdDst: pointer to starting target address
// t: initial offset in the pattern (0 to 14, step by 2)
// pdSrc: pointer to the pattern (8 * 2 byte pixel)
// ld: line delta for target
// w, pixel01 and pixel23 are used for work register
// r13 and above can't be used (as not saved in case of short)
//
__xor_16_s1:
mtctr cy
lhzx pixel01,pdSrc,t
__xor_16_s1Loop:
lhz w,0(pbDst)
xor w,w,pixel01
sth w,0(pbDst)
add pbDst,pbDst,ld
bdnz __xor_16_s1Loop
blr
//
__xor_16_s20:
mtctr cy
lwzx pixel01,pdSrc,t
__xor_16_s20Loop:
lwz w,0(pbDst)
xor w,w,pixel01
stw w,0(pbDst)
add pbDst,pbDst,ld
bdnz __xor_16_s20Loop
blr
//
__xor_16_s21:
mtctr cy
lhzx pixel01,pdSrc,t
addi t,t,2
rlwinm t,t,0,28,31
lhzx pixel23,pdSrc,t
__xor_16_s21Loop:
lhz w,0(pbDst)
xor w,w,pixel01
sth w,0(pbDst)
lhz w,2(pbDst)
xor w,w,pixel23
sth w,2(pbDst)
add pbDst,pbDst,ld
bdnz __xor_16_s21Loop
blr
//
// More than 2 pixels
//
__xor_16_10:
//
rlwinm t,pbDst,1,27,29 // t <- first pixel offset (0 to 7) index to the table entry
lwzx w,w,t // w <- entry point for processing each line
mtlr w // LR <- entry point
mr loopcount,cbX // loop count <- pixel count
andi. w,pbDst,0x02 // starting word boundary?
beq __xor_16_15 // yes
addi loopcount,loopcount,-1 // no -> adjust for initial pixel operation
__xor_16_15:
andi. remainder,loopcount,0x01 // remainder is # of pixels to do after main loop (0 or 1)
srawi loopcount,loopcount,1 // loopcount <- number of 2 pixels pair (at least 1)
//
// Load pattern into pixel01 ~ pixel67
//
lwz pixel01,0(pdSrc)
lwz pixel23,4(pdSrc)
lwz pixel45,8(pdSrc)
lwz pixel67,12(pdSrc)
//
__xor_16_20:
mtctr loopcount // CTR <- pixel pair count to operate
mr t,pbDst // t <- strating target address of the line
blr
//
// Wide cases routines
//
__xor_16_31:
lhz w,0(t)
srawi w2,pixel01,16
xor w,w,w2
sth w,0(t)
addi t,t,2
b __xor_16_32
//
__xor_16_33:
lhz w,0(t)
srawi w2,pixel23,16
xor w,w,w2
sth w,0(t)
addi t,t,2
b __xor_16_34
//
__xor_16_35:
lhz w,0(t)
srawi w2,pixel45,16
xor w,w,w2
sth w,0(t)
addi t,t,2
b __xor_16_36
//
__xor_16_37:
lhz w,0(t)
srawi w2,pixel67,16
xor w,w,w2
sth w,0(t)
addi t,t,2
//
__xor_16_30:
lwz w,0(t)
xor w,w,pixel01
stw w,0(t)
addi t,t,4
bdz __xor_16_50
__xor_16_32:
lwz w,0(t)
xor w,w,pixel23
stw w,0(t)
addi t,t,4
bdz __xor_16_50
__xor_16_34:
lwz w,0(t)
xor w,w,pixel45
stw w,0(t)
addi t,t,4
bdz __xor_16_50
__xor_16_36:
lwz w,0(t)
xor w,w,pixel67
stw w,0(t)
addi t,t,4
bdnz __xor_16_30
//
// End of line
//
__xor_16_50:
and. remainder,remainder,remainder
beq __xor_16_60
rlwinm w,t,0,28,30 // alignment in pat of last pixel
lhzx w2,pdSrc,w // last pixel to store
lhz w,0(t)
xor w,w,w2
sth w,0(t)
__xor_16_60:
add pbDst,pbDst,ld // pointing to the next line
addic. cy,cy,-1 // any more lines?
bne __xor_16_20 // yes -> do next line
//
__xor_16_exit:
//
// Restore non-volatile registers
//
lwz r14,SLACK2(sp)
lwz r15,SLACK3(sp)
lwz r16,SLACK4(sp)
lwz r17,SLACK5(sp)
lwz r18,SLACK6(sp)
mtlr r31
lwz r31,SLACK1(sp)
//
SPECIAL_EXIT(__xor_pat16_entry)
//
// __nxor_pat8(pbDst, pdSrc, cbX, cy, ld, pSave)
// pbDst: byte addr of destination
// pdSrc: double word addr of fill value needed for 1st word of CB -> entry point of line routine
// cbX: number of bytes to xor per scan line --> count of pixels
// cy: count of scan lines
// ld: stride between scan lines
// pSave: 8 word register save area (first entry is used in this routine, the rest is used in the calling routine)
//
NESTED_ENTRY(__nxor_pat8, MINSTACKSIZE, 0, 0)
PROLOGUE_END(__nxor_pat8)
//
li w,0 // w <- 0
addi w,w,-1 // w <- 0xffffffff
lwz t,0(pdSrc)
xor t,t,w
stw t,0(pdSrc) // invert pattern
lwz t,4(pdSrc)
xor t,t,w
stw t,4(pdSrc) // invert pattern
//
bl ..__xor_pat8_entry // call pat xor function
//
li w,0 // w <- 0
addi w,w,-1 // w <- 0xffffffff
lwz t,0(pdSrc)
xor t,t,w
stw t,0(pdSrc) // invert pattern
lwz t,4(pdSrc)
xor t,t,w
stw t,4(pdSrc) // invert pattern
//
NESTED_EXIT(__nxor_pat8, MINSTACKSIZE, 0, 0)
//
// __xor_pat8(pbDst, pdSrc, cbX, cy, ld, pSave)
// pbDst: byte addr of destination
// pdSrc: double word addr of fill value needed for 1st word of CB -> entry point of line routine
// cbX: number of bytes to xor per scan line --> count of pixels
// cy: count of scan lines
// ld: stride between scan lines
// pSave: 8 word register save area (first entry may be used in caller routine, use 2nd and later)
//
NESTED_ENTRY(__xor_pat8, MINSTACKSIZE, 0, 0)
PROLOGUE_END(__xor_pat8)
//
bl ..__xor_pat8_entry // call pat xor function
//
NESTED_EXIT(__xor_pat8, MINSTACKSIZE, 0, 0)
//
SPECIAL_ENTRY(__xor_pat8_entry)
stw r31,SLACK1(sp)
mflr r31
//
// Save non-volatile registers
//
stw r14,SLACK2(sp)
stw r15,SLACK3(sp)
stw r16,SLACK4(sp)
//
PROLOGUE_END(__xor_pat8_entry)
//
and. cy,cy,cy // any lines?
beq- __xor_08_exit
and. cbX,cbX,cbX // any pixels?
beq- __xor_08_exit
//
bl __xor_08_00
__xor_08_proc:
.ualong __xor_08_30
.ualong __xor_08_31
.ualong __xor_08_32
.ualong __xor_08_33
.ualong __xor_08_34
.ualong __xor_08_35
.ualong __xor_08_36
.ualong __xor_08_37
__xor_08_shortproc:
.ualong __xor_08_s1
.ualong __xor_08_s1
.ualong __xor_08_s1
.ualong __xor_08_s1
.ualong __xor_08_s20
.ualong __xor_08_s21
.ualong __xor_08_s22
.ualong __xor_08_s23
.ualong __xor_08_s30
.ualong __xor_08_s31
.ualong __xor_08_s32
.ualong __xor_08_s33
.ualong __xor_08_s40
.ualong __xor_08_s41
.ualong __xor_08_s42
.ualong __xor_08_s43
.ualong __xor_08_s50
.ualong __xor_08_s51
.ualong __xor_08_s52
.ualong __xor_08_s53
.ualong __xor_08_s60
.ualong __xor_08_s61
.ualong __xor_08_s62
.ualong __xor_08_s63
//
__xor_08_00:
mflr w // w <- top of table address
cmplwi cbX,6 // more than 6 pixels?
bgt __xor_08_10 // yes ->
addi t,cbX,-1 // t <- pixel count - 1 (0 to 5)
rlwinm t,t,4,25,27
rlwimi t,pbDst,2,28,29 // (1 bits of cbX(-1)) || (MOD 4 of dest addr) || (2 bits of 0)
addi t,t,__xor_08_shortproc-__xor_08_proc
lwzx w,w,t
mtctr w // CTR <- entry for short routine
andi. t,pbDst,0x07 // alignment in pat of 1st pixel (0 to 7)
bctrl
b __xor_08_exit
//
// Short routines for xor 08
// At entry: pdDst: pointer to starting target address
// t: initial offset in the pattern (0 to 7)
// pdSrc: pointer to the pattern (8 * 1 byte pixel)
// ld: line delta for target
// w, pixel0123 and pixel4567 are used for work register
// cy and cbX can be used as work registers after it's been accessed
// r13 and above can't be used (as not saved in case of short)
//
__xor_08_s1:
mtctr cy
lbzx pixel0123,pdSrc,t
__xor_08_s1Loop:
lbz w,0(pbDst)
xor w,w,pixel0123
stb w,0(pbDst)
add pbDst,pbDst,ld
bdnz __xor_08_s1Loop
blr
//
__xor_08_s20:
__xor_08_s22:
mtctr cy
lhzx pixel0123,pdSrc,t
__xor_08_s20Loop:
lhz w,0(pbDst)
xor w,w,pixel0123
sth w,0(pbDst)
add pbDst,pbDst,ld
bdnz __xor_08_s20Loop
blr
//
__xor_08_s21:
__xor_08_s23:
mtctr cy
lbzx pixel0123,pdSrc,t
addi t,t,1
andi. t,t,0x07
lbzx pixel4567,pdSrc,t
__xor_08_s21Loop:
lbz w,0(pbDst)
xor w,w,pixel0123
stb w,0(pbDst)
lbz w,1(pbDst)
xor w,w,pixel4567
stb w,1(pbDst)
add pbDst,pbDst,ld
bdnz __xor_08_s21Loop
blr
//
__xor_08_s30:
__xor_08_s32:
mtctr cy
lhzx pixel0123,pdSrc,t
addi t,t,2
andi. t,t,0x07
lbzx pixel4567,pdSrc,t
__xor_08_s30Loop:
lhz w,0(pbDst)
xor w,w,pixel0123
sth w,0(pbDst)
lbz w,2(pbDst)
xor w,w,pixel4567
stb w,2(pbDst)
add pbDst,pbDst,ld
bdnz __xor_08_s30Loop
blr
//
__xor_08_s31:
__xor_08_s33:
mtctr cy
lbzx pixel0123,pdSrc,t
addi t,t,1
andi. t,t,0x07
lhzx pixel4567,pdSrc,t
__xor_08_s31Loop:
lbz w,0(pbDst)
xor w,w,pixel0123
stb w,0(pbDst)
lhz w,1(pbDst)
xor w,w,pixel4567
sth w,1(pbDst)
add pbDst,pbDst,ld
bdnz __xor_08_s31Loop
blr
//
__xor_08_s40:
mtctr cy
lwzx pixel0123,pdSrc,t
__xor_08_s40Loop:
lwz w,0(pbDst)
xor w,w,pixel0123
stw w,0(pbDst)
add pbDst,pbDst,ld
bdnz __xor_08_s40Loop
blr
//
__xor_08_s41:
__xor_08_s43:
mtctr cy
lbzx pixel0123,pdSrc,t
addi t,t,1
andi. t,t,0x07
lhzx pixel4567,pdSrc,t
addi t,t,2
andi. t,t,0x07
lbzx cy,pdSrc,t
__xor_08_s41Loop:
lbz w,0(pbDst)
xor w,w,pixel0123
stb w,0(pbDst)
lhz w,1(pbDst)
xor w,w,pixel4567
sth w,1(pbDst)
lbz w,3(pbDst)
xor w,w,cy
stb w,3(pbDst)
add pbDst,pbDst,ld
bdnz __xor_08_s41Loop
blr
//
__xor_08_s42:
mtctr cy
lhzx pixel0123,pdSrc,t
addi t,t,2
andi. t,t,0x07
lhzx pixel4567,pdSrc,t
__xor_08_s42Loop:
lhz w,0(pbDst)
xor w,w,pixel0123
sth w,0(pbDst)
lhz w,2(pbDst)
xor w,w,pixel4567
sth w,2(pbDst)
add pbDst,pbDst,ld
bdnz __xor_08_s42Loop
blr
//
__xor_08_s50:
mtctr cy
lwzx pixel0123,pdSrc,t
addi t,t,4
andi. t,t,0x07
lbzx pixel4567,pdSrc,t
__xor_08_s50Loop:
lwz w,0(pbDst)
xor w,w,pixel0123
stw w,0(pbDst)
lbz w,4(pbDst)
xor w,w,pixel4567
stb w,4(pbDst)
add pbDst,pbDst,ld
bdnz __xor_08_s50Loop
blr
//
__xor_08_s51:
mtctr cy
lbzx pixel0123,pdSrc,t
addi t,t,1
andi. t,t,0x07
lhzx pixel4567,pdSrc,t
addi t,t,2
andi. t,t,0x07
lhzx cy,pdSrc,t
__xor_08_s51Loop:
lbz w,0(pbDst)
xor w,w,pixel0123
stb w,0(pbDst)
lhz w,1(pbDst)
xor w,w,pixel4567
sth w,1(pbDst)
lhz w,3(pbDst)
xor w,w,cy
sth w,3(pbDst)
add pbDst,pbDst,ld
bdnz __xor_08_s51Loop
blr
//
__xor_08_s52:
mtctr cy
lhzx pixel0123,pdSrc,t
addi t,t,2
andi. t,t,0x07
lhzx pixel4567,pdSrc,t
addi t,t,2
andi. t,t,0x07
lbzx cy,pdSrc,t
__xor_08_s52Loop:
lhz w,0(pbDst)
xor w,w,pixel0123
sth w,0(pbDst)
lhz w,2(pbDst)
xor w,w,pixel4567
sth w,2(pbDst)
lbz w,4(pbDst)
xor w,w,cy
stb w,4(pbDst)
add pbDst,pbDst,ld
bdnz __xor_08_s52Loop
blr
//
__xor_08_s53:
mtctr cy
lbzx pixel0123,pdSrc,t
addi t,t,1
andi. t,t,0x07
lwzx pixel4567,pdSrc,t
__xor_08_s53Loop:
lbz w,0(pbDst)
xor w,w,pixel0123
stb w,0(pbDst)
lwz w,1(pbDst)
xor w,w,pixel4567
stw w,1(pbDst)
add pbDst,pbDst,ld
bdnz __xor_08_s53Loop
blr
//
__xor_08_s60:
mtctr cy
lwzx pixel0123,pdSrc,t
addi t,t,4
andi. t,t,0x07
lhzx pixel4567,pdSrc,t
__xor_08_s60Loop:
lwz w,0(pbDst)
xor w,w,pixel0123
stw w,0(pbDst)
lhz w,4(pbDst)
xor w,w,pixel4567
sth w,4(pbDst)
add pbDst,pbDst,ld
bdnz __xor_08_s60Loop
blr
//
__xor_08_s61:
mtctr cy
lbzx pixel0123,pdSrc,t
addi t,t,1
andi. t,t,0x07
lhzx pixel4567,pdSrc,t
addi t,t,2
andi. t,t,0x07
lhzx cy,pdSrc,t
addi t,t,2
andi. t,t,0x07
lbzx cbX,pdSrc,t
__xor_08_s61Loop:
lbz w,0(pbDst)
xor w,w,pixel0123
stb w,0(pbDst)
lhz w,1(pbDst)
xor w,w,pixel4567
sth w,1(pbDst)
lhz w,3(pbDst)
xor w,w,cy
sth w,3(pbDst)
lbz w,5(pbDst)
xor w,w,cbX
stb w,5(pbDst)
add pbDst,pbDst,ld
bdnz __xor_08_s61Loop
blr
//
__xor_08_s62:
mtctr cy
lhzx pixel0123,pdSrc,t
addi t,t,2
andi. t,t,0x07
lwzx pixel4567,pdSrc,t
__xor_08_s62Loop:
lhz w,0(pbDst)
xor w,w,pixel0123
sth w,0(pbDst)
lwz w,2(pbDst)
xor w,w,pixel4567
stw w,2(pbDst)
add pbDst,pbDst,ld
bdnz __xor_08_s62Loop
blr
//
__xor_08_s63:
mtctr cy
lbzx pixel0123,pdSrc,t
addi t,t,1
andi. t,t,0x07
lwzx pixel4567,pdSrc,t
addi t,t,4
andi. t,t,0x07
lbzx cy,pdSrc,t
__xor_08_s63Loop:
lbz w,0(pbDst)
xor w,w,pixel0123
stb w,0(pbDst)
lwz w,1(pbDst)
xor w,w,pixel4567
stw w,1(pbDst)
lbz w,5(pbDst)
xor w,w,cy
stb w,5(pbDst)
add pbDst,pbDst,ld
bdnz __xor_08_s63Loop
blr
//
// Wide cases ( >= 7)
//
__xor_08_10:
rlwinm t,pbDst,2,27,29 // t <- first pixel offset (0 to 7) index to the table entry
lwzx w,w,t // w <- entry point for processing each line
mtlr w // LR <- entry point
mr loopcount,cbX // loop count <- pixel count
andi. w,pbDst,0x03 // starting word boundary offset
beq- __xor_08_15 // word aligned -> no extra operation before main loop
subfic w,w,4 // w <- number of pixels (= bytes) to process at first
subf loopcount,w,loopcount // loopcount <- # of byte after initial process
__xor_08_15:
andi. remainder,loopcount,0x03 // remainder is # of pixels to do after main loop (0 to 3)
srawi loopcount,loopcount,2 // loopcount <- number of 4 pixels unit (at least 1)
//
// Load pattern into pixel0123 ~ pixel4567
//
lwz pixel0123,0(pdSrc)
lwz pixel4567,4(pdSrc)
//
__xor_08_20:
mtctr loopcount // CTR <- pixel pair count to operate
mr t,pbDst // t <- strating target address of the line
blr
//
// Wide cases routines
//
__xor_08_31:
lbz w,0(t)
srawi w2,pixel0123,8
xor w,w,w2
stb w,0(t)
addi t,t,1
__xor_08_32:
lhz w,0(t)
srawi w2,pixel0123,16
xor w,w,w2
sth w,0(t)
addi t,t,2
b __xor_08_34
//
__xor_08_33:
lbz w,0(t)
srawi w2,pixel0123,24
xor w,w,w2
stb w,0(t)
addi t,t,1
b __xor_08_34
//
__xor_08_35:
lbz w,0(t)
srawi w2,pixel4567,8
xor w,w,w2
stb w,0(t)
addi t,t,1
__xor_08_36:
lhz w,0(t)
srawi w2,pixel4567,16
xor w,w,w2
sth w,0(t)
addi t,t,2
b __xor_08_30
//
__xor_08_37:
lbz w,0(t)
srawi w2,pixel4567,24
xor w,w,w2
stb w,0(t)
addi t,t,1
//
__xor_08_30:
lwz w,0(t)
xor w,w,pixel0123
stw w,0(t)
addi t,t,4
bdz __xor_08_50
__xor_08_34:
lwz w,0(t)
xor w,w,pixel4567
stw w,0(t)
addi t,t,4
bdnz __xor_08_30
mr w2,pixel0123
b __xor_08_60
//
// End of line process
//
__xor_08_50:
mr w2,pixel4567
__xor_08_60:
andi. w,remainder,0x02 // equal or more than 2 bytes remaining?
beq __xor_08_70
lhz w,0(t)
xor w,w,w2
sth w,0(t)
addi t,t,2
srawi w2,w2,16
__xor_08_70:
andi. w,remainder,0x01 // still byte remaining?
beq __xor_08_80 // No -> next line
lbz w,0(t)
xor w,w,w2
stb w,0(t)
__xor_08_80:
add pbDst,pbDst,ld // pointing to the next line
addic. cy,cy,-1 // any more lines?
bne __xor_08_20 // yes -> do next line
//
__xor_08_exit:
//
// Restore non-volatile registers
//
lwz r14,SLACK2(sp)
lwz r15,SLACK3(sp)
lwz r16,SLACK4(sp)
mtlr r31
lwz r31,SLACK1(sp)
//
SPECIAL_EXIT(__xor_pat8_entry)
//
// __nand_pat32(pbDst, pdSrc, cbX, cy, ld, pSave)
// pbDst: byte addr of destination
// pdSrc: double word addr of fill value needed for 1st word of CB -> entry point of line routine
// cbX: number of bytes to and per scan line --> count of pixels
// cy: count of scan lines
// ld: stride between scan lines
// pSave: 8 word register save area (first entry is used in this routine, the rest is used in the calling routine)
//
NESTED_ENTRY(__nand_pat32, MINSTACKSIZE, 0, 0)
PROLOGUE_END(__nand_pat32)
//
li w,0 // w <- 0
addi w,w,-1 // w <- 0xffffffff
li t,8
mtctr t // CTR <- 8
__nand_pat32_00:
lwz t,0(pdSrc)
xor t,t,w
stw t,0(pdSrc) // invert pattern
addi pdSrc,pdSrc,4
bdnz __nand_pat32_00
addi pdSrc,pdSrc,-32 // seek back pointer
//
bl ..__and_pat32_entry // call pat xor function
//
li w,0 // w <- 0
addi w,w,-1 // w <- 0xffffffff
li t,8
mtctr t // CTR <- 8
__nand_pat32_10:
lwz t,0(pdSrc)
xor t,t,w
stw t,0(pdSrc) // restore pattern
addi pdSrc,pdSrc,4
bdnz __nand_pat32_10
//
NESTED_EXIT(__nand_pat32, MINSTACKSIZE, 0, 0)
//
// __and_pat32(pbDst, pdSrc, cbX, cy, ld, pSave)
// pbDst: byte addr of destination
// pdSrc: double word addr of fill value needed for 1st word of CB -> entry point of line routine
// cbX: number of bytes to and per scan line --> count of pixels
// cy: count of scan lines
// ld: stride between scan lines
// pSave: 8 word register save area (first entry may be used in caller routine, use 2nd and later)
//
NESTED_ENTRY(__and_pat32, MINSTACKSIZE, 0, 0)
PROLOGUE_END(__and_pat32)
//
bl ..__and_pat32_entry // call pat xor function
//
NESTED_EXIT(__and_pat32, MINSTACKSIZE, 0, 0)
//
SPECIAL_ENTRY(__and_pat32_entry)
stw r31,SLACK1(sp)
mflr r31
//
// Save non-volatile registers
//
stw r14,SLACK2(sp)
stw r15,SLACK3(sp)
stw r16,SLACK4(sp)
stw r17,SLACK5(sp)
stw r18,SLACK6(sp)
stw r19,SLACK7(sp)
//
PROLOGUE_END(__and_pat32_entry)
//
and. cy,cy,cy // any lines?
beq- __and_32_exit
srawi. cbX,cbX,2 // cbX is now number of pixels
beq- __and_32_exit
//
// Load pattern into pixel0 ~ pixel7
//
lwz pixel0,0(pdSrc)
lwz pixel1,4(pdSrc)
lwz pixel2,8(pdSrc)
lwz pixel3,12(pdSrc)
lwz pixel4,16(pdSrc)
lwz pixel5,20(pdSrc)
lwz pixel6,24(pdSrc)
lwz pixel7,28(pdSrc)
//
bl __and_32_00
.ualong __and_32_30
.ualong __and_32_31
.ualong __and_32_32
.ualong __and_32_33
.ualong __and_32_34
.ualong __and_32_35
.ualong __and_32_36
.ualong __and_32_37
//
__and_32_00:
mflr w
rlwinm t,pbDst,0,27,29 // t <- first pixel offset from pattern staring position
lwzx w,w,t // w <- entry point for processing each line
mtlr w // LR <- entry point for processing each line
//
__and_32_20:
mtctr cbX // CTR <- number of pixel to and per line
mr t,pbDst // t <- strating target address of the line
blr // dispatch to line process routines
//
__and_32_30:
lwz w,0(t)
and w,w,pixel0
stw w,0(t)
addi t,t,4
bdz __and_32_50
__and_32_31:
lwz w,0(t)
and w,w,pixel1
stw w,0(t)
addi t,t,4
bdz __and_32_50
__and_32_32:
lwz w,0(t)
and w,w,pixel2
stw w,0(t)
addi t,t,4
bdz __and_32_50
__and_32_33:
lwz w,0(t)
and w,w,pixel3
stw w,0(t)
addi t,t,4
bdz __and_32_50
__and_32_34:
lwz w,0(t)
and w,w,pixel4
stw w,0(t)
addi t,t,4
bdz __and_32_50
__and_32_35:
lwz w,0(t)
and w,w,pixel5
stw w,0(t)
addi t,t,4
bdz __and_32_50
__and_32_36:
lwz w,0(t)
and w,w,pixel6
stw w,0(t)
addi t,t,4
bdz __and_32_50
__and_32_37:
lwz w,0(t)
and w,w,pixel7
stw w,0(t)
addi t,t,4
bdnz __and_32_30
//
__and_32_50:
add pbDst,pbDst,ld // pointing to the next line
addic. cy,cy,-1 // any more lines?
bne __and_32_20 // yes -> do next line
//
__and_32_exit:
//
// Restore non-volatile registers
//
lwz r14,SLACK2(sp)
lwz r15,SLACK3(sp)
lwz r16,SLACK4(sp)
lwz r17,SLACK5(sp)
lwz r18,SLACK6(sp)
lwz r19,SLACK7(sp)
mtlr r31
lwz r31,SLACK1(sp)
//
SPECIAL_EXIT(__and_pat32_entry)
//
// __nand_pat16(pbDst, pdSrc, cbX, cy, ld, pSave)
// pbDst: byte addr of destination
// pdSrc: double word addr of fill value needed for 1st word of CB -> entry point of line routine
// cbX: number of bytes to and per scan line --> count of pixels
// cy: count of scan lines
// ld: stride between scan lines
// pSave: 8 word register save area (first entry is used in this routine, the rest is used in the calling routine)
//
NESTED_ENTRY(__nand_pat16, MINSTACKSIZE, 0, 0)
PROLOGUE_END(__nand_pat16)
//
li w,0 // w <- 0
addi w,w,-1 // w <- 0xffffffff
li t,4
mtctr t // CTR <- 4
__nand_pat16_00:
lwz t,0(pdSrc)
xor t,t,w
stw t,0(pdSrc) // invert pattern
addi pdSrc,pdSrc,4
bdnz __nand_pat16_00
addi pdSrc,pdSrc,-16 // seek back pointer
//
bl ..__and_pat16_entry // call pat and function
//
li w,0 // w <- 0
addi w,w,-1 // w <- 0xffffffff
li t,8
mtctr t // CTR <- 8
__nand_pat16_10:
lwz t,0(pdSrc)
xor t,t,w
stw t,0(pdSrc) // restore pattern
addi pdSrc,pdSrc,4
bdnz __nand_pat16_10
//
NESTED_EXIT(__nand_pat16, MINSTACKSIZE, 0, 0)
//
// __and_pat16(pbDst, pdSrc, cbX, cy, ld, pSave)
// pbDst: byte addr of destination
// pdSrc: double word addr of fill value needed for 1st word of CB -> entry point of line routine
// cbX: number of bytes to and per scan line --> count of pixels
// cy: count of scan lines
// ld: stride between scan lines
// pSave: 8 word register save area (first entry may be used in caller routine, use 2nd and later)
//
NESTED_ENTRY(__and_pat16, MINSTACKSIZE, 0, 0)
PROLOGUE_END(__and_pat16)
//
bl ..__and_pat16_entry // call pat xor function
//
NESTED_EXIT(__and_pat16, MINSTACKSIZE, 0, 0)
//
SPECIAL_ENTRY(__and_pat16_entry)
stw r31,SLACK1(sp)
mflr r31
//
// Save non-volatile registers
//
stw r14,SLACK2(sp)
stw r15,SLACK3(sp)
stw r16,SLACK4(sp)
stw r17,SLACK5(sp)
stw r18,SLACK6(sp)
//
PROLOGUE_END(__and_pat16_entry)
//
and. cy,cy,cy // any lines?
beq- __and_16_exit
srawi. cbX,cbX,1 // cbX is now number of pixels
beq- __and_16_exit
//
bl __and_16_00
__and_16_proc:
.ualong __and_16_30
.ualong __and_16_31
.ualong __and_16_32
.ualong __and_16_33
.ualong __and_16_34
.ualong __and_16_35
.ualong __and_16_36
.ualong __and_16_37
__and_16_shortproc:
.ualong __and_16_s1
.ualong __and_16_s1
.ualong __and_16_s20
.ualong __and_16_s21
//
__and_16_00:
mflr w // w <- top of table address
cmplwi cbX,2 // more than 2 pixels?
bgt __and_16_10 // yes ->
addi t,cbX,-1 // t <- pixel count - 1
rlwinm t,t,3,28,28
rlwimi t,pbDst,1,29,29 // (1 bits of cbX(-1)) || (MOD 2 of dest h/w addr) || (2 bits of 0)
addi t,t,__and_16_shortproc-__and_16_proc
lwzx w,w,t
mtctr w // CTR <- entry for short routine
rlwinm t,pbDst,0,28,30 // alignment in pat of 1st pixel
bctrl
b __and_16_exit
//
// Short routines for and 16
// At entry: pdDst: pointer to starting target address
// t: initial offset in the pattern (0 to 14, step by 2)
// pdSrc: pointer to the pattern (8 * 2 byte pixel)
// ld: line delta for target
// w, pixel01 and pixel23 are used for work register
// r13 and above can't be used (as not saved in case of short)
//
__and_16_s1:
mtctr cy
lhzx pixel01,pdSrc,t
__and_16_s1Loop:
lhz w,0(pbDst)
and w,w,pixel01
sth w,0(pbDst)
add pbDst,pbDst,ld
bdnz __and_16_s1Loop
blr
//
__and_16_s20:
mtctr cy
lwzx pixel01,pdSrc,t
__and_16_s20Loop:
lwz w,0(pbDst)
and w,w,pixel01
stw w,0(pbDst)
add pbDst,pbDst,ld
bdnz __and_16_s20Loop
blr
//
__and_16_s21:
mtctr cy
lhzx pixel01,pdSrc,t
addi t,t,2
rlwinm t,t,0,28,31
lhzx pixel23,pdSrc,t
__and_16_s21Loop:
lhz w,0(pbDst)
and w,w,pixel01
sth w,0(pbDst)
lhz w,2(pbDst)
and w,w,pixel23
sth w,2(pbDst)
add pbDst,pbDst,ld
bdnz __and_16_s21Loop
blr
//
// Wide cases ( >= 3 pixel)
//
__and_16_10:
rlwinm t,pbDst,1,27,29 // t <- first pixel offset (0 to 7) index to the table entry
lwzx w,w,t // w <- entry point for processing each line
mtlr w // LR <- entry point
mr loopcount,cbX // loop count <- pixel count
andi. w,pbDst,0x02 // starting word boundary?
beq __and_16_15 // yes
addi loopcount,loopcount,-1 // no -> adjust for initial pixel operation
__and_16_15:
andi. remainder,loopcount,0x01 // remainder is # of pixels to do after main loop (0 or 1)
srawi loopcount,loopcount,1 // loopcount <- number of 2 pixels pair (at least 1)
//
// Load pattern into pixel01 ~ pixel67
//
lwz pixel01,0(pdSrc)
lwz pixel23,4(pdSrc)
lwz pixel45,8(pdSrc)
lwz pixel67,12(pdSrc)
//
__and_16_20:
mtctr loopcount // CTR <- pixel pair count to operate
mr t,pbDst // t <- strating target address of the line
blr
//
// Wide cases routines
//
__and_16_31:
lhz w,0(t)
srawi w2,pixel01,16
and w,w,w2
sth w,0(t)
addi t,t,2
b __and_16_32
//
__and_16_33:
lhz w,0(t)
srawi w2,pixel23,16
and w,w,w2
sth w,0(t)
addi t,t,2
b __and_16_34
//
__and_16_35:
lhz w,0(t)
srawi w2,pixel45,16
and w,w,w2
sth w,0(t)
addi t,t,2
b __and_16_36
//
__and_16_37:
lhz w,0(t)
srawi w2,pixel67,16
and w,w,w2
sth w,0(t)
addi t,t,2
//
__and_16_30:
lwz w,0(t)
and w,w,pixel01
stw w,0(t)
addi t,t,4
bdz __and_16_50
__and_16_32:
lwz w,0(t)
and w,w,pixel23
stw w,0(t)
addi t,t,4
bdz __and_16_50
__and_16_34:
lwz w,0(t)
and w,w,pixel45
stw w,0(t)
addi t,t,4
bdz __and_16_50
__and_16_36:
lwz w,0(t)
and w,w,pixel67
stw w,0(t)
addi t,t,4
bdnz __and_16_30
//
// End of line
//
__and_16_50:
and. remainder,remainder,remainder
beq __and_16_60
rlwinm w,t,0,28,30 // alignment in pat of last pixel
lhzx w2,pdSrc,w // last pixel to store
lhz w,0(t)
and w,w,w2
sth w,0(t)
__and_16_60:
add pbDst,pbDst,ld // pointing to the next line
addic. cy,cy,-1 // any more lines?
bne __and_16_20 // yes -> do next line
//
__and_16_exit:
//
// Restore non-volatile registers
//
lwz r14,SLACK2(sp)
lwz r15,SLACK3(sp)
lwz r16,SLACK4(sp)
lwz r17,SLACK5(sp)
lwz r18,SLACK6(sp)
mtlr r31
lwz r31,SLACK1(sp)
//
SPECIAL_EXIT(__and_pat16_entry)
//
// __nand_pat8(pbDst, pdSrc, cbX, cy, ld, pSave)
// pbDst: byte addr of destination
// pdSrc: double word addr of fill value needed for 1st word of CB -> entry point of line routine
// cbX: number of bytes to and per scan line --> count of pixels
// cy: count of scan lines
// ld: stride between scan lines
// pSave: 8 word register save area (first entry is used in this routine, the rest is used in the calling routine)
//
NESTED_ENTRY(__nand_pat8, MINSTACKSIZE, 0, 0)
PROLOGUE_END(__nand_pat8)
//
li w,0 // w <- 0
addi w,w,-1 // w <- 0xffffffff
lwz t,0(pdSrc)
xor t,t,w
stw t,0(pdSrc) // invert pattern
lwz t,4(pdSrc)
xor t,t,w
stw t,4(pdSrc) // invert pattern
//
bl ..__and_pat8_entry // call pat and function
//
li w,0 // w <- 0
addi w,w,-1 // w <- 0xffffffff
lwz t,0(pdSrc)
xor t,t,w
stw t,0(pdSrc) // invert pattern
lwz t,4(pdSrc)
xor t,t,w
stw t,4(pdSrc) // invert pattern
//
NESTED_EXIT(__nand_pat8, MINSTACKSIZE, 0, 0)
//
// __and_pat8(pbDst, pdSrc, cbX, cy, ld, pSave)
// pbDst: byte addr of destination
// pdSrc: double word addr of fill value needed for 1st word of CB -> entry point of line routine
// cbX: number of bytes to and per scan line --> count of pixels
// cy: count of scan lines
// ld: stride between scan lines
// pSave: 8 word register save area (first entry may be used in caller routine, use 2nd and later)
//
NESTED_ENTRY(__and_pat8, MINSTACKSIZE, 0, 0)
PROLOGUE_END(__and_pat8)
//
bl ..__and_pat8_entry // call pat xor function
//
NESTED_EXIT(__and_pat8, MINSTACKSIZE, 0, 0)
//
SPECIAL_ENTRY(__and_pat8_entry)
stw r31,SLACK1(sp)
mflr r31
//
// Save non-volatile registers
//
stw r14,SLACK2(sp)
stw r15,SLACK3(sp)
stw r16,SLACK4(sp)
//
PROLOGUE_END(__and_pat8_entry)
//
and. cy,cy,cy // any lines?
beq- __and_08_exit
and. cbX,cbX,cbX // any pixels?
beq- __and_08_exit
//
bl __and_08_00
__and_08_proc:
.ualong __and_08_30
.ualong __and_08_31
.ualong __and_08_32
.ualong __and_08_33
.ualong __and_08_34
.ualong __and_08_35
.ualong __and_08_36
.ualong __and_08_37
__and_08_shortproc:
.ualong __and_08_s1
.ualong __and_08_s1
.ualong __and_08_s1
.ualong __and_08_s1
.ualong __and_08_s20
.ualong __and_08_s21
.ualong __and_08_s22
.ualong __and_08_s23
.ualong __and_08_s30
.ualong __and_08_s31
.ualong __and_08_s32
.ualong __and_08_s33
.ualong __and_08_s40
.ualong __and_08_s41
.ualong __and_08_s42
.ualong __and_08_s43
.ualong __and_08_s50
.ualong __and_08_s51
.ualong __and_08_s52
.ualong __and_08_s53
.ualong __and_08_s60
.ualong __and_08_s61
.ualong __and_08_s62
.ualong __and_08_s63
//
__and_08_00:
mflr w // w <- top of table address
cmplwi cbX,6 // more than 6 pixels?
bgt __and_08_10 // yes ->
addi t,cbX,-1 // t <- pixel count - 1 (0 to 5)
rlwinm t,t,4,25,27
rlwimi t,pbDst,2,28,29 // (1 bits of cbX(-1)) || (MOD 4 of dest addr) || (2 bits of 0)
addi t,t,__and_08_shortproc-__and_08_proc
lwzx w,w,t
mtctr w // CTR <- entry for short routine
andi. t,pbDst,0x07 // alignment in pat of 1st pixel (0 to 7)
bctrl
b __and_08_exit
//
// Short routines for and 08
// At entry: pdDst: pointer to starting target address
// t: initial offset in the pattern (0 to 7)
// pdSrc: pointer to the pattern (8 * 1 byte pixel)
// ld: line delta for target
// w, pixel0123 and pixel4567 are used for work register
// cy and cbX can be used as work registers after it's been accessed
// r13 and above can't be used (as not saved in case of short)
//
__and_08_s1:
mtctr cy
lbzx pixel0123,pdSrc,t
__and_08_s1Loop:
lbz w,0(pbDst)
and w,w,pixel0123
stb w,0(pbDst)
add pbDst,pbDst,ld
bdnz __and_08_s1Loop
blr
//
__and_08_s20:
__and_08_s22:
mtctr cy
lhzx pixel0123,pdSrc,t
__and_08_s20Loop:
lhz w,0(pbDst)
and w,w,pixel0123
sth w,0(pbDst)
add pbDst,pbDst,ld
bdnz __and_08_s20Loop
blr
//
__and_08_s21:
__and_08_s23:
mtctr cy
lbzx pixel0123,pdSrc,t
addi t,t,1
andi. t,t,0x07
lbzx pixel4567,pdSrc,t
__and_08_s21Loop:
lbz w,0(pbDst)
and w,w,pixel0123
stb w,0(pbDst)
lbz w,1(pbDst)
and w,w,pixel4567
stb w,1(pbDst)
add pbDst,pbDst,ld
bdnz __and_08_s21Loop
blr
//
__and_08_s30:
__and_08_s32:
mtctr cy
lhzx pixel0123,pdSrc,t
addi t,t,2
andi. t,t,0x07
lbzx pixel4567,pdSrc,t
__and_08_s30Loop:
lhz w,0(pbDst)
and w,w,pixel0123
sth w,0(pbDst)
lbz w,2(pbDst)
and w,w,pixel4567
stb w,2(pbDst)
add pbDst,pbDst,ld
bdnz __and_08_s30Loop
blr
//
__and_08_s31:
__and_08_s33:
mtctr cy
lbzx pixel0123,pdSrc,t
addi t,t,1
andi. t,t,0x07
lhzx pixel4567,pdSrc,t
__and_08_s31Loop:
lbz w,0(pbDst)
and w,w,pixel0123
stb w,0(pbDst)
lhz w,1(pbDst)
and w,w,pixel4567
sth w,1(pbDst)
add pbDst,pbDst,ld
bdnz __and_08_s31Loop
blr
//
__and_08_s40:
mtctr cy
lwzx pixel0123,pdSrc,t
__and_08_s40Loop:
lwz w,0(pbDst)
and w,w,pixel0123
stw w,0(pbDst)
add pbDst,pbDst,ld
bdnz __and_08_s40Loop
blr
//
__and_08_s41:
__and_08_s43:
mtctr cy
lbzx pixel0123,pdSrc,t
addi t,t,1
andi. t,t,0x07
lhzx pixel4567,pdSrc,t
addi t,t,2
andi. t,t,0x07
lbzx cy,pdSrc,t
__and_08_s41Loop:
lbz w,0(pbDst)
and w,w,pixel0123
stb w,0(pbDst)
lhz w,1(pbDst)
and w,w,pixel4567
sth w,1(pbDst)
lbz w,3(pbDst)
and w,w,cy
stb w,3(pbDst)
add pbDst,pbDst,ld
bdnz __and_08_s41Loop
blr
//
__and_08_s42:
mtctr cy
lhzx pixel0123,pdSrc,t
addi t,t,2
andi. t,t,0x07
lhzx pixel4567,pdSrc,t
__and_08_s42Loop:
lhz w,0(pbDst)
and w,w,pixel0123
sth w,0(pbDst)
lhz w,2(pbDst)
and w,w,pixel4567
sth w,2(pbDst)
add pbDst,pbDst,ld
bdnz __and_08_s42Loop
blr
//
__and_08_s50:
mtctr cy
lwzx pixel0123,pdSrc,t
addi t,t,4
andi. t,t,0x07
lbzx pixel4567,pdSrc,t
__and_08_s50Loop:
lwz w,0(pbDst)
and w,w,pixel0123
stw w,0(pbDst)
lbz w,4(pbDst)
and w,w,pixel4567
stb w,4(pbDst)
add pbDst,pbDst,ld
bdnz __and_08_s50Loop
blr
//
__and_08_s51:
mtctr cy
lbzx pixel0123,pdSrc,t
addi t,t,1
andi. t,t,0x07
lhzx pixel4567,pdSrc,t
addi t,t,2
andi. t,t,0x07
lhzx cy,pdSrc,t
__and_08_s51Loop:
lbz w,0(pbDst)
and w,w,pixel0123
stb w,0(pbDst)
lhz w,1(pbDst)
and w,w,pixel4567
sth w,1(pbDst)
lhz w,3(pbDst)
and w,w,cy
sth w,3(pbDst)
add pbDst,pbDst,ld
bdnz __and_08_s51Loop
blr
//
__and_08_s52:
mtctr cy
lhzx pixel0123,pdSrc,t
addi t,t,2
andi. t,t,0x07
lhzx pixel4567,pdSrc,t
addi t,t,2
andi. t,t,0x07
lbzx cy,pdSrc,t
__and_08_s52Loop:
lhz w,0(pbDst)
and w,w,pixel0123
sth w,0(pbDst)
lhz w,2(pbDst)
and w,w,pixel4567
sth w,2(pbDst)
lbz w,4(pbDst)
and w,w,cy
stb w,4(pbDst)
add pbDst,pbDst,ld
bdnz __and_08_s52Loop
blr
//
__and_08_s53:
mtctr cy
lbzx pixel0123,pdSrc,t
addi t,t,1
andi. t,t,0x07
lwzx pixel4567,pdSrc,t
__and_08_s53Loop:
lbz w,0(pbDst)
and w,w,pixel0123
stb w,0(pbDst)
lwz w,1(pbDst)
and w,w,pixel4567
stw w,1(pbDst)
add pbDst,pbDst,ld
bdnz __and_08_s53Loop
blr
//
__and_08_s60:
mtctr cy
lwzx pixel0123,pdSrc,t
addi t,t,4
andi. t,t,0x07
lhzx pixel4567,pdSrc,t
__and_08_s60Loop:
lwz w,0(pbDst)
and w,w,pixel0123
stw w,0(pbDst)
lhz w,4(pbDst)
and w,w,pixel4567
sth w,4(pbDst)
add pbDst,pbDst,ld
bdnz __and_08_s60Loop
blr
//
__and_08_s61:
mtctr cy
lbzx pixel0123,pdSrc,t
addi t,t,1
andi. t,t,0x07
lhzx pixel4567,pdSrc,t
addi t,t,2
andi. t,t,0x07
lhzx cy,pdSrc,t
addi t,t,2
andi. t,t,0x07
lbzx cbX,pdSrc,t
__and_08_s61Loop:
lbz w,0(pbDst)
and w,w,pixel0123
stb w,0(pbDst)
lhz w,1(pbDst)
and w,w,pixel4567
sth w,1(pbDst)
lhz w,3(pbDst)
and w,w,cy
sth w,3(pbDst)
lbz w,5(pbDst)
and w,w,cbX
stb w,5(pbDst)
add pbDst,pbDst,ld
bdnz __and_08_s61Loop
blr
//
__and_08_s62:
mtctr cy
lhzx pixel0123,pdSrc,t
addi t,t,2
andi. t,t,0x07
lwzx pixel4567,pdSrc,t
__and_08_s62Loop:
lhz w,0(pbDst)
and w,w,pixel0123
sth w,0(pbDst)
lwz w,2(pbDst)
and w,w,pixel4567
stw w,2(pbDst)
add pbDst,pbDst,ld
bdnz __and_08_s62Loop
blr
//
__and_08_s63:
mtctr cy
lbzx pixel0123,pdSrc,t
addi t,t,1
andi. t,t,0x07
lwzx pixel4567,pdSrc,t
addi t,t,4
andi. t,t,0x07
lbzx cy,pdSrc,t
__and_08_s63Loop:
lbz w,0(pbDst)
and w,w,pixel0123
stb w,0(pbDst)
lwz w,1(pbDst)
and w,w,pixel4567
stw w,1(pbDst)
lbz w,5(pbDst)
and w,w,cy
stb w,5(pbDst)
add pbDst,pbDst,ld
bdnz __and_08_s63Loop
blr
//
// Wide cases (>= 7)
//
__and_08_10:
rlwinm t,pbDst,2,27,29 // t <- first pixel offset (0 to 7) index to the table entry
lwzx w,w,t // w <- entry point for processing each line
mtlr w // LR <- entry point
mr loopcount,cbX // loop count <- pixel count
andi. w,pbDst,0x03 // starting word boundary offset
beq- __and_08_15 // word aligned -> no extra operation before main loop
subfic w,w,4 // w <- number of pixels (= bytes) to process at first
subf loopcount,w,loopcount // loopcount <- # of byte after initial process
__and_08_15:
andi. remainder,loopcount,0x03 // remainder is # of pixels to do after main loop (0 to 3)
srawi loopcount,loopcount,2 // loopcount <- number of 4 pixels unit (at least 1)
//
// Load pattern into pixel0123 ~ pixel4567
//
lwz pixel0123,0(pdSrc)
lwz pixel4567,4(pdSrc)
//
__and_08_20:
mtctr loopcount // CTR <- pixel pair count to operate
mr t,pbDst // t <- strating target address of the line
blr
//
// Wide cases routines
//
__and_08_31:
lbz w,0(t)
srawi w2,pixel0123,8
and w,w,w2
stb w,0(t)
addi t,t,1
__and_08_32:
lhz w,0(t)
srawi w2,pixel0123,16
and w,w,w2
sth w,0(t)
addi t,t,2
b __and_08_34
//
__and_08_33:
lbz w,0(t)
srawi w2,pixel0123,24
and w,w,w2
stb w,0(t)
addi t,t,1
b __and_08_34
//
__and_08_35:
lbz w,0(t)
srawi w2,pixel4567,8
and w,w,w2
stb w,0(t)
addi t,t,1
__and_08_36:
lhz w,0(t)
srawi w2,pixel4567,16
and w,w,w2
sth w,0(t)
addi t,t,2
b __and_08_30
//
__and_08_37:
lbz w,0(t)
srawi w2,pixel4567,24
and w,w,w2
stb w,0(t)
addi t,t,1
//
__and_08_30:
lwz w,0(t)
and w,w,pixel0123
stw w,0(t)
addi t,t,4
bdz __and_08_50
__and_08_34:
lwz w,0(t)
and w,w,pixel4567
stw w,0(t)
addi t,t,4
bdnz __and_08_30
mr w2,pixel0123
b __and_08_60
//
// End of line
//
__and_08_50:
mr w2,pixel4567
__and_08_60:
andi. w,remainder,0x02 // equal or more than 2 bytes remaining?
beq __and_08_70
lhz w,0(t)
and w,w,w2
sth w,0(t)
addi t,t,2
srawi w2,w2,16
__and_08_70:
andi. w,remainder,0x01 // still byte remaining?
beq __and_08_80 // No -> next line
lbz w,0(t)
and w,w,w2
stb w,0(t)
__and_08_80:
add pbDst,pbDst,ld // pointing to the next line
addic. cy,cy,-1 // any more lines?
bne __and_08_20 // yes -> do next line
//
__and_08_exit:
//
// Restore non-volatile registers
//
lwz r14,SLACK2(sp)
lwz r15,SLACK3(sp)
lwz r16,SLACK4(sp)
mtlr r31
lwz r31,SLACK1(sp)
//
SPECIAL_EXIT(__and_pat8_entry)