mirror of https://github.com/lianthony/NT4.0
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
9232 lines
208 KiB
9232 lines
208 KiB
//
|
|
// Copyright (c) 1994 FirePower Systems, Inc.
|
|
//
|
|
// Module Name:
|
|
// rectops.s
|
|
//
|
|
// Abstract:
|
|
// This module includes Rect fill, copy and xor operations to be used
|
|
// in PSIDISP.DLL display driver for PowerPro & PowerTop.
|
|
//
|
|
// Author:
|
|
// Neil Ogura: 11-23-1994
|
|
//
|
|
// Environment:
|
|
// User mode.
|
|
//
|
|
// Assumption:
|
|
// The width of cache line is assumed to be 32 bytes. If the assumption
|
|
// becomes not true, some modifications are necessary. There are other
|
|
// restrictions for each functions - see function header.
|
|
// Also, if the number of L1 cache entry is chaged for future processor,
|
|
// the parameter passed from upper routine has to be updated, too. This
|
|
// number should be taken care of in PSIDISP.SYS using PVR value.
|
|
//
|
|
// Revision History:
|
|
//
|
|
//--
|
|
|
|
//
|
|
// Copyright (c) 1995 FirePower Systems, Inc.
|
|
// DO NOT DISTRIBUTE without permission
|
|
//
|
|
// $RCSfile: rectops.s $
|
|
// $Revision: 1.2 $
|
|
// $Date: 1996/04/10 17:59:38 $
|
|
// $Locker: $
|
|
//
|
|
|
|
#include "ksppc.h"
|
|
#include "ladj.h" // To make easy mapping to line # in error messages -- subtract 1500.
|
|
|
|
// Cache Flush control bit parameter stored in MS half word.
|
|
#define SFLUSHBIT 0x8000
|
|
#define TFLUSHBIT 0x4000
|
|
#define TTOUCHBIT 0x2000
|
|
|
|
// RectOp operation flag -- currently only XOR is supported
|
|
#define OPXOR 0x0100
|
|
|
|
// This flag is used to select if using just dcbz for filling zero or not.
|
|
// 0 is used for safety reasons (possible "dcbz" bug) because this increases
|
|
// performance very little - almost negligible.
|
|
#define CLEAR_BY_DCBZ 0
|
|
|
|
// Threshold to select which routine to use long or short
|
|
// MINLENGTH_XXX values has to be more than 63 to ensure that there will be
|
|
// at least one innermost (32 bytes) operation as it assumes that there is
|
|
// at least one. For copy, 31 bytes is the minimum length which can be processed
|
|
// in long routine - no inner most loop case is considered.
|
|
#define MINLENGTH_FILL 63
|
|
#define MINLENGTH_OP 63
|
|
#define MINLENGTH_COPY 31
|
|
|
|
// MINDISTANCE is minimum distance between source and target to be safe to
|
|
// use "dcbz" target (not to destroy uncopied source)
|
|
#define MINDISTANCE 29
|
|
|
|
// Parameter structure offset
|
|
#define PARAM1 0
|
|
#define PARAM2 4
|
|
#define PARAM3 8
|
|
#define PARAM4 12
|
|
#define PARAM5 16
|
|
#define PARAM6 20
|
|
#define PARAM7 24
|
|
#define PARAM8 28
|
|
#define PARAM9 32
|
|
#define PARAM10 36
|
|
#define PARAM11 40
|
|
#define PARAM12 44
|
|
#define PARAM13 48
|
|
#define PARAM14 52
|
|
#define PARAM15 56
|
|
#define PARAM16 60
|
|
#define PARAM17 64
|
|
|
|
// Stack frame size
|
|
#define MINSTACKSIZE 64
|
|
// Stacl Slack offset
|
|
#define SLACK1 -4
|
|
#define SLACK2 -8
|
|
#define SLACK3 -12
|
|
#define SLACK4 -16
|
|
#define SLACK5 -20
|
|
#define SLACK6 -24
|
|
#define SLACK7 -28
|
|
#define SLACK8 -32
|
|
|
|
// Dispatch tables
|
|
|
|
.data
|
|
.align 3
|
|
|
|
.globl __xorentrytable
|
|
__xorentrytable:
|
|
__XorsShortTable:
|
|
.ualong __xors1_A0
|
|
.ualong __xors1_A1
|
|
.ualong __xors1_A2
|
|
.ualong __xors1_A3
|
|
.ualong __xors2_A0
|
|
.ualong __xors2_A1
|
|
.ualong __xors2_A2
|
|
.ualong __xors2_A3
|
|
__XorsInitProcsB:
|
|
.ualong __xorsInit_0B
|
|
.ualong __xorsInit_1B
|
|
.ualong __xorsInit_2B
|
|
.ualong __xorsInit_3B
|
|
__XorsMainProcsB:
|
|
.ualong __xorsmains_0B
|
|
.ualong __xorsmains_1B
|
|
.ualong __xorsmains_2B
|
|
.ualong __xorsmains_3B
|
|
__XorsEndProcsB:
|
|
.ualong __xorsEnd_0B
|
|
.ualong __xorsEnd_1B
|
|
.ualong __xorsEnd_2B
|
|
.ualong __xorsEnd_3B
|
|
__XorsInitProcsF:
|
|
.ualong __xorsInit_0F
|
|
.ualong __xorsInit_3F
|
|
.ualong __xorsInit_2F
|
|
.ualong __xorsInit_1F
|
|
__XorsMainProcsF:
|
|
.ualong __xorsmains_0F
|
|
.ualong __xorsmains_1F
|
|
.ualong __xorsmains_2F
|
|
.ualong __xorsmains_3F
|
|
__XorsEndProcsF:
|
|
.ualong __xorsEnd_0F
|
|
.ualong __xorsEnd_1F
|
|
.ualong __xorsEnd_2F
|
|
.ualong __xorsEnd_3F
|
|
//
|
|
.globl __andentrytable
|
|
__andentrytable:
|
|
__AndsShortTable:
|
|
.ualong __ands1_A0
|
|
.ualong __ands1_A1
|
|
.ualong __ands1_A2
|
|
.ualong __ands1_A3
|
|
.ualong __ands2_A0
|
|
.ualong __ands2_A1
|
|
.ualong __ands2_A2
|
|
.ualong __ands2_A3
|
|
__AndsInitProcsB:
|
|
.ualong __andsInit_0B
|
|
.ualong __andsInit_1B
|
|
.ualong __andsInit_2B
|
|
.ualong __andsInit_3B
|
|
__AndsMainProcsB:
|
|
.ualong __andsmains_0B
|
|
.ualong __andsmains_1B
|
|
.ualong __andsmains_2B
|
|
.ualong __andsmains_3B
|
|
__AndsEndProcsB:
|
|
.ualong __andsEnd_0B
|
|
.ualong __andsEnd_1B
|
|
.ualong __andsEnd_2B
|
|
.ualong __andsEnd_3B
|
|
__AndsInitProcsF:
|
|
.ualong __andsInit_0F
|
|
.ualong __andsInit_3F
|
|
.ualong __andsInit_2F
|
|
.ualong __andsInit_1F
|
|
__AndsMainProcsF:
|
|
.ualong __andsmains_0F
|
|
.ualong __andsmains_1F
|
|
.ualong __andsmains_2F
|
|
.ualong __andsmains_3F
|
|
__AndsEndProcsF:
|
|
.ualong __andsEnd_0F
|
|
.ualong __andsEnd_1F
|
|
.ualong __andsEnd_2F
|
|
.ualong __andsEnd_3F
|
|
//
|
|
.globl __orentrytable
|
|
__orentrytable:
|
|
__OrsShortTable:
|
|
.ualong __ors1_A0
|
|
.ualong __ors1_A1
|
|
.ualong __ors1_A2
|
|
.ualong __ors1_A3
|
|
.ualong __ors2_A0
|
|
.ualong __ors2_A1
|
|
.ualong __ors2_A2
|
|
.ualong __ors2_A3
|
|
__OrsInitProcsB:
|
|
.ualong __orsInit_0B
|
|
.ualong __orsInit_1B
|
|
.ualong __orsInit_2B
|
|
.ualong __orsInit_3B
|
|
__OrsMainProcsB:
|
|
.ualong __orsmains_0B
|
|
.ualong __orsmains_1B
|
|
.ualong __orsmains_2B
|
|
.ualong __orsmains_3B
|
|
__OrsEndProcsB:
|
|
.ualong __orsEnd_0B
|
|
.ualong __orsEnd_1B
|
|
.ualong __orsEnd_2B
|
|
.ualong __orsEnd_3B
|
|
__OrsInitProcsF:
|
|
.ualong __orsInit_0F
|
|
.ualong __orsInit_3F
|
|
.ualong __orsInit_2F
|
|
.ualong __orsInit_1F
|
|
__OrsMainProcsF:
|
|
.ualong __orsmains_0F
|
|
.ualong __orsmains_1F
|
|
.ualong __orsmains_2F
|
|
.ualong __orsmains_3F
|
|
__OrsEndProcsF:
|
|
.ualong __orsEnd_0F
|
|
.ualong __orsEnd_1F
|
|
.ualong __orsEnd_2F
|
|
.ualong __orsEnd_3F
|
|
//
|
|
.globl __orcentrytable
|
|
__orcentrytable:
|
|
__OrcsShortTable:
|
|
.ualong __orcs1_A0
|
|
.ualong __orcs1_A1
|
|
.ualong __orcs1_A2
|
|
.ualong __orcs1_A3
|
|
.ualong __orcs2_A0
|
|
.ualong __orcs2_A1
|
|
.ualong __orcs2_A2
|
|
.ualong __orcs2_A3
|
|
__OrcsInitProcsB:
|
|
.ualong __orcsInit_0B
|
|
.ualong __orcsInit_1B
|
|
.ualong __orcsInit_2B
|
|
.ualong __orcsInit_3B
|
|
__OrcsMainProcsB:
|
|
.ualong __orcsmains_0B
|
|
.ualong __orcsmains_1B
|
|
.ualong __orcsmains_2B
|
|
.ualong __orcsmains_3B
|
|
__OrcsEndProcsB:
|
|
.ualong __orcsEnd_0B
|
|
.ualong __orcsEnd_1B
|
|
.ualong __orcsEnd_2B
|
|
.ualong __orcsEnd_3B
|
|
__OrcsInitProcsF:
|
|
.ualong __orcsInit_0F
|
|
.ualong __orcsInit_3F
|
|
.ualong __orcsInit_2F
|
|
.ualong __orcsInit_1F
|
|
__OrcsMainProcsF:
|
|
.ualong __orcsmains_0F
|
|
.ualong __orcsmains_1F
|
|
.ualong __orcsmains_2F
|
|
.ualong __orcsmains_3F
|
|
__OrcsEndProcsF:
|
|
.ualong __orcsEnd_0F
|
|
.ualong __orcsEnd_1F
|
|
.ualong __orcsEnd_2F
|
|
.ualong __orcsEnd_3F
|
|
//
|
|
.globl __b8opentrytable
|
|
__b8opentrytable:
|
|
__B8opsShortTable:
|
|
.ualong __b8ops1_A0
|
|
.ualong __b8ops1_A1
|
|
.ualong __b8ops1_A2
|
|
.ualong __b8ops1_A3
|
|
.ualong __b8ops2_A0
|
|
.ualong __b8ops2_A1
|
|
.ualong __b8ops2_A2
|
|
.ualong __b8ops2_A3
|
|
__B8opsInitProcsB:
|
|
.ualong __b8opsInit_0B
|
|
.ualong __b8opsInit_1B
|
|
.ualong __b8opsInit_2B
|
|
.ualong __b8opsInit_3B
|
|
__B8opsMainProcsB:
|
|
.ualong __b8opsmains_0B
|
|
.ualong __b8opsmains_1B
|
|
.ualong __b8opsmains_2B
|
|
.ualong __b8opsmains_3B
|
|
__B8opsEndProcsB:
|
|
.ualong __b8opsEnd_0B
|
|
.ualong __b8opsEnd_1B
|
|
.ualong __b8opsEnd_2B
|
|
.ualong __b8opsEnd_3B
|
|
__B8opsInitProcsF:
|
|
.ualong __b8opsInit_0F
|
|
.ualong __b8opsInit_3F
|
|
.ualong __b8opsInit_2F
|
|
.ualong __b8opsInit_1F
|
|
__B8opsMainProcsF:
|
|
.ualong __b8opsmains_0F
|
|
.ualong __b8opsmains_1F
|
|
.ualong __b8opsmains_2F
|
|
.ualong __b8opsmains_3F
|
|
__B8opsEndProcsF:
|
|
.ualong __b8opsEnd_0F
|
|
.ualong __b8opsEnd_1F
|
|
.ualong __b8opsEnd_2F
|
|
.ualong __b8opsEnd_3F
|
|
//
|
|
.globl __andcentrytable
|
|
__andcentrytable:
|
|
__AndcsShortTable:
|
|
.ualong __andcs1_A0
|
|
.ualong __andcs1_A1
|
|
.ualong __andcs1_A2
|
|
.ualong __andcs1_A3
|
|
.ualong __andcs2_A0
|
|
.ualong __andcs2_A1
|
|
.ualong __andcs2_A2
|
|
.ualong __andcs2_A3
|
|
__AndcsInitProcsB:
|
|
.ualong __andcsInit_0B
|
|
.ualong __andcsInit_1B
|
|
.ualong __andcsInit_2B
|
|
.ualong __andcsInit_3B
|
|
__AndcsMainProcsB:
|
|
.ualong __andcsmains_0B
|
|
.ualong __andcsmains_1B
|
|
.ualong __andcsmains_2B
|
|
.ualong __andcsmains_3B
|
|
__AndcsEndProcsB:
|
|
.ualong __andcsEnd_0B
|
|
.ualong __andcsEnd_1B
|
|
.ualong __andcsEnd_2B
|
|
.ualong __andcsEnd_3B
|
|
__AndcsInitProcsF:
|
|
.ualong __andcsInit_0F
|
|
.ualong __andcsInit_3F
|
|
.ualong __andcsInit_2F
|
|
.ualong __andcsInit_1F
|
|
__AndcsMainProcsF:
|
|
.ualong __andcsmains_0F
|
|
.ualong __andcsmains_1F
|
|
.ualong __andcsmains_2F
|
|
.ualong __andcsmains_3F
|
|
__AndcsEndProcsF:
|
|
.ualong __andcsEnd_0F
|
|
.ualong __andcsEnd_1F
|
|
.ualong __andcsEnd_2F
|
|
.ualong __andcsEnd_3F
|
|
//
|
|
.globl __norentrytable
|
|
__norentrytable:
|
|
__NorsShortTable:
|
|
.ualong __nors1_A0
|
|
.ualong __nors1_A1
|
|
.ualong __nors1_A2
|
|
.ualong __nors1_A3
|
|
.ualong __nors2_A0
|
|
.ualong __nors2_A1
|
|
.ualong __nors2_A2
|
|
.ualong __nors2_A3
|
|
__NorsInitProcsB:
|
|
.ualong __norsInit_0B
|
|
.ualong __norsInit_1B
|
|
.ualong __norsInit_2B
|
|
.ualong __norsInit_3B
|
|
__NorsMainProcsB:
|
|
.ualong __norsmains_0B
|
|
.ualong __norsmains_1B
|
|
.ualong __norsmains_2B
|
|
.ualong __norsmains_3B
|
|
__NorsEndProcsB:
|
|
.ualong __norsEnd_0B
|
|
.ualong __norsEnd_1B
|
|
.ualong __norsEnd_2B
|
|
.ualong __norsEnd_3B
|
|
__NorsInitProcsF:
|
|
.ualong __norsInit_0F
|
|
.ualong __norsInit_3F
|
|
.ualong __norsInit_2F
|
|
.ualong __norsInit_1F
|
|
__NorsMainProcsF:
|
|
.ualong __norsmains_0F
|
|
.ualong __norsmains_1F
|
|
.ualong __norsmains_2F
|
|
.ualong __norsmains_3F
|
|
__NorsEndProcsF:
|
|
.ualong __norsEnd_0F
|
|
.ualong __norsEnd_1F
|
|
.ualong __norsEnd_2F
|
|
.ualong __norsEnd_3F
|
|
//
|
|
.globl __nsrcentrytable
|
|
__nsrcentrytable:
|
|
__NsrcsShortTable:
|
|
.ualong __nsrcs1_A0
|
|
.ualong __nsrcs1_A1
|
|
.ualong __nsrcs1_A2
|
|
.ualong __nsrcs1_A3
|
|
.ualong __nsrcs2_A0
|
|
.ualong __nsrcs2_A1
|
|
.ualong __nsrcs2_A2
|
|
.ualong __nsrcs2_A3
|
|
__NsrcsInitProcsB:
|
|
.ualong __nsrcsInit_0B
|
|
.ualong __nsrcsInit_1B
|
|
.ualong __nsrcsInit_2B
|
|
.ualong __nsrcsInit_3B
|
|
__NsrcsMainProcsB:
|
|
.ualong __nsrcsmains_0B
|
|
.ualong __nsrcsmains_1B
|
|
.ualong __nsrcsmains_2B
|
|
.ualong __nsrcsmains_3B
|
|
__NsrcsEndProcsB:
|
|
.ualong __nsrcsEnd_0B
|
|
.ualong __nsrcsEnd_1B
|
|
.ualong __nsrcsEnd_2B
|
|
.ualong __nsrcsEnd_3B
|
|
__NsrcsInitProcsF:
|
|
.ualong __nsrcsInit_0F
|
|
.ualong __nsrcsInit_3F
|
|
.ualong __nsrcsInit_2F
|
|
.ualong __nsrcsInit_1F
|
|
__NsrcsMainProcsF:
|
|
.ualong __nsrcsmains_0F
|
|
.ualong __nsrcsmains_1F
|
|
.ualong __nsrcsmains_2F
|
|
.ualong __nsrcsmains_3F
|
|
__NsrcsEndProcsF:
|
|
.ualong __nsrcsEnd_0F
|
|
.ualong __nsrcsEnd_1F
|
|
.ualong __nsrcsEnd_2F
|
|
.ualong __nsrcsEnd_3F
|
|
//
|
|
.text
|
|
//
|
|
//*************************************************************************************************
|
|
NESTED_ENTRY(RectFill, MINSTACKSIZE, 1, 0)
|
|
//
|
|
// Input Parameters:
|
|
// r3: The pointer to the parameter structure as follows.
|
|
// PARAM1 [00] : Target address
|
|
// PARAM2 [04] : Number of bytes to fill per line
|
|
// PARAM3 [08] : Number of lines to fill
|
|
// PARAM4 [12] : Target line increments byte per line
|
|
// PARAM5 [16] : First word of dword solid brush to use (duplicated brush)
|
|
// PARAM6 [20] : Second word of dword solid brush to use (same as the first word)
|
|
// PARAM7 [24] : Maximum number of cache lines to flush
|
|
// PARAM8 [28] : Maximum number of display lines to flush
|
|
// PARAM9 [32] : Operation control flag
|
|
// bit 1 (TFLUSHBIT): Target Flush flag 0:No Flush, 1:Flush
|
|
// bit 2 (TTOUCHBIT): Target touch using "dcbz" 0:No Touch, 1:Touch
|
|
// PARAM10 [36] : Register save area 1
|
|
// PARAM11 [40] : Register save area 2
|
|
// PARAM12 [44] : Register save area 3
|
|
// PARAM13 [48] : Register save area 4
|
|
// PARAM14 [52] : Register save area 5
|
|
// PARAM15 [56] : Register save area 6
|
|
//
|
|
// Register usage:
|
|
// r4: Solid word brush to be used for the fill operation
|
|
// r5: Number of bytes to fill per line -> inner most loop counter
|
|
// r6: Remaining number of lines to fill
|
|
// r7: Gap between after last byte of previous line and the top byte of next line
|
|
// r8: Before loop fill routine address
|
|
// r9: Updating target address
|
|
// r10: Work register
|
|
// r11: Main loop fill routine address
|
|
// r12: After loop fill routine address
|
|
// r31: Work register to save r3 when calling RectFillS (saved by NESTED_ENTRY macro)
|
|
// CTR: Used for loop counter and linking
|
|
// f1: Solid dword brush to be used for the fill operation
|
|
//
|
|
// Restrictions:
|
|
// If Pixel width is 2 bytes, the target address has to be half word aligned.
|
|
// If Pixel width is 4 bytes, the target address has to be word aligned.
|
|
// Number of bytes to fill per line must be multiple of pixel width in bytes.
|
|
// Fill width is assumed to be equal or shorter than target delta.
|
|
// If target memory is not cachable, TFLUSHBIT and TTOUCHBIT has to be set
|
|
// to 0 - otherwise exception occurs.
|
|
// Target line increments byte has to be multiple of 4.
|
|
// If it's multiple of 32 (cache line width), RectFill is used, if it's not,
|
|
// RectFillS is used.
|
|
//
|
|
PROLOGUE_END(RectFill)
|
|
//
|
|
lwz r6,PARAM3(r3) // r6 <- number of lines to fill
|
|
and. r6,r6,r6 // Any lines to fill?
|
|
beq- fill_exit // No -> exit
|
|
lwz r9,PARAM1(r3) // r9 <- target address
|
|
lwz r5,PARAM2(r3) // r5 <- bytes to fill per line
|
|
lwz r7,PARAM4(r3) // r7 <- byte distance between lines
|
|
lwz r4,PARAM5(r3) // r4 <- GPR brush
|
|
cmplwi r5,MINLENGTH_FILL // Is it wide enough to do in this routine?
|
|
blt- fill_00 // No -> use RectFillS
|
|
#if (! FULLCACHE)
|
|
lwz r10,PARAM9(r3) // r10 <- cache control bit
|
|
andis. r10,r10,TTOUCHBIT // Can touch target cache?
|
|
beq- fill_01 // No -> use RectFillS
|
|
#endif
|
|
andi. r10,r7,0x1f // Target delta is multiple of 32?
|
|
beq fill_05 // Yes -> go ahead, otherwise use RectFillS
|
|
//
|
|
fill_00:
|
|
and. r5,r5,r5 // Width zero?
|
|
beq fill_exit // Yes -> just exit
|
|
fill_01:
|
|
mr r31,r3 // Save r3
|
|
mr r3,r9 // r3 <- target address
|
|
bl ..RectFillS // and call RectFillS
|
|
mr r3,r31 // Restore r3
|
|
b fill_10 // and jump to flush cache
|
|
//
|
|
fill_05:
|
|
subf r7,r5,r7 // r7 <- gap between after last byte of previous line and the top byte of next line
|
|
lfd f1,PARAM5(r3) // f1 <- FPR brush
|
|
bl fill_06
|
|
__InitFillProc: // Procedures to handle initial 8 byte alignment adjustment
|
|
.ualong __fillinit_0
|
|
.ualong __fillinit_7
|
|
.ualong __fillinit_6
|
|
.ualong __fillinit_5
|
|
.ualong __fillinit_4
|
|
.ualong __fillinit_3
|
|
.ualong __fillinit_2
|
|
.ualong __fillinit_1
|
|
__MainFillProc: // Procedures to handle main loop (plus initial 32 byte alignment from dword alignment)
|
|
.ualong __fillmain_0_0
|
|
.ualong __fillmain_0_1
|
|
.ualong __fillmain_3_0
|
|
.ualong __fillmain_3_1
|
|
.ualong __fillmain_2_0
|
|
.ualong __fillmain_2_1
|
|
.ualong __fillmain_1_0
|
|
.ualong __fillmain_1_1
|
|
__EndFillProc: // Procedures to handle up to 31 byte fill at the end of each line
|
|
.ualong __fillend_0
|
|
.ualong __fillend_1
|
|
.ualong __fillend_2
|
|
.ualong __fillend_3
|
|
.ualong __fillend_4
|
|
.ualong __fillend_5
|
|
.ualong __fillend_6
|
|
.ualong __fillend_7
|
|
.ualong __fillend_8
|
|
.ualong __fillend_9
|
|
.ualong __fillend_10
|
|
.ualong __fillend_11
|
|
.ualong __fillend_12
|
|
.ualong __fillend_13
|
|
.ualong __fillend_14
|
|
.ualong __fillend_15
|
|
.ualong __fillend_16
|
|
.ualong __fillend_17
|
|
.ualong __fillend_18
|
|
.ualong __fillend_19
|
|
.ualong __fillend_20
|
|
.ualong __fillend_21
|
|
.ualong __fillend_22
|
|
.ualong __fillend_23
|
|
.ualong __fillend_24
|
|
.ualong __fillend_25
|
|
.ualong __fillend_26
|
|
.ualong __fillend_27
|
|
.ualong __fillend_28
|
|
.ualong __fillend_29
|
|
.ualong __fillend_30
|
|
.ualong __fillend_31
|
|
fill_06:
|
|
mflr r10
|
|
rlwinm. r8,r9,2,27,29 // r8 <- table index for init loop
|
|
beq fill_06x // if length zero -> set r11 in r8 later
|
|
lwzx r8,r10,r8 // r8 <- init routine address
|
|
fill_06x:
|
|
andi. r12,r9,0x07
|
|
beq fill_07
|
|
subfic r12,r12,8 // r12 <- byte length filled by init routine
|
|
fill_07:
|
|
add r11,r9,r12 // r11 <- target address after initial fill
|
|
andi. r11,r11,0x18 // r11 (bit 27&28) = 00:0, 01:24, 10:16, 11:8 byte to fill to make 32 byte alignment
|
|
#if (USE_DCBZ && CLEAR_BY_DCBZ)
|
|
and. r4,r4,r4 // Filling zero?
|
|
beq fill_08 // Yes -> Use r11 as an index as is
|
|
#endif
|
|
ori r11,r11,0x04 // No -> set bit 29 of r11 to index filling non-zero routine
|
|
fill_08:
|
|
addi r10,r10,__MainFillProc-__InitFillProc
|
|
lwzx r11,r10,r11 // r11 <- main fill routine address
|
|
andi. r12,r9,0x1f // dis-alignment for 32 byte alignment
|
|
beq fill_09
|
|
subfic r12,r12,32 // r12 <- number of byte to be filled before innermost loop
|
|
fill_09:
|
|
subf r12,r12,r5 // r12 <- number of byte to be filled at the inner most loop and end routine
|
|
srawi. r5,r12,5 // r5 <- innermost loop counter
|
|
rlwinm r12,r12,2,25,29 // r12 <- end routine table index
|
|
addi r10,r10,__EndFillProc-__MainFillProc
|
|
lwzx r12,r10,r12 // r12 <- end routine address
|
|
//
|
|
and. r8,r8,r8 // No initial routine?
|
|
bne fill_09x
|
|
mr r8,r11 // -> skip initial routine
|
|
fill_09x:
|
|
mtlr r8
|
|
blrl // Call init proc --> will cahin to main routine -> end routine and loop for all lines
|
|
//
|
|
fill_10:
|
|
#if (! FULLCACHE)
|
|
bl ..flush_cache // Flush cache
|
|
#endif
|
|
fill_exit:
|
|
NESTED_EXIT(RectFill, MINSTACKSIZE, 1, 0)
|
|
//
|
|
//*************************************************************************************************
|
|
SPECIAL_ENTRY(RectFillS)
|
|
//
|
|
// Input Parameters:
|
|
// r3: Target address
|
|
// r4: Solid brush to be used for the fill operation (duplicated)
|
|
// r5: Number of bytes --> inner loop count
|
|
// r6: Number of lines
|
|
// r7: Target line increment bytes per line
|
|
//
|
|
// Register usage:
|
|
//
|
|
// r0: Saved return address
|
|
// r8: Init subroutine address
|
|
// r9: Target address to use
|
|
// r10: Work register
|
|
// r11: Main routine address
|
|
// r12: Ending subroutine address
|
|
// CTR: Used for loop counter and linking
|
|
//
|
|
// Restrictions:
|
|
// If Pixel width is 2 bytes, the target address has to be half word aligned.
|
|
// If Pixel width is 4 bytes, the target address has to be word aligned.
|
|
// Number of bytes must be multiple of pixel width in bytes.
|
|
// Fill width is assumed to be equal or shorter than target delta.
|
|
// Target line increments byte has to be multiple of 4.
|
|
//
|
|
mflr r0 // Save retunr address in r0
|
|
//
|
|
PROLOGUE_END(RectFillS)
|
|
//
|
|
and. r6,r6,r6 // Any lines to fill?
|
|
beq fills_exit // No -> exit
|
|
mr r9,r3 // r9 <- target address to use
|
|
cmplwi r5,8 // More than 8 bytes?
|
|
bgt fills_40 // Yes -> do normal fill
|
|
and. r5,r5,r5 // Width zero?
|
|
beq fills_exit // Yes -> just exit
|
|
bl fills_10
|
|
__ShortFillProcS:
|
|
.ualong __fillshort_1
|
|
.ualong __fillshort_1
|
|
.ualong __fillshort_1
|
|
.ualong __fillshort_1
|
|
.ualong __fillshort_2_0
|
|
.ualong __fillshort_2_1
|
|
.ualong __fillshort_2_2
|
|
.ualong __fillshort_2_3
|
|
.ualong __fillshort_3_0
|
|
.ualong __fillshort_3_1
|
|
.ualong __fillshort_3_2
|
|
.ualong __fillshort_3_3
|
|
.ualong __fillshort_4_0
|
|
.ualong __fillshort_4_1
|
|
.ualong __fillshort_4_2
|
|
.ualong __fillshort_4_3
|
|
.ualong __fillshort_5_0
|
|
.ualong __fillshort_5_1
|
|
.ualong __fillshort_5_2
|
|
.ualong __fillshort_5_3
|
|
.ualong __fillshort_6_0
|
|
.ualong __fillshort_6_1
|
|
.ualong __fillshort_6_2
|
|
.ualong __fillshort_6_3
|
|
.ualong __fillshort_7_0
|
|
.ualong __fillshort_7_1
|
|
.ualong __fillshort_7_2
|
|
.ualong __fillshort_7_3
|
|
.ualong __fillshort_8_0
|
|
.ualong __fillshort_8_1
|
|
.ualong __fillshort_8_2
|
|
.ualong __fillshort_8_3
|
|
//
|
|
// Short fill <= 8 bytes
|
|
//
|
|
fills_10:
|
|
mflr r10 // r10 <- InitProcS address
|
|
addi r8,r5,-1 // r8 <- width - 1 (0~7)
|
|
rlwinm r8,r8,4,25,27 // bit 25~27 of r8 <- width - 1 (0~7)
|
|
rlwimi r8,r9,2,28,29 // bit 28~29 of r8 <- mod 4 of target address
|
|
lwzx r8,r10,r8 // r8 <- subroutine to call
|
|
mtlr r8
|
|
mtctr r6 // CTR <- number of lines to fill
|
|
blrl // Call short fill subroutine
|
|
b fills_90
|
|
//
|
|
// width > 8 -- normal process
|
|
//
|
|
fills_40:
|
|
subf r7,r5,r7 // r7 <- gap between after last byte of previous line and the top byte of next line
|
|
bl fills_50
|
|
__InitFillProcS:
|
|
.ualong __fillinit_0
|
|
.ualong __fillinit_3
|
|
.ualong __fillinit_2
|
|
.ualong __fillinit_1
|
|
__MainFillProcS:
|
|
.ualong __fillmainS
|
|
__EndFillProcS:
|
|
.ualong __fillend_0
|
|
.ualong __fillend_1
|
|
.ualong __fillend_2
|
|
.ualong __fillend_3
|
|
fills_50:
|
|
mflr r10 // r10 <- InitProcS address
|
|
rlwinm. r8,r9,2,28,29 // r8 <- table index for init loop
|
|
beq fills_50x // No initial routine -> set r8 later
|
|
lwzx r8,r10,r8 // r8 <- init routine address
|
|
fills_50x:
|
|
andi. r12,r9,0x3
|
|
beq fills_55
|
|
subfic r12,r12,4 // r12 <- number of initial filled byte
|
|
fills_55:
|
|
subf r12,r12,r5 // r12 <- number of bytes to fill after initial routine
|
|
srawi. r5,r12,2 // r5 <- inner loop count
|
|
rlwinm r12,r12,2,28,29 // r12 <- 2 bit shifted number of remaining bytes to fill after main loop
|
|
addi r10,r10,__MainFillProcS-__InitFillProcS
|
|
lwz r11,0(r10) // r11 <- main routine address
|
|
addi r10,r10,__EndFillProcS-__MainFillProcS
|
|
lwzx r12,r10,r12 // r12 <- end routine address
|
|
and. r8,r8,r8 // No initial routine?
|
|
bne fills_55x
|
|
mr r8,r11 // -> skip initial routine
|
|
fills_55x:
|
|
//
|
|
mtlr r8
|
|
blrl // Call init proc --> will cahin to main routine -> end routine and loop for all lines
|
|
//
|
|
fills_90:
|
|
mtlr r0 // Restore return address
|
|
fills_exit:
|
|
SPECIAL_EXIT(RectFillS)
|
|
//
|
|
LEAF_ENTRY(FillProcs)
|
|
//
|
|
// fill short routines
|
|
//
|
|
__fillshort_1:
|
|
stb r4,0(r9)
|
|
add r9,r9,r7
|
|
bdnz __fillshort_1
|
|
blr
|
|
__fillshort_2_0:
|
|
__fillshort_2_2:
|
|
sth r4,0(r9)
|
|
add r9,r9,r7
|
|
bdnz __fillshort_2_2
|
|
blr
|
|
__fillshort_2_1:
|
|
__fillshort_2_3:
|
|
stb r4,0(r9)
|
|
stb r4,1(r9)
|
|
add r9,r9,r7
|
|
bdnz __fillshort_2_3
|
|
blr
|
|
__fillshort_3_0:
|
|
__fillshort_3_2:
|
|
sth r4,0(r9)
|
|
stb r4,2(r9)
|
|
add r9,r9,r7
|
|
bdnz __fillshort_3_2
|
|
blr
|
|
__fillshort_3_1:
|
|
__fillshort_3_3:
|
|
stb r4,0(r9)
|
|
sth r4,1(r9)
|
|
add r9,r9,r7
|
|
bdnz __fillshort_3_3
|
|
blr
|
|
__fillshort_4_0:
|
|
stw r4,0(r9)
|
|
add r9,r9,r7
|
|
bdnz __fillshort_4_0
|
|
blr
|
|
__fillshort_4_1:
|
|
__fillshort_4_3:
|
|
stb r4,0(r9)
|
|
sth r4,1(r9)
|
|
stb r4,3(r9)
|
|
add r9,r9,r7
|
|
bdnz __fillshort_4_3
|
|
blr
|
|
__fillshort_4_2:
|
|
sth r4,0(r9)
|
|
sth r4,2(r9)
|
|
add r9,r9,r7
|
|
bdnz __fillshort_4_2
|
|
blr
|
|
__fillshort_5_0:
|
|
stw r4,0(r9)
|
|
stb r4,4(r9)
|
|
add r9,r9,r7
|
|
bdnz __fillshort_5_0
|
|
blr
|
|
__fillshort_5_1:
|
|
stb r4,0(r9)
|
|
sth r4,1(r9)
|
|
sth r4,3(r9)
|
|
add r9,r9,r7
|
|
bdnz __fillshort_5_1
|
|
blr
|
|
__fillshort_5_2:
|
|
sth r4,0(r9)
|
|
sth r4,2(r9)
|
|
stb r4,4(r9)
|
|
add r9,r9,r7
|
|
bdnz __fillshort_5_2
|
|
blr
|
|
__fillshort_5_3:
|
|
stb r4,0(r9)
|
|
stw r4,1(r9)
|
|
add r9,r9,r7
|
|
bdnz __fillshort_5_3
|
|
blr
|
|
__fillshort_6_0:
|
|
stw r4,0(r9)
|
|
sth r4,4(r9)
|
|
add r9,r9,r7
|
|
bdnz __fillshort_6_0
|
|
blr
|
|
__fillshort_6_1:
|
|
stb r4,0(r9)
|
|
sth r4,1(r9)
|
|
sth r4,3(r9)
|
|
stb r4,5(r9)
|
|
add r9,r9,r7
|
|
bdnz __fillshort_6_1
|
|
blr
|
|
__fillshort_6_2:
|
|
sth r4,0(r9)
|
|
stw r4,2(r9)
|
|
add r9,r9,r7
|
|
bdnz __fillshort_6_2
|
|
blr
|
|
__fillshort_6_3:
|
|
stb r4,0(r9)
|
|
stw r4,1(r9)
|
|
stb r4,5(r9)
|
|
add r9,r9,r7
|
|
bdnz __fillshort_6_3
|
|
blr
|
|
__fillshort_7_0:
|
|
stw r4,0(r9)
|
|
sth r4,4(r9)
|
|
stb r4,6(r9)
|
|
add r9,r9,r7
|
|
bdnz __fillshort_7_0
|
|
blr
|
|
__fillshort_7_1:
|
|
stb r4,0(r9)
|
|
sth r4,1(r9)
|
|
stw r4,3(r9)
|
|
add r9,r9,r7
|
|
bdnz __fillshort_7_1
|
|
blr
|
|
__fillshort_7_2:
|
|
sth r4,0(r9)
|
|
stw r4,2(r9)
|
|
stb r4,6(r9)
|
|
add r9,r9,r7
|
|
bdnz __fillshort_7_2
|
|
blr
|
|
__fillshort_7_3:
|
|
stb r4,0(r9)
|
|
stw r4,1(r9)
|
|
sth r4,5(r9)
|
|
add r9,r9,r7
|
|
bdnz __fillshort_7_3
|
|
blr
|
|
__fillshort_8_0:
|
|
stw r4,0(r9)
|
|
stw r4,4(r9)
|
|
add r9,r9,r7
|
|
bdnz __fillshort_8_0
|
|
blr
|
|
__fillshort_8_1:
|
|
stb r4,0(r9)
|
|
sth r4,1(r9)
|
|
stw r4,3(r9)
|
|
stb r4,7(r9)
|
|
add r9,r9,r7
|
|
bdnz __fillshort_8_1
|
|
blr
|
|
__fillshort_8_2:
|
|
sth r4,0(r9)
|
|
stw r4,2(r9)
|
|
sth r4,6(r9)
|
|
add r9,r9,r7
|
|
bdnz __fillshort_8_2
|
|
blr
|
|
__fillshort_8_3:
|
|
stb r4,0(r9)
|
|
stw r4,1(r9)
|
|
sth r4,5(r9)
|
|
stb r4,7(r9)
|
|
add r9,r9,r7
|
|
bdnz __fillshort_8_3
|
|
blr
|
|
//
|
|
// Fill routines
|
|
//
|
|
__fillinit_0:
|
|
mtctr r11 // Main loop address
|
|
bctr // Jump to main loop
|
|
__fillinit_1:
|
|
mtctr r11 // Main loop address
|
|
stb r4,0(r9)
|
|
addi r9,r9,1
|
|
bctr // Jump to main loop
|
|
__fillinit_2:
|
|
mtctr r11 // Main loop address
|
|
sth r4,0(r9)
|
|
addi r9,r9,2
|
|
bctr // Jump to main loop
|
|
__fillinit_3:
|
|
mtctr r11 // Main loop address
|
|
stb r4,0(r9)
|
|
sth r4,1(r9)
|
|
addi r9,r9,3
|
|
bctr // Jump to main loop
|
|
__fillinit_4:
|
|
mtctr r11 // Main loop address
|
|
stw r4,0(r9)
|
|
addi r9,r9,4
|
|
bctr // Jump to main loop
|
|
__fillinit_5:
|
|
mtctr r11 // Main loop address
|
|
stb r4,0(r9)
|
|
stw r4,1(r9)
|
|
addi r9,r9,5
|
|
bctr // Jump to main loop
|
|
__fillinit_6:
|
|
mtctr r11 // Main loop address
|
|
sth r4,0(r9)
|
|
stw r4,2(r9)
|
|
addi r9,r9,6
|
|
bctr // Jump to main loop
|
|
__fillinit_7:
|
|
mtctr r11 // Main loop address
|
|
stb r4,0(r9)
|
|
sth r4,1(r9)
|
|
stw r4,3(r9)
|
|
addi r9,r9,7
|
|
bctr // Jump to main loop
|
|
//
|
|
__fillmain_3_0:
|
|
stfd f1,0(r9)
|
|
addi r9,r9,8
|
|
__fillmain_2_0:
|
|
stfd f1,0(r9)
|
|
addi r9,r9,8
|
|
__fillmain_1_0:
|
|
stfd f1,0(r9)
|
|
addi r9,r9,8
|
|
__fillmain_0_0:
|
|
mtctr r5 // Use CTR as a counter for 32 bytes units to fill
|
|
__fillmain00:
|
|
dcbz 0,r9 // Fill zero -> just "dcbz" is enough
|
|
addi r9,r9,32 // Increment target pointer
|
|
bdnz __fillmain00
|
|
mtctr r12 // End proc address
|
|
bctr // Jump to end proc
|
|
__fillmain_3_1:
|
|
stfd f1,0(r9)
|
|
addi r9,r9,8
|
|
__fillmain_2_1:
|
|
stfd f1,0(r9)
|
|
addi r9,r9,8
|
|
__fillmain_1_1:
|
|
stfd f1,0(r9)
|
|
addi r9,r9,8
|
|
__fillmain_0_1:
|
|
mtctr r5 // Use CTR as a counter for 32 bytes units to fill
|
|
__fillmainNZ:
|
|
#if USE_DCBZ
|
|
dcbz 0,r9 // Clear cache line
|
|
#endif
|
|
stfd f1,0(r9) // Fill 32 bytes of data
|
|
stfd f1,8(r9)
|
|
stfd f1,16(r9)
|
|
stfd f1,24(r9)
|
|
addi r9,r9,32 // Increment target pointer
|
|
bdnz __fillmainNZ
|
|
mtctr r12 // End proc address
|
|
bctr // Jump to end proc
|
|
//
|
|
__fillend_31:
|
|
stfd f1,0(r9)
|
|
addi r9,r9,8
|
|
__fillend_23:
|
|
stfd f1,0(r9)
|
|
addi r9,r9,8
|
|
__fillend_15:
|
|
stfd f1,0(r9)
|
|
addi r9,r9,8
|
|
__fillend_7:
|
|
mtctr r8 // Initial routine address
|
|
addic. r6,r6,-1 // Decrement line counter
|
|
stw r4,0(r9)
|
|
sth r4,4(r9)
|
|
stb r4,6(r9)
|
|
addi r9,r9,7
|
|
add r9,r9,r7 // Update target address to point to the top byte of the next line
|
|
bnectr // Jump to initial fill routine if all lines are not done
|
|
blr // Return to original calling point
|
|
__fillend_30:
|
|
stfd f1,0(r9)
|
|
addi r9,r9,8
|
|
__fillend_22:
|
|
stfd f1,0(r9)
|
|
addi r9,r9,8
|
|
__fillend_14:
|
|
stfd f1,0(r9)
|
|
addi r9,r9,8
|
|
__fillend_6:
|
|
mtctr r8 // Initial routine address
|
|
addic. r6,r6,-1 // Decrement line counter
|
|
stw r4,0(r9)
|
|
sth r4,4(r9)
|
|
addi r9,r9,6
|
|
add r9,r9,r7 // Update target address to point to the top byte of the next line
|
|
bnectr // Jump to initial fill routine if all lines are not done
|
|
blr // Return to original calling point
|
|
__fillend_29:
|
|
stfd f1,0(r9)
|
|
addi r9,r9,8
|
|
__fillend_21:
|
|
stfd f1,0(r9)
|
|
addi r9,r9,8
|
|
__fillend_13:
|
|
stfd f1,0(r9)
|
|
addi r9,r9,8
|
|
__fillend_5:
|
|
mtctr r8 // Initial routine address
|
|
addic. r6,r6,-1 // Decrement line counter
|
|
stw r4,0(r9)
|
|
stb r4,4(r9)
|
|
addi r9,r9,5
|
|
add r9,r9,r7 // Update target address to point to the top byte of the next line
|
|
bnectr // Jump to initial fill routine if all lines are not done
|
|
blr // Return to original calling point
|
|
__fillend_28:
|
|
stfd f1,0(r9)
|
|
addi r9,r9,8
|
|
__fillend_20:
|
|
stfd f1,0(r9)
|
|
addi r9,r9,8
|
|
__fillend_12:
|
|
stfd f1,0(r9)
|
|
addi r9,r9,8
|
|
__fillend_4:
|
|
mtctr r8 // Initial routine address
|
|
addic. r6,r6,-1 // Decrement line counter
|
|
stw r4,0(r9)
|
|
addi r9,r9,4
|
|
add r9,r9,r7 // Update target address to point to the top byte of the next line
|
|
bnectr // Jump to initial fill routine if all lines are not done
|
|
blr // Return to original calling point
|
|
__fillend_27:
|
|
stfd f1,0(r9)
|
|
addi r9,r9,8
|
|
__fillend_19:
|
|
stfd f1,0(r9)
|
|
addi r9,r9,8
|
|
__fillend_11:
|
|
stfd f1,0(r9)
|
|
addi r9,r9,8
|
|
__fillend_3:
|
|
mtctr r8 // Initial routine address
|
|
addic. r6,r6,-1 // Decrement line counter
|
|
sth r4,0(r9)
|
|
stb r4,2(r9)
|
|
addi r9,r9,3
|
|
add r9,r9,r7 // Update target address to point to the top byte of the next line
|
|
bnectr // Jump to initial fill routine if all lines are not done
|
|
blr // Return to original calling point
|
|
__fillend_26:
|
|
stfd f1,0(r9)
|
|
addi r9,r9,8
|
|
__fillend_18:
|
|
stfd f1,0(r9)
|
|
addi r9,r9,8
|
|
__fillend_10:
|
|
stfd f1,0(r9)
|
|
addi r9,r9,8
|
|
__fillend_2:
|
|
mtctr r8 // Initial routine address
|
|
addic. r6,r6,-1 // Decrement line counter
|
|
sth r4,0(r9)
|
|
addi r9,r9,2
|
|
add r9,r9,r7 // Update target address to point to the top byte of the next line
|
|
bnectr // Jump to initial fill routine if all lines are not done
|
|
blr // Return to original calling point
|
|
__fillend_25:
|
|
stfd f1,0(r9)
|
|
addi r9,r9,8
|
|
__fillend_17:
|
|
stfd f1,0(r9)
|
|
addi r9,r9,8
|
|
__fillend_9:
|
|
stfd f1,0(r9)
|
|
addi r9,r9,8
|
|
__fillend_1:
|
|
mtctr r8 // Initial routine address
|
|
addic. r6,r6,-1 // Decrement line counter
|
|
stb r4,0(r9)
|
|
addi r9,r9,1
|
|
add r9,r9,r7 // Update target address to point to the top byte of the next line
|
|
bnectr // Jump to initial fill routine if all lines are not done
|
|
blr // Return to original calling point
|
|
__fillend_24:
|
|
stfd f1,0(r9)
|
|
addi r9,r9,8
|
|
__fillend_16:
|
|
stfd f1,0(r9)
|
|
addi r9,r9,8
|
|
__fillend_8:
|
|
stfd f1,0(r9)
|
|
addi r9,r9,8
|
|
__fillend_0:
|
|
mtctr r8 // Initial routine address
|
|
addic. r6,r6,-1 // Decrement line counter
|
|
add r9,r9,r7 // Update target address to point to the top byte of the next line
|
|
bnectr // Jump to initial fill routine if all lines are not done
|
|
blr // Return to original calling point
|
|
//
|
|
__fillmainS:
|
|
mtctr r5 // no need for r5 zero check because width > 8 (r5 >= 1)
|
|
__fillmainS_00:
|
|
stw r4,0(r9) // Innermost loop -> fill word by word.
|
|
addi r9,r9,4
|
|
bdnz __fillmainS_00
|
|
mtctr r12 // End proc address
|
|
bctr // Jump to end proc
|
|
//
|
|
// End of fill routines
|
|
//
|
|
LEAF_EXIT(FillProcs)
|
|
//
|
|
#if (! FULLCACHE)
|
|
//
|
|
LEAF_ENTRY(flush_cache)
|
|
//
|
|
// Register usage for flushing cache (* indicates input parameters)
|
|
//
|
|
// *r3: The pointer to the parameter structure (same as above)
|
|
// r4: Maximum number of cache lines to flush
|
|
// r5: Number of bytes to fill per line
|
|
// r6: Number of target lines
|
|
// r7: Delta bytes per line
|
|
// r8: Starting cache line address
|
|
// *r9: Ending cache line address (pointing to the first byte of the next line on entry)
|
|
// r10: Updating cache line address
|
|
// r11: Number of cache entries to flush per line
|
|
//
|
|
lwz r11,PARAM9(r3) // r11 <- cache control flag
|
|
andis. r11,r11,TFLUSHBIT // Need to flush target cache?
|
|
beq- flush_exit // No -> exit byte loop
|
|
lwz r5,PARAM2(r3) // r5 <- bytes to fill per line
|
|
lwz r4,PARAM7(r3) // r4 <- Maximum number of cache lines to flush
|
|
lwz r7,PARAM4(r3) // r7 <- Target line increment
|
|
lwz r6,PARAM8(r3) // r6 <- Maximum number of display lines to flush
|
|
lwz r8,PARAM3(r3) // r8 <- Number of target lines
|
|
cmplw r8,r6 // compare those two
|
|
bge flush_05 // and take whichever
|
|
mr r6,r8 // smaller
|
|
flush_05:
|
|
subf r8,r7,r9 // r8 <- pointing to the first byte in the last line
|
|
add r9,r8,r5 // r9 <- pointing to one byte after last filled byte
|
|
rlwinm r8,r8,0,0,26 // r8 <- 32 byte aligned start address
|
|
addi r9,r9,-1 // r9 <- pointing to the last byte stored in the last line
|
|
rlwinm r9,r9,0,0,26 // r9 <- 32 byte aligned end address
|
|
subf r11,r8,r9 // r11 <- end - start
|
|
srawi r11,r11,5
|
|
addi r11,r11,1 // r11 <- Number of cache entries to flush per line
|
|
flush_10:
|
|
mr r10,r9 // r10 <- address to flush cache to start with
|
|
flush_20:
|
|
dcbf 0,r10 // Flush cached data
|
|
addi r10,r10,-32 // Decrement address to flush
|
|
cmplw r10,r8 // Exceeding end address?
|
|
bge flush_20 // No -> loop to flush previous cache line
|
|
subf. r4,r11,r4 // Flush enough entries?
|
|
blt- flush_exit // Yes -> exit
|
|
addic. r6,r6,-1 // Flush all lines?
|
|
subf r8,r7,r8 // Update start
|
|
subf r9,r7,r9 // and end address to flush cache to point to the previous line
|
|
bne flush_10 // No -> continue to flush
|
|
flush_exit:
|
|
LEAF_EXIT(flush_cache)
|
|
#endif // (! FULLCACHE)
|
|
//
|
|
//*************************************************************************************************
|
|
SPECIAL_ENTRY(RectOp)
|
|
//
|
|
// Input Parameters:
|
|
// r3: The pointer to the parameter structure as follows.
|
|
// PARAM1 [00] : Target address
|
|
// PARAM2 [04] : Number of bytes to operate per line
|
|
// PARAM3 [08] : Number of lines to operate
|
|
// PARAM4 [12] : Target line increments byte per line
|
|
// PARAM5 [16] : Dword solid brush to use (duplicated brush)
|
|
// PARAM6 [20] : [reserved]
|
|
// PARAM7 [24] : Maximum number of cache lines to flush
|
|
// PARAM8 [28] : Maximum number of display lines to flush
|
|
// PARAM9 [32] : Operation control flag
|
|
// bit 16 ~ 23: Operation
|
|
// bit 23 (OPXOR) : XOR brush & target
|
|
// Currently, only XOR is supported
|
|
// bit 1 (TFLUSHBIT): Target Flush flag 0:No Flush, 1:Flush
|
|
// PARAM10 [36] : Register save area 1
|
|
// PARAM11 [40] : Register save area 2
|
|
// PARAM12 [44] : Register save area 3
|
|
// PARAM13 [48] : Register save area 4
|
|
// PARAM14 [52] : Register save area 5
|
|
// PARAM15 [56] : Register save area 6
|
|
//
|
|
// Register usage:
|
|
// r4: Solid word brush to be used for the operation
|
|
// r5: Number of bytes to operate per line -> inner most loop counter
|
|
// r6: Remaining number of lines to operate
|
|
// r7: Gap between after last byte of previous line and the top byte of next line
|
|
// r8: Operation control flag -> Before loop operation routine address
|
|
// r9: Updating target address
|
|
// r10: Work register
|
|
// r11: Main operation routine address
|
|
// r12: After loop operation routine address
|
|
// r14: Work register
|
|
// r15: Work register
|
|
// r16: Work register
|
|
// r17: Work register
|
|
// r18: Work register
|
|
// r19: Work register
|
|
// r20: Work register
|
|
// r31: Register to save LR
|
|
// CTR: Used for loop counter and linking
|
|
//
|
|
// Restrictions:
|
|
// If Pixel width is 2 bytes, the target address has to be half word aligned.
|
|
// If Pixel width is 4 bytes, the target address has to be word aligned.
|
|
// Number of bytes must be multiple of pixel width in bytes.
|
|
// Fill width is assumed to be equal or shorter than target delta.
|
|
// Target line increments byte has to be multiple of 4.
|
|
// This routine trys to utilize 32 byte alignment between lines, but it doesn't
|
|
// have to be because we don't need to use "dcbz" in this routine.
|
|
//
|
|
stw r31,SLACK1(sp)
|
|
mflr r31
|
|
//
|
|
// Save non-volatile registers
|
|
//
|
|
stw r14,SLACK2(sp)
|
|
stw r15,SLACK3(sp)
|
|
stw r16,SLACK4(sp)
|
|
stw r17,SLACK5(sp)
|
|
stw r18,SLACK6(sp)
|
|
stw r19,SLACK7(sp)
|
|
stw r20,SLACK8(sp)
|
|
//
|
|
PROLOGUE_END(RectOp)
|
|
//
|
|
lwz r6,PARAM3(r3) // r6 <- number of lines to operate
|
|
and. r6,r6,r6 // Any lines to operate?
|
|
beq- op_exit // No -> exit
|
|
lwz r9,PARAM1(r3) // r9 <- target address
|
|
lwz r5,PARAM2(r3) // r5 <- bytes to operate per line
|
|
lwz r7,PARAM4(r3) // r7 <- byte distance between lines
|
|
lwz r4,PARAM5(r3) // r4 <- solid brush
|
|
lwz r8,PARAM9(r3) // r8 <- operation control flag
|
|
cmplwi r5,MINLENGTH_OP // Is it wide enough to do in this routine?
|
|
bge op_05 // Yes -> go ahead
|
|
//
|
|
and. r5,r5,r5 // Width zero?
|
|
beq op_exit // Yes -> just exit
|
|
mr r14,r3 // Save r3
|
|
mr r3,r9 // r3 <- target address
|
|
bl ..RectOpS // and call RectOpS
|
|
mr r3,r14 // Restore r3
|
|
b op_10 // and jump to flush cache
|
|
//
|
|
op_05:
|
|
subf r7,r5,r7 // r7 <- gap between after last byte of previous line and the top byte of next line
|
|
bl op_06
|
|
__InitXorProc: // Procedures to handle initial 8 byte alignment adjustment
|
|
.ualong __xorinit_0
|
|
.ualong __xorinit_7
|
|
.ualong __xorinit_6
|
|
.ualong __xorinit_5
|
|
.ualong __xorinit_4
|
|
.ualong __xorinit_3
|
|
.ualong __xorinit_2
|
|
.ualong __xorinit_1
|
|
__MainXorProc: // Procedures to handle main loop (plus initial 32 byte alignment from dword alignment)
|
|
.ualong __xormain_0
|
|
.ualong __xormain_3
|
|
.ualong __xormain_2
|
|
.ualong __xormain_1
|
|
__EndXorProc: // Procedures to handle up to 31 byte fill at the end of each line
|
|
.ualong __xorend_0
|
|
.ualong __xorend_1
|
|
.ualong __xorend_2
|
|
.ualong __xorend_3
|
|
.ualong __xorend_4
|
|
.ualong __xorend_5
|
|
.ualong __xorend_6
|
|
.ualong __xorend_7
|
|
.ualong __xorend_8
|
|
.ualong __xorend_9
|
|
.ualong __xorend_10
|
|
.ualong __xorend_11
|
|
.ualong __xorend_12
|
|
.ualong __xorend_13
|
|
.ualong __xorend_14
|
|
.ualong __xorend_15
|
|
.ualong __xorend_16
|
|
.ualong __xorend_17
|
|
.ualong __xorend_18
|
|
.ualong __xorend_19
|
|
.ualong __xorend_20
|
|
.ualong __xorend_21
|
|
.ualong __xorend_22
|
|
.ualong __xorend_23
|
|
.ualong __xorend_24
|
|
.ualong __xorend_25
|
|
.ualong __xorend_26
|
|
.ualong __xorend_27
|
|
.ualong __xorend_28
|
|
.ualong __xorend_29
|
|
.ualong __xorend_30
|
|
.ualong __xorend_31
|
|
//
|
|
__xormain_3:
|
|
lwz r10,4(r9)
|
|
xor r10,r10,r4
|
|
stwu r10,4(r9)
|
|
lwz r10,4(r9)
|
|
xor r10,r10,r4
|
|
stwu r10,4(r9)
|
|
__xormain_2:
|
|
lwz r10,4(r9)
|
|
xor r10,r10,r4
|
|
stwu r10,4(r9)
|
|
lwz r10,4(r9)
|
|
xor r10,r10,r4
|
|
stwu r10,4(r9)
|
|
__xormain_1:
|
|
lwz r10,4(r9)
|
|
xor r10,r10,r4
|
|
stwu r10,4(r9)
|
|
lwz r10,4(r9)
|
|
xor r10,r10,r4
|
|
stwu r10,4(r9)
|
|
__xormain_0:
|
|
mtctr r5 // Use CTR as a counter for 32 bytes units to fill
|
|
__xormain:
|
|
lwz r10,4(r9)
|
|
lwz r14,8(r9)
|
|
lwz r15,12(r9)
|
|
lwz r16,16(r9)
|
|
lwz r17,20(r9)
|
|
lwz r18,24(r9)
|
|
lwz r19,28(r9)
|
|
lwz r20,32(r9)
|
|
xor r10,r10,r4
|
|
xor r14,r14,r4
|
|
xor r15,r15,r4
|
|
xor r16,r16,r4
|
|
xor r17,r17,r4
|
|
xor r18,r18,r4
|
|
xor r19,r19,r4
|
|
xor r20,r20,r4
|
|
stwu r10,4(r9)
|
|
stwu r14,4(r9)
|
|
stwu r15,4(r9)
|
|
stwu r16,4(r9)
|
|
stwu r17,4(r9)
|
|
stwu r18,4(r9)
|
|
stwu r19,4(r9)
|
|
stwu r20,4(r9)
|
|
bdnz __xormain
|
|
mtctr r12 // End proc address
|
|
bctr // Jump to end proc
|
|
//
|
|
op_06:
|
|
mflr r10
|
|
//
|
|
// If we need to support other than XOR operation, refer to operation kind bits in r8 and
|
|
// change r10 so as to pointing to correct operation table here.
|
|
//
|
|
rlwinm r12,r9,2,27,29 // r12 <- table index for init loop
|
|
lwzx r8,r10,r12 // r8 <- init routine address
|
|
andi. r12,r9,0x07
|
|
beq op_07
|
|
subfic r12,r12,8 // r12 <- byte length operated by init routine
|
|
op_07:
|
|
add r11,r9,r12 // r11 <- target address after initial operation
|
|
rlwinm r11,r11,31,28,29 // r11 (bit 28&29) = 00:0, 01:24, 10:16, 11:8 byte to fill to make 32 byte alignment
|
|
addi r10,r10,__MainXorProc-__InitXorProc
|
|
lwzx r11,r10,r11 // r11 <- main operation routine address
|
|
andi. r12,r9,0x1f // dis-alignment for 32 byte alignment
|
|
beq op_09
|
|
subfic r12,r12,32 // r12 <- number of byte to be operated before innermost loop
|
|
op_09:
|
|
subf r12,r12,r5 // r12 <- number of byte to be operated at the inner most loop and end routine
|
|
srawi. r5,r12,5 // r5 <- innermost loop counter
|
|
rlwinm r12,r12,2,25,29 // r12 <- end routine table index
|
|
addi r10,r10,__EndXorProc-__MainXorProc
|
|
lwzx r12,r10,r12 // r12 <- end routine address
|
|
//
|
|
mtlr r8
|
|
blrl // Call init proc --> will cahin to main routine -> end routine and loop for all lines
|
|
//
|
|
op_10:
|
|
#if (! FULLCACHE)
|
|
bl ..flush_cache // Flush cache
|
|
#endif
|
|
//
|
|
// Restore non-volatile registers
|
|
//
|
|
lwz r14,SLACK2(sp)
|
|
lwz r15,SLACK3(sp)
|
|
lwz r16,SLACK4(sp)
|
|
lwz r17,SLACK5(sp)
|
|
lwz r18,SLACK6(sp)
|
|
lwz r19,SLACK7(sp)
|
|
lwz r20,SLACK8(sp)
|
|
mtlr r31
|
|
lwz r31,SLACK1(sp)
|
|
//
|
|
op_exit:
|
|
SPECIAL_EXIT(RectOp)
|
|
//
|
|
//*************************************************************************************************
|
|
SPECIAL_ENTRY(RectOpS)
|
|
//
|
|
// Input Parameters:
|
|
// r3: Target address
|
|
// r4: Solid brush to be used for the operation (duplicated)
|
|
// r5: Number of bytes --> inner loop count
|
|
// r6: Number of lines
|
|
// r7: Target line increment bytes per line
|
|
// r8: Operation --> used for Init subroutine address
|
|
//
|
|
// Register usage:
|
|
//
|
|
// r0: Saved return address
|
|
// r9: Target address to use
|
|
// r10: Work register
|
|
// r11: Saved return address
|
|
// r12: Ending subroutine address
|
|
//
|
|
// Restrictions:
|
|
// If Pixel width is 2 bytes, the target address has to be half word aligned.
|
|
// If Pixel width is 4 bytes, the target address has to be word aligned.
|
|
// Number of bytes must be multiple of pixel width in bytes.
|
|
// Fill width is assumed to be equal or shorter than target delta.
|
|
// Target line increments byte has to be multiple of 4.
|
|
//
|
|
mflr r0 // Save return address
|
|
//
|
|
PROLOGUE_END(RectOpS)
|
|
//
|
|
and. r6,r6,r6 // Any lines to operate?
|
|
beq ops_exit // No -> exit
|
|
mr r9,r3 // r9 <- target address to use
|
|
cmplwi r5,8 // More than 8 bytes?
|
|
bgt ops_40 // Yes -> do normal operate
|
|
and. r5,r5,r5 // Width zero?
|
|
beq ops_exit // Yes -> just exit
|
|
bl ops_10
|
|
__ShortXorProcS:
|
|
.ualong __xorshort_1
|
|
.ualong __xorshort_1
|
|
.ualong __xorshort_1
|
|
.ualong __xorshort_1
|
|
.ualong __xorshort_2_0
|
|
.ualong __xorshort_2_1
|
|
.ualong __xorshort_2_2
|
|
.ualong __xorshort_2_3
|
|
.ualong __xorshort_3_0
|
|
.ualong __xorshort_3_1
|
|
.ualong __xorshort_3_2
|
|
.ualong __xorshort_3_3
|
|
.ualong __xorshort_4_0
|
|
.ualong __xorshort_4_1
|
|
.ualong __xorshort_4_2
|
|
.ualong __xorshort_4_3
|
|
.ualong __xorshort_5_0
|
|
.ualong __xorshort_5_1
|
|
.ualong __xorshort_5_2
|
|
.ualong __xorshort_5_3
|
|
.ualong __xorshort_6_0
|
|
.ualong __xorshort_6_1
|
|
.ualong __xorshort_6_2
|
|
.ualong __xorshort_6_3
|
|
.ualong __xorshort_7_0
|
|
.ualong __xorshort_7_1
|
|
.ualong __xorshort_7_2
|
|
.ualong __xorshort_7_3
|
|
.ualong __xorshort_8_0
|
|
.ualong __xorshort_8_1
|
|
.ualong __xorshort_8_2
|
|
.ualong __xorshort_8_3
|
|
//
|
|
// Short operation <= 8 bytes
|
|
//
|
|
ops_10:
|
|
mflr r10 // r10 <- InitProcS address
|
|
//
|
|
// If we need to support other than XOR operation, refer to operation kind bits in r8 and
|
|
// change r10 so as to pointing to correct operation table here.
|
|
//
|
|
addi r8,r5,-1 // r8 <- width - 1 (0~7)
|
|
rlwinm r8,r8,4,25,27 // bit 25~27 of r8 <- width - 1 (0~7)
|
|
rlwimi r8,r9,2,28,29 // bit 28~29 of r8 <- mod 4 of target address
|
|
lwzx r8,r10,r8 // r8 <- subroutine to call
|
|
mtlr r8
|
|
mtctr r6 // CTR <- number of lines to perform the operation
|
|
blrl // Call short operation subroutine
|
|
b ops_90
|
|
//
|
|
// width > 8 -- normal process
|
|
//
|
|
ops_40:
|
|
subf r7,r5,r7 // r7 <- gap between after last byte of previous line and the top byte of next line
|
|
bl ops_50
|
|
__InitXorProcS:
|
|
.ualong __xorinit_0
|
|
.ualong __xorinit_3
|
|
.ualong __xorinit_2
|
|
.ualong __xorinit_1
|
|
__MainXorProcS:
|
|
.ualong __xormainS
|
|
__EndXorProcS:
|
|
.ualong __xorend_0
|
|
.ualong __xorend_1
|
|
.ualong __xorend_2
|
|
.ualong __xorend_3
|
|
//
|
|
ops_50:
|
|
mflr r10 // r10 <- InitProcS address
|
|
rlwinm r12,r9,2,28,29 // r12 <- table index for init loop
|
|
lwzx r8,r10,r12 // r8 <- init routine address
|
|
andi. r12,r9,0x3
|
|
beq ops_55
|
|
subfic r12,r12,4 // r12 <- number of initial operated byte
|
|
ops_55:
|
|
subf r12,r12,r5 // r12 <- number of bytes to operate after initial routine
|
|
srawi. r5,r12,2 // r5 <- inner loop count
|
|
rlwinm r12,r12,2,28,29 // r12 <- 2 bit shifted number of remaining bytes to operate after main loop
|
|
addi r10,r10,__MainXorProcS-__InitXorProcS
|
|
lwz r11,0(r10) // r11 <- main routine address
|
|
addi r10,r10,__EndXorProcS-__MainXorProcS
|
|
lwzx r12,r10,r12 // r12 <- end routine address
|
|
mtlr r8
|
|
blrl // Call init proc --> will cahin to main routine -> end routine and loop for all lines
|
|
//
|
|
ops_90:
|
|
mtlr r0 // Restore return address
|
|
ops_exit:
|
|
SPECIAL_EXIT(RectOpS)
|
|
//
|
|
LEAF_ENTRY(XorProcs)
|
|
//
|
|
// Subroutines for xor
|
|
//
|
|
__xorinit_0:
|
|
mtctr r11 // Main loop address
|
|
addi r9,r9,-4 // Decrement r9 to use updated load/store
|
|
bctr // Jump to main loop
|
|
__xorinit_1:
|
|
mtctr r11 // Main loop address
|
|
lbz r10,0(r9)
|
|
xor r10,r10,r4
|
|
stb r10,0(r9)
|
|
addi r9,r9,-3 // Decrement r9 to use updated load/store
|
|
bctr // Jump to main loop
|
|
__xorinit_2:
|
|
mtctr r11 // Main loop address
|
|
lhz r10,0(r9)
|
|
xor r10,r10,r4
|
|
sth r10,0(r9)
|
|
addi r9,r9,-2 // Decrement r9 to use updated load/store
|
|
bctr // Jump to main loop
|
|
__xorinit_3:
|
|
mtctr r11 // Main loop address
|
|
lbz r10,0(r9)
|
|
xor r10,r10,r4
|
|
stb r10,0(r9)
|
|
lhz r10,1(r9)
|
|
xor r10,r10,r4
|
|
sth r10,1(r9)
|
|
addi r9,r9,-1 // Decrement r9 to use updated load/store
|
|
bctr // Jump to main loop
|
|
__xorinit_4:
|
|
mtctr r11 // Main loop address
|
|
lwz r10,0(r9)
|
|
xor r10,r10,r4
|
|
stw r10,0(r9) // Don't increment r9 to use updated load/store
|
|
bctr // Jump to main loop
|
|
__xorinit_5:
|
|
mtctr r11 // Main loop address
|
|
lbz r10,0(r9)
|
|
xor r10,r10,r4
|
|
stb r10,0(r9)
|
|
lwz r10,1(r9)
|
|
xor r10,r10,r4
|
|
stw r10,1(r9)
|
|
addi r9,r9,1 // Adjust r9 to use updated load/store
|
|
bctr // Jump to main loop
|
|
__xorinit_6:
|
|
mtctr r11 // Main loop address
|
|
lhz r10,0(r9)
|
|
xor r10,r10,r4
|
|
sth r10,0(r9)
|
|
lwz r10,2(r9)
|
|
xor r10,r10,r4
|
|
stw r10,2(r9)
|
|
addi r9,r9,2 // Adjust r9 to use updated load/store
|
|
bctr // Jump to main loop
|
|
__xorinit_7:
|
|
mtctr r11 // Main loop address
|
|
lbz r10,0(r9)
|
|
xor r10,r10,r4
|
|
stb r10,0(r9)
|
|
lhz r10,1(r9)
|
|
xor r10,r10,r4
|
|
sth r10,1(r9)
|
|
lwz r10,3(r9)
|
|
xor r10,r10,r4
|
|
stw r10,3(r9)
|
|
addi r9,r9,3 // Adjust r9 to use updated load/store
|
|
bctr // Jump to main loop
|
|
//
|
|
__xorend_31:
|
|
lwz r10,4(r9)
|
|
xor r10,r10,r4
|
|
stwu r10,4(r9)
|
|
__xorend_27:
|
|
lwz r10,4(r9)
|
|
xor r10,r10,r4
|
|
stwu r10,4(r9)
|
|
__xorend_23:
|
|
lwz r10,4(r9)
|
|
xor r10,r10,r4
|
|
stwu r10,4(r9)
|
|
__xorend_19:
|
|
lwz r10,4(r9)
|
|
xor r10,r10,r4
|
|
stwu r10,4(r9)
|
|
__xorend_15:
|
|
lwz r10,4(r9)
|
|
xor r10,r10,r4
|
|
stwu r10,4(r9)
|
|
__xorend_11:
|
|
lwz r10,4(r9)
|
|
xor r10,r10,r4
|
|
stwu r10,4(r9)
|
|
__xorend_7:
|
|
lwz r10,4(r9)
|
|
xor r10,r10,r4
|
|
stwu r10,4(r9)
|
|
__xorend_3:
|
|
mtctr r8 // Initial routine address
|
|
addic. r6,r6,-1 // Decrement line counter
|
|
lhz r10,4(r9)
|
|
xor r10,r10,r4
|
|
sth r10,4(r9)
|
|
lbz r10,6(r9)
|
|
xor r10,r10,r4
|
|
stb r10,6(r9)
|
|
addi r9,r9,7
|
|
add r9,r9,r7 // Update target address to point to the top byte of the next line
|
|
bnectr // Jump to initial fill routine
|
|
blr // Return to original calling point
|
|
__xorend_30:
|
|
lwz r10,4(r9)
|
|
xor r10,r10,r4
|
|
stwu r10,4(r9)
|
|
__xorend_26:
|
|
lwz r10,4(r9)
|
|
xor r10,r10,r4
|
|
stwu r10,4(r9)
|
|
__xorend_22:
|
|
lwz r10,4(r9)
|
|
xor r10,r10,r4
|
|
stwu r10,4(r9)
|
|
__xorend_18:
|
|
lwz r10,4(r9)
|
|
xor r10,r10,r4
|
|
stwu r10,4(r9)
|
|
__xorend_14:
|
|
lwz r10,4(r9)
|
|
xor r10,r10,r4
|
|
stwu r10,4(r9)
|
|
__xorend_10:
|
|
lwz r10,4(r9)
|
|
xor r10,r10,r4
|
|
stwu r10,4(r9)
|
|
__xorend_6:
|
|
lwz r10,4(r9)
|
|
xor r10,r10,r4
|
|
stwu r10,4(r9)
|
|
__xorend_2:
|
|
mtctr r8 // Initial routine address
|
|
addic. r6,r6,-1 // Decrement line counter
|
|
lhz r10,4(r9)
|
|
xor r10,r10,r4
|
|
sth r10,4(r9)
|
|
addi r9,r9,6
|
|
add r9,r9,r7 // Update target address to point to the top byte of the next line
|
|
bnectr // Jump to initial fill routine
|
|
blr // Return to original calling point
|
|
__xorend_29:
|
|
lwz r10,4(r9)
|
|
xor r10,r10,r4
|
|
stwu r10,4(r9)
|
|
__xorend_25:
|
|
lwz r10,4(r9)
|
|
xor r10,r10,r4
|
|
stwu r10,4(r9)
|
|
__xorend_21:
|
|
lwz r10,4(r9)
|
|
xor r10,r10,r4
|
|
stwu r10,4(r9)
|
|
__xorend_17:
|
|
lwz r10,4(r9)
|
|
xor r10,r10,r4
|
|
stwu r10,4(r9)
|
|
__xorend_13:
|
|
lwz r10,4(r9)
|
|
xor r10,r10,r4
|
|
stwu r10,4(r9)
|
|
__xorend_9:
|
|
lwz r10,4(r9)
|
|
xor r10,r10,r4
|
|
stwu r10,4(r9)
|
|
__xorend_5:
|
|
lwz r10,4(r9)
|
|
xor r10,r10,r4
|
|
stwu r10,4(r9)
|
|
__xorend_1:
|
|
mtctr r8 // Initial routine address
|
|
addic. r6,r6,-1 // Decrement line counter
|
|
lbz r10,4(r9)
|
|
xor r10,r10,r4
|
|
stb r10,4(r9)
|
|
addi r9,r9,5
|
|
add r9,r9,r7 // Update target address to point to the top byte of the next line
|
|
bnectr // Jump to initial fill routine
|
|
blr // Return to original calling point
|
|
__xorend_28:
|
|
lwz r10,4(r9)
|
|
xor r10,r10,r4
|
|
stwu r10,4(r9)
|
|
__xorend_24:
|
|
lwz r10,4(r9)
|
|
xor r10,r10,r4
|
|
stwu r10,4(r9)
|
|
__xorend_20:
|
|
lwz r10,4(r9)
|
|
xor r10,r10,r4
|
|
stwu r10,4(r9)
|
|
__xorend_16:
|
|
lwz r10,4(r9)
|
|
xor r10,r10,r4
|
|
stwu r10,4(r9)
|
|
__xorend_12:
|
|
lwz r10,4(r9)
|
|
xor r10,r10,r4
|
|
stwu r10,4(r9)
|
|
__xorend_8:
|
|
lwz r10,4(r9)
|
|
xor r10,r10,r4
|
|
stwu r10,4(r9)
|
|
__xorend_4:
|
|
lwz r10,4(r9)
|
|
xor r10,r10,r4
|
|
stwu r10,4(r9)
|
|
__xorend_0:
|
|
mtctr r8 // Initial routine address
|
|
addic. r6,r6,-1 // Decrement line counter
|
|
addi r9,r9,4
|
|
add r9,r9,r7 // Update target address to point to the top byte of the next line
|
|
bnectr // Jump to initial fill routine
|
|
blr // Return to original calling point
|
|
//
|
|
__xormainS:
|
|
mtctr r5 // no need for r5 zero check because width > 8 (r5 >= 1)
|
|
__xormainS_00:
|
|
lwz r10,4(r9)
|
|
xor r10,r10,r4
|
|
stwu r10,4(r9)
|
|
bdnz __xormainS_00
|
|
mtctr r12 // End proc address
|
|
bctr // Jump to end proc
|
|
//
|
|
__xorshort_1:
|
|
lbz r10,0(r9)
|
|
xor r10,r10,r4
|
|
stb r10,0(r9)
|
|
add r9,r9,r7
|
|
bdnz __xorshort_1
|
|
blr
|
|
__xorshort_2_0:
|
|
__xorshort_2_2:
|
|
lhz r10,0(r9)
|
|
xor r10,r10,r4
|
|
sth r10,0(r9)
|
|
add r9,r9,r7
|
|
bdnz __xorshort_2_2
|
|
blr
|
|
__xorshort_2_1:
|
|
__xorshort_2_3:
|
|
lbz r10,0(r9)
|
|
xor r10,r10,r4
|
|
stb r10,0(r9)
|
|
lbz r10,1(r9)
|
|
xor r10,r10,r4
|
|
stb r10,1(r9)
|
|
add r9,r9,r7
|
|
bdnz __xorshort_2_3
|
|
blr
|
|
__xorshort_3_0:
|
|
__xorshort_3_2:
|
|
lhz r10,0(r9)
|
|
xor r10,r10,r4
|
|
sth r10,0(r9)
|
|
lbz r10,2(r9)
|
|
xor r10,r10,r4
|
|
stb r10,2(r9)
|
|
add r9,r9,r7
|
|
bdnz __xorshort_3_2
|
|
blr
|
|
__xorshort_3_1:
|
|
__xorshort_3_3:
|
|
lbz r10,0(r9)
|
|
xor r10,r10,r4
|
|
stb r10,0(r9)
|
|
lhz r10,1(r9)
|
|
xor r10,r10,r4
|
|
sth r10,1(r9)
|
|
add r9,r9,r7
|
|
bdnz __xorshort_3_3
|
|
blr
|
|
__xorshort_4_0:
|
|
lwz r10,0(r9)
|
|
xor r10,r10,r4
|
|
stw r10,0(r9)
|
|
add r9,r9,r7
|
|
bdnz __xorshort_4_0
|
|
blr
|
|
__xorshort_4_1:
|
|
__xorshort_4_3:
|
|
lbz r10,0(r9)
|
|
xor r10,r10,r4
|
|
stb r10,0(r9)
|
|
lhz r10,1(r9)
|
|
xor r10,r10,r4
|
|
sth r10,1(r9)
|
|
lbz r10,3(r9)
|
|
xor r10,r10,r4
|
|
stb r10,3(r9)
|
|
add r9,r9,r7
|
|
bdnz __xorshort_4_3
|
|
blr
|
|
__xorshort_4_2:
|
|
lhz r10,0(r9)
|
|
xor r10,r10,r4
|
|
sth r10,0(r9)
|
|
lhz r10,2(r9)
|
|
xor r10,r10,r4
|
|
sth r10,2(r9)
|
|
add r9,r9,r7
|
|
bdnz __xorshort_4_2
|
|
blr
|
|
__xorshort_5_0:
|
|
lwz r10,0(r9)
|
|
xor r10,r10,r4
|
|
stw r10,0(r9)
|
|
lbz r10,4(r9)
|
|
xor r10,r10,r4
|
|
stb r10,4(r9)
|
|
add r9,r9,r7
|
|
bdnz __xorshort_5_0
|
|
blr
|
|
__xorshort_5_1:
|
|
lbz r10,0(r9)
|
|
xor r10,r10,r4
|
|
stb r10,0(r9)
|
|
lhz r10,1(r9)
|
|
xor r10,r10,r4
|
|
sth r10,1(r9)
|
|
lhz r10,3(r9)
|
|
xor r10,r10,r4
|
|
sth r10,3(r9)
|
|
add r9,r9,r7
|
|
bdnz __xorshort_5_1
|
|
blr
|
|
__xorshort_5_2:
|
|
lhz r10,0(r9)
|
|
xor r10,r10,r4
|
|
sth r10,0(r9)
|
|
lhz r10,2(r9)
|
|
xor r10,r10,r4
|
|
sth r10,2(r9)
|
|
lbz r10,4(r9)
|
|
xor r10,r10,r4
|
|
stb r10,4(r9)
|
|
add r9,r9,r7
|
|
bdnz __xorshort_5_2
|
|
blr
|
|
__xorshort_5_3:
|
|
lbz r10,0(r9)
|
|
xor r10,r10,r4
|
|
stb r10,0(r9)
|
|
lwz r10,1(r9)
|
|
xor r10,r10,r4
|
|
stw r10,1(r9)
|
|
add r9,r9,r7
|
|
bdnz __xorshort_5_3
|
|
blr
|
|
__xorshort_6_0:
|
|
lwz r10,0(r9)
|
|
xor r10,r10,r4
|
|
stw r10,0(r9)
|
|
lhz r10,4(r9)
|
|
xor r10,r10,r4
|
|
sth r10,4(r9)
|
|
add r9,r9,r7
|
|
bdnz __xorshort_6_0
|
|
blr
|
|
__xorshort_6_1:
|
|
lbz r10,0(r9)
|
|
xor r10,r10,r4
|
|
stb r10,0(r9)
|
|
lhz r10,1(r9)
|
|
xor r10,r10,r4
|
|
sth r10,1(r9)
|
|
lhz r10,3(r9)
|
|
xor r10,r10,r4
|
|
sth r10,3(r9)
|
|
lbz r10,5(r9)
|
|
xor r10,r10,r4
|
|
stb r10,5(r9)
|
|
add r9,r9,r7
|
|
bdnz __xorshort_6_1
|
|
blr
|
|
__xorshort_6_2:
|
|
lhz r10,0(r9)
|
|
xor r10,r10,r4
|
|
sth r10,0(r9)
|
|
lwz r10,2(r9)
|
|
xor r10,r10,r4
|
|
stw r10,2(r9)
|
|
add r9,r9,r7
|
|
bdnz __xorshort_6_2
|
|
blr
|
|
__xorshort_6_3:
|
|
lbz r10,0(r9)
|
|
xor r10,r10,r4
|
|
stb r10,0(r9)
|
|
lwz r10,1(r9)
|
|
xor r10,r10,r4
|
|
stw r10,1(r9)
|
|
lbz r10,5(r9)
|
|
xor r10,r10,r4
|
|
stb r10,5(r9)
|
|
add r9,r9,r7
|
|
bdnz __xorshort_6_3
|
|
blr
|
|
__xorshort_7_0:
|
|
lwz r10,0(r9)
|
|
xor r10,r10,r4
|
|
stw r10,0(r9)
|
|
lhz r10,4(r9)
|
|
xor r10,r10,r4
|
|
sth r10,4(r9)
|
|
lbz r10,6(r9)
|
|
xor r10,r10,r4
|
|
stb r10,6(r9)
|
|
add r9,r9,r7
|
|
bdnz __xorshort_7_0
|
|
blr
|
|
__xorshort_7_1:
|
|
lbz r10,0(r9)
|
|
xor r10,r10,r4
|
|
stb r10,0(r9)
|
|
lhz r10,1(r9)
|
|
xor r10,r10,r4
|
|
sth r10,1(r9)
|
|
lwz r10,3(r9)
|
|
xor r10,r10,r4
|
|
stw r10,3(r9)
|
|
add r9,r9,r7
|
|
bdnz __xorshort_7_1
|
|
blr
|
|
__xorshort_7_2:
|
|
lhz r10,0(r9)
|
|
xor r10,r10,r4
|
|
sth r10,0(r9)
|
|
lwz r10,2(r9)
|
|
xor r10,r10,r4
|
|
stw r10,2(r9)
|
|
lbz r10,6(r9)
|
|
xor r10,r10,r4
|
|
stb r10,6(r9)
|
|
add r9,r9,r7
|
|
bdnz __xorshort_7_2
|
|
blr
|
|
__xorshort_7_3:
|
|
lbz r10,0(r9)
|
|
xor r10,r10,r4
|
|
stb r10,0(r9)
|
|
lwz r10,1(r9)
|
|
xor r10,r10,r4
|
|
stw r10,1(r9)
|
|
lhz r10,5(r9)
|
|
xor r10,r10,r4
|
|
sth r10,5(r9)
|
|
add r9,r9,r7
|
|
bdnz __xorshort_7_3
|
|
blr
|
|
__xorshort_8_0:
|
|
lwz r10,0(r9)
|
|
xor r10,r10,r4
|
|
stw r10,0(r9)
|
|
lwz r10,4(r9)
|
|
xor r10,r10,r4
|
|
stw r10,4(r9)
|
|
add r9,r9,r7
|
|
bdnz __xorshort_8_0
|
|
blr
|
|
__xorshort_8_1:
|
|
lbz r10,0(r9)
|
|
xor r10,r10,r4
|
|
stb r10,0(r9)
|
|
lhz r10,1(r9)
|
|
xor r10,r10,r4
|
|
sth r10,1(r9)
|
|
lwz r10,3(r9)
|
|
xor r10,r10,r4
|
|
stw r10,3(r9)
|
|
lbz r10,7(r9)
|
|
xor r10,r10,r4
|
|
stb r10,7(r9)
|
|
add r9,r9,r7
|
|
bdnz __xorshort_8_1
|
|
blr
|
|
__xorshort_8_2:
|
|
lhz r10,0(r9)
|
|
xor r10,r10,r4
|
|
sth r10,0(r9)
|
|
lwz r10,2(r9)
|
|
xor r10,r10,r4
|
|
stw r10,2(r9)
|
|
lhz r10,6(r9)
|
|
xor r10,r10,r4
|
|
sth r10,6(r9)
|
|
add r9,r9,r7
|
|
bdnz __xorshort_8_2
|
|
blr
|
|
__xorshort_8_3:
|
|
lbz r10,0(r9)
|
|
xor r10,r10,r4
|
|
stb r10,0(r9)
|
|
lwz r10,1(r9)
|
|
xor r10,r10,r4
|
|
stw r10,1(r9)
|
|
lhz r10,5(r9)
|
|
xor r10,r10,r4
|
|
sth r10,5(r9)
|
|
lbz r10,7(r9)
|
|
xor r10,r10,r4
|
|
stb r10,7(r9)
|
|
add r9,r9,r7
|
|
bdnz __xorshort_8_3
|
|
blr
|
|
//
|
|
LEAF_EXIT(XorProcs)
|
|
//
|
|
//
|
|
//*************************************************************************************************
|
|
SPECIAL_ENTRY(RectCopy)
|
|
//
|
|
// Input Parameters:
|
|
// r3: The pointer to the parameter structure as follows.
|
|
// PARAM1 [00] : Target address
|
|
// PARAM2 [04] : Source address
|
|
// PARAM3 [08] : Number of bytes to copy per line
|
|
// PARAM4 [12] : Number of lines to copy
|
|
// PARAM5 [16] : Target line increments byte per line
|
|
// PARAM6 [20] : Source line increments byte per line
|
|
// PARAM7 [24] : Maximum number of cache lines to flush
|
|
// PARAM8 [28] : Maximum number of display lines to flush
|
|
// PARAM9 [32] : Operation control flag
|
|
// bit 0 (SFLUSHBIT): Source Flush flag 0:No Flush, 1:Flush
|
|
// bit 1 (TFLUSHBIT): Target Flush flag 0:No Flush, 1:Flush
|
|
// bit 2 (TTOUCHBIT): Target Touch flag using "dcbz" 0:No Touch, 1:Touch
|
|
// PARAM10 [36] : Register save area 1
|
|
// PARAM11 [40] : Register save area 2
|
|
// PARAM12 [44] : Register save area 3
|
|
// PARAM13 [48] : Register save area 4
|
|
// PARAM14 [52] : Register save area 5
|
|
// PARAM15 [56] : Register save area 6 (r0 is saved when calling RectCopyS)
|
|
//
|
|
// Register usage:
|
|
// r0: Work register
|
|
// r4: Updating source address
|
|
// r5: Number of bytes to copy per line --> used for counter (and destroied) in main copy routine
|
|
// r6: Updating remaining number of lines to copy
|
|
// r7: Target increment bytes per line (changed for pre caluculated value)
|
|
// r8: Source increment bytes per line (changed for pre caluculated value)
|
|
// r9: Updating target address
|
|
// r10: Work register
|
|
// r11: Work register
|
|
// r12: Inner most loop counter (8 bytes unit)
|
|
// r14: Subroutine for init copy
|
|
// r15: Subroutine for main loop
|
|
// r16: Subroutine for final copy
|
|
// r17: Cache touch offset
|
|
// CTR: Used for link
|
|
// f1~f4: Work register to be used for dword aligned copy
|
|
//
|
|
// Restrictions:
|
|
// Copy width is assumed to be equal or shorter than target delta.
|
|
// If target and source overlap, both of them must have same amount of
|
|
// line increments.
|
|
// Target memory has to be cachable - otherwise exception occurs.
|
|
// Target and source line increments byte has to be multiple of 4.
|
|
// If target delta is multiple of 32 (cache line width), RectCopy is used,
|
|
// if it's not, RectCopyS is used.
|
|
// If target delta is not multiple of 32, TFLUSH bit has to be off.
|
|
// If source delta is not multiple of 32, SFLUSH bit has to be off.
|
|
//
|
|
mflr r0 // LR
|
|
stw r14,SLACK1(sp)
|
|
stw r15,SLACK2(sp)
|
|
stw r16,SLACK3(sp)
|
|
stw r17,SLACK4(sp)
|
|
stwu sp,-(MINSTACKSIZE+16)(sp)
|
|
stw r0,MINSTACKSIZE+16-4*(4+1)(sp)
|
|
//
|
|
PROLOGUE_END(RectCopy)
|
|
//
|
|
lwz r6,PARAM4(r3) // r6 <- number of lines to copy
|
|
and. r6,r6,r6 // Any lines to copy?
|
|
beq- copy_exit // No -> exit
|
|
lwz r9,PARAM1(r3) // r9 <- target address
|
|
lwz r4,PARAM2(r3) // r4 <- source address
|
|
lwz r5,PARAM3(r3) // r5 <- bytes to copy per line
|
|
lwz r7,PARAM5(r3) // r7 <- target byte distance between lines
|
|
lwz r8,PARAM6(r3) // r8 <- source byte distance between lines
|
|
cmplw r9,r4 // Compare source & target address
|
|
blt- copy_100 // Target is lower -> copy from top to bottom
|
|
//
|
|
// Copy from bottom to top
|
|
//
|
|
cmplwi r5,MINLENGTH_COPY // Is it wide enough to do in this routine?
|
|
blt- copy_10 // No -> use RectCopyS
|
|
subf r10,r4,r9 // Check distance between source & target
|
|
cmplwi r10,MINDISTANCE // Too close?
|
|
blt- copy_10 // Yes -> use RectCopyS
|
|
#if (! FULLCACHE)
|
|
lwz r10,PARAM9(r3) // r10 <- cache control bit
|
|
andis. r10,r10,TTOUCHBIT // Can touch target cache?
|
|
beq- copy_10 // No -> use RectCopyS
|
|
#endif
|
|
andi. r10,r7,0x1f // Target delta is multiple of 32?
|
|
beq copy_20 // Yes -> we can use RectCopy, otherwise we need to use RectCopyS
|
|
//
|
|
copy_10:
|
|
bl ..RectCopyS // and call RectCopyS
|
|
#if (! FULLCACHE)
|
|
bl ..copyflush
|
|
#endif
|
|
b copy_exit
|
|
//
|
|
copy_20:
|
|
mullw r10,r7,r6 // Target is higher -> copy from bottom to top
|
|
add r9,r9,r10 // r9 <- top target address of the line after last
|
|
mullw r10,r8,r6
|
|
add r4,r4,r10 // r4 <- top source address of the line after last
|
|
subf r7,r5,r7 // r7 <- target delta after pointer increment
|
|
subf r8,r5,r8 // r8 <- source delta after pointer increment
|
|
neg r7,r7 // r7 <- negative target delta
|
|
neg r8,r8 // r8 <- negative source delta
|
|
add r9,r9,r7 // r9 <- one byte after the last byte of the last line
|
|
add r4,r4,r8 // r8 <- one byte after the last byte of the last line
|
|
li r17,-8 // r17 is used for "dcbz" offset
|
|
bl copy_30 // To get table address in LR
|
|
__CopyInitProcB:
|
|
.ualong __copyInit_0B
|
|
.ualong __copyInit_1B
|
|
.ualong __copyInit_2B
|
|
.ualong __copyInit_3B
|
|
.ualong __copyInit_4B
|
|
.ualong __copyInit_5B
|
|
.ualong __copyInit_6B
|
|
.ualong __copyInit_7B
|
|
__CopyMainProcB:
|
|
.ualong __copymain_0B
|
|
.ualong __copymain_1B
|
|
.ualong __copymain_2B
|
|
.ualong __copymain_3B
|
|
.ualong __copymain_4B
|
|
__CopyEndProcB:
|
|
.ualong __copyEnd_0B
|
|
.ualong __copyEnd_1B
|
|
.ualong __copyEnd_2B
|
|
.ualong __copyEnd_3B
|
|
.ualong __copyEnd_4B
|
|
.ualong __copyEnd_5B
|
|
.ualong __copyEnd_6B
|
|
.ualong __copyEnd_7B
|
|
//
|
|
copy_30:
|
|
mflr r10 // r10 <- Address of top table
|
|
rlwinm. r14,r9,2,27,29 // r14 <- table index to use depending on the ending alignment
|
|
beq copy_30x // No initial routine -> set r14 later
|
|
lwzx r14,r10,r14 // r14 <- subroutine to be called at first
|
|
copy_30x:
|
|
andi. r11,r9,0x07 // r11 <- number of bytes to be copied at first
|
|
subf r15,r11,r4 // r15 <- pointing one byte after initial copy adjustment (source)
|
|
rlwinm. r12,r15,2,28,29 // r12 <- table index for main loop routine
|
|
bne copy_35 // word unaligned -> proceed
|
|
andi. r15,r15,0x04 // word aligned -> check for dword aligned
|
|
bne copy_35 // not dword aligned -> use word aligned routine (index = 0)
|
|
lwz r15,PARAM6(r3) // r15 <- source byte distance between lines
|
|
andi. r15,r15,0x07 // Source delta multiple of 8?
|
|
bne copy_35
|
|
li r12,4*4 // dword aligned -> use dword aligned routine (index = 4)
|
|
copy_35:
|
|
addi r10,r10,__CopyMainProcB-__CopyInitProcB
|
|
lwzx r15,r10,r12 // r15 <- subroutine address for main loop
|
|
subf r11,r11,r5 // r11 <- remaining number of bytes to be copied
|
|
srawi r12,r11,3 // r12 <- number of dwords (8 byte unit) to be copied in the main loop
|
|
rlwinm r16,r11,2,27,29 // r16 <- table index for ending copy
|
|
addi r10,r10,__CopyEndProcB-__CopyMainProcB
|
|
lwzx r16,r10,r16 // r16 <- subroutine to be called after the main loop
|
|
//
|
|
and. r14,r14,r14 // Initial routine exist?
|
|
bne copy_35x // Yes -> proceed
|
|
mr r14,r15 // No -> skip initial routine
|
|
copy_35x:
|
|
//
|
|
// Main process for copying
|
|
//
|
|
mtctr r14
|
|
bctrl // Junp to entry routine -> link to main routine -> link to end routine and loop
|
|
// back to here after all lines are copied
|
|
//
|
|
copy_90:
|
|
#if (! FULLCACHE)
|
|
lwz r5,PARAM3(r3) // r5 <- bytes to copy per line
|
|
subf r4,r5,r4 // adjust source and
|
|
subf r9,r5,r9 // target pointer
|
|
subf r7,r5,r7 // also delta need to be
|
|
subf r8,r5,r8 // adjusted
|
|
bl ..copyflush
|
|
#endif
|
|
b copy_exit
|
|
//
|
|
//
|
|
// Initial copy routines for 1~7 bytes for forward direction
|
|
//
|
|
__copyInit_0F:
|
|
mtctr r15
|
|
bctr
|
|
__copyInit_1F:
|
|
mtctr r15
|
|
lbz r10,0(r4)
|
|
stb r10,0(r9)
|
|
addi r4,r4,1
|
|
addi r9,r9,1
|
|
bctr
|
|
__copyInit_2F:
|
|
mtctr r15
|
|
lbz r10,0(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
sth r10,0(r9)
|
|
addi r4,r4,2
|
|
addi r9,r9,2
|
|
bctr
|
|
__copyInit_3F:
|
|
mtctr r15
|
|
lbz r10,0(r4)
|
|
stb r10,0(r9)
|
|
lbz r10,1(r4)
|
|
lbz r11,2(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
sth r10,1(r9)
|
|
addi r4,r4,3
|
|
addi r9,r9,3
|
|
bctr
|
|
__copyInit_4F:
|
|
mtctr r15
|
|
lbz r10,0(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lbz r11,2(r4)
|
|
rlwimi r10,r11,16,8,15
|
|
lbz r11,3(r4)
|
|
rlwimi r10,r11,24,0,7
|
|
stw r10,0(r9)
|
|
addi r4,r4,4
|
|
addi r9,r9,4
|
|
bctr
|
|
__copyInit_5F:
|
|
mtctr r15
|
|
lbz r10,0(r4)
|
|
stb r10,0(r9)
|
|
lbz r10,1(r4)
|
|
lbz r11,2(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lbz r11,3(r4)
|
|
rlwimi r10,r11,16,8,15
|
|
lbz r11,4(r4)
|
|
rlwimi r10,r11,24,0,7
|
|
stw r10,1(r9)
|
|
addi r4,r4,5
|
|
addi r9,r9,5
|
|
bctr
|
|
__copyInit_6F:
|
|
mtctr r15
|
|
lbz r10,0(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
sth r10,0(r9)
|
|
lbz r10,2(r4)
|
|
lbz r11,3(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lbz r11,4(r4)
|
|
rlwimi r10,r11,16,8,15
|
|
lbz r11,5(r4)
|
|
rlwimi r10,r11,24,0,7
|
|
stw r10,2(r9)
|
|
addi r4,r4,6
|
|
addi r9,r9,6
|
|
bctr
|
|
__copyInit_7F:
|
|
mtctr r15
|
|
lbz r10,0(r4)
|
|
stb r10,0(r9)
|
|
lbz r10,1(r4)
|
|
lbz r11,2(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
sth r10,1(r9)
|
|
lbz r10,3(r4)
|
|
lbz r11,4(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lbz r11,5(r4)
|
|
rlwimi r10,r11,16,8,15
|
|
lbz r11,6(r4)
|
|
rlwimi r10,r11,24,0,7
|
|
stw r10,3(r9)
|
|
addi r4,r4,7
|
|
addi r9,r9,7
|
|
bctr
|
|
//
|
|
// Ending copy routines for 1~7 bytes for forward direction
|
|
//
|
|
__copyEnd_0F:
|
|
addic. r6,r6,-1
|
|
mtctr r14
|
|
add r9,r9,r7
|
|
add r4,r4,r8
|
|
bnectr
|
|
blr
|
|
__copyEnd_1F:
|
|
mtctr r14
|
|
lbz r10,0(r4)
|
|
stb r10,0(r9)
|
|
addic. r6,r6,-1
|
|
addi r4,r4,1
|
|
addi r9,r9,1
|
|
add r9,r9,r7
|
|
add r4,r4,r8
|
|
bnectr
|
|
blr
|
|
__copyEnd_2F:
|
|
mtctr r14
|
|
lbz r10,0(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
sth r10,0(r9)
|
|
addic. r6,r6,-1
|
|
addi r4,r4,2
|
|
addi r9,r9,2
|
|
add r9,r9,r7
|
|
add r4,r4,r8
|
|
bnectr
|
|
blr
|
|
__copyEnd_3F:
|
|
mtctr r14
|
|
lbz r10,0(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
sth r10,0(r9)
|
|
lbz r10,2(r4)
|
|
stb r10,2(r9)
|
|
addic. r6,r6,-1
|
|
addi r4,r4,3
|
|
addi r9,r9,3
|
|
add r9,r9,r7
|
|
add r4,r4,r8
|
|
bnectr
|
|
blr
|
|
__copyEnd_4F:
|
|
mtctr r14
|
|
lbz r10,0(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lbz r11,2(r4)
|
|
rlwimi r10,r11,16,8,15
|
|
lbz r11,3(r4)
|
|
rlwimi r10,r11,24,0,7
|
|
stw r10,0(r9)
|
|
addic. r6,r6,-1
|
|
addi r4,r4,4
|
|
addi r9,r9,4
|
|
add r9,r9,r7
|
|
add r4,r4,r8
|
|
bnectr
|
|
blr
|
|
__copyEnd_5F:
|
|
mtctr r14
|
|
lbz r10,0(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lbz r11,2(r4)
|
|
rlwimi r10,r11,16,8,15
|
|
lbz r11,3(r4)
|
|
rlwimi r10,r11,24,0,7
|
|
stw r10,0(r9)
|
|
lbz r10,4(r4)
|
|
stb r10,4(r9)
|
|
addic. r6,r6,-1
|
|
addi r4,r4,5
|
|
addi r9,r9,5
|
|
add r9,r9,r7
|
|
add r4,r4,r8
|
|
bnectr
|
|
blr
|
|
__copyEnd_6F:
|
|
mtctr r14
|
|
lbz r10,0(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lbz r11,2(r4)
|
|
rlwimi r10,r11,16,8,15
|
|
lbz r11,3(r4)
|
|
rlwimi r10,r11,24,0,7
|
|
stw r10,0(r9)
|
|
lbz r10,4(r4)
|
|
lbz r11,5(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
sth r10,4(r9)
|
|
addic. r6,r6,-1
|
|
addi r4,r4,6
|
|
addi r9,r9,6
|
|
add r9,r9,r7
|
|
add r4,r4,r8
|
|
bnectr
|
|
blr
|
|
__copyEnd_7F:
|
|
mtctr r14
|
|
lbz r10,0(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lbz r11,2(r4)
|
|
rlwimi r10,r11,16,8,15
|
|
lbz r11,3(r4)
|
|
rlwimi r10,r11,24,0,7
|
|
stw r10,0(r9)
|
|
lbz r10,4(r4)
|
|
lbz r11,5(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
sth r10,4(r9)
|
|
lbz r10,6(r4)
|
|
stb r10,6(r9)
|
|
addic. r6,r6,-1
|
|
addi r4,r4,7
|
|
addi r9,r9,7
|
|
add r9,r9,r7
|
|
add r4,r4,r8
|
|
bnectr
|
|
blr
|
|
//
|
|
// Main copy routines for long case (32 bytes unit) forward direction
|
|
//
|
|
__copymain_0F:
|
|
mtctr r16
|
|
mr r0,r12 // r0 <- number of loops (8 bytes units)
|
|
__copy0F_00:
|
|
andi. r10,r9,0x1f
|
|
beq __copy0F_10 // Target 32 byte aligned -> jump to main loop
|
|
lwz r10,0(r4) // Load next
|
|
lwz r11,4(r4) // two words
|
|
stw r10,0(r9) // And store
|
|
stw r11,4(r9)
|
|
addi r4,r4,8
|
|
addi r9,r9,8
|
|
addic r0,r0,-1
|
|
b __copy0F_00
|
|
__copy0F_10:
|
|
srawi. r5,r0,2 // r5 <- number of 32 bytes units
|
|
beq __copy0F_25
|
|
__copy0F_20:
|
|
addic. r5,r5,-1
|
|
lwz r10,0(r4) // Load and store 8 times (32 bytes)
|
|
#if USE_DCBZ
|
|
dcbz 0,r9 // Touch next target cache line
|
|
#endif
|
|
lwz r11,4(r4)
|
|
stw r10,0(r9)
|
|
stw r11,4(r9)
|
|
lwz r10,8(r4)
|
|
lwz r11,12(r4)
|
|
stw r10,8(r9)
|
|
stw r11,12(r9)
|
|
lwz r10,16(r4)
|
|
lwz r11,20(r4)
|
|
stw r10,16(r9)
|
|
stw r11,20(r9)
|
|
lwz r10,24(r4)
|
|
lwz r11,28(r4)
|
|
stw r10,24(r9)
|
|
stw r11,28(r9)
|
|
addi r4,r4,32
|
|
addi r9,r9,32
|
|
bne __copy0F_20 // End of main loop
|
|
__copy0F_25:
|
|
andi. r0,r0,0x03 // r0 <- remaining number of 8 byte unit to move after this loop is done
|
|
beq __copy0F_90
|
|
__copy0F_30:
|
|
lwz r10,0(r4) // Load next
|
|
lwz r11,4(r4) // two words
|
|
stw r10,0(r9) // And store
|
|
stw r11,4(r9)
|
|
addi r4,r4,8
|
|
addi r9,r9,8
|
|
addic. r0,r0,-1
|
|
bne __copy0F_30
|
|
__copy0F_90:
|
|
bctr
|
|
//
|
|
__copymain_1F:
|
|
mtctr r16
|
|
mr r0,r12 // r0 <- number of loops (8 bytes units)
|
|
addi r4,r4,-1
|
|
lwz r10,0(r4)
|
|
__copy1F_00:
|
|
andi. r11,r9,0x1f
|
|
beq __copy1F_10 // Target 32 byte aligned -> jump to main loop
|
|
rlwinm r11,r10,24,8,31
|
|
lwzu r10,4(r4)
|
|
rlwimi r11,r10,24,0,7
|
|
stw r11,0(r9)
|
|
rlwinm r11,r10,24,8,31
|
|
lwzu r10,4(r4)
|
|
rlwimi r11,r10,24,0,7
|
|
stw r11,4(r9)
|
|
addi r9,r9,8
|
|
addic r0,r0,-1
|
|
b __copy1F_00
|
|
__copy1F_10:
|
|
srawi. r5,r0,2 // r5 <- number of 32 bytes units
|
|
beq __copy1F_25
|
|
__copy1F_20:
|
|
addic. r5,r5,-1
|
|
rlwinm r11,r10,24,8,31
|
|
lwzu r10,4(r4)
|
|
#if USE_DCBZ
|
|
dcbz 0,r9 // Touch next target cache line
|
|
#endif
|
|
rlwimi r11,r10,24,0,7
|
|
stw r11,0(r9)
|
|
rlwinm r11,r10,24,8,31
|
|
lwzu r10,4(r4)
|
|
rlwimi r11,r10,24,0,7
|
|
stw r11,4(r9)
|
|
rlwinm r11,r10,24,8,31
|
|
lwzu r10,4(r4)
|
|
rlwimi r11,r10,24,0,7
|
|
stw r11,8(r9)
|
|
rlwinm r11,r10,24,8,31
|
|
lwzu r10,4(r4)
|
|
rlwimi r11,r10,24,0,7
|
|
stw r11,12(r9)
|
|
rlwinm r11,r10,24,8,31
|
|
lwzu r10,4(r4)
|
|
rlwimi r11,r10,24,0,7
|
|
stw r11,16(r9)
|
|
rlwinm r11,r10,24,8,31
|
|
lwzu r10,4(r4)
|
|
rlwimi r11,r10,24,0,7
|
|
stw r11,20(r9)
|
|
rlwinm r11,r10,24,8,31
|
|
lwzu r10,4(r4)
|
|
rlwimi r11,r10,24,0,7
|
|
stw r11,24(r9)
|
|
rlwinm r11,r10,24,8,31
|
|
lwzu r10,4(r4)
|
|
rlwimi r11,r10,24,0,7
|
|
stw r11,28(r9)
|
|
addi r9,r9,32
|
|
bne __copy1F_20 // End of main loop
|
|
__copy1F_25:
|
|
andi. r0,r0,0x03 // r0 <- remaining number of 8 byte unit to move after this loop is done
|
|
beq __copy1F_90
|
|
__copy1F_30:
|
|
rlwinm r11,r10,24,8,31
|
|
lwzu r10,4(r4)
|
|
rlwimi r11,r10,24,0,7
|
|
stw r11,0(r9)
|
|
rlwinm r11,r10,24,8,31
|
|
lwzu r10,4(r4)
|
|
rlwimi r11,r10,24,0,7
|
|
stw r11,4(r9)
|
|
addi r9,r9,8
|
|
addic. r0,r0,-1
|
|
bne __copy1F_30
|
|
__copy1F_90:
|
|
addi r4,r4,1
|
|
bctr
|
|
//
|
|
__copymain_2F:
|
|
mtctr r16
|
|
mr r0,r12 // r0 <- number of loops (8 bytes units)
|
|
lhz r10,0(r4)
|
|
addi r4,r4,-2
|
|
__copy2F_00:
|
|
andi. r11,r9,0x1f
|
|
beq __copy2F_10 // Target 32 byte aligned -> jump to main loop
|
|
lwzu r11,4(r4)
|
|
rlwimi r10,r11,16,0,15
|
|
stw r10,0(r9)
|
|
rlwinm r10,r11,16,16,31
|
|
lwzu r11,4(r4)
|
|
rlwimi r10,r11,16,0,15
|
|
stw r10,4(r9)
|
|
rlwinm r10,r11,16,16,31
|
|
addi r9,r9,8
|
|
addic r0,r0,-1
|
|
b __copy2F_00
|
|
__copy2F_10:
|
|
srawi. r5,r0,2 // r5 <- number of 32 bytes units
|
|
beq __copy2F_25
|
|
__copy2F_20:
|
|
addic. r5,r5,-1
|
|
lwzu r11,4(r4)
|
|
#if USE_DCBZ
|
|
dcbz 0,r9 // Touch next target cache line
|
|
#endif
|
|
rlwimi r10,r11,16,0,15
|
|
stw r10,0(r9)
|
|
rlwinm r10,r11,16,16,31
|
|
lwzu r11,4(r4)
|
|
rlwimi r10,r11,16,0,15
|
|
stw r10,4(r9)
|
|
rlwinm r10,r11,16,16,31
|
|
lwzu r11,4(r4)
|
|
rlwimi r10,r11,16,0,15
|
|
stw r10,8(r9)
|
|
rlwinm r10,r11,16,16,31
|
|
lwzu r11,4(r4)
|
|
rlwimi r10,r11,16,0,15
|
|
stw r10,12(r9)
|
|
rlwinm r10,r11,16,16,31
|
|
lwzu r11,4(r4)
|
|
rlwimi r10,r11,16,0,15
|
|
stw r10,16(r9)
|
|
rlwinm r10,r11,16,16,31
|
|
lwzu r11,4(r4)
|
|
rlwimi r10,r11,16,0,15
|
|
stw r10,20(r9)
|
|
rlwinm r10,r11,16,16,31
|
|
lwzu r11,4(r4)
|
|
rlwimi r10,r11,16,0,15
|
|
stw r10,24(r9)
|
|
rlwinm r10,r11,16,16,31
|
|
lwzu r11,4(r4)
|
|
rlwimi r10,r11,16,0,15
|
|
stw r10,28(r9)
|
|
rlwinm r10,r11,16,16,31
|
|
addi r9,r9,32
|
|
bne __copy2F_20 // End of main loop
|
|
__copy2F_25:
|
|
andi. r0,r0,0x03 // r0 <- remaining number of 8 byte unit to move after this loop is done
|
|
beq __copy2F_90
|
|
__copy2F_30:
|
|
lwzu r11,4(r4)
|
|
rlwimi r10,r11,16,0,15
|
|
stw r10,0(r9)
|
|
rlwinm r10,r11,16,16,31
|
|
lwzu r11,4(r4)
|
|
rlwimi r10,r11,16,0,15
|
|
stw r10,4(r9)
|
|
rlwinm r10,r11,16,16,31
|
|
addi r9,r9,8
|
|
addic. r0,r0,-1
|
|
bne __copy2F_30
|
|
__copy2F_90:
|
|
addi r4,r4,2
|
|
bctr
|
|
//
|
|
__copymain_3F:
|
|
mtctr r16
|
|
mr r0,r12 // r0 <- number of loops (8 bytes units)
|
|
lbz r10,0(r4)
|
|
addi r4,r4,-3
|
|
__copy3F_00:
|
|
andi. r11,r9,0x1f
|
|
beq __copy3F_10 // Target 32 byte aligned -> jump to main loop
|
|
lwzu r11,4(r4)
|
|
rlwimi r10,r11,8,0,23
|
|
stw r10,0(r9)
|
|
rlwinm r10,r11,8,24,31
|
|
lwzu r11,4(r4)
|
|
rlwimi r10,r11,8,0,23
|
|
stw r10,4(r9)
|
|
rlwinm r10,r11,8,24,31
|
|
addi r9,r9,8
|
|
addic r0,r0,-1
|
|
b __copy3F_00
|
|
__copy3F_10:
|
|
srawi. r5,r0,2 // r5 <- number of 32 bytes units
|
|
beq __copy3F_25
|
|
__copy3F_20:
|
|
addic. r5,r5,-1
|
|
lwzu r11,4(r4)
|
|
#if USE_DCBZ
|
|
dcbz 0,r9 // Touch next target cache line
|
|
#endif
|
|
rlwimi r10,r11,8,0,23
|
|
stw r10,0(r9)
|
|
rlwinm r10,r11,8,24,31
|
|
lwzu r11,4(r4)
|
|
rlwimi r10,r11,8,0,23
|
|
stw r10,4(r9)
|
|
rlwinm r10,r11,8,24,31
|
|
lwzu r11,4(r4)
|
|
rlwimi r10,r11,8,0,23
|
|
stw r10,8(r9)
|
|
rlwinm r10,r11,8,24,31
|
|
lwzu r11,4(r4)
|
|
rlwimi r10,r11,8,0,23
|
|
stw r10,12(r9)
|
|
rlwinm r10,r11,8,24,31
|
|
lwzu r11,4(r4)
|
|
rlwimi r10,r11,8,0,23
|
|
stw r10,16(r9)
|
|
rlwinm r10,r11,8,24,31
|
|
lwzu r11,4(r4)
|
|
rlwimi r10,r11,8,0,23
|
|
stw r10,20(r9)
|
|
rlwinm r10,r11,8,24,31
|
|
lwzu r11,4(r4)
|
|
rlwimi r10,r11,8,0,23
|
|
stw r10,24(r9)
|
|
rlwinm r10,r11,8,24,31
|
|
lwzu r11,4(r4)
|
|
rlwimi r10,r11,8,0,23
|
|
stw r10,28(r9)
|
|
rlwinm r10,r11,8,24,31
|
|
addi r9,r9,32
|
|
bne __copy3F_20 // End of main loop
|
|
__copy3F_25:
|
|
andi. r0,r0,0x03 // r0 <- remaining number of 8 byte unit to move after this loop is done
|
|
beq __copy3F_90
|
|
__copy3F_30:
|
|
lwzu r11,4(r4)
|
|
rlwimi r10,r11,8,0,23
|
|
stw r10,0(r9)
|
|
rlwinm r10,r11,8,24,31
|
|
lwzu r11,4(r4)
|
|
rlwimi r10,r11,8,0,23
|
|
stw r10,4(r9)
|
|
rlwinm r10,r11,8,24,31
|
|
addi r9,r9,8
|
|
addic. r0,r0,-1
|
|
bne __copy3F_30
|
|
__copy3F_90:
|
|
addi r4,r4,3
|
|
bctr
|
|
//
|
|
__copymain_4F:
|
|
mtctr r16
|
|
mr r0,r12 // r0 <- number of loops (8 bytes units)
|
|
__copy4F_00:
|
|
andi. r10,r9,0x1f
|
|
beq __copy4F_10 // Target 32 byte aligned -> jump to main loop
|
|
lfd f1,0(r4)
|
|
stfd f1,0(r9)
|
|
addi r4,r4,8
|
|
addi r9,r9,8
|
|
addic r0,r0,-1
|
|
b __copy4F_00
|
|
__copy4F_10:
|
|
srawi. r5,r0,2 // r5 <- number of 32 bytes units
|
|
beq __copy4F_25
|
|
__copy4F_20:
|
|
addic. r5,r5,-1
|
|
lfd f1,0(r4)
|
|
#if USE_DCBZ
|
|
dcbz 0,r9 // Touch next target cache line
|
|
#endif
|
|
lfd f2,8(r4)
|
|
lfd f3,16(r4)
|
|
lfd f4,24(r4)
|
|
stfd f1,0(r9)
|
|
stfd f2,8(r9)
|
|
stfd f3,16(r9)
|
|
stfd f4,24(r9)
|
|
addi r4,r4,32
|
|
addi r9,r9,32
|
|
bne __copy4F_20 // End of main loop
|
|
__copy4F_25:
|
|
andi. r0,r0,0x03 // r0 <- remaining number of 8 byte unit to move after this loop is done
|
|
beq __copy4F_90
|
|
__copy4F_30:
|
|
lfd f1,0(r4)
|
|
stfd f1,0(r9)
|
|
addi r4,r4,8
|
|
addi r9,r9,8
|
|
addic. r0,r0,-1
|
|
bne __copy4F_30
|
|
__copy4F_90:
|
|
bctr
|
|
//
|
|
// Initial copy routines for 1~7 bytes for backword direction
|
|
//
|
|
__copyInit_0B:
|
|
mtctr r15
|
|
bctr
|
|
__copyInit_1B:
|
|
mtctr r15
|
|
lbzu r10,-1(r4)
|
|
stbu r10,-1(r9)
|
|
bctr
|
|
__copyInit_2B:
|
|
mtctr r15
|
|
lbzu r10,-2(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
sthu r10,-2(r9)
|
|
bctr
|
|
__copyInit_3B:
|
|
mtctr r15
|
|
lbzu r10,-1(r4)
|
|
stbu r10,-1(r9)
|
|
lbzu r10,-2(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
sthu r10,-2(r9)
|
|
bctr
|
|
__copyInit_4B:
|
|
mtctr r15
|
|
lbzu r10,-4(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lbz r11,2(r4)
|
|
rlwimi r10,r11,16,8,15
|
|
lbz r11,3(r4)
|
|
rlwimi r10,r11,24,0,7
|
|
stwu r10,-4(r9)
|
|
bctr
|
|
__copyInit_5B:
|
|
mtctr r15
|
|
lbzu r10,-1(r4)
|
|
stbu r10,-1(r9)
|
|
lbzu r10,-4(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lbz r11,2(r4)
|
|
rlwimi r10,r11,16,8,15
|
|
lbz r11,3(r4)
|
|
rlwimi r10,r11,24,0,7
|
|
stwu r10,-4(r9)
|
|
bctr
|
|
__copyInit_6B:
|
|
mtctr r15
|
|
lbzu r10,-2(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
sthu r10,-2(r9)
|
|
lbzu r10,-4(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lbz r11,2(r4)
|
|
rlwimi r10,r11,16,8,15
|
|
lbz r11,3(r4)
|
|
rlwimi r10,r11,24,0,7
|
|
stwu r10,-4(r9)
|
|
bctr
|
|
__copyInit_7B:
|
|
mtctr r15
|
|
lbzu r10,-1(r4)
|
|
stbu r10,-1(r9)
|
|
lbzu r10,-2(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
sthu r10,-2(r9)
|
|
lbzu r10,-4(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lbz r11,2(r4)
|
|
rlwimi r10,r11,16,8,15
|
|
lbz r11,3(r4)
|
|
rlwimi r10,r11,24,0,7
|
|
stwu r10,-4(r9)
|
|
bctr
|
|
//
|
|
// Ending copy routines for 1~7 bytes for backword direction
|
|
//
|
|
__copyEnd_0B:
|
|
addic. r6,r6,-1
|
|
mtctr r14
|
|
add r9,r9,r7
|
|
add r4,r4,r8
|
|
bnectr
|
|
blr
|
|
__copyEnd_1B:
|
|
mtctr r14
|
|
lbzu r10,-1(r4)
|
|
stbu r10,-1(r9)
|
|
addic. r6,r6,-1
|
|
add r9,r9,r7
|
|
add r4,r4,r8
|
|
bnectr
|
|
blr
|
|
__copyEnd_2B:
|
|
mtctr r14
|
|
lbzu r10,-2(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
sthu r10,-2(r9)
|
|
addic. r6,r6,-1
|
|
add r9,r9,r7
|
|
add r4,r4,r8
|
|
bnectr
|
|
blr
|
|
__copyEnd_3B:
|
|
mtctr r14
|
|
lbzu r10,-2(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
sthu r10,-2(r9)
|
|
lbzu r10,-1(r4)
|
|
stbu r10,-1(r9)
|
|
addic. r6,r6,-1
|
|
add r9,r9,r7
|
|
add r4,r4,r8
|
|
bnectr
|
|
blr
|
|
__copyEnd_4B:
|
|
mtctr r14
|
|
lbzu r10,-4(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lbz r11,2(r4)
|
|
rlwimi r10,r11,16,8,15
|
|
lbz r11,3(r4)
|
|
rlwimi r10,r11,24,0,7
|
|
stwu r10,-4(r9)
|
|
addic. r6,r6,-1
|
|
add r9,r9,r7
|
|
add r4,r4,r8
|
|
bnectr
|
|
blr
|
|
__copyEnd_5B:
|
|
mtctr r14
|
|
lbzu r10,-4(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lbz r11,2(r4)
|
|
rlwimi r10,r11,16,8,15
|
|
lbz r11,3(r4)
|
|
rlwimi r10,r11,24,0,7
|
|
stwu r10,-4(r9)
|
|
lbzu r10,-1(r4)
|
|
stbu r10,-1(r9)
|
|
addic. r6,r6,-1
|
|
add r9,r9,r7
|
|
add r4,r4,r8
|
|
bnectr
|
|
blr
|
|
__copyEnd_6B:
|
|
mtctr r14
|
|
lbzu r10,-4(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lbz r11,2(r4)
|
|
rlwimi r10,r11,16,8,15
|
|
lbz r11,3(r4)
|
|
rlwimi r10,r11,24,0,7
|
|
stwu r10,-4(r9)
|
|
lbzu r10,-2(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
sthu r10,-2(r9)
|
|
addic. r6,r6,-1
|
|
add r9,r9,r7
|
|
add r4,r4,r8
|
|
bnectr
|
|
blr
|
|
__copyEnd_7B:
|
|
mtctr r14
|
|
lbzu r10,-4(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lbz r11,2(r4)
|
|
rlwimi r10,r11,16,8,15
|
|
lbz r11,3(r4)
|
|
rlwimi r10,r11,24,0,7
|
|
stwu r10,-4(r9)
|
|
lbzu r10,-2(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
sthu r10,-2(r9)
|
|
lbzu r10,-1(r4)
|
|
stbu r10,-1(r9)
|
|
addic. r6,r6,-1
|
|
add r9,r9,r7
|
|
add r4,r4,r8
|
|
bnectr
|
|
blr
|
|
//
|
|
// Main copy routines for long case (32 bytes unit) backword direction
|
|
//
|
|
__copymain_0B:
|
|
mtctr r16
|
|
mr r0,r12 // r0 <- number of loops (8 bytes units)
|
|
__copy0B_00:
|
|
andi. r10,r9,0x1f
|
|
beq __copy0B_10 // Target 32 byte aligned -> jump to main loop
|
|
lwzu r10,-4(r4)
|
|
lwzu r11,-4(r4)
|
|
stwu r10,-4(r9)
|
|
stwu r11,-4(r9)
|
|
addic r0,r0,-1
|
|
b __copy0B_00
|
|
__copy0B_10:
|
|
srawi. r5,r0,2 // r5 <- number of 32 bytes units
|
|
beq __copy0B_25
|
|
__copy0B_20:
|
|
addic. r5,r5,-1
|
|
lwzu r10,-4(r4)
|
|
#if USE_DCBZ
|
|
dcbz r17,r9 // Touch next target cache line
|
|
#endif
|
|
lwzu r11,-4(r4)
|
|
stwu r10,-4(r9)
|
|
stwu r11,-4(r9)
|
|
lwzu r10,-4(r4)
|
|
lwzu r11,-4(r4)
|
|
stwu r10,-4(r9)
|
|
stwu r11,-4(r9)
|
|
lwzu r10,-4(r4)
|
|
lwzu r11,-4(r4)
|
|
stwu r10,-4(r9)
|
|
stwu r11,-4(r9)
|
|
lwzu r10,-4(r4)
|
|
lwzu r11,-4(r4)
|
|
stwu r10,-4(r9)
|
|
stwu r11,-4(r9)
|
|
bne __copy0B_20 // End of main loop
|
|
__copy0B_25:
|
|
andi. r0,r0,0x03 // r0 <- remaining number of 8 byte unit to move after this loop is done
|
|
beq __copy0B_90
|
|
__copy0B_30:
|
|
lwzu r10,-4(r4)
|
|
lwzu r11,-4(r4)
|
|
stwu r10,-4(r9)
|
|
stwu r11,-4(r9)
|
|
addic. r0,r0,-1
|
|
bne __copy0B_30
|
|
__copy0B_90:
|
|
bctr
|
|
//
|
|
__copymain_1B:
|
|
mtctr r16
|
|
mr r0,r12 // r0 <- number of loops (8 bytes units)
|
|
lbzu r10,-1(r4) // Load last byte
|
|
__copy1B_00:
|
|
andi. r11,r9,0x1f
|
|
beq __copy1B_10 // Target 32 byte aligned -> jump to main loop
|
|
rlwinm r11,r10,24,0,7 // Move LS 1 bytes in r10 to MS byte in r11
|
|
lwzu r10,-4(r4) // Load preceeding word
|
|
rlwimi r11,r10,24,8,31 // Insert MS 3 bytes in r10 to LS 3 bytes in r11
|
|
stwu r11,-4(r9) // Store r11
|
|
rlwinm r11,r10,24,0,7 // Move LS 1 bytes in r10 to MS byte in r11
|
|
lwzu r10,-4(r4) // Load preceeding word
|
|
rlwimi r11,r10,24,8,31 // Insert MS 3 bytes in r10 to LS 3 bytes in r11
|
|
stwu r11,-4(r9) // Store r11
|
|
addic r0,r0,-1
|
|
b __copy1B_00
|
|
__copy1B_10:
|
|
srawi. r5,r0,2 // r5 <- number of 32 bytes units
|
|
beq __copy1B_25
|
|
__copy1B_20:
|
|
addic. r5,r5,-1
|
|
rlwinm r11,r10,24,0,7 // Move LS 1 bytes in r10 to MS byte in r11
|
|
lwzu r10,-4(r4) // Load preceeding word
|
|
#if USE_DCBZ
|
|
dcbz r17,r9 // Touch next target cache line
|
|
#endif
|
|
rlwimi r11,r10,24,8,31 // Insert MS 3 bytes in r10 to LS 3 bytes in r11
|
|
stwu r11,-4(r9) // Store r11
|
|
rlwinm r11,r10,24,0,7 // Move LS 1 bytes in r10 to MS byte in r11
|
|
lwzu r10,-4(r4) // Load preceeding word
|
|
rlwimi r11,r10,24,8,31 // Insert MS 3 bytes in r10 to LS 3 bytes in r11
|
|
stwu r11,-4(r9) // Store r11
|
|
rlwinm r11,r10,24,0,7 // Move LS 1 bytes in r10 to MS byte in r11
|
|
lwzu r10,-4(r4) // Load preceeding word
|
|
rlwimi r11,r10,24,8,31 // Insert MS 3 bytes in r10 to LS 3 bytes in r11
|
|
stwu r11,-4(r9) // Store r11
|
|
rlwinm r11,r10,24,0,7 // Move LS 1 bytes in r10 to MS byte in r11
|
|
lwzu r10,-4(r4) // Load preceeding word
|
|
rlwimi r11,r10,24,8,31 // Insert MS 3 bytes in r10 to LS 3 bytes in r11
|
|
stwu r11,-4(r9) // Store r11
|
|
rlwinm r11,r10,24,0,7 // Move LS 1 bytes in r10 to MS byte in r11
|
|
lwzu r10,-4(r4) // Load preceeding word
|
|
rlwimi r11,r10,24,8,31 // Insert MS 3 bytes in r10 to LS 3 bytes in r11
|
|
stwu r11,-4(r9) // Store r11
|
|
rlwinm r11,r10,24,0,7 // Move LS 1 bytes in r10 to MS byte in r11
|
|
lwzu r10,-4(r4) // Load preceeding word
|
|
rlwimi r11,r10,24,8,31 // Insert MS 3 bytes in r10 to LS 3 bytes in r11
|
|
stwu r11,-4(r9) // Store r11
|
|
rlwinm r11,r10,24,0,7 // Move LS 1 bytes in r10 to MS byte in r11
|
|
lwzu r10,-4(r4) // Load preceeding word
|
|
rlwimi r11,r10,24,8,31 // Insert MS 3 bytes in r10 to LS 3 bytes in r11
|
|
stwu r11,-4(r9) // Store r11
|
|
rlwinm r11,r10,24,0,7 // Move LS 1 bytes in r10 to MS byte in r11
|
|
lwzu r10,-4(r4) // Load preceeding word
|
|
rlwimi r11,r10,24,8,31 // Insert MS 3 bytes in r10 to LS 3 bytes in r11
|
|
stwu r11,-4(r9) // Store r11
|
|
bne __copy1B_20 // End of main loop
|
|
__copy1B_25:
|
|
andi. r0,r0,0x03 // r0 <- remaining number of 8 byte unit to move after this loop is done
|
|
beq __copy1B_90
|
|
__copy1B_30:
|
|
rlwinm r11,r10,24,0,7 // Move LS 1 bytes in r10 to MS byte in r11
|
|
lwzu r10,-4(r4) // Load preceeding word
|
|
rlwimi r11,r10,24,8,31 // Insert MS 3 bytes in r10 to LS 3 bytes in r11
|
|
stwu r11,-4(r9) // Store r11
|
|
rlwinm r11,r10,24,0,7 // Move LS 1 bytes in r10 to MS byte in r11
|
|
lwzu r10,-4(r4) // Load preceeding word
|
|
rlwimi r11,r10,24,8,31 // Insert MS 3 bytes in r10 to LS 3 bytes in r11
|
|
stwu r11,-4(r9) // Store r11
|
|
addic. r0,r0,-1
|
|
bne __copy1B_30
|
|
__copy1B_90:
|
|
addi r4,r4,1 // Adjust source pointer
|
|
bctr
|
|
//
|
|
__copymain_2B:
|
|
mtctr r16
|
|
mr r0,r12 // r0 <- number of loops (8 bytes units)
|
|
lhzu r10,-2(r4) // Load needed two bytes in r11
|
|
__copy2B_00:
|
|
andi. r11,r9,0x1f
|
|
beq __copy2B_10 // Target 32 byte aligned -> jump to main loop
|
|
rlwinm r11,r10,16,0,15 // Move LS 2 bytes in r10 to MS 2 bytes in r11
|
|
lwzu r10,-4(r4) // Load preceeding word in r10
|
|
rlwimi r11,r10,16,16,31 // Insert MS 2 bytes in r10 to LS 2 bytes in r11
|
|
stwu r11,-4(r9) // Store r11
|
|
rlwinm r11,r10,16,0,15 // Move LS 2 bytes in r10 to MS 2 bytes in r11
|
|
lwzu r10,-4(r4) // Load preceeding word in r10
|
|
rlwimi r11,r10,16,16,31 // Insert MS 2 bytes in r10 to LS 2 bytes in r11
|
|
stwu r11,-4(r9) // Store r11
|
|
addic r0,r0,-1
|
|
b __copy2B_00
|
|
__copy2B_10:
|
|
srawi. r5,r0,2 // r5 <- number of 32 bytes units
|
|
beq __copy2B_25
|
|
__copy2B_20:
|
|
addic. r5,r5,-1
|
|
rlwinm r11,r10,16,0,15 // Move LS 2 bytes in r10 to MS 2 bytes in r11
|
|
lwzu r10,-4(r4) // Load preceeding word in r10
|
|
#if USE_DCBZ
|
|
dcbz r17,r9 // Touch next target cache line
|
|
#endif
|
|
rlwimi r11,r10,16,16,31 // Insert MS 2 bytes in r10 to LS 2 bytes in r11
|
|
stwu r11,-4(r9) // Store r11
|
|
rlwinm r11,r10,16,0,15 // Move LS 2 bytes in r10 to MS 2 bytes in r11
|
|
lwzu r10,-4(r4) // Load preceeding word in r10
|
|
rlwimi r11,r10,16,16,31 // Insert MS 2 bytes in r10 to LS 2 bytes in r11
|
|
stwu r11,-4(r9) // Store r11
|
|
rlwinm r11,r10,16,0,15 // Move LS 2 bytes in r10 to MS 2 bytes in r11
|
|
lwzu r10,-4(r4) // Load preceeding word in r10
|
|
rlwimi r11,r10,16,16,31 // Insert MS 2 bytes in r10 to LS 2 bytes in r11
|
|
stwu r11,-4(r9) // Store r11
|
|
rlwinm r11,r10,16,0,15 // Move LS 2 bytes in r10 to MS 2 bytes in r11
|
|
lwzu r10,-4(r4) // Load preceeding word in r10
|
|
rlwimi r11,r10,16,16,31 // Insert MS 2 bytes in r10 to LS 2 bytes in r11
|
|
stwu r11,-4(r9) // Store r11
|
|
rlwinm r11,r10,16,0,15 // Move LS 2 bytes in r10 to MS 2 bytes in r11
|
|
lwzu r10,-4(r4) // Load preceeding word in r10
|
|
rlwimi r11,r10,16,16,31 // Insert MS 2 bytes in r10 to LS 2 bytes in r11
|
|
stwu r11,-4(r9) // Store r11
|
|
rlwinm r11,r10,16,0,15 // Move LS 2 bytes in r10 to MS 2 bytes in r11
|
|
lwzu r10,-4(r4) // Load preceeding word in r10
|
|
rlwimi r11,r10,16,16,31 // Insert MS 2 bytes in r10 to LS 2 bytes in r11
|
|
stwu r11,-4(r9) // Store r11
|
|
rlwinm r11,r10,16,0,15 // Move LS 2 bytes in r10 to MS 2 bytes in r11
|
|
lwzu r10,-4(r4) // Load preceeding word in r10
|
|
rlwimi r11,r10,16,16,31 // Insert MS 2 bytes in r10 to LS 2 bytes in r11
|
|
stwu r11,-4(r9) // Store r11
|
|
rlwinm r11,r10,16,0,15 // Move LS 2 bytes in r10 to MS 2 bytes in r11
|
|
lwzu r10,-4(r4) // Load preceeding word in r10
|
|
rlwimi r11,r10,16,16,31 // Insert MS 2 bytes in r10 to LS 2 bytes in r11
|
|
stwu r11,-4(r9) // Store r11
|
|
bne __copy2B_20 // End of main loop
|
|
__copy2B_25:
|
|
andi. r0,r0,0x03 // r0 <- remaining number of 8 byte unit to move after this loop is done
|
|
beq __copy2B_90
|
|
__copy2B_30:
|
|
rlwinm r11,r10,16,0,15 // Move LS 2 bytes in r10 to MS 2 bytes in r11
|
|
lwzu r10,-4(r4) // Load preceeding word in r10
|
|
rlwimi r11,r10,16,16,31 // Insert MS 2 bytes in r10 to LS 2 bytes in r11
|
|
stwu r11,-4(r9) // Store r11
|
|
rlwinm r11,r10,16,0,15 // Move LS 2 bytes in r10 to MS 2 bytes in r11
|
|
lwzu r10,-4(r4) // Load preceeding word in r10
|
|
rlwimi r11,r10,16,16,31 // Insert MS 2 bytes in r10 to LS 2 bytes in r11
|
|
stwu r11,-4(r9) // Store r11
|
|
addic. r0,r0,-1
|
|
bne __copy2B_30
|
|
__copy2B_90:
|
|
addi r4,r4,2 // Adjust source pointer
|
|
bctr
|
|
//
|
|
__copymain_3B:
|
|
mtctr r16
|
|
mr r0,r12 // r0 <- number of loops (8 bytes units)
|
|
addi r4,r4,1 // Adjust source pointer to make update word access
|
|
lwzu r10,-4(r4) // Load needed three bytes in MS r10
|
|
__copy3B_00:
|
|
andi. r11,r9,0x1f
|
|
beq __copy3B_10 // Target 32 byte aligned -> jump to main loop
|
|
rlwinm r11,r10,8,0,23 // Move LS 3 bytes in r10 to MS bytes in r11
|
|
lwzu r10,-4(r4) // Load preceeding word
|
|
rlwimi r11,r10,8,24,31 // Insert MS 1 bytes in r10 to LS 1 byte in r11
|
|
stwu r11,-4(r9) // Store r11
|
|
rlwinm r11,r10,8,0,23 // Move LS 3 bytes in r10 to MS bytes in r11
|
|
lwzu r10,-4(r4) // Load preceeding word
|
|
rlwimi r11,r10,8,24,31 // Insert MS 1 bytes in r10 to LS 1 byte in r11
|
|
stwu r11,-4(r9) // Store r11
|
|
addic r0,r0,-1
|
|
b __copy3B_00
|
|
__copy3B_10:
|
|
srawi. r5,r0,2 // r5 <- number of 32 bytes units
|
|
beq __copy3B_25
|
|
__copy3B_20:
|
|
addic. r5,r5,-1
|
|
rlwinm r11,r10,8,0,23 // Move LS 3 bytes in r10 to MS bytes in r11
|
|
lwzu r10,-4(r4) // Load preceeding word
|
|
#if USE_DCBZ
|
|
dcbz r17,r9 // Touch next target cache line
|
|
#endif
|
|
rlwimi r11,r10,8,24,31 // Insert MS 1 bytes in r10 to LS 1 byte in r11
|
|
stwu r11,-4(r9) // Store r11
|
|
rlwinm r11,r10,8,0,23 // Move LS 3 bytes in r10 to MS bytes in r11
|
|
lwzu r10,-4(r4) // Load preceeding word
|
|
rlwimi r11,r10,8,24,31 // Insert MS 1 bytes in r10 to LS 1 byte in r11
|
|
stwu r11,-4(r9) // Store r11
|
|
rlwinm r11,r10,8,0,23 // Move LS 3 bytes in r10 to MS bytes in r11
|
|
lwzu r10,-4(r4) // Load preceeding word
|
|
rlwimi r11,r10,8,24,31 // Insert MS 1 bytes in r10 to LS 1 byte in r11
|
|
stwu r11,-4(r9) // Store r11
|
|
rlwinm r11,r10,8,0,23 // Move LS 3 bytes in r10 to MS bytes in r11
|
|
lwzu r10,-4(r4) // Load preceeding word
|
|
rlwimi r11,r10,8,24,31 // Insert MS 1 bytes in r10 to LS 1 byte in r11
|
|
stwu r11,-4(r9) // Store r11
|
|
rlwinm r11,r10,8,0,23 // Move LS 3 bytes in r10 to MS bytes in r11
|
|
lwzu r10,-4(r4) // Load preceeding word
|
|
rlwimi r11,r10,8,24,31 // Insert MS 1 bytes in r10 to LS 1 byte in r11
|
|
stwu r11,-4(r9) // Store r11
|
|
rlwinm r11,r10,8,0,23 // Move LS 3 bytes in r10 to MS bytes in r11
|
|
lwzu r10,-4(r4) // Load preceeding word
|
|
rlwimi r11,r10,8,24,31 // Insert MS 1 bytes in r10 to LS 1 byte in r11
|
|
stwu r11,-4(r9) // Store r11
|
|
rlwinm r11,r10,8,0,23 // Move LS 3 bytes in r10 to MS bytes in r11
|
|
lwzu r10,-4(r4) // Load preceeding word
|
|
rlwimi r11,r10,8,24,31 // Insert MS 1 bytes in r10 to LS 1 byte in r11
|
|
stwu r11,-4(r9) // Store r11
|
|
rlwinm r11,r10,8,0,23 // Move LS 3 bytes in r10 to MS bytes in r11
|
|
lwzu r10,-4(r4) // Load preceeding word
|
|
rlwimi r11,r10,8,24,31 // Insert MS 1 bytes in r10 to LS 1 byte in r11
|
|
stwu r11,-4(r9) // Store r11
|
|
bne __copy3B_20 // End of main loop
|
|
__copy3B_25:
|
|
andi. r0,r0,0x03 // r0 <- remaining number of 8 byte unit to move after this loop is done
|
|
beq __copy3B_90
|
|
__copy3B_30:
|
|
rlwinm r11,r10,8,0,23 // Move LS 3 bytes in r10 to MS bytes in r11
|
|
lwzu r10,-4(r4) // Load preceeding word
|
|
rlwimi r11,r10,8,24,31 // Insert MS 1 bytes in r10 to LS 1 byte in r11
|
|
stwu r11,-4(r9) // Store r11
|
|
rlwinm r11,r10,8,0,23 // Move LS 3 bytes in r10 to MS bytes in r11
|
|
lwzu r10,-4(r4) // Load preceeding word
|
|
rlwimi r11,r10,8,24,31 // Insert MS 1 bytes in r10 to LS 1 byte in r11
|
|
stwu r11,-4(r9) // Store r11
|
|
addic. r0,r0,-1
|
|
bne __copy3B_30
|
|
__copy3B_90:
|
|
addi r4,r4,3 // Adjust source pointer
|
|
bctr
|
|
//
|
|
__copymain_4B:
|
|
mtctr r16
|
|
mr r0,r12 // r0 <- number of loops (8 bytes units)
|
|
__copy4B_00:
|
|
andi. r10,r9,0x1f
|
|
beq __copy4B_10 // Target 32 byte aligned -> jump to main loop
|
|
lfd f1,-8(r4)
|
|
stfd f1,-8(r9)
|
|
addi r4,r4,-8
|
|
addi r9,r9,-8
|
|
addic r0,r0,-1
|
|
b __copy4B_00
|
|
__copy4B_10:
|
|
srawi. r5,r0,2 // r5 <- number of 32 bytes units
|
|
beq __copy4B_25
|
|
__copy4B_20:
|
|
addic. r5,r5,-1
|
|
lfd f1,-8(r4)
|
|
#if USE_DCBZ
|
|
dcbz r17,r9 // Touch next target cache line
|
|
#endif
|
|
lfd f2,-16(r4)
|
|
lfd f3,-24(r4)
|
|
lfd f4,-32(r4)
|
|
stfd f1,-8(r9)
|
|
stfd f2,-16(r9)
|
|
stfd f3,-24(r9)
|
|
stfd f4,-32(r9)
|
|
addi r4,r4,-32
|
|
addi r9,r9,-32
|
|
bne __copy4B_20 // End of main loop
|
|
__copy4B_25:
|
|
andi. r0,r0,0x03 // r0 <- remaining number of 8 byte unit to move after this loop is done
|
|
beq __copy4B_90
|
|
__copy4B_30:
|
|
lfd f1,-8(r4)
|
|
stfd f1,-8(r9)
|
|
addi r4,r4,-8
|
|
addi r9,r9,-8
|
|
addic. r0,r0,-1
|
|
bne __copy4B_30
|
|
__copy4B_90:
|
|
bctr
|
|
//
|
|
copy_100:
|
|
//
|
|
// Copy from top to bottom
|
|
//
|
|
cmplwi r5,MINLENGTH_COPY // Is it wide enough to do in this routine?
|
|
blt- copy_110 // No -> use RectCopyS
|
|
subf r10,r9,r4 // Check distance between source & target
|
|
cmplwi r10,MINDISTANCE // Too close?
|
|
blt- copy_110 // Yes -> use RectCopyS
|
|
#if (! FULLCACHE)
|
|
lwz r10,PARAM9(r3) // r10 <- cache control bit
|
|
andis. r10,r10,TTOUCHBIT // Can touch target cache?
|
|
beq- copy_110 // No -> use RectCopyS
|
|
#endif
|
|
andi. r10,r7,0x1f // Target delta is multiple of 32?
|
|
beq copy_120 // Yes -> we can use RectCopy, otherwise we need to use RectCopyS
|
|
//
|
|
copy_110:
|
|
bl ..RectCopyS // and call RectCopyS
|
|
b copy_195 // and flush cache
|
|
//
|
|
copy_120:
|
|
li r17,-8
|
|
bl copy_130 // To get table address in LR
|
|
__CopyInitProcF:
|
|
.ualong __copyInit_0F
|
|
.ualong __copyInit_7F
|
|
.ualong __copyInit_6F
|
|
.ualong __copyInit_5F
|
|
.ualong __copyInit_4F
|
|
.ualong __copyInit_3F
|
|
.ualong __copyInit_2F
|
|
.ualong __copyInit_1F
|
|
__CopyMainProcF:
|
|
.ualong __copymain_0F
|
|
.ualong __copymain_1F
|
|
.ualong __copymain_2F
|
|
.ualong __copymain_3F
|
|
.ualong __copymain_4F
|
|
__CopyEndProcF:
|
|
.ualong __copyEnd_0F
|
|
.ualong __copyEnd_1F
|
|
.ualong __copyEnd_2F
|
|
.ualong __copyEnd_3F
|
|
.ualong __copyEnd_4F
|
|
.ualong __copyEnd_5F
|
|
.ualong __copyEnd_6F
|
|
.ualong __copyEnd_7F
|
|
//
|
|
copy_130:
|
|
mflr r10 // r10 <- Address of top table
|
|
rlwinm. r14,r9,2,27,29 // r14 <- table index to use depending on the initial alignment
|
|
beq copy_130x // No init routine -> set r14 later
|
|
lwzx r14,r10,r14 // r14 <- subroutine to be called at first
|
|
copy_130x:
|
|
andi. r11,r9,0x07 // r11 <- initial target alignment
|
|
beq- copy_132
|
|
subfic r11,r11,8 // r11 <- number of bytes to be copied at first
|
|
copy_132:
|
|
add r15,r11,r4 // r15 <- source pointer after initial copy
|
|
rlwinm. r12,r15,2,28,29 // r12 <- table index for main loop routine
|
|
bne copy_135 // word unaligned -> proceed
|
|
andi. r15,r15,0x04 // word aligned -> check for dword aligned
|
|
bne copy_135 // not dword aligned -> use word aligned routine (index = 0)
|
|
lwz r15,PARAM6(r3) // r15 <- source byte distance between lines
|
|
andi. r15,r15,0x07 // Source delta multiple of 8?
|
|
bne copy_135
|
|
li r12,4*4 // dword aligned -> use dword aligned routine (index = 4)
|
|
copy_135:
|
|
addi r10,r10,__CopyMainProcF-__CopyInitProcF
|
|
lwzx r15,r10,r12 // r15 <- subroutine address for main loop
|
|
subf r11,r11,r5 // r11 <- remaining number of bytes to be copied
|
|
srawi r12,r11,3 // r12 <- number of dwords (8 byte unit) to be copied in the main loop
|
|
rlwinm r16,r11,2,27,29 // r16 <- table index for ending copy
|
|
addi r10,r10,__CopyEndProcF-__CopyMainProcF
|
|
lwzx r16,r10,r16 // r16 <- subroutine to be called after the main loop
|
|
//
|
|
and. r14,r14,r14 // Initial routine exist?
|
|
bne copy_135x // Yes -> proceed
|
|
mr r14,r15 // No -> skip initial routine
|
|
copy_135x:
|
|
//
|
|
// Main process for copying
|
|
//
|
|
subf r7,r5,r7 // r7 <- line delta after updating pointer (target)
|
|
subf r8,r5,r8 // r7 <- line delta after updating pointer (source)
|
|
mtctr r14
|
|
bctrl // Junp to entry routine -> link to main routine -> link to end routine and loop
|
|
// back to here after all lines are copied
|
|
//
|
|
copy_190:
|
|
#if (! FULLCACHE)
|
|
lwz r5,PARAM3(r3) // r5 <- bytes to copy per line
|
|
add r7,r5,r7 // restore source and target delta
|
|
add r8,r5,r8
|
|
#endif
|
|
copy_195:
|
|
#if (! FULLCACHE)
|
|
bl ..copyflush
|
|
#endif
|
|
//
|
|
copy_exit:
|
|
lwz r0,MINSTACKSIZE+16-4*5(sp)
|
|
lwz r17,MINSTACKSIZE+16-4*4(sp)
|
|
lwz r16,MINSTACKSIZE+16-4*3(sp)
|
|
lwz r15,MINSTACKSIZE+16-4*2(sp)
|
|
lwz r14,MINSTACKSIZE+16-4*1(sp)
|
|
mtlr r0
|
|
addi sp,sp,(MINSTACKSIZE+16)
|
|
//
|
|
SPECIAL_EXIT(RectCopy)
|
|
//
|
|
#if (! FULLCACHE)
|
|
LEAF_ENTRY(copyflush)
|
|
//
|
|
// Register usage for flushing cache (* indicates input parameters)
|
|
//
|
|
// *r3: The pointer to the parameter structure (same as above)
|
|
// *r4: Starting source address (pointing to the first byte of the next line on entry)
|
|
// r5: Ending address
|
|
// r6: Number of target lines
|
|
// *r7: Target delta bytes per line (positive or negative depending on the direction)
|
|
// *r8: Source delta bytes per line (positive or negative depending on the direction)
|
|
// *r9: Starting target address (pointing to the first byte of the next line on entry)
|
|
// *r10: Updating address to flush
|
|
// r11: Number of cache entries to flush per line
|
|
// r12: Maximum number of cache lines to flush
|
|
//
|
|
lwz r5,PARAM9(r3) // r5 <- cache control flag
|
|
andis. r6,r5,TFLUSHBIT // Need to flush target cache?
|
|
beq- flushcopy_50 // No -> check source flush
|
|
lwz r5,PARAM3(r3) // r5 <- bytes to copy per line
|
|
flushcopy_00:
|
|
lwz r6,PARAM8(r3) // r6 <- Maximum number of display lines to flush
|
|
lwz r12,PARAM4(r3) // r12 <- Number of target lines
|
|
cmplw r12,r6 // compare those two
|
|
bge flushcopy_05 // and take whichever
|
|
mr r6,r12 // smaller
|
|
flushcopy_05:
|
|
lwz r12,PARAM7(r3) // r12 <- Maximum number of cache lines to flush
|
|
subf r9,r7,r9 // r9 <- starting byte of the last line
|
|
add r5,r9,r5 // r5 <- one byte after last byte to flush
|
|
addi r5,r5,-1 // r5 <- last byte to flush
|
|
rlwinm r9,r9,0,0,26 // r9 <- 32 byte aligned start address
|
|
rlwinm r5,r5,0,0,26 // r5 <- 32 byte aligned end address
|
|
subf r11,r9,r5 // r11 <- end - start
|
|
srawi r11,r11,5
|
|
addi r11,r11,1 // r11 <- Number of cache entries to flush per line
|
|
flushcopy_10:
|
|
mr r10,r9 // r10 <- address to flush cache
|
|
flushcopy_20:
|
|
dcbf 0,r10 // Flush cached data
|
|
addi r10,r10,32 // Next cache line address
|
|
cmplw r10,r5 // Exceeding end address?
|
|
ble flushcopy_20 // No -> loop to flush previous cache line
|
|
subf. r12,r11,r12 // Flush enough entries?
|
|
blt- flushcopy_50 // Yes -> check source flush necessity
|
|
addic. r6,r6,-1 // Flush all lines?
|
|
subf r9,r7,r9 // Update start
|
|
subf r5,r7,r5 // and end address to flush cache to point to the previous line
|
|
bne flushcopy_10 // No -> continue to flush
|
|
//
|
|
flushcopy_50:
|
|
lwz r5,PARAM9(r3) // r5 <- cache control flag
|
|
andis. r6,r5,SFLUSHBIT // Need to flush source cache?
|
|
beq- flushcopy_90 // No -> exit
|
|
lwz r5,PARAM3(r3) // r5 <- bytes to copy per line
|
|
lwz r6,PARAM8(r3) // r6 <- Maximum number of display lines to flush
|
|
lwz r12,PARAM4(r3) // r12 <- Number of target lines
|
|
cmplw r12,r6 // compare those two
|
|
bge flushcopy_55 // and take whichever
|
|
mr r6,r12 // smaller
|
|
flushcopy_55:
|
|
lwz r12,PARAM7(r3) // r12 <- Maximum number of cache lines to flush
|
|
subf r4,r8,r4 // r4 <- starting byte of the last line
|
|
add r5,r4,r5 // r5 <- one byte after last byte to flush
|
|
addi r5,r5,-1 // r5 <- last byte to flush
|
|
rlwinm r4,r4,0,0,26 // r4 <- 32 byte aligned start address
|
|
rlwinm r5,r5,0,0,26 // r5 <- 32 byte aligned end address
|
|
subf r11,r4,r5 // r11 <- end - start
|
|
srawi r11,r11,5
|
|
addi r11,r11,1 // r11 <- Number of cache entries to flush per line
|
|
flushcopy_60:
|
|
mr r10,r4 // r10 <- address to flush cache
|
|
flushcopy_70:
|
|
dcbf 0,r10 // Flush cached data
|
|
addi r10,r10,32 // Next cache line address
|
|
cmplw r10,r5 // Exceeding end address?
|
|
ble flushcopy_70 // No -> loop to flush previous cache line
|
|
subf. r12,r11,r12 // Flush enough entries?
|
|
blt- flushcopy_90 // Yes -> exit
|
|
addic. r6,r6,-1 // Flush all lines?
|
|
subf r4,r8,r4 // Update start
|
|
subf r5,r8,r5 // and end address to flush cache to point to the previous line
|
|
bne flushcopy_60 // No -> continue to flush
|
|
flushcopy_90:
|
|
LEAF_EXIT(copyflush)
|
|
#endif (! FULLCACHE)
|
|
//
|
|
//
|
|
//*************************************************************************************************
|
|
SPECIAL_ENTRY(RectCopyS)
|
|
//
|
|
// Input Parameters:
|
|
// r3: The pointer to the parameter structure as follows.
|
|
// PARAM1 [00] : Target address
|
|
// PARAM2 [04] : Source address
|
|
// PARAM3 [08] : Number of bytes to copy per line
|
|
// PARAM4 [12] : Number of lines to copy
|
|
// PARAM5 [16] : Target line increments byte per line
|
|
// PARAM6 [20] : Source line increments byte per line
|
|
// PARAM7 [24] : <reserved -- don't change>
|
|
// PARAM8 [28] : <reserved -- don't change>
|
|
// PARAM9 [32] : <reserved -- don't change>
|
|
// PARAM10 [36] : Register save area 1
|
|
// PARAM11 [40] : Register save area 2
|
|
// PARAM12 [44] : Register save area 3
|
|
// PARAM13 [48] : Register save area 4
|
|
// PARAM14 [52] : Register save area 5
|
|
// PARAM15 [56] : Register save area 6 (Used by RectCopy when calling RectCopyS)
|
|
//
|
|
// Register usage:
|
|
// r0: Saved return address
|
|
// r4: Updating source address
|
|
// r5: Number of bytes to copy per line -> used as work register
|
|
// r6: Remaining number of lines to copy
|
|
// r7: Target increment bytes per line (may be changed for pre caluculated value)
|
|
// r8: Source increment bytes per line (may be changed for pre caluculated value)
|
|
// r9: Updating target address
|
|
// r10: Work register
|
|
// r11: Work register
|
|
// r12: Inner most loop counter (work register for width <= 8 case)
|
|
// r14: Subroutine for init copy
|
|
// r15: Subroutine for main loop
|
|
// r16: Subroutine for final copy
|
|
// CTR: Used for link
|
|
//
|
|
// Restrictions:
|
|
// Copy width is assumed to be equal or shorter than target delta.
|
|
// If target and source overlap, both of them must have same amount of
|
|
// line increments.
|
|
// Target and source line increments byte has to be multiple of 4.
|
|
//
|
|
mflr r0 // LR
|
|
stw r14,SLACK1(sp)
|
|
stw r15,SLACK2(sp)
|
|
stw r16,SLACK3(sp)
|
|
stw r17,SLACK4(sp)
|
|
stwu sp,-(MINSTACKSIZE+16)(sp)
|
|
stw r0,MINSTACKSIZE+16-4*(4+1)(sp)
|
|
//
|
|
PROLOGUE_END(RectCopyS)
|
|
//
|
|
lwz r6,PARAM4(r3) // r6 <- number of lines
|
|
and. r6,r6,r6 // Any lines to copy?
|
|
beq copys_exit // No -> exit
|
|
lwz r9,PARAM1(r3) // r9 <- target address
|
|
lwz r4,PARAM2(r3) // r4 <- source address
|
|
lwz r5,PARAM3(r3) // r5 <- bytes to copy per line
|
|
lwz r7,PARAM5(r3) // r7 <- target byte distance between lines
|
|
lwz r8,PARAM6(r3) // r8 <- source byte distance between lines
|
|
//
|
|
cmplwi r5,8 // More than 8 bytes
|
|
bgt copys_20 // Yes -> do normal process
|
|
addic. r11,r5,-1 // r11 <- Length - 1
|
|
blt copys_exit // length = 0 -> just exit
|
|
bl copys_10
|
|
__CopyShortTable:
|
|
.ualong __copy1_A0
|
|
.ualong __copy1_A1
|
|
.ualong __copy1_A2
|
|
.ualong __copy1_A3
|
|
.ualong __copy2_A0
|
|
.ualong __copy2_A1
|
|
.ualong __copy2_A2
|
|
.ualong __copy2_A3
|
|
.ualong __copy3_A0
|
|
.ualong __copy3_A1
|
|
.ualong __copy3_A2
|
|
.ualong __copy3_A3
|
|
.ualong __copy4_A0
|
|
.ualong __copy4_A1
|
|
.ualong __copy4_A2
|
|
.ualong __copy4_A3
|
|
.ualong __copy5_A0
|
|
.ualong __copy5_A1
|
|
.ualong __copy5_A2
|
|
.ualong __copy5_A3
|
|
.ualong __copy6_A0
|
|
.ualong __copy6_A1
|
|
.ualong __copy6_A2
|
|
.ualong __copy6_A3
|
|
.ualong __copy7_A0
|
|
.ualong __copy7_A1
|
|
.ualong __copy7_A2
|
|
.ualong __copy7_A3
|
|
.ualong __copy8_A0
|
|
.ualong __copy8_A1
|
|
.ualong __copy8_A2
|
|
.ualong __copy8_A3
|
|
//
|
|
// Short copy routines for 1~8 bytes with 4 target word alignment cases
|
|
//
|
|
__copy1_A0:
|
|
__copy1_A1:
|
|
__copy1_A2:
|
|
__copy1_A3:
|
|
addic. r6,r6,-1
|
|
lbz r10,0(r4)
|
|
stb r10,0(r9)
|
|
add r4,r4,r8
|
|
add r9,r9,r7
|
|
bnectr
|
|
blr
|
|
__copy2_A0:
|
|
__copy2_A2:
|
|
addic. r6,r6,-1
|
|
lbz r10,0(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
sth r10,0(r9)
|
|
add r4,r4,r8
|
|
add r9,r9,r7
|
|
bnectr
|
|
blr
|
|
__copy2_A1:
|
|
__copy2_A3:
|
|
addic. r6,r6,-1
|
|
lbz r10,0(r4)
|
|
lbz r11,1(r4)
|
|
stb r10,0(r9)
|
|
stb r11,1(r9)
|
|
add r4,r4,r8
|
|
add r9,r9,r7
|
|
bnectr
|
|
blr
|
|
__copy3_A0:
|
|
__copy3_A2:
|
|
addic. r6,r6,-1
|
|
lbz r10,0(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lbz r12,2(r4)
|
|
sth r10,0(r9)
|
|
stb r12,2(r9)
|
|
add r4,r4,r8
|
|
add r9,r9,r7
|
|
bnectr
|
|
blr
|
|
__copy3_A1:
|
|
__copy3_A3:
|
|
addic. r6,r6,-1
|
|
lbz r12,0(r4)
|
|
lbz r10,1(r4)
|
|
lbz r11,2(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
stb r12,0(r9)
|
|
sth r10,1(r9)
|
|
add r4,r4,r8
|
|
add r9,r9,r7
|
|
bnectr
|
|
blr
|
|
__copy4_A0:
|
|
addic. r6,r6,-1
|
|
lbz r10,0(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lbz r11,2(r4)
|
|
rlwimi r10,r11,16,8,15
|
|
lbz r11,3(r4)
|
|
rlwimi r10,r11,24,0,7
|
|
stw r10,0(r9)
|
|
add r4,r4,r8
|
|
add r9,r9,r7
|
|
bnectr
|
|
blr
|
|
__copy4_A1:
|
|
__copy4_A3:
|
|
addic. r6,r6,-1
|
|
lbz r12,0(r4)
|
|
lbz r10,1(r4)
|
|
lbz r11,2(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lbz r5,3(r4)
|
|
stb r12,0(r9)
|
|
sth r10,1(r9)
|
|
stb r5,3(r9)
|
|
add r4,r4,r8
|
|
add r9,r9,r7
|
|
bnectr
|
|
blr
|
|
__copy4_A2:
|
|
addic. r6,r6,-1
|
|
lbz r10,0(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lbz r12,2(r4)
|
|
lbz r11,3(r4)
|
|
rlwimi r12,r11,8,16,23
|
|
sth r10,0(r9)
|
|
sth r12,2(r9)
|
|
add r4,r4,r8
|
|
add r9,r9,r7
|
|
bnectr
|
|
blr
|
|
__copy5_A0:
|
|
addic. r6,r6,-1
|
|
lbz r10,0(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lbz r11,2(r4)
|
|
rlwimi r10,r11,16,8,15
|
|
lbz r11,3(r4)
|
|
rlwimi r10,r11,24,0,7
|
|
lbz r12,4(r4)
|
|
stw r10,0(r9)
|
|
stb r12,4(r9)
|
|
add r4,r4,r8
|
|
add r9,r9,r7
|
|
bnectr
|
|
blr
|
|
__copy5_A1:
|
|
addic. r6,r6,-1
|
|
lbz r5,0(r4)
|
|
lbz r10,1(r4)
|
|
lbz r11,2(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lbz r12,3(r4)
|
|
lbz r11,4(r4)
|
|
rlwimi r12,r11,8,16,23
|
|
stb r5,0(r9)
|
|
sth r10,1(r9)
|
|
sth r12,3(r9)
|
|
add r4,r4,r8
|
|
add r9,r9,r7
|
|
bnectr
|
|
blr
|
|
__copy5_A2:
|
|
addic. r6,r6,-1
|
|
lbz r10,0(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lbz r12,2(r4)
|
|
lbz r11,3(r4)
|
|
rlwimi r12,r11,8,16,23
|
|
lbz r11,4(r4)
|
|
sth r10,0(r9)
|
|
sth r12,2(r9)
|
|
stb r11,4(r9)
|
|
add r4,r4,r8
|
|
add r9,r9,r7
|
|
bnectr
|
|
blr
|
|
__copy5_A3:
|
|
addic. r6,r6,-1
|
|
lbz r12,0(r4)
|
|
lbz r10,1(r4)
|
|
lbz r11,2(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lbz r11,3(r4)
|
|
rlwimi r10,r11,16,8,15
|
|
lbz r11,4(r4)
|
|
rlwimi r10,r11,24,0,7
|
|
stb r12,0(r9)
|
|
stw r10,1(r9)
|
|
add r4,r4,r8
|
|
add r9,r9,r7
|
|
bnectr
|
|
blr
|
|
__copy6_A0:
|
|
addic. r6,r6,-1
|
|
lbz r10,0(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lbz r11,2(r4)
|
|
rlwimi r10,r11,16,8,15
|
|
lbz r11,3(r4)
|
|
rlwimi r10,r11,24,0,7
|
|
lbz r12,4(r4)
|
|
lbz r11,5(r4)
|
|
rlwimi r12,r11,8,16,23
|
|
stw r10,0(r9)
|
|
sth r12,4(r9)
|
|
add r4,r4,r8
|
|
add r9,r9,r7
|
|
bnectr
|
|
blr
|
|
__copy6_A1:
|
|
addic. r6,r6,-1
|
|
lbz r5,0(r4)
|
|
lbz r10,1(r4)
|
|
lbz r11,2(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lbz r12,3(r4)
|
|
lbz r11,4(r4)
|
|
rlwimi r12,r11,8,16,23
|
|
lbz r11,5(r4)
|
|
stb r5,0(r9)
|
|
sth r10,1(r9)
|
|
sth r12,3(r9)
|
|
stb r11,5(r9)
|
|
add r4,r4,r8
|
|
add r9,r9,r7
|
|
bnectr
|
|
blr
|
|
__copy6_A2:
|
|
addic. r6,r6,-1
|
|
lbz r10,0(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lbz r12,2(r4)
|
|
lbz r11,3(r4)
|
|
rlwimi r12,r11,8,16,23
|
|
lbz r11,4(r4)
|
|
rlwimi r12,r11,16,8,15
|
|
lbz r11,5(r4)
|
|
rlwimi r12,r11,24,0,7
|
|
sth r10,0(r9)
|
|
stw r12,2(r9)
|
|
add r4,r4,r8
|
|
add r9,r9,r7
|
|
bnectr
|
|
blr
|
|
__copy6_A3:
|
|
addic. r6,r6,-1
|
|
lbz r12,0(r4)
|
|
lbz r10,1(r4)
|
|
lbz r11,2(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lbz r11,3(r4)
|
|
rlwimi r10,r11,16,8,15
|
|
lbz r11,4(r4)
|
|
rlwimi r10,r11,24,0,7
|
|
lbz r11,5(r4)
|
|
stb r12,0(r9)
|
|
stw r10,1(r9)
|
|
stb r11,5(r9)
|
|
add r4,r4,r8
|
|
add r9,r9,r7
|
|
bnectr
|
|
blr
|
|
__copy7_A0:
|
|
addic. r6,r6,-1
|
|
lbz r10,0(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lbz r11,2(r4)
|
|
rlwimi r10,r11,16,8,15
|
|
lbz r11,3(r4)
|
|
rlwimi r10,r11,24,0,7
|
|
lbz r12,4(r4)
|
|
lbz r11,5(r4)
|
|
rlwimi r12,r11,8,16,23
|
|
lbz r11,6(r4)
|
|
stw r10,0(r9)
|
|
sth r12,4(r9)
|
|
stb r11,6(r9)
|
|
add r4,r4,r8
|
|
add r9,r9,r7
|
|
bnectr
|
|
blr
|
|
__copy7_A1:
|
|
addic. r6,r6,-1
|
|
lbz r5,0(r4)
|
|
lbz r10,1(r4)
|
|
lbz r11,2(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lbz r12,3(r4)
|
|
lbz r11,4(r4)
|
|
rlwimi r12,r11,8,16,23
|
|
lbz r11,5(r4)
|
|
rlwimi r12,r11,16,8,15
|
|
lbz r11,6(r4)
|
|
rlwimi r12,r11,24,0,7
|
|
stb r5,0(r9)
|
|
sth r10,1(r9)
|
|
stw r12,3(r9)
|
|
add r4,r4,r8
|
|
add r9,r9,r7
|
|
bnectr
|
|
blr
|
|
__copy7_A2:
|
|
addic. r6,r6,-1
|
|
lbz r10,0(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lbz r12,2(r4)
|
|
lbz r11,3(r4)
|
|
rlwimi r12,r11,8,16,23
|
|
lbz r11,4(r4)
|
|
rlwimi r12,r11,16,8,15
|
|
lbz r11,5(r4)
|
|
rlwimi r12,r11,24,0,7
|
|
lbz r11,6(r4)
|
|
sth r10,0(r9)
|
|
stw r12,2(r9)
|
|
stb r11,6(r9)
|
|
add r4,r4,r8
|
|
add r9,r9,r7
|
|
bnectr
|
|
blr
|
|
__copy7_A3:
|
|
addic. r6,r6,-1
|
|
lbz r5,0(r4)
|
|
lbz r10,1(r4)
|
|
lbz r11,2(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lbz r11,3(r4)
|
|
rlwimi r10,r11,16,8,15
|
|
lbz r11,4(r4)
|
|
rlwimi r10,r11,24,0,7
|
|
lbz r12,5(r4)
|
|
lbz r11,6(r4)
|
|
rlwimi r12,r11,8,16,23
|
|
stb r5,0(r9)
|
|
stw r10,1(r9)
|
|
sth r12,5(r9)
|
|
add r4,r4,r8
|
|
add r9,r9,r7
|
|
bnectr
|
|
blr
|
|
__copy8_A0:
|
|
addic. r6,r6,-1
|
|
lbz r10,0(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lbz r11,2(r4)
|
|
rlwimi r10,r11,16,8,15
|
|
lbz r11,3(r4)
|
|
rlwimi r10,r11,24,0,7
|
|
lbz r12,4(r4)
|
|
lbz r11,5(r4)
|
|
rlwimi r12,r11,8,16,23
|
|
lbz r11,6(r4)
|
|
rlwimi r12,r11,16,8,15
|
|
lbz r11,7(r4)
|
|
rlwimi r12,r11,24,0,7
|
|
stw r10,0(r9)
|
|
stw r12,4(r9)
|
|
add r4,r4,r8
|
|
add r9,r9,r7
|
|
bnectr
|
|
blr
|
|
__copy8_A1:
|
|
addic. r6,r6,-1
|
|
lbz r5,0(r4)
|
|
lbz r10,1(r4)
|
|
lbz r11,2(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lbz r12,3(r4)
|
|
lbz r11,4(r4)
|
|
rlwimi r12,r11,8,16,23
|
|
lbz r11,5(r4)
|
|
rlwimi r12,r11,16,8,15
|
|
lbz r11,6(r4)
|
|
rlwimi r12,r11,24,0,7
|
|
lbz r11,7(r4)
|
|
stb r5,0(r9)
|
|
sth r10,1(r9)
|
|
stw r12,3(r9)
|
|
stb r11,7(r9)
|
|
add r4,r4,r8
|
|
add r9,r9,r7
|
|
bnectr
|
|
blr
|
|
__copy8_A2:
|
|
addic. r6,r6,-1
|
|
lbz r10,0(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lbz r12,2(r4)
|
|
lbz r11,3(r4)
|
|
rlwimi r12,r11,8,16,23
|
|
lbz r11,4(r4)
|
|
rlwimi r12,r11,16,8,15
|
|
lbz r11,5(r4)
|
|
rlwimi r12,r11,24,0,7
|
|
lbz r5,6(r4)
|
|
lbz r11,7(r4)
|
|
rlwimi r5,r11,8,16,23
|
|
sth r10,0(r9)
|
|
stw r12,2(r9)
|
|
sth r5,6(r9)
|
|
add r4,r4,r8
|
|
add r9,r9,r7
|
|
bnectr
|
|
blr
|
|
__copy8_A3:
|
|
addic. r6,r6,-1
|
|
lbz r5,0(r4)
|
|
lbz r10,1(r4)
|
|
lbz r11,2(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lbz r11,3(r4)
|
|
rlwimi r10,r11,16,8,15
|
|
lbz r11,4(r4)
|
|
rlwimi r10,r11,24,0,7
|
|
lbz r12,5(r4)
|
|
lbz r11,6(r4)
|
|
rlwimi r12,r11,8,16,23
|
|
lbz r11,7(r4)
|
|
stb r5,0(r9)
|
|
stw r10,1(r9)
|
|
sth r12,5(r9)
|
|
stb r11,7(r9)
|
|
add r4,r4,r8
|
|
add r9,r9,r7
|
|
bnectr
|
|
blr
|
|
//
|
|
// Main copy routines for short case (4 bytes unit) forward direction
|
|
//
|
|
__copymains_0F:
|
|
mtctr r16
|
|
mr r5,r12
|
|
__copys0F_00:
|
|
addic. r5,r5,-1
|
|
lwz r10,0(r4)
|
|
stw r10,0(r9)
|
|
addi r4,r4,4
|
|
addi r9,r9,4
|
|
bne __copys0F_00
|
|
bctr
|
|
//
|
|
__copymains_1F:
|
|
mtctr r16
|
|
mr r5,r12
|
|
addi r4,r4,-1
|
|
lwz r10,0(r4)
|
|
__copys1F_00:
|
|
addic. r5,r5,-1
|
|
rlwinm r11,r10,24,8,31
|
|
lwzu r10,4(r4)
|
|
rlwimi r11,r10,24,0,7
|
|
stw r11,0(r9)
|
|
addi r9,r9,4
|
|
bne __copys1F_00
|
|
addi r4,r4,1
|
|
bctr
|
|
//
|
|
__copymains_2F:
|
|
mtctr r16
|
|
mr r5,r12
|
|
lhz r10,0(r4)
|
|
addi r4,r4,-2
|
|
__copys2F_00:
|
|
addic. r5,r5,-1
|
|
lwzu r11,4(r4)
|
|
rlwimi r10,r11,16,0,15
|
|
stw r10,0(r9)
|
|
addi r9,r9,4
|
|
rlwinm r10,r11,16,16,31
|
|
bne __copys2F_00
|
|
addi r4,r4,2
|
|
bctr
|
|
//
|
|
__copymains_3F:
|
|
mtctr r16
|
|
mr r5,r12
|
|
lbz r10,0(r4)
|
|
addi r4,r4,-3
|
|
__copys3F_00:
|
|
addic. r5,r5,-1
|
|
lwzu r11,4(r4)
|
|
rlwimi r10,r11,8,0,23
|
|
stw r10,0(r9)
|
|
addi r9,r9,4
|
|
rlwinm r10,r11,8,24,31
|
|
bne __copys3F_00
|
|
addi r4,r4,3
|
|
bctr
|
|
//
|
|
// Main copy routines for short case (4 bytes unit) backword direction
|
|
//
|
|
__copymains_0B:
|
|
mtctr r16
|
|
mr r5,r12
|
|
__copys0B_00:
|
|
addic. r5,r5,-1
|
|
lwzu r11,-4(r4)
|
|
stwu r11,-4(r9)
|
|
bne __copys0B_00
|
|
bctr
|
|
//
|
|
__copymains_1B:
|
|
mtctr r16
|
|
mr r5,r12
|
|
lbzu r10,-1(r4) // Load last byte
|
|
__copys1B_00:
|
|
addic. r5,r5,-1
|
|
rlwinm r11,r10,24,0,7 // Move LS 1 bytes in r10 to MS byte in r11
|
|
lwzu r10,-4(r4) // Load preceeding word
|
|
rlwimi r11,r10,24,8,31 // Insert MS 3 bytes in r10 to LS 3 bytes in r11
|
|
stwu r11,-4(r9) // Store r11
|
|
bne __copys1B_00
|
|
addi r4,r4,1 // Adjust source pointer
|
|
bctr
|
|
//
|
|
__copymains_2B:
|
|
mtctr r16
|
|
mr r5,r12
|
|
lhzu r10,-2(r4) // Load needed two bytes in r11
|
|
__copys2B_00:
|
|
addic. r5,r5,-1
|
|
rlwinm r11,r10,16,0,15 // Move LS 2 bytes in r10 to MS 2 bytes in r11
|
|
lwzu r10,-4(r4) // Load preceeding word in r10
|
|
rlwimi r11,r10,16,16,31 // Insert MS 2 bytes in r10 to LS 2 bytes in r11
|
|
stwu r11,-4(r9) // Store r11
|
|
bne __copys2B_00
|
|
addi r4,r4,2 // Adjust source pointer
|
|
bctr
|
|
//
|
|
__copymains_3B:
|
|
mtctr r16
|
|
mr r5,r12
|
|
addi r4,r4,1 // Adjust source pointer to make update word access
|
|
lwzu r10,-4(r4) // Load needed three bytes in MS r10
|
|
__copys3B_00:
|
|
addic. r5,r5,-1
|
|
rlwinm r11,r10,8,0,23 // Move LS 3 bytes in r10 to MS bytes in r11
|
|
lwzu r10,-4(r4) // Load preceeding word
|
|
rlwimi r11,r10,8,24,31 // Insert MS 1 bytes in r10 to LS 1 byte in r11
|
|
stwu r11,-4(r9) // Store r11
|
|
bne __copys3B_00
|
|
addi r4,r4,3 // Adjust source pointer
|
|
bctr
|
|
//
|
|
// End of short copy routines
|
|
//
|
|
copys_10:
|
|
mflr r10 // r10 <- top of table address
|
|
rlwinm r11,r11,4,25,27 // get length part of table index
|
|
rlwimi r11,r9,2,28,29 // get target alignment part of table index
|
|
lwzx r12,r10,r11 // r12 <- short copy routine address
|
|
cmplw r9,r4 // Compare source & target address
|
|
blt- copys_15 // Target is lower -> copy from top to bottom
|
|
mullw r10,r7,r6 // Target is higher -> copy from bottom to top
|
|
add r9,r9,r10 // r9 <- top target address of the line after last
|
|
mullw r10,r8,r6
|
|
add r4,r4,r10 // r4 <- top source address of the line after last
|
|
neg r7,r7 // r7 <- negative target distance between lines
|
|
neg r8,r8 // r8 <- negative source distance between lines
|
|
add r9,r9,r7 // r9 <- top target address of the last line
|
|
add r4,r4,r8 // r8 <- top source address of the last line
|
|
copys_15:
|
|
mtctr r12
|
|
bctrl // jump to short copy routine
|
|
//
|
|
b copys_exit // return to this point after completing all lines
|
|
//
|
|
// normal case (width > 8)
|
|
//
|
|
copys_20:
|
|
cmplw r9,r4 // Compare source & target address
|
|
blt- copys_50 // Target is lower -> copy from top to bottom
|
|
mullw r10,r7,r6 // Target is higher -> copy from bottom to top
|
|
add r9,r9,r10 // r9 <- top target address of the line after last
|
|
mullw r10,r8,r6
|
|
add r4,r4,r10 // r4 <- top source address of the line after last
|
|
subf r7,r5,r7 // r7 <- target delta after pointer increment
|
|
subf r8,r5,r8 // r8 <- source delta after pointer increment
|
|
neg r7,r7 // r7 <- negative target delta
|
|
neg r8,r8 // r8 <- negative source delta
|
|
add r9,r9,r7 // r9 <- one byte after the last byte of the last line
|
|
add r4,r4,r8 // r8 <- one byte after the last byte of the last line
|
|
bl copys_30 // To get table address in LR
|
|
__CopyInitProcsB:
|
|
.ualong __copyInit_0B
|
|
.ualong __copyInit_1B
|
|
.ualong __copyInit_2B
|
|
.ualong __copyInit_3B
|
|
__CopyMainProcsB:
|
|
.ualong __copymains_0B
|
|
.ualong __copymains_1B
|
|
.ualong __copymains_2B
|
|
.ualong __copymains_3B
|
|
__CopyEndProcsB:
|
|
.ualong __copyEnd_0B
|
|
.ualong __copyEnd_1B
|
|
.ualong __copyEnd_2B
|
|
.ualong __copyEnd_3B
|
|
//
|
|
copys_30:
|
|
mflr r10 // r10 <- Address of top table
|
|
rlwinm. r14,r9,2,28,29 // r14 <- table index to use depending on the ending alignment
|
|
beq copys_30x // No init routine -> set r14 later
|
|
lwzx r14,r10,r14 // r14 <- subroutine to be called at first
|
|
copys_30x:
|
|
andi. r11,r9,0x03 // r11 <- number of bytes to be copied at first
|
|
subf r15,r11,r4 // r15 <- pointing one byte after initial copy adjustment (source)
|
|
rlwinm r12,r15,2,28,29 // r12 <- table index for main loop routine
|
|
addi r10,r10,__CopyMainProcsB-__CopyInitProcsB
|
|
lwzx r15,r10,r12 // r15 <- subroutine address for main loop
|
|
subf r11,r11,r5 // r11 <- remaining number of bytes to be copied
|
|
srawi r12,r11,2 // r12 <- number of words (4 byte unit) to be copied in the main loop
|
|
rlwinm r16,r11,2,28,29 // r16 <- table index for ending copy
|
|
addi r10,r10,__CopyEndProcsB-__CopyMainProcsB
|
|
lwzx r16,r10,r16 // r16 <- subroutine to be called after the main loop
|
|
//
|
|
and. r14,r14,r14 // Initial routine exist?
|
|
bne copys_35x // Yes -> proceed
|
|
mr r14,r15 // No -> skip initial routine
|
|
copys_35x:
|
|
//
|
|
// Main process for copying
|
|
//
|
|
mtctr r14
|
|
bctrl // Junp to entry routine -> link to main routine -> link to end routine and loop
|
|
// back to here after all lines are copied
|
|
#if (! FULLCACHE)
|
|
lwz r5,PARAM3(r3) // r5 <- bytes to copy per line
|
|
subf r4,r5,r4 // adjust source and
|
|
subf r9,r5,r9 // target pointer
|
|
subf r7,r5,r7 // also delta need to be
|
|
subf r8,r5,r8 // adjusted
|
|
#endif
|
|
b copys_exit
|
|
//
|
|
copys_50:
|
|
//
|
|
// Copy forward
|
|
//
|
|
bl copys_60
|
|
__CopyInitProcsF:
|
|
.ualong __copyInit_0F
|
|
.ualong __copyInit_3F
|
|
.ualong __copyInit_2F
|
|
.ualong __copyInit_1F
|
|
__CopyMainProcsF:
|
|
.ualong __copymains_0F
|
|
.ualong __copymains_1F
|
|
.ualong __copymains_2F
|
|
.ualong __copymains_3F
|
|
__CopyEndProcsF:
|
|
.ualong __copyEnd_0F
|
|
.ualong __copyEnd_1F
|
|
.ualong __copyEnd_2F
|
|
.ualong __copyEnd_3F
|
|
//
|
|
copys_60:
|
|
mflr r10
|
|
rlwinm. r14,r9,2,28,29 // r14 <- table index to use depending on the initial alignment
|
|
beq copys_60x // No init routine -> set r14 later
|
|
lwzx r14,r10,r14 // r14 <- subroutine to be called at first
|
|
copys_60x:
|
|
andi. r11,r9,0x03 // r11 <- initial target alignment
|
|
beq- copys_65
|
|
subfic r11,r11,4 // r11 <- number of bytes to be copied at first
|
|
copys_65:
|
|
add r15,r11,r4 // r15 <- source pointer after initial copy
|
|
rlwinm r15,r15,2,28,29 // r15 <- table index for main loop routine
|
|
addi r10,r10,__CopyMainProcsF-__CopyInitProcsF
|
|
lwzx r15,r10,r15 // r15 <- subroutine address for main loop
|
|
subf r11,r11,r5 // r11 <- remaining number of bytes to be copied
|
|
srawi r12,r11,2 // r12 <- number of words (4 byte unit) to be copied in the main loop
|
|
rlwinm r16,r11,2,28,29 // r16 <- table index for ending copy
|
|
addi r10,r10,__CopyEndProcsF-__CopyMainProcsF
|
|
lwzx r16,r10,r16 // r16 <- subroutine to be called after the main loop
|
|
//
|
|
and. r14,r14,r14 // Initial routine exist?
|
|
bne copy_65x // Yes -> proceed
|
|
mr r14,r15 // No -> skip initial routine
|
|
copy_65x:
|
|
//
|
|
// Main process for copying
|
|
//
|
|
subf r7,r5,r7 // r7 <- line delta after updating pointer (target)
|
|
subf r8,r5,r8 // r8 <- line delta after updating pointer (source)
|
|
mtctr r14
|
|
bctrl // Junp to entry routine -> link to main routine -> link to end routine and loop
|
|
//
|
|
#if (! FULLCACHE)
|
|
lwz r5,PARAM3(r3) // r5 <- bytes to copy per line
|
|
add r7,r5,r7 // restore source and target delta
|
|
add r8,r5,r8
|
|
#endif
|
|
//
|
|
copys_exit:
|
|
lwz r0,MINSTACKSIZE+16-4*5(sp)
|
|
lwz r17,MINSTACKSIZE+16-4*4(sp)
|
|
lwz r16,MINSTACKSIZE+16-4*3(sp)
|
|
lwz r15,MINSTACKSIZE+16-4*2(sp)
|
|
lwz r14,MINSTACKSIZE+16-4*1(sp)
|
|
mtlr r0
|
|
addi sp,sp,(MINSTACKSIZE+16)
|
|
//
|
|
SPECIAL_EXIT(RectCopyS)
|
|
//
|
|
//*************************************************************************************************
|
|
LEAF_ENTRY(RectFlushCache)
|
|
//
|
|
// Input Parameters:
|
|
// r3: Target address (pointing to top left of the rectangle)
|
|
// r4: Width of the rectangle (in bytes)
|
|
// r5: Number of lines of the rectangle
|
|
// r6: Target delta par line (in bytes)
|
|
// r7: Maximum number of cache lines to flush
|
|
// r8: Maximum number of display lines to flush
|
|
//
|
|
addi r10,r5,-1 // r10 <- number of lines -1
|
|
mullw r9,r10,r6 // r9 <- offset to the last line
|
|
add r3,r3,r9 // r3 <- top address of the last line
|
|
cmplw r5,r8 // compare target lines and maximum display lines to flush
|
|
ble rect_flush_05 // and take whichever
|
|
mr r5,r8 // smaller
|
|
rect_flush_05:
|
|
add r8,r3,r4 // r8 <- pointing to one byte after last byte of the last line
|
|
addi r8,r8,-1 // r8 <- pointing to the last byte of the top line
|
|
rlwinm r8,r8,0,0,26 // r8 <- 32 byte aligned end address
|
|
rlwinm r3,r3,0,0,26 // r3 <- 32 byte aligned start address
|
|
subf r9,r3,r8 // r9 <- end - start
|
|
srawi r9,r9,5
|
|
addi r9,r9,1 // r9 <- Number of cache entries to be flushed per line
|
|
rect_flush_10:
|
|
mr r10,r3 // r10 <- address to flush cache to start with
|
|
rect_flush_20:
|
|
dcbf 0,r10 // Flush cached data
|
|
addi r10,r10,32 // Increment address to flush
|
|
cmplw r10,r8 // Exceeding end address?
|
|
ble rect_flush_20 // No -> loop to flush next cache line
|
|
subf. r7,r9,r7 // Flush enough entries?
|
|
blt- rect_flush_exit // Yes -> exit
|
|
addic. r5,r5,-1 // Flush all lines?
|
|
subf r3,r6,r3 // Update start
|
|
subf r8,r6,r8 // and end address to flush cache to point to the previous line
|
|
bne rect_flush_10 // No -> continue to flush
|
|
rect_flush_exit:
|
|
LEAF_EXIT(RectFlushCache)
|
|
//
|
|
//*************************************************************************************************
|
|
SPECIAL_ENTRY(RectSrcOpTgt)
|
|
//
|
|
// Input Parameters:
|
|
// r3: The pointer to the parameter structure as follows.
|
|
// PARAM1 [00] : Target address
|
|
// PARAM2 [04] : Source address
|
|
// PARAM3 [08] : Number of bytes to op per line
|
|
// PARAM4 [12] : Number of lines to op
|
|
// PARAM5 [16] : Target line increments byte per line
|
|
// PARAM6 [20] : Source line increments byte per line
|
|
// PARAM7 [24] : Maximum number of cache lines to flush
|
|
// PARAM8 [28] : Maximum number of display lines to flush
|
|
// PARAM9 [32] : Operation control flag
|
|
// bit 0 (SFLUSHBIT): Source Flush flag 0:No Flush, 1:Flush
|
|
// bit 1 (TFLUSHBIT): Target Flush flag 0:No Flush, 1:Flush
|
|
// PARAM10 [36] : Function entry
|
|
// PARAM11 [40] : Solid Brush (if any)
|
|
// PARAM12 [44] : Register save area 1
|
|
// PARAM13 [48] : Register save area 2
|
|
// PARAM14 [52] : Register save area 3
|
|
// PARAM15 [56] : Register save area 4
|
|
// PARAM16 [60] : Register save area 5
|
|
// PARAM17 [64] : Register save area 6
|
|
//
|
|
// Register usage:
|
|
// r0: Return address save register
|
|
// r4: Updating source address
|
|
// r5: Number of bytes to op per line --> used for counter (and destroied) in main op routine
|
|
// used for solid brush in case of short operation (<= 2 bytes)
|
|
// r6: Updating remaining number of lines to op
|
|
// r7: Target increment bytes per line (changed for pre caluculated value)
|
|
// r8: Source increment bytes per line (changed for pre caluculated value)
|
|
// r9: Updating target address
|
|
// r10: Work register
|
|
// r11: Work register
|
|
// r12: Inner most loop counter (8 bytes unit)
|
|
// used for short op routine entry and then work register
|
|
// in case of short operation (<= 2 bytes)
|
|
// r14: Subroutine for init op
|
|
// r15: Subroutine for main loop
|
|
// r16: Subroutine for final op
|
|
// r17: Work register
|
|
// r18: Work register
|
|
// r19: Solid Brush (if any)
|
|
// CTR: Used for link
|
|
//
|
|
//
|
|
mflr r0 // save LR
|
|
//
|
|
// Save non-volatile registers
|
|
//
|
|
stw r14,SLACK2(sp)
|
|
stw r15,SLACK3(sp)
|
|
stw r16,SLACK4(sp)
|
|
stw r17,SLACK5(sp)
|
|
stw r18,SLACK6(sp)
|
|
stw r19,SLACK7(sp)
|
|
//
|
|
PROLOGUE_END(RectSrcOpTgt)
|
|
//
|
|
lwz r6,PARAM4(r3) // r6 <- number of lines to op
|
|
and. r6,r6,r6 // Any lines to op?
|
|
beq- opsrcs_exit // No -> exit
|
|
lwz r9,PARAM1(r3) // r9 <- target address
|
|
lwz r4,PARAM2(r3) // r4 <- source address
|
|
lwz r5,PARAM3(r3) // r5 <- bytes to op per line
|
|
lwz r7,PARAM5(r3) // r7 <- target byte distance between lines
|
|
lwz r8,PARAM6(r3) // r8 <- source byte distance between lines
|
|
lwz r10,PARAM10(r3) // r10 <- asm function table entry
|
|
cmplwi r5,2 // More than 2 bytes
|
|
bgt opsrcs_20 // Yes -> do normal process
|
|
addic. r11,r5,-1 // r11 <- Length - 1
|
|
blt opsrcs_exit // length = 0 -> just exit
|
|
//
|
|
lwz r5,PARAM11(r3) // r5 <- solid brush for short operation
|
|
rlwinm r11,r11,4,25,27 // get length part of table index
|
|
rlwimi r11,r9,2,28,29 // get target alignment part of table index
|
|
lwzx r12,r10,r11 // r12 <- short op routine address
|
|
cmplw r9,r4 // Compare source & target address
|
|
blt- opsrcs_15 // Target is lower -> op from top to bottom
|
|
mullw r10,r7,r6 // Target is higher -> op from bottom to top
|
|
add r9,r9,r10 // r9 <- top target address of the line after last
|
|
mullw r10,r8,r6
|
|
add r4,r4,r10 // r4 <- top source address of the line after last
|
|
neg r7,r7 // r7 <- negative target distance between lines
|
|
neg r8,r8 // r8 <- negative source distance between lines
|
|
add r9,r9,r7 // r9 <- top target address of the last line
|
|
add r4,r4,r8 // r8 <- top source address of the last line
|
|
opsrcs_15:
|
|
mtctr r12
|
|
bctrl // jump to short op routine
|
|
//
|
|
b opsrcs_90 // return to this point after completing all lines
|
|
//
|
|
// normal case (width > 2)
|
|
//
|
|
opsrcs_20:
|
|
lwz r19,PARAM11(r3) // r19 <- solid brush
|
|
cmplw r9,r4 // Compare source & target address
|
|
blt- opsrcs_50 // Target is lower -> op from top to bottom
|
|
mullw r11,r7,r6 // Target is higher -> op from bottom to top
|
|
add r9,r9,r11 // r9 <- top target address of the line after last
|
|
mullw r11,r8,r6
|
|
add r4,r4,r11 // r4 <- top source address of the line after last
|
|
subf r7,r5,r7 // r7 <- target delta after pointer increment
|
|
subf r8,r5,r8 // r8 <- source delta after pointer increment
|
|
neg r7,r7 // r7 <- negative target delta
|
|
neg r8,r8 // r8 <- negative source delta
|
|
add r9,r9,r7 // r9 <- one byte after the last byte of the last line
|
|
add r4,r4,r8 // r8 <- one byte after the last byte of the last line
|
|
//
|
|
addi r10,r10,__XorsInitProcsB-__XorsShortTable
|
|
rlwinm r17,r9,2,28,29 // r17 <- table index to use depending on the ending alignment
|
|
lwzx r14,r10,r17 // r14 <- subroutine to be called at first
|
|
andi. r11,r9,0x03 // r11 <- number of bytes to be copied at first
|
|
subf r15,r11,r4 // r15 <- pointing one byte after initial op adjustment (source)
|
|
rlwinm r12,r15,2,28,29 // r12 <- table index for main loop routine
|
|
addi r10,r10,__XorsMainProcsB-__XorsInitProcsB
|
|
lwzx r15,r10,r12 // r15 <- subroutine address for main loop
|
|
subf r11,r11,r5 // r11 <- remaining number of bytes to be copied
|
|
srawi r12,r11,2 // r12 <- number of words (4 byte unit) to be copied in the main loop
|
|
rlwinm r16,r11,2,28,29 // r16 <- table index for ending op
|
|
addi r10,r10,__XorsEndProcsB-__XorsMainProcsB
|
|
lwzx r16,r10,r16 // r16 <- subroutine to be called after the main loop
|
|
and. r12,r12,r12 // Internal loop counter 0?
|
|
bne opsrcs_30
|
|
mr r15,r16 // Yes -> skip main loop
|
|
opsrcs_30:
|
|
and. r17,r17,r17 // Any initial operation exist?
|
|
bne opsrcs_40
|
|
mr r14,r15 // No -> skip initial routine
|
|
opsrcs_40:
|
|
//
|
|
// Main process for oping
|
|
//
|
|
mtctr r14
|
|
bctrl // Junp to entry routine -> main routine -> end routine and loop
|
|
// back here after all lines are copied
|
|
#if (! FULLCACHE)
|
|
lwz r5,PARAM3(r3) // r5 <- bytes to op per line
|
|
subf r4,r5,r4 // adjust source and
|
|
subf r9,r5,r9 // target pointer
|
|
subf r7,r5,r7 // also delta need to be
|
|
subf r8,r5,r8 // adjusted
|
|
#endif
|
|
b opsrcs_90
|
|
//
|
|
opsrcs_50:
|
|
//
|
|
// OP forward
|
|
//
|
|
addi r10,r10,__XorsInitProcsF-__XorsShortTable
|
|
rlwinm r17,r9,2,28,29 // r17 <- table index to use depending on the initial alignment
|
|
lwzx r14,r10,r17 // r14 <- subroutine to be called at first
|
|
andi. r11,r9,0x03 // r11 <- initial target alignment
|
|
beq- opsrcs_60
|
|
subfic r11,r11,4 // r11 <- number of bytes to be copied at first
|
|
opsrcs_60:
|
|
add r15,r11,r4 // r15 <- source pointer after initial op
|
|
rlwinm r15,r15,2,28,29 // r15 <- table index for main loop routine
|
|
addi r10,r10,__XorsMainProcsF-__XorsInitProcsF
|
|
lwzx r15,r10,r15 // r15 <- subroutine address for main loop
|
|
subf r11,r11,r5 // r11 <- remaining number of bytes to be copied
|
|
srawi r12,r11,2 // r12 <- number of words (4 byte unit) to be copied in the main loop
|
|
rlwinm r16,r11,2,28,29 // r16 <- table index for ending op
|
|
addi r10,r10,__XorsEndProcsF-__XorsMainProcsF
|
|
lwzx r16,r10,r16 // r16 <- subroutine to be called after the main loop
|
|
and. r12,r12,r12 // Internal loop counter 0?
|
|
bne opsrcs_70
|
|
mr r15,r16 // Yes -> skip main loop
|
|
opsrcs_70:
|
|
and. r17,r17,r17 // Any initial operation exist?
|
|
bne opsrcs_80
|
|
mr r14,r15 // No -> skip initial routine
|
|
opsrcs_80:
|
|
//
|
|
// Main process for oping
|
|
//
|
|
subf r7,r5,r7 // r7 <- line delta after updating pointer (target)
|
|
subf r8,r5,r8 // r8 <- line delta after updating pointer (source)
|
|
mtctr r14
|
|
bctrl // Junp to entry routine -> main routine -> end routine and loop
|
|
//
|
|
#if (! FULLCACHE)
|
|
lwz r5,PARAM3(r3) // r5 <- bytes to op per line
|
|
add r7,r5,r7 // restore source and target delta
|
|
add r8,r5,r8
|
|
#endif
|
|
//
|
|
opsrcs_90:
|
|
#if (! FULLCACHE)
|
|
bl ..copyflush
|
|
#endif
|
|
b opsrcs_exit
|
|
//
|
|
// Short xor routines for 1~2 bytes with 4 target word alignment cases
|
|
//
|
|
__xors1_A0:
|
|
__xors1_A1:
|
|
__xors1_A2:
|
|
__xors1_A3:
|
|
addic. r6,r6,-1
|
|
lbz r10,0(r4)
|
|
lbz r11,0(r9)
|
|
xor r10,r10,r11
|
|
stb r10,0(r9)
|
|
add r4,r4,r8
|
|
add r9,r9,r7
|
|
bnectr
|
|
blr
|
|
__xors2_A0:
|
|
__xors2_A2:
|
|
addic. r6,r6,-1
|
|
lbz r10,0(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lhz r11,0(r9)
|
|
xor r10,r10,r11
|
|
sth r10,0(r9)
|
|
add r4,r4,r8
|
|
add r9,r9,r7
|
|
bnectr
|
|
blr
|
|
__xors2_A1:
|
|
__xors2_A3:
|
|
addic. r6,r6,-1
|
|
lbz r10,0(r4)
|
|
lbz r11,0(r9)
|
|
xor r10,r10,r11
|
|
lbz r12,1(r4)
|
|
lbz r11,1(r9)
|
|
xor r12,r12,r11
|
|
stb r10,0(r9)
|
|
stb r12,1(r9)
|
|
add r4,r4,r8
|
|
add r9,r9,r7
|
|
bnectr
|
|
blr
|
|
//
|
|
// Main xor routines for short case (4 bytes unit) forward direction
|
|
//
|
|
__xorsmains_0F:
|
|
mtctr r16
|
|
mr r5,r12
|
|
__xorss0F_00:
|
|
addic. r5,r5,-1
|
|
lwz r10,0(r4)
|
|
lwz r17,0(r9)
|
|
xor r10,r10,r17
|
|
stw r10,0(r9)
|
|
addi r4,r4,4
|
|
addi r9,r9,4
|
|
bne __xorss0F_00
|
|
bctr
|
|
//
|
|
__xorsmains_1F:
|
|
mtctr r16
|
|
mr r5,r12
|
|
addi r4,r4,-1
|
|
lwz r10,0(r4)
|
|
__xorss1F_00:
|
|
addic. r5,r5,-1
|
|
rlwinm r11,r10,24,8,31
|
|
lwzu r10,4(r4)
|
|
rlwimi r11,r10,24,0,7
|
|
lwz r17,0(r9)
|
|
xor r11,r11,r17
|
|
stw r11,0(r9)
|
|
addi r9,r9,4
|
|
bne __xorss1F_00
|
|
addi r4,r4,1
|
|
bctr
|
|
//
|
|
__xorsmains_2F:
|
|
mtctr r16
|
|
mr r5,r12
|
|
lhz r10,0(r4)
|
|
addi r4,r4,-2
|
|
__xorss2F_00:
|
|
addic. r5,r5,-1
|
|
lwzu r11,4(r4)
|
|
rlwimi r10,r11,16,0,15
|
|
lwz r17,0(r9)
|
|
xor r10,r10,r17
|
|
stw r10,0(r9)
|
|
addi r9,r9,4
|
|
rlwinm r10,r11,16,16,31
|
|
bne __xorss2F_00
|
|
addi r4,r4,2
|
|
bctr
|
|
//
|
|
__xorsmains_3F:
|
|
mtctr r16
|
|
mr r5,r12
|
|
lbz r10,0(r4)
|
|
addi r4,r4,-3
|
|
__xorss3F_00:
|
|
addic. r5,r5,-1
|
|
lwzu r11,4(r4)
|
|
rlwimi r10,r11,8,0,23
|
|
lwz r17,0(r9)
|
|
xor r10,r10,r17
|
|
stw r10,0(r9)
|
|
addi r9,r9,4
|
|
rlwinm r10,r11,8,24,31
|
|
bne __xorss3F_00
|
|
addi r4,r4,3
|
|
bctr
|
|
//
|
|
// Initial xor routines for 1~3 bytes for forward direction
|
|
//
|
|
__xorsInit_0F:
|
|
mtctr r15
|
|
bctr
|
|
__xorsInit_1F:
|
|
mtctr r15
|
|
lbz r10,0(r4)
|
|
lbz r11,0(r9)
|
|
xor r10,r10,r11
|
|
stb r10,0(r9)
|
|
addi r4,r4,1
|
|
addi r9,r9,1
|
|
bctr
|
|
__xorsInit_2F:
|
|
mtctr r15
|
|
lbz r10,0(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lhz r11,0(r9)
|
|
xor r10,r10,r11
|
|
sth r10,0(r9)
|
|
addi r4,r4,2
|
|
addi r9,r9,2
|
|
bctr
|
|
__xorsInit_3F:
|
|
mtctr r15
|
|
lbz r10,0(r4)
|
|
lbz r11,0(r9)
|
|
xor r10,r10,r11
|
|
stb r10,0(r9)
|
|
lbz r10,1(r4)
|
|
lbz r11,2(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lhz r11,1(r9)
|
|
xor r10,r10,r11
|
|
sth r10,1(r9)
|
|
addi r4,r4,3
|
|
addi r9,r9,3
|
|
bctr
|
|
//
|
|
// Ending xor routines for 1~3 bytes for forward direction
|
|
//
|
|
__xorsEnd_0F:
|
|
addic. r6,r6,-1
|
|
mtctr r14
|
|
add r9,r9,r7
|
|
add r4,r4,r8
|
|
bnectr
|
|
blr
|
|
__xorsEnd_1F:
|
|
mtctr r14
|
|
lbz r10,0(r4)
|
|
lbz r11,0(r9)
|
|
xor r10,r10,r11
|
|
stb r10,0(r9)
|
|
addic. r6,r6,-1
|
|
addi r4,r4,1
|
|
addi r9,r9,1
|
|
add r9,r9,r7
|
|
add r4,r4,r8
|
|
bnectr
|
|
blr
|
|
__xorsEnd_2F:
|
|
mtctr r14
|
|
lbz r10,0(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lhz r11,0(r9)
|
|
xor r10,r10,r11
|
|
sth r10,0(r9)
|
|
addic. r6,r6,-1
|
|
addi r4,r4,2
|
|
addi r9,r9,2
|
|
add r9,r9,r7
|
|
add r4,r4,r8
|
|
bnectr
|
|
blr
|
|
__xorsEnd_3F:
|
|
mtctr r14
|
|
lbz r10,0(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lhz r11,0(r9)
|
|
xor r10,r10,r11
|
|
sth r10,0(r9)
|
|
lbz r10,2(r4)
|
|
lbz r11,2(r9)
|
|
xor r10,r10,r11
|
|
stb r10,2(r9)
|
|
addic. r6,r6,-1
|
|
addi r4,r4,3
|
|
addi r9,r9,3
|
|
add r9,r9,r7
|
|
add r4,r4,r8
|
|
bnectr
|
|
blr
|
|
//
|
|
// Main xor routines for short case (4 bytes unit) backword direction
|
|
//
|
|
__xorsmains_0B:
|
|
mtctr r16
|
|
mr r5,r12
|
|
__xorss0B_00:
|
|
addic. r5,r5,-1
|
|
lwzu r11,-4(r4)
|
|
lwzu r17,-4(r9)
|
|
xor r11,r11,r17
|
|
stw r11,0(r9)
|
|
bne __xorss0B_00
|
|
bctr
|
|
//
|
|
__xorsmains_1B:
|
|
mtctr r16
|
|
mr r5,r12
|
|
lbzu r10,-1(r4) // Load last byte
|
|
__xorss1B_00:
|
|
addic. r5,r5,-1
|
|
rlwinm r11,r10,24,0,7 // Move LS 1 bytes in r10 to MS byte in r11
|
|
lwzu r10,-4(r4) // Load preceeding word
|
|
rlwimi r11,r10,24,8,31 // Insert MS 3 bytes in r10 to LS 3 bytes in r11
|
|
lwzu r17,-4(r9)
|
|
xor r11,r11,r17
|
|
stw r11,0(r9) // Store r11
|
|
bne __xorss1B_00
|
|
addi r4,r4,1 // Adjust source pointer
|
|
bctr
|
|
//
|
|
__xorsmains_2B:
|
|
mtctr r16
|
|
mr r5,r12
|
|
lhzu r10,-2(r4) // Load needed two bytes in r11
|
|
__xorss2B_00:
|
|
addic. r5,r5,-1
|
|
rlwinm r11,r10,16,0,15 // Move LS 2 bytes in r10 to MS 2 bytes in r11
|
|
lwzu r10,-4(r4) // Load preceeding word in r10
|
|
rlwimi r11,r10,16,16,31 // Insert MS 2 bytes in r10 to LS 2 bytes in r11
|
|
lwzu r17,-4(r9)
|
|
xor r11,r11,r17
|
|
stw r11,0(r9) // Store r11
|
|
bne __xorss2B_00
|
|
addi r4,r4,2 // Adjust source pointer
|
|
bctr
|
|
//
|
|
__xorsmains_3B:
|
|
mtctr r16
|
|
mr r5,r12
|
|
addi r4,r4,1 // Adjust source pointer to make update word access
|
|
lwzu r10,-4(r4) // Load needed three bytes in MS r10
|
|
__xorss3B_00:
|
|
addic. r5,r5,-1
|
|
rlwinm r11,r10,8,0,23 // Move LS 3 bytes in r10 to MS bytes in r11
|
|
lwzu r10,-4(r4) // Load preceeding word
|
|
rlwimi r11,r10,8,24,31 // Insert MS 1 bytes in r10 to LS 1 byte in r11
|
|
lwzu r17,-4(r9)
|
|
xor r11,r11,r17
|
|
stw r11,0(r9) // Store r11
|
|
bne __xorss3B_00
|
|
addi r4,r4,3 // Adjust source pointer
|
|
bctr
|
|
//
|
|
// Initial xor routines for 1~3 bytes for backword direction
|
|
//
|
|
__xorsInit_0B:
|
|
mtctr r15
|
|
bctr
|
|
__xorsInit_1B:
|
|
mtctr r15
|
|
lbzu r10,-1(r4)
|
|
lbzu r11,-1(r9)
|
|
xor r10,r10,r11
|
|
stb r10,0(r9)
|
|
bctr
|
|
__xorsInit_2B:
|
|
mtctr r15
|
|
lbzu r10,-2(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lhzu r11,-2(r9)
|
|
xor r10,r10,r11
|
|
sth r10,0(r9)
|
|
bctr
|
|
__xorsInit_3B:
|
|
mtctr r15
|
|
lbzu r10,-1(r4)
|
|
lbzu r11,-1(r9)
|
|
xor r10,r10,r11
|
|
stb r10,0(r9)
|
|
lbzu r10,-2(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lhzu r11,-2(r9)
|
|
xor r10,r10,r11
|
|
sth r10,0(r9)
|
|
bctr
|
|
//
|
|
// Ending xor routines for 1~3 bytes for backword direction
|
|
//
|
|
__xorsEnd_0B:
|
|
addic. r6,r6,-1
|
|
mtctr r14
|
|
add r9,r9,r7
|
|
add r4,r4,r8
|
|
bnectr
|
|
blr
|
|
__xorsEnd_1B:
|
|
mtctr r14
|
|
lbzu r10,-1(r4)
|
|
lbzu r11,-1(r9)
|
|
xor r10,r10,r11
|
|
stb r10,0(r9)
|
|
addic. r6,r6,-1
|
|
add r9,r9,r7
|
|
add r4,r4,r8
|
|
bnectr
|
|
blr
|
|
__xorsEnd_2B:
|
|
mtctr r14
|
|
lbzu r10,-2(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lhzu r11,-2(r9)
|
|
xor r10,r10,r11
|
|
sth r10,0(r9)
|
|
addic. r6,r6,-1
|
|
add r9,r9,r7
|
|
add r4,r4,r8
|
|
bnectr
|
|
blr
|
|
__xorsEnd_3B:
|
|
mtctr r14
|
|
lbzu r10,-2(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lhzu r11,-2(r9)
|
|
xor r10,r10,r11
|
|
sth r10,0(r9)
|
|
lbzu r10,-1(r4)
|
|
lbzu r11,-1(r9)
|
|
xor r10,r10,r11
|
|
stb r10,0(r9)
|
|
addic. r6,r6,-1
|
|
add r9,r9,r7
|
|
add r4,r4,r8
|
|
bnectr
|
|
blr
|
|
//
|
|
// Short and routines for 1~2 bytes with 4 target word alignment cases
|
|
//
|
|
__ands1_A0:
|
|
__ands1_A1:
|
|
__ands1_A2:
|
|
__ands1_A3:
|
|
addic. r6,r6,-1
|
|
lbz r10,0(r4)
|
|
lbz r11,0(r9)
|
|
and r10,r10,r11
|
|
stb r10,0(r9)
|
|
add r4,r4,r8
|
|
add r9,r9,r7
|
|
bnectr
|
|
blr
|
|
__ands2_A0:
|
|
__ands2_A2:
|
|
addic. r6,r6,-1
|
|
lbz r10,0(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lhz r11,0(r9)
|
|
and r10,r10,r11
|
|
sth r10,0(r9)
|
|
add r4,r4,r8
|
|
add r9,r9,r7
|
|
bnectr
|
|
blr
|
|
__ands2_A1:
|
|
__ands2_A3:
|
|
addic. r6,r6,-1
|
|
lbz r10,0(r4)
|
|
lbz r11,0(r9)
|
|
and r10,r10,r11
|
|
lbz r12,1(r4)
|
|
lbz r11,1(r9)
|
|
and r12,r12,r11
|
|
stb r10,0(r9)
|
|
stb r12,1(r9)
|
|
add r4,r4,r8
|
|
add r9,r9,r7
|
|
bnectr
|
|
blr
|
|
//
|
|
// Main and routines for short case (4 bytes unit) forward direction
|
|
//
|
|
__andsmains_0F:
|
|
mtctr r16
|
|
mr r5,r12
|
|
__andss0F_00:
|
|
addic. r5,r5,-1
|
|
lwz r10,0(r4)
|
|
lwz r17,0(r9)
|
|
and r10,r10,r17
|
|
stw r10,0(r9)
|
|
addi r4,r4,4
|
|
addi r9,r9,4
|
|
bne __andss0F_00
|
|
bctr
|
|
//
|
|
__andsmains_1F:
|
|
mtctr r16
|
|
mr r5,r12
|
|
addi r4,r4,-1
|
|
lwz r10,0(r4)
|
|
__andss1F_00:
|
|
addic. r5,r5,-1
|
|
rlwinm r11,r10,24,8,31
|
|
lwzu r10,4(r4)
|
|
rlwimi r11,r10,24,0,7
|
|
lwz r17,0(r9)
|
|
and r11,r11,r17
|
|
stw r11,0(r9)
|
|
addi r9,r9,4
|
|
bne __andss1F_00
|
|
addi r4,r4,1
|
|
bctr
|
|
//
|
|
__andsmains_2F:
|
|
mtctr r16
|
|
mr r5,r12
|
|
lhz r10,0(r4)
|
|
addi r4,r4,-2
|
|
__andss2F_00:
|
|
addic. r5,r5,-1
|
|
lwzu r11,4(r4)
|
|
rlwimi r10,r11,16,0,15
|
|
lwz r17,0(r9)
|
|
and r10,r10,r17
|
|
stw r10,0(r9)
|
|
addi r9,r9,4
|
|
rlwinm r10,r11,16,16,31
|
|
bne __andss2F_00
|
|
addi r4,r4,2
|
|
bctr
|
|
//
|
|
__andsmains_3F:
|
|
mtctr r16
|
|
mr r5,r12
|
|
lbz r10,0(r4)
|
|
addi r4,r4,-3
|
|
__andss3F_00:
|
|
addic. r5,r5,-1
|
|
lwzu r11,4(r4)
|
|
rlwimi r10,r11,8,0,23
|
|
lwz r17,0(r9)
|
|
and r10,r10,r17
|
|
stw r10,0(r9)
|
|
addi r9,r9,4
|
|
rlwinm r10,r11,8,24,31
|
|
bne __andss3F_00
|
|
addi r4,r4,3
|
|
bctr
|
|
//
|
|
// Initial and routines for 1~3 bytes for forward direction
|
|
//
|
|
__andsInit_0F:
|
|
mtctr r15
|
|
bctr
|
|
__andsInit_1F:
|
|
mtctr r15
|
|
lbz r10,0(r4)
|
|
lbz r11,0(r9)
|
|
and r10,r10,r11
|
|
stb r10,0(r9)
|
|
addi r4,r4,1
|
|
addi r9,r9,1
|
|
bctr
|
|
__andsInit_2F:
|
|
mtctr r15
|
|
lbz r10,0(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lhz r11,0(r9)
|
|
and r10,r10,r11
|
|
sth r10,0(r9)
|
|
addi r4,r4,2
|
|
addi r9,r9,2
|
|
bctr
|
|
__andsInit_3F:
|
|
mtctr r15
|
|
lbz r10,0(r4)
|
|
lbz r11,0(r9)
|
|
and r10,r10,r11
|
|
stb r10,0(r9)
|
|
lbz r10,1(r4)
|
|
lbz r11,2(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lhz r11,1(r9)
|
|
and r10,r10,r11
|
|
sth r10,1(r9)
|
|
addi r4,r4,3
|
|
addi r9,r9,3
|
|
bctr
|
|
//
|
|
// Ending and routines for 1~3 bytes for forward direction
|
|
//
|
|
__andsEnd_0F:
|
|
addic. r6,r6,-1
|
|
mtctr r14
|
|
add r9,r9,r7
|
|
add r4,r4,r8
|
|
bnectr
|
|
blr
|
|
__andsEnd_1F:
|
|
mtctr r14
|
|
lbz r10,0(r4)
|
|
lbz r11,0(r9)
|
|
and r10,r10,r11
|
|
stb r10,0(r9)
|
|
addic. r6,r6,-1
|
|
addi r4,r4,1
|
|
addi r9,r9,1
|
|
add r9,r9,r7
|
|
add r4,r4,r8
|
|
bnectr
|
|
blr
|
|
__andsEnd_2F:
|
|
mtctr r14
|
|
lbz r10,0(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lhz r11,0(r9)
|
|
and r10,r10,r11
|
|
sth r10,0(r9)
|
|
addic. r6,r6,-1
|
|
addi r4,r4,2
|
|
addi r9,r9,2
|
|
add r9,r9,r7
|
|
add r4,r4,r8
|
|
bnectr
|
|
blr
|
|
__andsEnd_3F:
|
|
mtctr r14
|
|
lbz r10,0(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lhz r11,0(r9)
|
|
and r10,r10,r11
|
|
sth r10,0(r9)
|
|
lbz r10,2(r4)
|
|
lbz r11,2(r9)
|
|
and r10,r10,r11
|
|
stb r10,2(r9)
|
|
addic. r6,r6,-1
|
|
addi r4,r4,3
|
|
addi r9,r9,3
|
|
add r9,r9,r7
|
|
add r4,r4,r8
|
|
bnectr
|
|
blr
|
|
//
|
|
// Main and routines for short case (4 bytes unit) backword direction
|
|
//
|
|
__andsmains_0B:
|
|
mtctr r16
|
|
mr r5,r12
|
|
__andss0B_00:
|
|
addic. r5,r5,-1
|
|
lwzu r11,-4(r4)
|
|
lwzu r17,-4(r9)
|
|
and r11,r11,r17
|
|
stw r11,0(r9)
|
|
bne __andss0B_00
|
|
bctr
|
|
//
|
|
__andsmains_1B:
|
|
mtctr r16
|
|
mr r5,r12
|
|
lbzu r10,-1(r4) // Load last byte
|
|
__andss1B_00:
|
|
addic. r5,r5,-1
|
|
rlwinm r11,r10,24,0,7 // Move LS 1 bytes in r10 to MS byte in r11
|
|
lwzu r10,-4(r4) // Load preceeding word
|
|
rlwimi r11,r10,24,8,31 // Insert MS 3 bytes in r10 to LS 3 bytes in r11
|
|
lwzu r17,-4(r9)
|
|
and r11,r11,r17
|
|
stw r11,0(r9) // Store r11
|
|
bne __andss1B_00
|
|
addi r4,r4,1 // Adjust source pointer
|
|
bctr
|
|
//
|
|
__andsmains_2B:
|
|
mtctr r16
|
|
mr r5,r12
|
|
lhzu r10,-2(r4) // Load needed two bytes in r11
|
|
__andss2B_00:
|
|
addic. r5,r5,-1
|
|
rlwinm r11,r10,16,0,15 // Move LS 2 bytes in r10 to MS 2 bytes in r11
|
|
lwzu r10,-4(r4) // Load preceeding word in r10
|
|
rlwimi r11,r10,16,16,31 // Insert MS 2 bytes in r10 to LS 2 bytes in r11
|
|
lwzu r17,-4(r9)
|
|
and r11,r11,r17
|
|
stw r11,0(r9) // Store r11
|
|
bne __andss2B_00
|
|
addi r4,r4,2 // Adjust source pointer
|
|
bctr
|
|
//
|
|
__andsmains_3B:
|
|
mtctr r16
|
|
mr r5,r12
|
|
addi r4,r4,1 // Adjust source pointer to make update word access
|
|
lwzu r10,-4(r4) // Load needed three bytes in MS r10
|
|
__andss3B_00:
|
|
addic. r5,r5,-1
|
|
rlwinm r11,r10,8,0,23 // Move LS 3 bytes in r10 to MS bytes in r11
|
|
lwzu r10,-4(r4) // Load preceeding word
|
|
rlwimi r11,r10,8,24,31 // Insert MS 1 bytes in r10 to LS 1 byte in r11
|
|
lwzu r17,-4(r9)
|
|
and r11,r11,r17
|
|
stw r11,0(r9) // Store r11
|
|
bne __andss3B_00
|
|
addi r4,r4,3 // Adjust source pointer
|
|
bctr
|
|
//
|
|
// Initial and routines for 1~3 bytes for backword direction
|
|
//
|
|
__andsInit_0B:
|
|
mtctr r15
|
|
bctr
|
|
__andsInit_1B:
|
|
mtctr r15
|
|
lbzu r10,-1(r4)
|
|
lbzu r11,-1(r9)
|
|
and r10,r10,r11
|
|
stb r10,0(r9)
|
|
bctr
|
|
__andsInit_2B:
|
|
mtctr r15
|
|
lbzu r10,-2(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lhzu r11,-2(r9)
|
|
and r10,r10,r11
|
|
sth r10,0(r9)
|
|
bctr
|
|
__andsInit_3B:
|
|
mtctr r15
|
|
lbzu r10,-1(r4)
|
|
lbzu r11,-1(r9)
|
|
and r10,r10,r11
|
|
stb r10,0(r9)
|
|
lbzu r10,-2(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lhzu r11,-2(r9)
|
|
and r10,r10,r11
|
|
sth r10,0(r9)
|
|
bctr
|
|
//
|
|
// Ending and routines for 1~3 bytes for backword direction
|
|
//
|
|
__andsEnd_0B:
|
|
addic. r6,r6,-1
|
|
mtctr r14
|
|
add r9,r9,r7
|
|
add r4,r4,r8
|
|
bnectr
|
|
blr
|
|
__andsEnd_1B:
|
|
mtctr r14
|
|
lbzu r10,-1(r4)
|
|
lbzu r11,-1(r9)
|
|
and r10,r10,r11
|
|
stb r10,0(r9)
|
|
addic. r6,r6,-1
|
|
add r9,r9,r7
|
|
add r4,r4,r8
|
|
bnectr
|
|
blr
|
|
__andsEnd_2B:
|
|
mtctr r14
|
|
lbzu r10,-2(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lhzu r11,-2(r9)
|
|
and r10,r10,r11
|
|
sth r10,0(r9)
|
|
addic. r6,r6,-1
|
|
add r9,r9,r7
|
|
add r4,r4,r8
|
|
bnectr
|
|
blr
|
|
__andsEnd_3B:
|
|
mtctr r14
|
|
lbzu r10,-2(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lhzu r11,-2(r9)
|
|
and r10,r10,r11
|
|
sth r10,0(r9)
|
|
lbzu r10,-1(r4)
|
|
lbzu r11,-1(r9)
|
|
and r10,r10,r11
|
|
stb r10,0(r9)
|
|
addic. r6,r6,-1
|
|
add r9,r9,r7
|
|
add r4,r4,r8
|
|
bnectr
|
|
blr
|
|
//
|
|
// Short or routines for 1~2 bytes with 4 target word alignment cases
|
|
//
|
|
__ors1_A0:
|
|
__ors1_A1:
|
|
__ors1_A2:
|
|
__ors1_A3:
|
|
addic. r6,r6,-1
|
|
lbz r10,0(r4)
|
|
lbz r11,0(r9)
|
|
or r10,r10,r11
|
|
stb r10,0(r9)
|
|
add r4,r4,r8
|
|
add r9,r9,r7
|
|
bnectr
|
|
blr
|
|
__ors2_A0:
|
|
__ors2_A2:
|
|
addic. r6,r6,-1
|
|
lbz r10,0(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lhz r11,0(r9)
|
|
or r10,r10,r11
|
|
sth r10,0(r9)
|
|
add r4,r4,r8
|
|
add r9,r9,r7
|
|
bnectr
|
|
blr
|
|
__ors2_A1:
|
|
__ors2_A3:
|
|
addic. r6,r6,-1
|
|
lbz r10,0(r4)
|
|
lbz r11,0(r9)
|
|
or r10,r10,r11
|
|
lbz r12,1(r4)
|
|
lbz r11,1(r9)
|
|
or r12,r12,r11
|
|
stb r10,0(r9)
|
|
stb r12,1(r9)
|
|
add r4,r4,r8
|
|
add r9,r9,r7
|
|
bnectr
|
|
blr
|
|
//
|
|
// Main or routines for short case (4 bytes unit) forward direction
|
|
//
|
|
__orsmains_0F:
|
|
mtctr r16
|
|
mr r5,r12
|
|
__orss0F_00:
|
|
addic. r5,r5,-1
|
|
lwz r10,0(r4)
|
|
lwz r17,0(r9)
|
|
or r10,r10,r17
|
|
stw r10,0(r9)
|
|
addi r4,r4,4
|
|
addi r9,r9,4
|
|
bne __orss0F_00
|
|
bctr
|
|
//
|
|
__orsmains_1F:
|
|
mtctr r16
|
|
mr r5,r12
|
|
addi r4,r4,-1
|
|
lwz r10,0(r4)
|
|
__orss1F_00:
|
|
addic. r5,r5,-1
|
|
rlwinm r11,r10,24,8,31
|
|
lwzu r10,4(r4)
|
|
rlwimi r11,r10,24,0,7
|
|
lwz r17,0(r9)
|
|
or r11,r11,r17
|
|
stw r11,0(r9)
|
|
addi r9,r9,4
|
|
bne __orss1F_00
|
|
addi r4,r4,1
|
|
bctr
|
|
//
|
|
__orsmains_2F:
|
|
mtctr r16
|
|
mr r5,r12
|
|
lhz r10,0(r4)
|
|
addi r4,r4,-2
|
|
__orss2F_00:
|
|
addic. r5,r5,-1
|
|
lwzu r11,4(r4)
|
|
rlwimi r10,r11,16,0,15
|
|
lwz r17,0(r9)
|
|
or r10,r10,r17
|
|
stw r10,0(r9)
|
|
addi r9,r9,4
|
|
rlwinm r10,r11,16,16,31
|
|
bne __orss2F_00
|
|
addi r4,r4,2
|
|
bctr
|
|
//
|
|
__orsmains_3F:
|
|
mtctr r16
|
|
mr r5,r12
|
|
lbz r10,0(r4)
|
|
addi r4,r4,-3
|
|
__orss3F_00:
|
|
addic. r5,r5,-1
|
|
lwzu r11,4(r4)
|
|
rlwimi r10,r11,8,0,23
|
|
lwz r17,0(r9)
|
|
or r10,r10,r17
|
|
stw r10,0(r9)
|
|
addi r9,r9,4
|
|
rlwinm r10,r11,8,24,31
|
|
bne __orss3F_00
|
|
addi r4,r4,3
|
|
bctr
|
|
//
|
|
// Initial or routines for 1~3 bytes for forward direction
|
|
//
|
|
__orsInit_0F:
|
|
mtctr r15
|
|
bctr
|
|
__orsInit_1F:
|
|
mtctr r15
|
|
lbz r10,0(r4)
|
|
lbz r11,0(r9)
|
|
or r10,r10,r11
|
|
stb r10,0(r9)
|
|
addi r4,r4,1
|
|
addi r9,r9,1
|
|
bctr
|
|
__orsInit_2F:
|
|
mtctr r15
|
|
lbz r10,0(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lhz r11,0(r9)
|
|
or r10,r10,r11
|
|
sth r10,0(r9)
|
|
addi r4,r4,2
|
|
addi r9,r9,2
|
|
bctr
|
|
__orsInit_3F:
|
|
mtctr r15
|
|
lbz r10,0(r4)
|
|
lbz r11,0(r9)
|
|
or r10,r10,r11
|
|
stb r10,0(r9)
|
|
lbz r10,1(r4)
|
|
lbz r11,2(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lhz r11,1(r9)
|
|
or r10,r10,r11
|
|
sth r10,1(r9)
|
|
addi r4,r4,3
|
|
addi r9,r9,3
|
|
bctr
|
|
//
|
|
// Ending or routines for 1~3 bytes for forward direction
|
|
//
|
|
__orsEnd_0F:
|
|
addic. r6,r6,-1
|
|
mtctr r14
|
|
add r9,r9,r7
|
|
add r4,r4,r8
|
|
bnectr
|
|
blr
|
|
__orsEnd_1F:
|
|
mtctr r14
|
|
lbz r10,0(r4)
|
|
lbz r11,0(r9)
|
|
or r10,r10,r11
|
|
stb r10,0(r9)
|
|
addic. r6,r6,-1
|
|
addi r4,r4,1
|
|
addi r9,r9,1
|
|
add r9,r9,r7
|
|
add r4,r4,r8
|
|
bnectr
|
|
blr
|
|
__orsEnd_2F:
|
|
mtctr r14
|
|
lbz r10,0(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lhz r11,0(r9)
|
|
or r10,r10,r11
|
|
sth r10,0(r9)
|
|
addic. r6,r6,-1
|
|
addi r4,r4,2
|
|
addi r9,r9,2
|
|
add r9,r9,r7
|
|
add r4,r4,r8
|
|
bnectr
|
|
blr
|
|
__orsEnd_3F:
|
|
mtctr r14
|
|
lbz r10,0(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lhz r11,0(r9)
|
|
or r10,r10,r11
|
|
sth r10,0(r9)
|
|
lbz r10,2(r4)
|
|
lbz r11,2(r9)
|
|
or r10,r10,r11
|
|
stb r10,2(r9)
|
|
addic. r6,r6,-1
|
|
addi r4,r4,3
|
|
addi r9,r9,3
|
|
add r9,r9,r7
|
|
add r4,r4,r8
|
|
bnectr
|
|
blr
|
|
//
|
|
// Main or routines for short case (4 bytes unit) backword direction
|
|
//
|
|
__orsmains_0B:
|
|
mtctr r16
|
|
mr r5,r12
|
|
__orss0B_00:
|
|
addic. r5,r5,-1
|
|
lwzu r11,-4(r4)
|
|
lwzu r17,-4(r9)
|
|
or r11,r11,r17
|
|
stw r11,0(r9)
|
|
bne __orss0B_00
|
|
bctr
|
|
//
|
|
__orsmains_1B:
|
|
mtctr r16
|
|
mr r5,r12
|
|
lbzu r10,-1(r4) // Load last byte
|
|
__orss1B_00:
|
|
addic. r5,r5,-1
|
|
rlwinm r11,r10,24,0,7 // Move LS 1 bytes in r10 to MS byte in r11
|
|
lwzu r10,-4(r4) // Load preceeding word
|
|
rlwimi r11,r10,24,8,31 // Insert MS 3 bytes in r10 to LS 3 bytes in r11
|
|
lwzu r17,-4(r9)
|
|
or r11,r11,r17
|
|
stw r11,0(r9) // Store r11
|
|
bne __orss1B_00
|
|
addi r4,r4,1 // Adjust source pointer
|
|
bctr
|
|
//
|
|
__orsmains_2B:
|
|
mtctr r16
|
|
mr r5,r12
|
|
lhzu r10,-2(r4) // Load needed two bytes in r11
|
|
__orss2B_00:
|
|
addic. r5,r5,-1
|
|
rlwinm r11,r10,16,0,15 // Move LS 2 bytes in r10 to MS 2 bytes in r11
|
|
lwzu r10,-4(r4) // Load preceeding word in r10
|
|
rlwimi r11,r10,16,16,31 // Insert MS 2 bytes in r10 to LS 2 bytes in r11
|
|
lwzu r17,-4(r9)
|
|
or r11,r11,r17
|
|
stw r11,0(r9) // Store r11
|
|
bne __orss2B_00
|
|
addi r4,r4,2 // Adjust source pointer
|
|
bctr
|
|
//
|
|
__orsmains_3B:
|
|
mtctr r16
|
|
mr r5,r12
|
|
addi r4,r4,1 // Adjust source pointer to make update word access
|
|
lwzu r10,-4(r4) // Load needed three bytes in MS r10
|
|
__orss3B_00:
|
|
addic. r5,r5,-1
|
|
rlwinm r11,r10,8,0,23 // Move LS 3 bytes in r10 to MS bytes in r11
|
|
lwzu r10,-4(r4) // Load preceeding word
|
|
rlwimi r11,r10,8,24,31 // Insert MS 1 bytes in r10 to LS 1 byte in r11
|
|
lwzu r17,-4(r9)
|
|
or r11,r11,r17
|
|
stw r11,0(r9) // Store r11
|
|
bne __orss3B_00
|
|
addi r4,r4,3 // Adjust source pointer
|
|
bctr
|
|
//
|
|
// Initial or routines for 1~3 bytes for backword direction
|
|
//
|
|
__orsInit_0B:
|
|
mtctr r15
|
|
bctr
|
|
__orsInit_1B:
|
|
mtctr r15
|
|
lbzu r10,-1(r4)
|
|
lbzu r11,-1(r9)
|
|
or r10,r10,r11
|
|
stb r10,0(r9)
|
|
bctr
|
|
__orsInit_2B:
|
|
mtctr r15
|
|
lbzu r10,-2(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lhzu r11,-2(r9)
|
|
or r10,r10,r11
|
|
sth r10,0(r9)
|
|
bctr
|
|
__orsInit_3B:
|
|
mtctr r15
|
|
lbzu r10,-1(r4)
|
|
lbzu r11,-1(r9)
|
|
or r10,r10,r11
|
|
stb r10,0(r9)
|
|
lbzu r10,-2(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lhzu r11,-2(r9)
|
|
or r10,r10,r11
|
|
sth r10,0(r9)
|
|
bctr
|
|
//
|
|
// Ending or routines for 1~3 bytes for backword direction
|
|
//
|
|
__orsEnd_0B:
|
|
addic. r6,r6,-1
|
|
mtctr r14
|
|
add r9,r9,r7
|
|
add r4,r4,r8
|
|
bnectr
|
|
blr
|
|
__orsEnd_1B:
|
|
mtctr r14
|
|
lbzu r10,-1(r4)
|
|
lbzu r11,-1(r9)
|
|
or r10,r10,r11
|
|
stb r10,0(r9)
|
|
addic. r6,r6,-1
|
|
add r9,r9,r7
|
|
add r4,r4,r8
|
|
bnectr
|
|
blr
|
|
__orsEnd_2B:
|
|
mtctr r14
|
|
lbzu r10,-2(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lhzu r11,-2(r9)
|
|
or r10,r10,r11
|
|
sth r10,0(r9)
|
|
addic. r6,r6,-1
|
|
add r9,r9,r7
|
|
add r4,r4,r8
|
|
bnectr
|
|
blr
|
|
__orsEnd_3B:
|
|
mtctr r14
|
|
lbzu r10,-2(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lhzu r11,-2(r9)
|
|
or r10,r10,r11
|
|
sth r10,0(r9)
|
|
lbzu r10,-1(r4)
|
|
lbzu r11,-1(r9)
|
|
or r10,r10,r11
|
|
stb r10,0(r9)
|
|
addic. r6,r6,-1
|
|
add r9,r9,r7
|
|
add r4,r4,r8
|
|
bnectr
|
|
blr
|
|
//
|
|
// Short orc routines for 1~2 bytes with 4 target word alignment cases
|
|
//
|
|
__orcs1_A0:
|
|
__orcs1_A1:
|
|
__orcs1_A2:
|
|
__orcs1_A3:
|
|
addic. r6,r6,-1
|
|
lbz r10,0(r4)
|
|
lbz r11,0(r9)
|
|
orc r10,r11,r10
|
|
stb r10,0(r9)
|
|
add r4,r4,r8
|
|
add r9,r9,r7
|
|
bnectr
|
|
blr
|
|
__orcs2_A0:
|
|
__orcs2_A2:
|
|
addic. r6,r6,-1
|
|
lbz r10,0(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lhz r11,0(r9)
|
|
orc r10,r11,r10
|
|
sth r10,0(r9)
|
|
add r4,r4,r8
|
|
add r9,r9,r7
|
|
bnectr
|
|
blr
|
|
__orcs2_A1:
|
|
__orcs2_A3:
|
|
addic. r6,r6,-1
|
|
lbz r10,0(r4)
|
|
lbz r11,0(r9)
|
|
orc r10,r11,r10
|
|
lbz r12,1(r4)
|
|
lbz r11,1(r9)
|
|
orc r12,r11,r12
|
|
stb r10,0(r9)
|
|
stb r12,1(r9)
|
|
add r4,r4,r8
|
|
add r9,r9,r7
|
|
bnectr
|
|
blr
|
|
//
|
|
// Main orc routines for short case (4 bytes unit) forward direction
|
|
//
|
|
__orcsmains_0F:
|
|
mtctr r16
|
|
mr r5,r12
|
|
__orcss0F_00:
|
|
addic. r5,r5,-1
|
|
lwz r10,0(r4)
|
|
lwz r17,0(r9)
|
|
orc r10,r17,r10
|
|
stw r10,0(r9)
|
|
addi r4,r4,4
|
|
addi r9,r9,4
|
|
bne __orcss0F_00
|
|
bctr
|
|
//
|
|
__orcsmains_1F:
|
|
mtctr r16
|
|
mr r5,r12
|
|
addi r4,r4,-1
|
|
lwz r10,0(r4)
|
|
__orcss1F_00:
|
|
addic. r5,r5,-1
|
|
rlwinm r11,r10,24,8,31
|
|
lwzu r10,4(r4)
|
|
rlwimi r11,r10,24,0,7
|
|
lwz r17,0(r9)
|
|
orc r11,r17,r11
|
|
stw r11,0(r9)
|
|
addi r9,r9,4
|
|
bne __orcss1F_00
|
|
addi r4,r4,1
|
|
bctr
|
|
//
|
|
__orcsmains_2F:
|
|
mtctr r16
|
|
mr r5,r12
|
|
lhz r10,0(r4)
|
|
addi r4,r4,-2
|
|
__orcss2F_00:
|
|
addic. r5,r5,-1
|
|
lwzu r11,4(r4)
|
|
rlwimi r10,r11,16,0,15
|
|
lwz r17,0(r9)
|
|
orc r10,r17,r10
|
|
stw r10,0(r9)
|
|
addi r9,r9,4
|
|
rlwinm r10,r11,16,16,31
|
|
bne __orcss2F_00
|
|
addi r4,r4,2
|
|
bctr
|
|
//
|
|
__orcsmains_3F:
|
|
mtctr r16
|
|
mr r5,r12
|
|
lbz r10,0(r4)
|
|
addi r4,r4,-3
|
|
__orcss3F_00:
|
|
addic. r5,r5,-1
|
|
lwzu r11,4(r4)
|
|
rlwimi r10,r11,8,0,23
|
|
lwz r17,0(r9)
|
|
orc r10,r17,r10
|
|
stw r10,0(r9)
|
|
addi r9,r9,4
|
|
rlwinm r10,r11,8,24,31
|
|
bne __orcss3F_00
|
|
addi r4,r4,3
|
|
bctr
|
|
//
|
|
// Initial orc routines for 1~3 bytes for forward direction
|
|
//
|
|
__orcsInit_0F:
|
|
mtctr r15
|
|
bctr
|
|
__orcsInit_1F:
|
|
mtctr r15
|
|
lbz r10,0(r4)
|
|
lbz r11,0(r9)
|
|
orc r10,r11,r10
|
|
stb r10,0(r9)
|
|
addi r4,r4,1
|
|
addi r9,r9,1
|
|
bctr
|
|
__orcsInit_2F:
|
|
mtctr r15
|
|
lbz r10,0(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lhz r11,0(r9)
|
|
orc r10,r11,r10
|
|
sth r10,0(r9)
|
|
addi r4,r4,2
|
|
addi r9,r9,2
|
|
bctr
|
|
__orcsInit_3F:
|
|
mtctr r15
|
|
lbz r10,0(r4)
|
|
lbz r11,0(r9)
|
|
orc r10,r11,r10
|
|
stb r10,0(r9)
|
|
lbz r10,1(r4)
|
|
lbz r11,2(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lhz r11,1(r9)
|
|
orc r10,r11,r10
|
|
sth r10,1(r9)
|
|
addi r4,r4,3
|
|
addi r9,r9,3
|
|
bctr
|
|
//
|
|
// Ending orc routines for 1~3 bytes for forward direction
|
|
//
|
|
__orcsEnd_0F:
|
|
addic. r6,r6,-1
|
|
mtctr r14
|
|
add r9,r9,r7
|
|
add r4,r4,r8
|
|
bnectr
|
|
blr
|
|
__orcsEnd_1F:
|
|
mtctr r14
|
|
lbz r10,0(r4)
|
|
lbz r11,0(r9)
|
|
orc r10,r11,r10
|
|
stb r10,0(r9)
|
|
addic. r6,r6,-1
|
|
addi r4,r4,1
|
|
addi r9,r9,1
|
|
add r9,r9,r7
|
|
add r4,r4,r8
|
|
bnectr
|
|
blr
|
|
__orcsEnd_2F:
|
|
mtctr r14
|
|
lbz r10,0(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lhz r11,0(r9)
|
|
orc r10,r11,r10
|
|
sth r10,0(r9)
|
|
addic. r6,r6,-1
|
|
addi r4,r4,2
|
|
addi r9,r9,2
|
|
add r9,r9,r7
|
|
add r4,r4,r8
|
|
bnectr
|
|
blr
|
|
__orcsEnd_3F:
|
|
mtctr r14
|
|
lbz r10,0(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lhz r11,0(r9)
|
|
orc r10,r11,r10
|
|
sth r10,0(r9)
|
|
lbz r10,2(r4)
|
|
lbz r11,2(r9)
|
|
orc r10,r11,r10
|
|
stb r10,2(r9)
|
|
addic. r6,r6,-1
|
|
addi r4,r4,3
|
|
addi r9,r9,3
|
|
add r9,r9,r7
|
|
add r4,r4,r8
|
|
bnectr
|
|
blr
|
|
//
|
|
// Main orc routines for short case (4 bytes unit) backword direction
|
|
//
|
|
__orcsmains_0B:
|
|
mtctr r16
|
|
mr r5,r12
|
|
__orcss0B_00:
|
|
addic. r5,r5,-1
|
|
lwzu r11,-4(r4)
|
|
lwzu r17,-4(r9)
|
|
orc r11,r17,r11
|
|
stw r11,0(r9)
|
|
bne __orcss0B_00
|
|
bctr
|
|
//
|
|
__orcsmains_1B:
|
|
mtctr r16
|
|
mr r5,r12
|
|
lbzu r10,-1(r4) // Load last byte
|
|
__orcss1B_00:
|
|
addic. r5,r5,-1
|
|
rlwinm r11,r10,24,0,7 // Move LS 1 bytes in r10 to MS byte in r11
|
|
lwzu r10,-4(r4) // Load preceeding word
|
|
rlwimi r11,r10,24,8,31 // Insert MS 3 bytes in r10 to LS 3 bytes in r11
|
|
lwzu r17,-4(r9)
|
|
orc r11,r17,r11
|
|
stw r11,0(r9) // Store r11
|
|
bne __orcss1B_00
|
|
addi r4,r4,1 // Adjust source pointer
|
|
bctr
|
|
//
|
|
__orcsmains_2B:
|
|
mtctr r16
|
|
mr r5,r12
|
|
lhzu r10,-2(r4) // Load needed two bytes in r11
|
|
__orcss2B_00:
|
|
addic. r5,r5,-1
|
|
rlwinm r11,r10,16,0,15 // Move LS 2 bytes in r10 to MS 2 bytes in r11
|
|
lwzu r10,-4(r4) // Load preceeding word in r10
|
|
rlwimi r11,r10,16,16,31 // Insert MS 2 bytes in r10 to LS 2 bytes in r11
|
|
lwzu r17,-4(r9)
|
|
orc r11,r17,r11
|
|
stw r11,0(r9) // Store r11
|
|
bne __orcss2B_00
|
|
addi r4,r4,2 // Adjust source pointer
|
|
bctr
|
|
//
|
|
__orcsmains_3B:
|
|
mtctr r16
|
|
mr r5,r12
|
|
addi r4,r4,1 // Adjust source pointer to make update word access
|
|
lwzu r10,-4(r4) // Load needed three bytes in MS r10
|
|
__orcss3B_00:
|
|
addic. r5,r5,-1
|
|
rlwinm r11,r10,8,0,23 // Move LS 3 bytes in r10 to MS bytes in r11
|
|
lwzu r10,-4(r4) // Load preceeding word
|
|
rlwimi r11,r10,8,24,31 // Insert MS 1 bytes in r10 to LS 1 byte in r11
|
|
lwzu r17,-4(r9)
|
|
orc r11,r17,r11
|
|
stw r11,0(r9) // Store r11
|
|
bne __orcss3B_00
|
|
addi r4,r4,3 // Adjust source pointer
|
|
bctr
|
|
//
|
|
// Initial orc routines for 1~3 bytes for backword direction
|
|
//
|
|
__orcsInit_0B:
|
|
mtctr r15
|
|
bctr
|
|
__orcsInit_1B:
|
|
mtctr r15
|
|
lbzu r10,-1(r4)
|
|
lbzu r11,-1(r9)
|
|
orc r10,r11,r10
|
|
stb r10,0(r9)
|
|
bctr
|
|
__orcsInit_2B:
|
|
mtctr r15
|
|
lbzu r10,-2(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lhzu r11,-2(r9)
|
|
orc r10,r11,r10
|
|
sth r10,0(r9)
|
|
bctr
|
|
__orcsInit_3B:
|
|
mtctr r15
|
|
lbzu r10,-1(r4)
|
|
lbzu r11,-1(r9)
|
|
orc r10,r11,r10
|
|
stb r10,0(r9)
|
|
lbzu r10,-2(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lhzu r11,-2(r9)
|
|
orc r10,r11,r10
|
|
sth r10,0(r9)
|
|
bctr
|
|
//
|
|
// Ending orc routines for 1~3 bytes for backword direction
|
|
//
|
|
__orcsEnd_0B:
|
|
addic. r6,r6,-1
|
|
mtctr r14
|
|
add r9,r9,r7
|
|
add r4,r4,r8
|
|
bnectr
|
|
blr
|
|
__orcsEnd_1B:
|
|
mtctr r14
|
|
lbzu r10,-1(r4)
|
|
lbzu r11,-1(r9)
|
|
orc r10,r11,r10
|
|
stb r10,0(r9)
|
|
addic. r6,r6,-1
|
|
add r9,r9,r7
|
|
add r4,r4,r8
|
|
bnectr
|
|
blr
|
|
__orcsEnd_2B:
|
|
mtctr r14
|
|
lbzu r10,-2(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lhzu r11,-2(r9)
|
|
orc r10,r11,r10
|
|
sth r10,0(r9)
|
|
addic. r6,r6,-1
|
|
add r9,r9,r7
|
|
add r4,r4,r8
|
|
bnectr
|
|
blr
|
|
__orcsEnd_3B:
|
|
mtctr r14
|
|
lbzu r10,-2(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lhzu r11,-2(r9)
|
|
orc r10,r11,r10
|
|
sth r10,0(r9)
|
|
lbzu r10,-1(r4)
|
|
lbzu r11,-1(r9)
|
|
orc r10,r11,r10
|
|
stb r10,0(r9)
|
|
addic. r6,r6,-1
|
|
add r9,r9,r7
|
|
add r4,r4,r8
|
|
bnectr
|
|
blr
|
|
//
|
|
// Short b8op routines for 1~2 bytes with 4 target word alignment cases
|
|
//
|
|
__b8ops1_A0:
|
|
__b8ops1_A1:
|
|
__b8ops1_A2:
|
|
__b8ops1_A3:
|
|
addic. r6,r6,-1
|
|
lbz r10,0(r4)
|
|
lbz r11,0(r9)
|
|
andc r12,r5,r10
|
|
and r11,r11,r10
|
|
or r10,r11,r12
|
|
stb r10,0(r9)
|
|
add r4,r4,r8
|
|
add r9,r9,r7
|
|
bnectr
|
|
blr
|
|
__b8ops2_A0:
|
|
__b8ops2_A2:
|
|
addic. r6,r6,-1
|
|
lbz r10,0(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lhz r11,0(r9)
|
|
andc r12,r5,r10
|
|
and r11,r11,r10
|
|
or r10,r11,r12
|
|
sth r10,0(r9)
|
|
add r4,r4,r8
|
|
add r9,r9,r7
|
|
bnectr
|
|
blr
|
|
__b8ops2_A1:
|
|
__b8ops2_A3:
|
|
addic. r6,r6,-1
|
|
lbz r10,0(r4)
|
|
lbz r11,0(r9)
|
|
andc r12,r5,r10
|
|
and r11,r11,r10
|
|
or r10,r11,r12
|
|
lbz r12,1(r4)
|
|
lbz r11,1(r9)
|
|
stb r10,0(r9)
|
|
andc r10,r5,r12
|
|
and r11,r11,r12
|
|
or r12,r10,r11
|
|
stb r12,1(r9)
|
|
add r4,r4,r8
|
|
add r9,r9,r7
|
|
bnectr
|
|
blr
|
|
//
|
|
// Main b8op routines for short case (4 bytes unit) forward direction
|
|
//
|
|
__b8opsmains_0F:
|
|
mtctr r16
|
|
mr r5,r12
|
|
__b8opss0F_00:
|
|
addic. r5,r5,-1
|
|
lwz r10,0(r4)
|
|
lwz r11,0(r9)
|
|
andc r18,r19,r10
|
|
and r11,r11,r10
|
|
or r10,r11,r18
|
|
stw r10,0(r9)
|
|
addi r4,r4,4
|
|
addi r9,r9,4
|
|
bne __b8opss0F_00
|
|
bctr
|
|
//
|
|
__b8opsmains_1F:
|
|
mtctr r16
|
|
mr r5,r12
|
|
addi r4,r4,-1
|
|
lwz r10,0(r4)
|
|
__b8opss1F_00:
|
|
addic. r5,r5,-1
|
|
rlwinm r11,r10,24,8,31
|
|
lwzu r10,4(r4)
|
|
rlwimi r11,r10,24,0,7
|
|
lwz r17,0(r9)
|
|
andc r18,r19,r11
|
|
and r17,r17,r11
|
|
or r11,r17,r18
|
|
stw r11,0(r9)
|
|
addi r9,r9,4
|
|
bne __b8opss1F_00
|
|
addi r4,r4,1
|
|
bctr
|
|
//
|
|
__b8opsmains_2F:
|
|
mtctr r16
|
|
mr r5,r12
|
|
lhz r10,0(r4)
|
|
addi r4,r4,-2
|
|
__b8opss2F_00:
|
|
addic. r5,r5,-1
|
|
lwzu r11,4(r4)
|
|
rlwimi r10,r11,16,0,15
|
|
lwz r17,0(r9)
|
|
andc r18,r19,r10
|
|
and r17,r17,r10
|
|
or r10,r17,r18
|
|
stw r10,0(r9)
|
|
addi r9,r9,4
|
|
rlwinm r10,r11,16,16,31
|
|
bne __b8opss2F_00
|
|
addi r4,r4,2
|
|
bctr
|
|
//
|
|
__b8opsmains_3F:
|
|
mtctr r16
|
|
mr r5,r12
|
|
lbz r10,0(r4)
|
|
addi r4,r4,-3
|
|
__b8opss3F_00:
|
|
addic. r5,r5,-1
|
|
lwzu r11,4(r4)
|
|
rlwimi r10,r11,8,0,23
|
|
lwz r17,0(r9)
|
|
andc r18,r19,r10
|
|
and r17,r17,r10
|
|
or r10,r17,r18
|
|
stw r10,0(r9)
|
|
addi r9,r9,4
|
|
rlwinm r10,r11,8,24,31
|
|
bne __b8opss3F_00
|
|
addi r4,r4,3
|
|
bctr
|
|
//
|
|
// Initial b8op routines for 1~3 bytes for forward direction
|
|
//
|
|
__b8opsInit_0F:
|
|
mtctr r15
|
|
bctr
|
|
__b8opsInit_1F:
|
|
mtctr r15
|
|
lbz r10,0(r4)
|
|
lbz r11,0(r9)
|
|
andc r18,r19,r10
|
|
and r11,r11,r10
|
|
or r10,r11,r18
|
|
stb r10,0(r9)
|
|
addi r4,r4,1
|
|
addi r9,r9,1
|
|
bctr
|
|
__b8opsInit_2F:
|
|
mtctr r15
|
|
lbz r10,0(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lhz r11,0(r9)
|
|
andc r18,r19,r10
|
|
and r11,r11,r10
|
|
or r10,r11,r18
|
|
sth r10,0(r9)
|
|
addi r4,r4,2
|
|
addi r9,r9,2
|
|
bctr
|
|
__b8opsInit_3F:
|
|
mtctr r15
|
|
lbz r10,0(r4)
|
|
lbz r11,0(r9)
|
|
andc r18,r19,r10
|
|
and r11,r11,r10
|
|
or r10,r11,r18
|
|
stb r10,0(r9)
|
|
lbz r10,1(r4)
|
|
lbz r11,2(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lhz r11,1(r9)
|
|
andc r18,r19,r10
|
|
and r11,r11,r10
|
|
or r10,r11,r18
|
|
sth r10,1(r9)
|
|
addi r4,r4,3
|
|
addi r9,r9,3
|
|
bctr
|
|
//
|
|
// Ending b8op routines for 1~3 bytes for forward direction
|
|
//
|
|
__b8opsEnd_0F:
|
|
addic. r6,r6,-1
|
|
mtctr r14
|
|
add r9,r9,r7
|
|
add r4,r4,r8
|
|
bnectr
|
|
blr
|
|
__b8opsEnd_1F:
|
|
mtctr r14
|
|
lbz r10,0(r4)
|
|
lbz r11,0(r9)
|
|
andc r18,r19,r10
|
|
and r11,r11,r10
|
|
or r10,r11,r18
|
|
stb r10,0(r9)
|
|
addic. r6,r6,-1
|
|
addi r4,r4,1
|
|
addi r9,r9,1
|
|
add r9,r9,r7
|
|
add r4,r4,r8
|
|
bnectr
|
|
blr
|
|
__b8opsEnd_2F:
|
|
mtctr r14
|
|
lbz r10,0(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lhz r11,0(r9)
|
|
andc r18,r19,r10
|
|
and r11,r11,r10
|
|
or r10,r11,r18
|
|
sth r10,0(r9)
|
|
addic. r6,r6,-1
|
|
addi r4,r4,2
|
|
addi r9,r9,2
|
|
add r9,r9,r7
|
|
add r4,r4,r8
|
|
bnectr
|
|
blr
|
|
__b8opsEnd_3F:
|
|
mtctr r14
|
|
lbz r10,0(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lhz r11,0(r9)
|
|
andc r18,r19,r10
|
|
and r11,r11,r10
|
|
or r10,r11,r18
|
|
sth r10,0(r9)
|
|
lbz r10,2(r4)
|
|
lbz r11,2(r9)
|
|
andc r18,r19,r10
|
|
and r11,r11,r10
|
|
or r10,r11,r18
|
|
stb r10,2(r9)
|
|
addic. r6,r6,-1
|
|
addi r4,r4,3
|
|
addi r9,r9,3
|
|
add r9,r9,r7
|
|
add r4,r4,r8
|
|
bnectr
|
|
blr
|
|
//
|
|
// Main b8op routines for short case (4 bytes unit) backword direction
|
|
//
|
|
__b8opsmains_0B:
|
|
mtctr r16
|
|
mr r5,r12
|
|
__b8opss0B_00:
|
|
addic. r5,r5,-1
|
|
lwzu r11,-4(r4)
|
|
lwzu r17,-4(r9)
|
|
andc r18,r19,r11
|
|
and r17,r17,r11
|
|
or r11,r17,r18
|
|
stw r11,0(r9)
|
|
bne __b8opss0B_00
|
|
bctr
|
|
//
|
|
__b8opsmains_1B:
|
|
mtctr r16
|
|
mr r5,r12
|
|
lbzu r10,-1(r4) // Load last byte
|
|
__b8opss1B_00:
|
|
addic. r5,r5,-1
|
|
rlwinm r11,r10,24,0,7 // Move LS 1 bytes in r10 to MS byte in r11
|
|
lwzu r10,-4(r4) // Load preceeding word
|
|
rlwimi r11,r10,24,8,31 // Insert MS 3 bytes in r10 to LS 3 bytes in r11
|
|
lwzu r17,-4(r9)
|
|
andc r18,r19,r11
|
|
and r17,r17,r11
|
|
or r11,r17,r18
|
|
stw r11,0(r9) // Store r11
|
|
bne __b8opss1B_00
|
|
addi r4,r4,1 // Adjust source pointer
|
|
bctr
|
|
//
|
|
__b8opsmains_2B:
|
|
mtctr r16
|
|
mr r5,r12
|
|
lhzu r10,-2(r4) // Load needed two bytes in r11
|
|
__b8opss2B_00:
|
|
addic. r5,r5,-1
|
|
rlwinm r11,r10,16,0,15 // Move LS 2 bytes in r10 to MS 2 bytes in r11
|
|
lwzu r10,-4(r4) // Load preceeding word in r10
|
|
rlwimi r11,r10,16,16,31 // Insert MS 2 bytes in r10 to LS 2 bytes in r11
|
|
lwzu r17,-4(r9)
|
|
andc r18,r19,r11
|
|
and r17,r17,r11
|
|
or r11,r17,r18
|
|
stw r11,0(r9) // Store r11
|
|
bne __b8opss2B_00
|
|
addi r4,r4,2 // Adjust source pointer
|
|
bctr
|
|
//
|
|
__b8opsmains_3B:
|
|
mtctr r16
|
|
mr r5,r12
|
|
addi r4,r4,1 // Adjust source pointer to make update word access
|
|
lwzu r10,-4(r4) // Load needed three bytes in MS r10
|
|
__b8opss3B_00:
|
|
addic. r5,r5,-1
|
|
rlwinm r11,r10,8,0,23 // Move LS 3 bytes in r10 to MS bytes in r11
|
|
lwzu r10,-4(r4) // Load preceeding word
|
|
rlwimi r11,r10,8,24,31 // Insert MS 1 bytes in r10 to LS 1 byte in r11
|
|
lwzu r17,-4(r9)
|
|
andc r18,r19,r11
|
|
and r17,r17,r11
|
|
or r11,r17,r18
|
|
stw r11,0(r9) // Store r11
|
|
bne __b8opss3B_00
|
|
addi r4,r4,3 // Adjust source pointer
|
|
bctr
|
|
//
|
|
// Initial b8op routines for 1~3 bytes for backword direction
|
|
//
|
|
__b8opsInit_0B:
|
|
mtctr r15
|
|
bctr
|
|
__b8opsInit_1B:
|
|
mtctr r15
|
|
lbzu r10,-1(r4)
|
|
lbzu r11,-1(r9)
|
|
andc r18,r19,r10
|
|
and r11,r11,r10
|
|
or r10,r11,r18
|
|
stb r10,0(r9)
|
|
bctr
|
|
__b8opsInit_2B:
|
|
mtctr r15
|
|
lbzu r10,-2(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lhzu r11,-2(r9)
|
|
andc r18,r19,r10
|
|
and r11,r11,r10
|
|
or r10,r11,r18
|
|
sth r10,0(r9)
|
|
bctr
|
|
__b8opsInit_3B:
|
|
mtctr r15
|
|
lbzu r10,-1(r4)
|
|
lbzu r11,-1(r9)
|
|
andc r18,r19,r10
|
|
and r11,r11,r10
|
|
or r10,r11,r18
|
|
stb r10,0(r9)
|
|
lbzu r10,-2(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lhzu r11,-2(r9)
|
|
andc r18,r19,r10
|
|
and r11,r11,r10
|
|
or r10,r11,r18
|
|
sth r10,0(r9)
|
|
bctr
|
|
//
|
|
// Ending b8op routines for 1~3 bytes for backword direction
|
|
//
|
|
__b8opsEnd_0B:
|
|
addic. r6,r6,-1
|
|
mtctr r14
|
|
add r9,r9,r7
|
|
add r4,r4,r8
|
|
bnectr
|
|
blr
|
|
__b8opsEnd_1B:
|
|
mtctr r14
|
|
lbzu r10,-1(r4)
|
|
lbzu r11,-1(r9)
|
|
andc r18,r19,r10
|
|
and r11,r11,r10
|
|
or r10,r11,r18
|
|
stb r10,0(r9)
|
|
addic. r6,r6,-1
|
|
add r9,r9,r7
|
|
add r4,r4,r8
|
|
bnectr
|
|
blr
|
|
__b8opsEnd_2B:
|
|
mtctr r14
|
|
lbzu r10,-2(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lhzu r11,-2(r9)
|
|
andc r18,r19,r10
|
|
and r11,r11,r10
|
|
or r10,r11,r18
|
|
sth r10,0(r9)
|
|
addic. r6,r6,-1
|
|
add r9,r9,r7
|
|
add r4,r4,r8
|
|
bnectr
|
|
blr
|
|
__b8opsEnd_3B:
|
|
mtctr r14
|
|
lbzu r10,-2(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lhzu r11,-2(r9)
|
|
andc r18,r19,r10
|
|
and r11,r11,r10
|
|
or r10,r11,r18
|
|
sth r10,0(r9)
|
|
lbzu r10,-1(r4)
|
|
lbzu r11,-1(r9)
|
|
andc r18,r19,r10
|
|
and r11,r11,r10
|
|
or r10,r11,r18
|
|
stb r10,0(r9)
|
|
addic. r6,r6,-1
|
|
add r9,r9,r7
|
|
add r4,r4,r8
|
|
bnectr
|
|
blr
|
|
//
|
|
// Short nor routines for 1~2 bytes with 4 target word alignment cases
|
|
//
|
|
__nors1_A0:
|
|
__nors1_A1:
|
|
__nors1_A2:
|
|
__nors1_A3:
|
|
addic. r6,r6,-1
|
|
lbz r10,0(r4)
|
|
lbz r11,0(r9)
|
|
nor r10,r10,r11
|
|
stb r10,0(r9)
|
|
add r4,r4,r8
|
|
add r9,r9,r7
|
|
bnectr
|
|
blr
|
|
__nors2_A0:
|
|
__nors2_A2:
|
|
addic. r6,r6,-1
|
|
lbz r10,0(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lhz r11,0(r9)
|
|
nor r10,r10,r11
|
|
sth r10,0(r9)
|
|
add r4,r4,r8
|
|
add r9,r9,r7
|
|
bnectr
|
|
blr
|
|
__nors2_A1:
|
|
__nors2_A3:
|
|
addic. r6,r6,-1
|
|
lbz r10,0(r4)
|
|
lbz r11,0(r9)
|
|
nor r10,r10,r11
|
|
lbz r12,1(r4)
|
|
lbz r11,1(r9)
|
|
nor r12,r12,r11
|
|
stb r10,0(r9)
|
|
stb r12,1(r9)
|
|
add r4,r4,r8
|
|
add r9,r9,r7
|
|
bnectr
|
|
blr
|
|
//
|
|
// Main nor routines for short case (4 bytes unit) forward direction
|
|
//
|
|
__norsmains_0F:
|
|
mtctr r16
|
|
mr r5,r12
|
|
__norss0F_00:
|
|
addic. r5,r5,-1
|
|
lwz r10,0(r4)
|
|
lwz r17,0(r9)
|
|
nor r10,r10,r17
|
|
stw r10,0(r9)
|
|
addi r4,r4,4
|
|
addi r9,r9,4
|
|
bne __norss0F_00
|
|
bctr
|
|
//
|
|
__norsmains_1F:
|
|
mtctr r16
|
|
mr r5,r12
|
|
addi r4,r4,-1
|
|
lwz r10,0(r4)
|
|
__norss1F_00:
|
|
addic. r5,r5,-1
|
|
rlwinm r11,r10,24,8,31
|
|
lwzu r10,4(r4)
|
|
rlwimi r11,r10,24,0,7
|
|
lwz r17,0(r9)
|
|
nor r11,r11,r17
|
|
stw r11,0(r9)
|
|
addi r9,r9,4
|
|
bne __norss1F_00
|
|
addi r4,r4,1
|
|
bctr
|
|
//
|
|
__norsmains_2F:
|
|
mtctr r16
|
|
mr r5,r12
|
|
lhz r10,0(r4)
|
|
addi r4,r4,-2
|
|
__norss2F_00:
|
|
addic. r5,r5,-1
|
|
lwzu r11,4(r4)
|
|
rlwimi r10,r11,16,0,15
|
|
lwz r17,0(r9)
|
|
nor r10,r10,r17
|
|
stw r10,0(r9)
|
|
addi r9,r9,4
|
|
rlwinm r10,r11,16,16,31
|
|
bne __norss2F_00
|
|
addi r4,r4,2
|
|
bctr
|
|
//
|
|
__norsmains_3F:
|
|
mtctr r16
|
|
mr r5,r12
|
|
lbz r10,0(r4)
|
|
addi r4,r4,-3
|
|
__norss3F_00:
|
|
addic. r5,r5,-1
|
|
lwzu r11,4(r4)
|
|
rlwimi r10,r11,8,0,23
|
|
lwz r17,0(r9)
|
|
nor r10,r10,r17
|
|
stw r10,0(r9)
|
|
addi r9,r9,4
|
|
rlwinm r10,r11,8,24,31
|
|
bne __norss3F_00
|
|
addi r4,r4,3
|
|
bctr
|
|
//
|
|
// Initial nor routines for 1~3 bytes for forward direction
|
|
//
|
|
__norsInit_0F:
|
|
mtctr r15
|
|
bctr
|
|
__norsInit_1F:
|
|
mtctr r15
|
|
lbz r10,0(r4)
|
|
lbz r11,0(r9)
|
|
nor r10,r10,r11
|
|
stb r10,0(r9)
|
|
addi r4,r4,1
|
|
addi r9,r9,1
|
|
bctr
|
|
__norsInit_2F:
|
|
mtctr r15
|
|
lbz r10,0(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lhz r11,0(r9)
|
|
nor r10,r10,r11
|
|
sth r10,0(r9)
|
|
addi r4,r4,2
|
|
addi r9,r9,2
|
|
bctr
|
|
__norsInit_3F:
|
|
mtctr r15
|
|
lbz r10,0(r4)
|
|
lbz r11,0(r9)
|
|
nor r10,r10,r11
|
|
stb r10,0(r9)
|
|
lbz r10,1(r4)
|
|
lbz r11,2(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lhz r11,1(r9)
|
|
nor r10,r10,r11
|
|
sth r10,1(r9)
|
|
addi r4,r4,3
|
|
addi r9,r9,3
|
|
bctr
|
|
//
|
|
// Ending nor routines for 1~3 bytes for forward direction
|
|
//
|
|
__norsEnd_0F:
|
|
addic. r6,r6,-1
|
|
mtctr r14
|
|
add r9,r9,r7
|
|
add r4,r4,r8
|
|
bnectr
|
|
blr
|
|
__norsEnd_1F:
|
|
mtctr r14
|
|
lbz r10,0(r4)
|
|
lbz r11,0(r9)
|
|
nor r10,r10,r11
|
|
stb r10,0(r9)
|
|
addic. r6,r6,-1
|
|
addi r4,r4,1
|
|
addi r9,r9,1
|
|
add r9,r9,r7
|
|
add r4,r4,r8
|
|
bnectr
|
|
blr
|
|
__norsEnd_2F:
|
|
mtctr r14
|
|
lbz r10,0(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lhz r11,0(r9)
|
|
nor r10,r10,r11
|
|
sth r10,0(r9)
|
|
addic. r6,r6,-1
|
|
addi r4,r4,2
|
|
addi r9,r9,2
|
|
add r9,r9,r7
|
|
add r4,r4,r8
|
|
bnectr
|
|
blr
|
|
__norsEnd_3F:
|
|
mtctr r14
|
|
lbz r10,0(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lhz r11,0(r9)
|
|
nor r10,r10,r11
|
|
sth r10,0(r9)
|
|
lbz r10,2(r4)
|
|
lbz r11,2(r9)
|
|
nor r10,r10,r11
|
|
stb r10,2(r9)
|
|
addic. r6,r6,-1
|
|
addi r4,r4,3
|
|
addi r9,r9,3
|
|
add r9,r9,r7
|
|
add r4,r4,r8
|
|
bnectr
|
|
blr
|
|
//
|
|
// Main nor routines for short case (4 bytes unit) backword direction
|
|
//
|
|
__norsmains_0B:
|
|
mtctr r16
|
|
mr r5,r12
|
|
__norss0B_00:
|
|
addic. r5,r5,-1
|
|
lwzu r11,-4(r4)
|
|
lwzu r17,-4(r9)
|
|
nor r11,r11,r17
|
|
stw r11,0(r9)
|
|
bne __norss0B_00
|
|
bctr
|
|
//
|
|
__norsmains_1B:
|
|
mtctr r16
|
|
mr r5,r12
|
|
lbzu r10,-1(r4) // Load last byte
|
|
__norss1B_00:
|
|
addic. r5,r5,-1
|
|
rlwinm r11,r10,24,0,7 // Move LS 1 bytes in r10 to MS byte in r11
|
|
lwzu r10,-4(r4) // Load preceeding word
|
|
rlwimi r11,r10,24,8,31 // Insert MS 3 bytes in r10 to LS 3 bytes in r11
|
|
lwzu r17,-4(r9)
|
|
nor r11,r11,r17
|
|
stw r11,0(r9) // Store r11
|
|
bne __norss1B_00
|
|
addi r4,r4,1 // Adjust source pointer
|
|
bctr
|
|
//
|
|
__norsmains_2B:
|
|
mtctr r16
|
|
mr r5,r12
|
|
lhzu r10,-2(r4) // Load needed two bytes in r11
|
|
__norss2B_00:
|
|
addic. r5,r5,-1
|
|
rlwinm r11,r10,16,0,15 // Move LS 2 bytes in r10 to MS 2 bytes in r11
|
|
lwzu r10,-4(r4) // Load preceeding word in r10
|
|
rlwimi r11,r10,16,16,31 // Insert MS 2 bytes in r10 to LS 2 bytes in r11
|
|
lwzu r17,-4(r9)
|
|
nor r11,r11,r17
|
|
stw r11,0(r9) // Store r11
|
|
bne __norss2B_00
|
|
addi r4,r4,2 // Adjust source pointer
|
|
bctr
|
|
//
|
|
__norsmains_3B:
|
|
mtctr r16
|
|
mr r5,r12
|
|
addi r4,r4,1 // Adjust source pointer to make update word access
|
|
lwzu r10,-4(r4) // Load needed three bytes in MS r10
|
|
__norss3B_00:
|
|
addic. r5,r5,-1
|
|
rlwinm r11,r10,8,0,23 // Move LS 3 bytes in r10 to MS bytes in r11
|
|
lwzu r10,-4(r4) // Load preceeding word
|
|
rlwimi r11,r10,8,24,31 // Insert MS 1 bytes in r10 to LS 1 byte in r11
|
|
lwzu r17,-4(r9)
|
|
nor r11,r11,r17
|
|
stw r11,0(r9) // Store r11
|
|
bne __norss3B_00
|
|
addi r4,r4,3 // Adjust source pointer
|
|
bctr
|
|
//
|
|
// Initial nor routines for 1~3 bytes for backword direction
|
|
//
|
|
__norsInit_0B:
|
|
mtctr r15
|
|
bctr
|
|
__norsInit_1B:
|
|
mtctr r15
|
|
lbzu r10,-1(r4)
|
|
lbzu r11,-1(r9)
|
|
nor r10,r10,r11
|
|
stb r10,0(r9)
|
|
bctr
|
|
__norsInit_2B:
|
|
mtctr r15
|
|
lbzu r10,-2(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lhzu r11,-2(r9)
|
|
nor r10,r10,r11
|
|
sth r10,0(r9)
|
|
bctr
|
|
__norsInit_3B:
|
|
mtctr r15
|
|
lbzu r10,-1(r4)
|
|
lbzu r11,-1(r9)
|
|
nor r10,r10,r11
|
|
stb r10,0(r9)
|
|
lbzu r10,-2(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lhzu r11,-2(r9)
|
|
nor r10,r10,r11
|
|
sth r10,0(r9)
|
|
bctr
|
|
//
|
|
// Ending nor routines for 1~3 bytes for backword direction
|
|
//
|
|
__norsEnd_0B:
|
|
addic. r6,r6,-1
|
|
mtctr r14
|
|
add r9,r9,r7
|
|
add r4,r4,r8
|
|
bnectr
|
|
blr
|
|
__norsEnd_1B:
|
|
mtctr r14
|
|
lbzu r10,-1(r4)
|
|
lbzu r11,-1(r9)
|
|
nor r10,r10,r11
|
|
stb r10,0(r9)
|
|
addic. r6,r6,-1
|
|
add r9,r9,r7
|
|
add r4,r4,r8
|
|
bnectr
|
|
blr
|
|
__norsEnd_2B:
|
|
mtctr r14
|
|
lbzu r10,-2(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lhzu r11,-2(r9)
|
|
nor r10,r10,r11
|
|
sth r10,0(r9)
|
|
addic. r6,r6,-1
|
|
add r9,r9,r7
|
|
add r4,r4,r8
|
|
bnectr
|
|
blr
|
|
__norsEnd_3B:
|
|
mtctr r14
|
|
lbzu r10,-2(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lhzu r11,-2(r9)
|
|
nor r10,r10,r11
|
|
sth r10,0(r9)
|
|
lbzu r10,-1(r4)
|
|
lbzu r11,-1(r9)
|
|
nor r10,r10,r11
|
|
stb r10,0(r9)
|
|
addic. r6,r6,-1
|
|
add r9,r9,r7
|
|
add r4,r4,r8
|
|
bnectr
|
|
blr
|
|
//
|
|
// Short andc routines for 1~2 bytes with 4 target word alignment cases
|
|
//
|
|
__andcs1_A0:
|
|
__andcs1_A1:
|
|
__andcs1_A2:
|
|
__andcs1_A3:
|
|
addic. r6,r6,-1
|
|
lbz r10,0(r4)
|
|
lbz r11,0(r9)
|
|
andc r10,r10,r11
|
|
stb r10,0(r9)
|
|
add r4,r4,r8
|
|
add r9,r9,r7
|
|
bnectr
|
|
blr
|
|
__andcs2_A0:
|
|
__andcs2_A2:
|
|
addic. r6,r6,-1
|
|
lbz r10,0(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lhz r11,0(r9)
|
|
andc r10,r10,r11
|
|
sth r10,0(r9)
|
|
add r4,r4,r8
|
|
add r9,r9,r7
|
|
bnectr
|
|
blr
|
|
__andcs2_A1:
|
|
__andcs2_A3:
|
|
addic. r6,r6,-1
|
|
lbz r10,0(r4)
|
|
lbz r11,0(r9)
|
|
andc r10,r10,r11
|
|
lbz r12,1(r4)
|
|
lbz r11,1(r9)
|
|
andc r12,r12,r11
|
|
stb r10,0(r9)
|
|
stb r12,1(r9)
|
|
add r4,r4,r8
|
|
add r9,r9,r7
|
|
bnectr
|
|
blr
|
|
//
|
|
// Main andc routines for short case (4 bytes unit) forward direction
|
|
//
|
|
__andcsmains_0F:
|
|
mtctr r16
|
|
mr r5,r12
|
|
__andcss0F_00:
|
|
addic. r5,r5,-1
|
|
lwz r10,0(r4)
|
|
lwz r17,0(r9)
|
|
andc r10,r10,r17
|
|
stw r10,0(r9)
|
|
addi r4,r4,4
|
|
addi r9,r9,4
|
|
bne __andcss0F_00
|
|
bctr
|
|
//
|
|
__andcsmains_1F:
|
|
mtctr r16
|
|
mr r5,r12
|
|
addi r4,r4,-1
|
|
lwz r10,0(r4)
|
|
__andcss1F_00:
|
|
addic. r5,r5,-1
|
|
rlwinm r11,r10,24,8,31
|
|
lwzu r10,4(r4)
|
|
rlwimi r11,r10,24,0,7
|
|
lwz r17,0(r9)
|
|
andc r11,r11,r17
|
|
stw r11,0(r9)
|
|
addi r9,r9,4
|
|
bne __andcss1F_00
|
|
addi r4,r4,1
|
|
bctr
|
|
//
|
|
__andcsmains_2F:
|
|
mtctr r16
|
|
mr r5,r12
|
|
lhz r10,0(r4)
|
|
addi r4,r4,-2
|
|
__andcss2F_00:
|
|
addic. r5,r5,-1
|
|
lwzu r11,4(r4)
|
|
rlwimi r10,r11,16,0,15
|
|
lwz r17,0(r9)
|
|
andc r10,r10,r17
|
|
stw r10,0(r9)
|
|
addi r9,r9,4
|
|
rlwinm r10,r11,16,16,31
|
|
bne __andcss2F_00
|
|
addi r4,r4,2
|
|
bctr
|
|
//
|
|
__andcsmains_3F:
|
|
mtctr r16
|
|
mr r5,r12
|
|
lbz r10,0(r4)
|
|
addi r4,r4,-3
|
|
__andcss3F_00:
|
|
addic. r5,r5,-1
|
|
lwzu r11,4(r4)
|
|
rlwimi r10,r11,8,0,23
|
|
lwz r17,0(r9)
|
|
andc r10,r10,r17
|
|
stw r10,0(r9)
|
|
addi r9,r9,4
|
|
rlwinm r10,r11,8,24,31
|
|
bne __andcss3F_00
|
|
addi r4,r4,3
|
|
bctr
|
|
//
|
|
// Initial andc routines for 1~3 bytes for forward direction
|
|
//
|
|
__andcsInit_0F:
|
|
mtctr r15
|
|
bctr
|
|
__andcsInit_1F:
|
|
mtctr r15
|
|
lbz r10,0(r4)
|
|
lbz r11,0(r9)
|
|
andc r10,r10,r11
|
|
stb r10,0(r9)
|
|
addi r4,r4,1
|
|
addi r9,r9,1
|
|
bctr
|
|
__andcsInit_2F:
|
|
mtctr r15
|
|
lbz r10,0(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lhz r11,0(r9)
|
|
andc r10,r10,r11
|
|
sth r10,0(r9)
|
|
addi r4,r4,2
|
|
addi r9,r9,2
|
|
bctr
|
|
__andcsInit_3F:
|
|
mtctr r15
|
|
lbz r10,0(r4)
|
|
lbz r11,0(r9)
|
|
andc r10,r10,r11
|
|
stb r10,0(r9)
|
|
lbz r10,1(r4)
|
|
lbz r11,2(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lhz r11,1(r9)
|
|
andc r10,r10,r11
|
|
sth r10,1(r9)
|
|
addi r4,r4,3
|
|
addi r9,r9,3
|
|
bctr
|
|
//
|
|
// Ending andc routines for 1~3 bytes for forward direction
|
|
//
|
|
__andcsEnd_0F:
|
|
addic. r6,r6,-1
|
|
mtctr r14
|
|
add r9,r9,r7
|
|
add r4,r4,r8
|
|
bnectr
|
|
blr
|
|
__andcsEnd_1F:
|
|
mtctr r14
|
|
lbz r10,0(r4)
|
|
lbz r11,0(r9)
|
|
andc r10,r10,r11
|
|
stb r10,0(r9)
|
|
addic. r6,r6,-1
|
|
addi r4,r4,1
|
|
addi r9,r9,1
|
|
add r9,r9,r7
|
|
add r4,r4,r8
|
|
bnectr
|
|
blr
|
|
__andcsEnd_2F:
|
|
mtctr r14
|
|
lbz r10,0(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lhz r11,0(r9)
|
|
andc r10,r10,r11
|
|
sth r10,0(r9)
|
|
addic. r6,r6,-1
|
|
addi r4,r4,2
|
|
addi r9,r9,2
|
|
add r9,r9,r7
|
|
add r4,r4,r8
|
|
bnectr
|
|
blr
|
|
__andcsEnd_3F:
|
|
mtctr r14
|
|
lbz r10,0(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lhz r11,0(r9)
|
|
andc r10,r10,r11
|
|
sth r10,0(r9)
|
|
lbz r10,2(r4)
|
|
lbz r11,2(r9)
|
|
andc r10,r10,r11
|
|
stb r10,2(r9)
|
|
addic. r6,r6,-1
|
|
addi r4,r4,3
|
|
addi r9,r9,3
|
|
add r9,r9,r7
|
|
add r4,r4,r8
|
|
bnectr
|
|
blr
|
|
//
|
|
// Main andc routines for short case (4 bytes unit) backword direction
|
|
//
|
|
__andcsmains_0B:
|
|
mtctr r16
|
|
mr r5,r12
|
|
__andcss0B_00:
|
|
addic. r5,r5,-1
|
|
lwzu r11,-4(r4)
|
|
lwzu r17,-4(r9)
|
|
andc r11,r11,r17
|
|
stw r11,0(r9)
|
|
bne __andcss0B_00
|
|
bctr
|
|
//
|
|
__andcsmains_1B:
|
|
mtctr r16
|
|
mr r5,r12
|
|
lbzu r10,-1(r4) // Load last byte
|
|
__andcss1B_00:
|
|
addic. r5,r5,-1
|
|
rlwinm r11,r10,24,0,7 // Move LS 1 bytes in r10 to MS byte in r11
|
|
lwzu r10,-4(r4) // Load preceeding word
|
|
rlwimi r11,r10,24,8,31 // Insert MS 3 bytes in r10 to LS 3 bytes in r11
|
|
lwzu r17,-4(r9)
|
|
andc r11,r11,r17
|
|
stw r11,0(r9) // Store r11
|
|
bne __andcss1B_00
|
|
addi r4,r4,1 // Adjust source pointer
|
|
bctr
|
|
//
|
|
__andcsmains_2B:
|
|
mtctr r16
|
|
mr r5,r12
|
|
lhzu r10,-2(r4) // Load needed two bytes in r11
|
|
__andcss2B_00:
|
|
addic. r5,r5,-1
|
|
rlwinm r11,r10,16,0,15 // Move LS 2 bytes in r10 to MS 2 bytes in r11
|
|
lwzu r10,-4(r4) // Load preceeding word in r10
|
|
rlwimi r11,r10,16,16,31 // Insert MS 2 bytes in r10 to LS 2 bytes in r11
|
|
lwzu r17,-4(r9)
|
|
andc r11,r11,r17
|
|
stw r11,0(r9) // Store r11
|
|
bne __andcss2B_00
|
|
addi r4,r4,2 // Adjust source pointer
|
|
bctr
|
|
//
|
|
__andcsmains_3B:
|
|
mtctr r16
|
|
mr r5,r12
|
|
addi r4,r4,1 // Adjust source pointer to make update word access
|
|
lwzu r10,-4(r4) // Load needed three bytes in MS r10
|
|
__andcss3B_00:
|
|
addic. r5,r5,-1
|
|
rlwinm r11,r10,8,0,23 // Move LS 3 bytes in r10 to MS bytes in r11
|
|
lwzu r10,-4(r4) // Load preceeding word
|
|
rlwimi r11,r10,8,24,31 // Insert MS 1 bytes in r10 to LS 1 byte in r11
|
|
lwzu r17,-4(r9)
|
|
andc r11,r11,r17
|
|
stw r11,0(r9) // Store r11
|
|
bne __andcss3B_00
|
|
addi r4,r4,3 // Adjust source pointer
|
|
bctr
|
|
//
|
|
// Initial andc routines for 1~3 bytes for backword direction
|
|
//
|
|
__andcsInit_0B:
|
|
mtctr r15
|
|
bctr
|
|
__andcsInit_1B:
|
|
mtctr r15
|
|
lbzu r10,-1(r4)
|
|
lbzu r11,-1(r9)
|
|
andc r10,r10,r11
|
|
stb r10,0(r9)
|
|
bctr
|
|
__andcsInit_2B:
|
|
mtctr r15
|
|
lbzu r10,-2(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lhzu r11,-2(r9)
|
|
andc r10,r10,r11
|
|
sth r10,0(r9)
|
|
bctr
|
|
__andcsInit_3B:
|
|
mtctr r15
|
|
lbzu r10,-1(r4)
|
|
lbzu r11,-1(r9)
|
|
andc r10,r10,r11
|
|
stb r10,0(r9)
|
|
lbzu r10,-2(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lhzu r11,-2(r9)
|
|
andc r10,r10,r11
|
|
sth r10,0(r9)
|
|
bctr
|
|
//
|
|
// Ending andc routines for 1~3 bytes for backword direction
|
|
//
|
|
__andcsEnd_0B:
|
|
addic. r6,r6,-1
|
|
mtctr r14
|
|
add r9,r9,r7
|
|
add r4,r4,r8
|
|
bnectr
|
|
blr
|
|
__andcsEnd_1B:
|
|
mtctr r14
|
|
lbzu r10,-1(r4)
|
|
lbzu r11,-1(r9)
|
|
andc r10,r10,r11
|
|
stb r10,0(r9)
|
|
addic. r6,r6,-1
|
|
add r9,r9,r7
|
|
add r4,r4,r8
|
|
bnectr
|
|
blr
|
|
__andcsEnd_2B:
|
|
mtctr r14
|
|
lbzu r10,-2(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lhzu r11,-2(r9)
|
|
andc r10,r10,r11
|
|
sth r10,0(r9)
|
|
addic. r6,r6,-1
|
|
add r9,r9,r7
|
|
add r4,r4,r8
|
|
bnectr
|
|
blr
|
|
__andcsEnd_3B:
|
|
mtctr r14
|
|
lbzu r10,-2(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lhzu r11,-2(r9)
|
|
andc r10,r10,r11
|
|
sth r10,0(r9)
|
|
lbzu r10,-1(r4)
|
|
lbzu r11,-1(r9)
|
|
andc r10,r10,r11
|
|
stb r10,0(r9)
|
|
addic. r6,r6,-1
|
|
add r9,r9,r7
|
|
add r4,r4,r8
|
|
bnectr
|
|
blr
|
|
//
|
|
// Short not src copy routines for 1~2 bytes with 4 target word alignment cases
|
|
//
|
|
__nsrcs1_A0:
|
|
__nsrcs1_A1:
|
|
__nsrcs1_A2:
|
|
__nsrcs1_A3:
|
|
addic. r6,r6,-1
|
|
lbz r10,0(r4)
|
|
xori r10,r10,0xffff
|
|
stb r10,0(r9)
|
|
add r4,r4,r8
|
|
add r9,r9,r7
|
|
bnectr
|
|
blr
|
|
__nsrcs2_A0:
|
|
__nsrcs2_A2:
|
|
addic. r6,r6,-1
|
|
lbz r10,0(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
xori r10,r10,0xffff
|
|
sth r10,0(r9)
|
|
add r4,r4,r8
|
|
add r9,r9,r7
|
|
bnectr
|
|
blr
|
|
__nsrcs2_A1:
|
|
__nsrcs2_A3:
|
|
addic. r6,r6,-1
|
|
lbz r10,0(r4)
|
|
xori r10,r10,0xffff
|
|
lbz r12,1(r4)
|
|
xori r12,r12,0xffff
|
|
stb r10,0(r9)
|
|
stb r12,1(r9)
|
|
add r4,r4,r8
|
|
add r9,r9,r7
|
|
bnectr
|
|
blr
|
|
//
|
|
// Main not src copy routines for short case (4 bytes unit) forward direction
|
|
//
|
|
__nsrcsmains_0F:
|
|
mtctr r16
|
|
mr r5,r12
|
|
__nsrcss0F_00:
|
|
addic. r5,r5,-1
|
|
lwz r10,0(r4)
|
|
xori r10,r10,0xffff
|
|
xoris r10,r10,0xffff
|
|
stw r10,0(r9)
|
|
addi r4,r4,4
|
|
addi r9,r9,4
|
|
bne __nsrcss0F_00
|
|
bctr
|
|
//
|
|
__nsrcsmains_1F:
|
|
mtctr r16
|
|
mr r5,r12
|
|
addi r4,r4,-1
|
|
lwz r10,0(r4)
|
|
__nsrcss1F_00:
|
|
addic. r5,r5,-1
|
|
rlwinm r11,r10,24,8,31
|
|
lwzu r10,4(r4)
|
|
rlwimi r11,r10,24,0,7
|
|
xori r11,r11,0xffff
|
|
xoris r11,r11,0xffff
|
|
stw r11,0(r9)
|
|
addi r9,r9,4
|
|
bne __nsrcss1F_00
|
|
addi r4,r4,1
|
|
bctr
|
|
//
|
|
__nsrcsmains_2F:
|
|
mtctr r16
|
|
mr r5,r12
|
|
lhz r10,0(r4)
|
|
addi r4,r4,-2
|
|
__nsrcss2F_00:
|
|
addic. r5,r5,-1
|
|
lwzu r11,4(r4)
|
|
rlwimi r10,r11,16,0,15
|
|
xori r10,r10,0xffff
|
|
xoris r10,r10,0xffff
|
|
stw r10,0(r9)
|
|
addi r9,r9,4
|
|
rlwinm r10,r11,16,16,31
|
|
bne __nsrcss2F_00
|
|
addi r4,r4,2
|
|
bctr
|
|
//
|
|
__nsrcsmains_3F:
|
|
mtctr r16
|
|
mr r5,r12
|
|
lbz r10,0(r4)
|
|
addi r4,r4,-3
|
|
__nsrcss3F_00:
|
|
addic. r5,r5,-1
|
|
lwzu r11,4(r4)
|
|
rlwimi r10,r11,8,0,23
|
|
xori r10,r10,0xffff
|
|
xoris r10,r10,0xffff
|
|
stw r10,0(r9)
|
|
addi r9,r9,4
|
|
rlwinm r10,r11,8,24,31
|
|
bne __nsrcss3F_00
|
|
addi r4,r4,3
|
|
bctr
|
|
//
|
|
// Initial not src copy routines for 1~3 bytes for forward direction
|
|
//
|
|
__nsrcsInit_0F:
|
|
mtctr r15
|
|
bctr
|
|
__nsrcsInit_1F:
|
|
mtctr r15
|
|
lbz r10,0(r4)
|
|
xori r10,r10,0xffff
|
|
stb r10,0(r9)
|
|
addi r4,r4,1
|
|
addi r9,r9,1
|
|
bctr
|
|
__nsrcsInit_2F:
|
|
mtctr r15
|
|
lbz r10,0(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
xori r10,r10,0xffff
|
|
sth r10,0(r9)
|
|
addi r4,r4,2
|
|
addi r9,r9,2
|
|
bctr
|
|
__nsrcsInit_3F:
|
|
mtctr r15
|
|
lbz r10,0(r4)
|
|
xori r10,r10,0xffff
|
|
stb r10,0(r9)
|
|
lbz r10,1(r4)
|
|
lbz r11,2(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
xori r10,r10,0xffff
|
|
sth r10,1(r9)
|
|
addi r4,r4,3
|
|
addi r9,r9,3
|
|
bctr
|
|
//
|
|
// Ending not src copy routines for 1~3 bytes for forward direction
|
|
//
|
|
__nsrcsEnd_0F:
|
|
addic. r6,r6,-1
|
|
mtctr r14
|
|
add r9,r9,r7
|
|
add r4,r4,r8
|
|
bnectr
|
|
blr
|
|
__nsrcsEnd_1F:
|
|
mtctr r14
|
|
lbz r10,0(r4)
|
|
xori r10,r10,0xffff
|
|
stb r10,0(r9)
|
|
addic. r6,r6,-1
|
|
addi r4,r4,1
|
|
addi r9,r9,1
|
|
add r9,r9,r7
|
|
add r4,r4,r8
|
|
bnectr
|
|
blr
|
|
__nsrcsEnd_2F:
|
|
mtctr r14
|
|
lbz r10,0(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
xori r10,r10,0xffff
|
|
sth r10,0(r9)
|
|
addic. r6,r6,-1
|
|
addi r4,r4,2
|
|
addi r9,r9,2
|
|
add r9,r9,r7
|
|
add r4,r4,r8
|
|
bnectr
|
|
blr
|
|
__nsrcsEnd_3F:
|
|
mtctr r14
|
|
lbz r10,0(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
xori r10,r10,0xffff
|
|
sth r10,0(r9)
|
|
lbz r10,2(r4)
|
|
xori r10,r10,0xffff
|
|
stb r10,2(r9)
|
|
addic. r6,r6,-1
|
|
addi r4,r4,3
|
|
addi r9,r9,3
|
|
add r9,r9,r7
|
|
add r4,r4,r8
|
|
bnectr
|
|
blr
|
|
//
|
|
// Main not src copy routines for short case (4 bytes unit) backword direction
|
|
//
|
|
__nsrcsmains_0B:
|
|
mtctr r16
|
|
mr r5,r12
|
|
__nsrcss0B_00:
|
|
addic. r5,r5,-1
|
|
lwzu r11,-4(r4)
|
|
xori r11,r11,0xffff
|
|
xoris r11,r11,0xffff
|
|
stwu r11,-4(r9)
|
|
bne __nsrcss0B_00
|
|
bctr
|
|
//
|
|
__nsrcsmains_1B:
|
|
mtctr r16
|
|
mr r5,r12
|
|
lbzu r10,-1(r4) // Load last byte
|
|
__nsrcss1B_00:
|
|
addic. r5,r5,-1
|
|
rlwinm r11,r10,24,0,7 // Move LS 1 bytes in r10 to MS byte in r11
|
|
lwzu r10,-4(r4) // Load preceeding word
|
|
rlwimi r11,r10,24,8,31 // Insert MS 3 bytes in r10 to LS 3 bytes in r11
|
|
xori r11,r11,0xffff
|
|
xoris r11,r11,0xffff
|
|
stwu r11,-4(r9) // Store r11
|
|
bne __nsrcss1B_00
|
|
addi r4,r4,1 // Adjust source pointer
|
|
bctr
|
|
//
|
|
__nsrcsmains_2B:
|
|
mtctr r16
|
|
mr r5,r12
|
|
lhzu r10,-2(r4) // Load needed two bytes in r11
|
|
__nsrcss2B_00:
|
|
addic. r5,r5,-1
|
|
rlwinm r11,r10,16,0,15 // Move LS 2 bytes in r10 to MS 2 bytes in r11
|
|
lwzu r10,-4(r4) // Load preceeding word in r10
|
|
rlwimi r11,r10,16,16,31 // Insert MS 2 bytes in r10 to LS 2 bytes in r11
|
|
xori r11,r11,0xffff
|
|
xoris r11,r11,0xffff
|
|
stwu r11,-4(r9) // Store r11
|
|
bne __nsrcss2B_00
|
|
addi r4,r4,2 // Adjust source pointer
|
|
bctr
|
|
//
|
|
__nsrcsmains_3B:
|
|
mtctr r16
|
|
mr r5,r12
|
|
addi r4,r4,1 // Adjust source pointer to make update word access
|
|
lwzu r10,-4(r4) // Load needed three bytes in MS r10
|
|
__nsrcss3B_00:
|
|
addic. r5,r5,-1
|
|
rlwinm r11,r10,8,0,23 // Move LS 3 bytes in r10 to MS bytes in r11
|
|
lwzu r10,-4(r4) // Load preceeding word
|
|
rlwimi r11,r10,8,24,31 // Insert MS 1 bytes in r10 to LS 1 byte in r11
|
|
xori r11,r11,0xffff
|
|
xoris r11,r11,0xffff
|
|
stwu r11,-4(r9) // Store r11
|
|
bne __nsrcss3B_00
|
|
addi r4,r4,3 // Adjust source pointer
|
|
bctr
|
|
//
|
|
// Initial not src copy routines for 1~3 bytes for backword direction
|
|
//
|
|
__nsrcsInit_0B:
|
|
mtctr r15
|
|
bctr
|
|
__nsrcsInit_1B:
|
|
mtctr r15
|
|
lbzu r10,-1(r4)
|
|
xori r10,r10,0xffff
|
|
stbu r10,-1(r9)
|
|
bctr
|
|
__nsrcsInit_2B:
|
|
mtctr r15
|
|
lbzu r10,-2(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
xori r10,r10,0xffff
|
|
sthu r10,-2(r9)
|
|
bctr
|
|
__nsrcsInit_3B:
|
|
mtctr r15
|
|
lbzu r10,-1(r4)
|
|
xori r10,r10,0xffff
|
|
stbu r10,-1(r9)
|
|
lbzu r10,-2(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
xori r10,r10,0xffff
|
|
sthu r10,-2(r9)
|
|
bctr
|
|
//
|
|
// Ending not src copy routines for 1~3 bytes for backword direction
|
|
//
|
|
__nsrcsEnd_0B:
|
|
addic. r6,r6,-1
|
|
mtctr r14
|
|
add r9,r9,r7
|
|
add r4,r4,r8
|
|
bnectr
|
|
blr
|
|
__nsrcsEnd_1B:
|
|
mtctr r14
|
|
lbzu r10,-1(r4)
|
|
xori r10,r10,0xffff
|
|
stbu r10,-1(r9)
|
|
addic. r6,r6,-1
|
|
add r9,r9,r7
|
|
add r4,r4,r8
|
|
bnectr
|
|
blr
|
|
__nsrcsEnd_2B:
|
|
mtctr r14
|
|
lbzu r10,-2(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
xori r10,r10,0xffff
|
|
sthu r10,-2(r9)
|
|
addic. r6,r6,-1
|
|
add r9,r9,r7
|
|
add r4,r4,r8
|
|
bnectr
|
|
blr
|
|
__nsrcsEnd_3B:
|
|
mtctr r14
|
|
lbzu r10,-2(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
xori r10,r10,0xffff
|
|
sthu r10,-2(r9)
|
|
lbzu r10,-1(r4)
|
|
xori r10,r10,0xffff
|
|
stbu r10,-1(r9)
|
|
addic. r6,r6,-1
|
|
add r9,r9,r7
|
|
add r4,r4,r8
|
|
bnectr
|
|
blr
|
|
//
|
|
opsrcs_exit:
|
|
//
|
|
// Restore non-volatile registers
|
|
//
|
|
lwz r14,SLACK2(sp)
|
|
lwz r15,SLACK3(sp)
|
|
lwz r16,SLACK4(sp)
|
|
lwz r17,SLACK5(sp)
|
|
lwz r18,SLACK6(sp)
|
|
lwz r19,SLACK7(sp)
|
|
mtlr r0
|
|
SPECIAL_EXIT(RectSrcOpTgt)
|
|
//
|
|
//
|
|
//*************************************************************************************************
|
|
SPECIAL_ENTRY(RectCopy24to32)
|
|
//
|
|
// Input Parameters:
|
|
// r3: The pointer to the parameter structure as follows.
|
|
// PARAM1 [00] : Target address
|
|
// PARAM2 [04] : Source address
|
|
// PARAM3 [08] : Number of bytes to copy per line
|
|
// PARAM4 [12] : Number of lines to copy
|
|
// PARAM5 [16] : Target line increments byte per line
|
|
// PARAM6 [20] : Source line increments byte per line
|
|
// PARAM7 [24] : Maximum number of cache lines to flush
|
|
// PARAM8 [28] : Maximum number of display lines to flush
|
|
// PARAM9 [32] : Operation control flag
|
|
// bit 0 (SFLUSHBIT): Source Flush flag 0:No Flush, 1:Flush
|
|
// bit 1 (TFLUSHBIT): Target Flush flag 0:No Flush, 1:Flush
|
|
// bit 2 (TTOUCHBIT): Target Touch flag using "dcbz" 0:No Touch, 1:Touch
|
|
// PARAM10 [36] : Register save area 1
|
|
// PARAM11 [40] : Register save area 2
|
|
// PARAM12 [44] : Register save area 3
|
|
// PARAM13 [48] : Register save area 4
|
|
// PARAM14 [52] : Register save area 5
|
|
// PARAM15 [56] : Register save area 6
|
|
// r4: Pointer to the palette (not used)
|
|
//
|
|
// Register usage:
|
|
// r0: Save LR
|
|
// r4: Updating source address
|
|
// r5: Number of pixels to copy per line
|
|
// r6: Updating remaining number of lines to copy
|
|
// r7: Target increment bytes per line (changed for pre caluculated value)
|
|
// r8: Source increment bytes per line (changed for pre caluculated value)
|
|
// r9: Updating target address
|
|
// r10: Work register
|
|
// r11: Work register
|
|
// r12: Pixel count
|
|
// CTR: Used for counter
|
|
//
|
|
// Restrictions:
|
|
// Copy width is assumed to be equal or shorter than target delta.
|
|
// Target is always cached VRAM and the source is always DRAM.
|
|
//
|
|
mflr r0 // Save return address
|
|
//
|
|
PROLOGUE_END(RectCopy24to32)
|
|
//
|
|
lwz r6,PARAM4(r3) // r6 <- number of lines to copy
|
|
and. r6,r6,r6 // Any lines to copy?
|
|
beq- copy2432_exit // No -> exit
|
|
lwz r9,PARAM1(r3) // r9 <- target address
|
|
lwz r4,PARAM2(r3) // r4 <- source address
|
|
lwz r5,PARAM3(r3) // r5 <- bytes to copy per line
|
|
lwz r7,PARAM5(r3) // r7 <- target byte distance between lines
|
|
lwz r8,PARAM6(r3) // r8 <- source byte distance between lines
|
|
subf r7,r5,r7 // r7 <- line delta after updating pointer (target)
|
|
subf r8,r5,r8 // r8 <- line delta after updating pointer (source)
|
|
srawi. r12,r5,2 // r5 <- pixel count
|
|
beq- copy2432_exit // No pixel -> exit
|
|
add r8,r12,r8 // r8 needed to adjust for 3 byte per pixel
|
|
//
|
|
copy2432_10:
|
|
mtctr r12
|
|
#if USE_DCBZ
|
|
addi r10,r9,-1 // r10 <- starting cache line address which can be dcbz'ed minus 32
|
|
add r11,r9,r5 // r11 <- one byte after last byte
|
|
addi r11,r11,-31 // r11 <- ending cache line address which can be dcbz'ed
|
|
copy2432_15:
|
|
addi r10,r10,32
|
|
cmplw r10,r11 // no more cache line to dcbz?
|
|
bge copy2432_20
|
|
dcbz 0,r10
|
|
b copy2432_15
|
|
#endif
|
|
copy2432_20:
|
|
lbz r10,0(r4)
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,8,16,23
|
|
lbz r11,2(r4)
|
|
rlwimi r10,r11,16,8,15
|
|
stw r10,0(r9)
|
|
addi r4,r4,3
|
|
addi r9,r9,4
|
|
bdnz copy2432_20
|
|
add r4,r8,r4
|
|
add r9,r7,r9
|
|
addic. r6,r6,-1
|
|
bne copy2432_10
|
|
//
|
|
#if (! FULLCACHE)
|
|
add r7,r5,r7 // restore target delta
|
|
bl flushcopy_00
|
|
#endif
|
|
mtlr r0
|
|
//
|
|
copy2432_exit:
|
|
//
|
|
SPECIAL_EXIT(RectCopy24to32)
|
|
//
|
|
//*************************************************************************************************
|
|
SPECIAL_ENTRY(RectCopy24to16)
|
|
//
|
|
// Input Parameters:
|
|
// r3: The pointer to the parameter structure as follows.
|
|
// PARAM1 [00] : Target address
|
|
// PARAM2 [04] : Source address
|
|
// PARAM3 [08] : Number of bytes to copy per line
|
|
// PARAM4 [12] : Number of lines to copy
|
|
// PARAM5 [16] : Target line increments byte per line
|
|
// PARAM6 [20] : Source line increments byte per line
|
|
// PARAM7 [24] : Maximum number of cache lines to flush
|
|
// PARAM8 [28] : Maximum number of display lines to flush
|
|
// PARAM9 [32] : Operation control flag
|
|
// bit 0 (SFLUSHBIT): Source Flush flag 0:No Flush, 1:Flush
|
|
// bit 1 (TFLUSHBIT): Target Flush flag 0:No Flush, 1:Flush
|
|
// bit 2 (TTOUCHBIT): Target Touch flag using "dcbz" 0:No Touch, 1:Touch
|
|
// PARAM10 [36] : Register save area 1
|
|
// PARAM11 [40] : Register save area 2
|
|
// PARAM12 [44] : Register save area 3
|
|
// PARAM13 [48] : Register save area 4
|
|
// PARAM14 [52] : Register save area 5
|
|
// PARAM15 [56] : Register save area 6
|
|
// r4: Pointer to the palette (not used)
|
|
//
|
|
// Register usage:
|
|
// r0: Save LR
|
|
// r4: Updating source address
|
|
// r5: Number of pixels to copy per line
|
|
// r6: Updating remaining number of lines to copy
|
|
// r7: Target increment bytes per line (changed for pre caluculated value)
|
|
// r8: Source increment bytes per line (changed for pre caluculated value)
|
|
// r9: Updating target address
|
|
// r10: Work register
|
|
// r11: Work register
|
|
// r12: Pixel count
|
|
// CTR: Used for counter
|
|
//
|
|
// Restrictions:
|
|
// Copy width is assumed to be equal or shorter than target delta.
|
|
// Target is always cached VRAM and the source is always DRAM.
|
|
//
|
|
mflr r0 // Save return address
|
|
//
|
|
PROLOGUE_END(RectCopy24to16)
|
|
//
|
|
lwz r6,PARAM4(r3) // r6 <- number of lines to copy
|
|
and. r6,r6,r6 // Any lines to copy?
|
|
beq- copy2416_exit // No -> exit
|
|
lwz r9,PARAM1(r3) // r9 <- target address
|
|
lwz r4,PARAM2(r3) // r4 <- source address
|
|
lwz r5,PARAM3(r3) // r5 <- bytes to copy per line
|
|
lwz r7,PARAM5(r3) // r7 <- target byte distance between lines
|
|
lwz r8,PARAM6(r3) // r8 <- source byte distance between lines
|
|
subf r7,r5,r7 // r7 <- line delta after updating pointer (target)
|
|
srawi. r12,r5,1 // r12 <- pixel count
|
|
beq- copy2416_exit // No pixel -> exit
|
|
subf r8,r12,r8 // r8 <- line delta after updating pointer (source)
|
|
subf r8,r12,r8 // r8 <- line delta after updating pointer (source)
|
|
subf r8,r12,r8 // r8 <- line delta after updating pointer (source)
|
|
//
|
|
copy2416_10:
|
|
mtctr r12
|
|
#if USE_DCBZ
|
|
addi r10,r9,-1 // r10 <- starting cache line address which can be dcbz'ed minus 32
|
|
add r11,r9,r5 // r11 <- one byte after last byte
|
|
addi r11,r11,-31 // r11 <- ending cache line address which can be dcbz'ed
|
|
copy2416_15:
|
|
addi r10,r10,32
|
|
cmplw r10,r11 // no more cache line to dcbz?
|
|
bge copy2416_20
|
|
dcbz 0,r10
|
|
b copy2416_15
|
|
#endif
|
|
copy2416_20:
|
|
lbz r10,0(r4)
|
|
rlwinm r10,r10,29,27,31
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,3,21,26
|
|
lbz r11,2(r4)
|
|
rlwimi r10,r11,8,16,20
|
|
sth r10,0(r9)
|
|
addi r4,r4,3
|
|
addi r9,r9,2
|
|
bdnz copy2416_20
|
|
add r4,r8,r4
|
|
add r9,r7,r9
|
|
addic. r6,r6,-1
|
|
bne copy2416_10
|
|
//
|
|
#if (! FULLCACHE)
|
|
add r7,r5,r7 // restore target delta
|
|
bl flushcopy_00
|
|
#endif
|
|
mtlr r0
|
|
//
|
|
copy2416_exit:
|
|
//
|
|
SPECIAL_EXIT(RectCopy24to16)
|
|
//
|
|
//*************************************************************************************************
|
|
SPECIAL_ENTRY(RectCopy24to15)
|
|
//
|
|
// Input Parameters:
|
|
// r3: The pointer to the parameter structure as follows.
|
|
// PARAM1 [00] : Target address
|
|
// PARAM2 [04] : Source address
|
|
// PARAM3 [08] : Number of bytes to copy per line
|
|
// PARAM4 [12] : Number of lines to copy
|
|
// PARAM5 [16] : Target line increments byte per line
|
|
// PARAM6 [20] : Source line increments byte per line
|
|
// PARAM7 [24] : Maximum number of cache lines to flush
|
|
// PARAM8 [28] : Maximum number of display lines to flush
|
|
// PARAM9 [32] : Operation control flag
|
|
// bit 0 (SFLUSHBIT): Source Flush flag 0:No Flush, 1:Flush
|
|
// bit 1 (TFLUSHBIT): Target Flush flag 0:No Flush, 1:Flush
|
|
// bit 2 (TTOUCHBIT): Target Touch flag using "dcbz" 0:No Touch, 1:Touch
|
|
// PARAM10 [36] : Register save area 1
|
|
// PARAM11 [40] : Register save area 2
|
|
// PARAM12 [44] : Register save area 3
|
|
// PARAM13 [48] : Register save area 4
|
|
// PARAM14 [52] : Register save area 5
|
|
// PARAM15 [56] : Register save area 6
|
|
// r4: Pointer to the palette (not used)
|
|
//
|
|
// Register usage:
|
|
// r0: Save LR
|
|
// r4: Updating source address
|
|
// r5: Number of pixels to copy per line
|
|
// r6: Updating remaining number of lines to copy
|
|
// r7: Target increment bytes per line (changed for pre caluculated value)
|
|
// r8: Source increment bytes per line (changed for pre caluculated value)
|
|
// r9: Updating target address
|
|
// r10: Work register
|
|
// r11: Work register
|
|
// r12: Pixel count
|
|
// CTR: Used for counter
|
|
//
|
|
// Restrictions:
|
|
// Copy width is assumed to be equal or shorter than target delta.
|
|
// Target is always cached VRAM and the source is always DRAM.
|
|
//
|
|
mflr r0 // Save return address
|
|
//
|
|
PROLOGUE_END(RectCopy24to15)
|
|
//
|
|
lwz r6,PARAM4(r3) // r6 <- number of lines to copy
|
|
and. r6,r6,r6 // Any lines to copy?
|
|
beq- copy2415_exit // No -> exit
|
|
lwz r9,PARAM1(r3) // r9 <- target address
|
|
lwz r4,PARAM2(r3) // r4 <- source address
|
|
lwz r5,PARAM3(r3) // r5 <- bytes to copy per line
|
|
lwz r7,PARAM5(r3) // r7 <- target byte distance between lines
|
|
lwz r8,PARAM6(r3) // r8 <- source byte distance between lines
|
|
subf r7,r5,r7 // r7 <- line delta after updating pointer (target)
|
|
srawi. r12,r5,1 // r12 <- pixel count
|
|
beq- copy2415_exit // No pixel -> exit
|
|
subf r8,r12,r8 // r8 <- line delta after updating pointer (source)
|
|
subf r8,r12,r8 // r8 <- line delta after updating pointer (source)
|
|
subf r8,r12,r8 // r8 <- line delta after updating pointer (source)
|
|
//
|
|
copy2415_10:
|
|
mtctr r12
|
|
#if USE_DCBZ
|
|
addi r10,r9,-1 // r10 <- starting cache line address which can be dcbz'ed minus 32
|
|
add r11,r9,r5 // r11 <- one byte after last byte
|
|
addi r11,r11,-31 // r11 <- ending cache line address which can be dcbz'ed
|
|
copy2415_15:
|
|
addi r10,r10,32
|
|
cmplw r10,r11 // no more cache line to dcbz?
|
|
bge copy2415_20
|
|
dcbz 0,r10
|
|
b copy2415_15
|
|
#endif
|
|
copy2415_20:
|
|
lbz r10,0(r4)
|
|
rlwinm r10,r10,29,27,31
|
|
lbz r11,1(r4)
|
|
rlwimi r10,r11,2,22,26
|
|
lbz r11,2(r4)
|
|
rlwimi r10,r11,7,17,21
|
|
sth r10,0(r9)
|
|
addi r4,r4,3
|
|
addi r9,r9,2
|
|
bdnz copy2415_20
|
|
add r4,r8,r4
|
|
add r9,r7,r9
|
|
addic. r6,r6,-1
|
|
bne copy2415_10
|
|
//
|
|
#if (! FULLCACHE)
|
|
add r7,r5,r7 // restore target delta
|
|
bl flushcopy_00
|
|
#endif
|
|
mtlr r0
|
|
//
|
|
copy2415_exit:
|
|
//
|
|
SPECIAL_EXIT(RectCopy24to15)
|
|
//
|
|
//*************************************************************************************************
|
|
SPECIAL_ENTRY(RectCopy15to16)
|
|
//
|
|
// Input Parameters:
|
|
// r3: The pointer to the parameter structure as follows.
|
|
// PARAM1 [00] : Target address
|
|
// PARAM2 [04] : Source address
|
|
// PARAM3 [08] : Number of bytes to copy per line
|
|
// PARAM4 [12] : Number of lines to copy
|
|
// PARAM5 [16] : Target line increments byte per line
|
|
// PARAM6 [20] : Source line increments byte per line
|
|
// PARAM7 [24] : Maximum number of cache lines to flush
|
|
// PARAM8 [28] : Maximum number of display lines to flush
|
|
// PARAM9 [32] : Operation control flag
|
|
// bit 0 (SFLUSHBIT): Source Flush flag 0:No Flush, 1:Flush
|
|
// bit 1 (TFLUSHBIT): Target Flush flag 0:No Flush, 1:Flush
|
|
// bit 2 (TTOUCHBIT): Target Touch flag using "dcbz" 0:No Touch, 1:Touch
|
|
// PARAM10 [36] : Register save area 1
|
|
// PARAM11 [40] : Register save area 2
|
|
// PARAM12 [44] : Register save area 3
|
|
// PARAM13 [48] : Register save area 4
|
|
// PARAM14 [52] : Register save area 5
|
|
// PARAM15 [56] : Register save area 6
|
|
// r4: Pointer to the palette (not used)
|
|
//
|
|
// Register usage:
|
|
// r0: Save LR
|
|
// r4: Updating source address
|
|
// r5: Number of pixels to copy per line
|
|
// r6: Updating remaining number of lines to copy
|
|
// r7: Target increment bytes per line (changed for pre caluculated value)
|
|
// r8: Source increment bytes per line (changed for pre caluculated value)
|
|
// r9: Updating target address
|
|
// r10: Work register
|
|
// r11: Work register
|
|
// r12: Pixel count
|
|
// CTR: Used for counter
|
|
//
|
|
// Restrictions:
|
|
// Copy width is assumed to be equal or shorter than target delta.
|
|
// Target is always cached VRAM and the source is always DRAM.
|
|
//
|
|
mflr r0 // Save return address
|
|
//
|
|
PROLOGUE_END(RectCopy15to16)
|
|
//
|
|
lwz r6,PARAM4(r3) // r6 <- number of lines to copy
|
|
and. r6,r6,r6 // Any lines to copy?
|
|
beq- copy1516_exit // No -> exit
|
|
lwz r9,PARAM1(r3) // r9 <- target address
|
|
lwz r4,PARAM2(r3) // r4 <- source address
|
|
lwz r5,PARAM3(r3) // r5 <- bytes to copy per line
|
|
lwz r7,PARAM5(r3) // r7 <- target byte distance between lines
|
|
lwz r8,PARAM6(r3) // r8 <- source byte distance between lines
|
|
subf r7,r5,r7 // r7 <- line delta after updating pointer (target)
|
|
subf r8,r5,r8 // r8 <- line delta after updating pointer (source)
|
|
srawi. r12,r5,1 // r12 <- pixel count
|
|
beq- copy1516_exit // No pixel -> exit
|
|
//
|
|
copy1516_10:
|
|
mtctr r12
|
|
#if USE_DCBZ
|
|
addi r10,r9,-1 // r10 <- starting cache line address which can be dcbz'ed minus 32
|
|
add r11,r9,r5 // r11 <- one byte after last byte
|
|
addi r11,r11,-31 // r11 <- ending cache line address which can be dcbz'ed
|
|
copy1516_15:
|
|
addi r10,r10,32
|
|
cmplw r10,r11 // no more cache line to dcbz?
|
|
bge copy1516_20
|
|
dcbz 0,r10
|
|
b copy1516_15
|
|
#endif
|
|
copy1516_20:
|
|
lhz r10,0(r4)
|
|
rlwinm r11,r10,0,27,31
|
|
rlwimi r11,r10,28,26,26
|
|
rlwimi r11,r10,1,16,25
|
|
sth r11,0(r9)
|
|
addi r4,r4,2
|
|
addi r9,r9,2
|
|
bdnz copy1516_20
|
|
add r4,r8,r4
|
|
add r9,r7,r9
|
|
addic. r6,r6,-1
|
|
bne copy1516_10
|
|
//
|
|
#if (! FULLCACHE)
|
|
add r7,r5,r7 // restore target delta
|
|
bl flushcopy_00
|
|
#endif
|
|
mtlr r0
|
|
//
|
|
copy1516_exit:
|
|
//
|
|
SPECIAL_EXIT(RectCopy15to16)
|
|
//
|
|
//*************************************************************************************************
|
|
SPECIAL_ENTRY(RectCopy15to32)
|
|
//
|
|
// Input Parameters:
|
|
// r3: The pointer to the parameter structure as follows.
|
|
// PARAM1 [00] : Target address
|
|
// PARAM2 [04] : Source address
|
|
// PARAM3 [08] : Number of bytes to copy per line
|
|
// PARAM4 [12] : Number of lines to copy
|
|
// PARAM5 [16] : Target line increments byte per line
|
|
// PARAM6 [20] : Source line increments byte per line
|
|
// PARAM7 [24] : Maximum number of cache lines to flush
|
|
// PARAM8 [28] : Maximum number of display lines to flush
|
|
// PARAM9 [32] : Operation control flag
|
|
// bit 0 (SFLUSHBIT): Source Flush flag 0:No Flush, 1:Flush
|
|
// bit 1 (TFLUSHBIT): Target Flush flag 0:No Flush, 1:Flush
|
|
// bit 2 (TTOUCHBIT): Target Touch flag using "dcbz" 0:No Touch, 1:Touch
|
|
// PARAM10 [36] : Register save area 1
|
|
// PARAM11 [40] : Register save area 2
|
|
// PARAM12 [44] : Register save area 3
|
|
// PARAM13 [48] : Register save area 4
|
|
// PARAM14 [52] : Register save area 5
|
|
// PARAM15 [56] : Register save area 6
|
|
// r4: Pointer to the palette (not used)
|
|
//
|
|
// Register usage:
|
|
// r0: Save LR
|
|
// r4: Updating source address
|
|
// r5: Number of pixels to copy per line
|
|
// r6: Updating remaining number of lines to copy
|
|
// r7: Target increment bytes per line (changed for pre caluculated value)
|
|
// r8: Source increment bytes per line (changed for pre caluculated value)
|
|
// r9: Updating target address
|
|
// r10: Work register
|
|
// r11: Work register
|
|
// r12: Pixel count
|
|
// CTR: Used for counter
|
|
//
|
|
// Restrictions:
|
|
// Copy width is assumed to be equal or shorter than target delta.
|
|
// Target is always cached VRAM and the source is always DRAM.
|
|
//
|
|
mflr r0 // Save return address
|
|
//
|
|
PROLOGUE_END(RectCopy15to32)
|
|
//
|
|
lwz r6,PARAM4(r3) // r6 <- number of lines to copy
|
|
and. r6,r6,r6 // Any lines to copy?
|
|
beq- copy1532_exit // No -> exit
|
|
lwz r9,PARAM1(r3) // r9 <- target address
|
|
lwz r4,PARAM2(r3) // r4 <- source address
|
|
lwz r5,PARAM3(r3) // r5 <- bytes to copy per line
|
|
lwz r7,PARAM5(r3) // r7 <- target byte distance between lines
|
|
lwz r8,PARAM6(r3) // r8 <- source byte distance between lines
|
|
subf r7,r5,r7 // r7 <- line delta after updating pointer (target)
|
|
srawi. r12,r5,2 // r12 <- pixel count
|
|
beq- copy1532_exit // No pixel -> exit
|
|
subf r8,r12,r8 // r8 line delta after updating pointer (source)
|
|
subf r8,r12,r8 // by subtracting twice of pixel count
|
|
//
|
|
copy1532_10:
|
|
mtctr r12
|
|
#if USE_DCBZ
|
|
addi r10,r9,-1 // r10 <- starting cache line address which can be dcbz'ed minus 32
|
|
add r11,r9,r5 // r11 <- one byte after last byte
|
|
addi r11,r11,-31 // r11 <- ending cache line address which can be dcbz'ed
|
|
copy1532_15:
|
|
addi r10,r10,32
|
|
cmplw r10,r11 // no more cache line to dcbz?
|
|
bge copy1532_20
|
|
dcbz 0,r10
|
|
b copy1532_15
|
|
#endif
|
|
copy1532_20:
|
|
lhz r10,0(r4)
|
|
rlwinm r11,r10,9,8,12
|
|
rlwimi r11,r10,4,13,15
|
|
rlwimi r11,r10,6,16,20
|
|
rlwimi r11,r10,1,21,23
|
|
rlwimi r11,r10,3,24,28
|
|
rlwimi r11,r10,30,29,31
|
|
stw r11,0(r9)
|
|
addi r4,r4,2
|
|
addi r9,r9,4
|
|
bdnz copy1532_20
|
|
add r4,r8,r4
|
|
add r9,r7,r9
|
|
addic. r6,r6,-1
|
|
bne copy1532_10
|
|
//
|
|
#if (! FULLCACHE)
|
|
add r7,r5,r7 // restore target delta
|
|
bl flushcopy_00
|
|
#endif
|
|
mtlr r0
|
|
//
|
|
copy1532_exit:
|
|
//
|
|
SPECIAL_EXIT(RectCopy15to32)
|
|
//
|
|
//*************************************************************************************************
|
|
SPECIAL_ENTRY(RectCopy8to8)
|
|
//
|
|
// Input Parameters:
|
|
// r3: The pointer to the parameter structure as follows.
|
|
// PARAM1 [00] : Target address
|
|
// PARAM2 [04] : Source address
|
|
// PARAM3 [08] : Number of bytes to copy per line
|
|
// PARAM4 [12] : Number of lines to copy
|
|
// PARAM5 [16] : Target line increments byte per line
|
|
// PARAM6 [20] : Source line increments byte per line
|
|
// PARAM7 [24] : Maximum number of cache lines to flush
|
|
// PARAM8 [28] : Maximum number of display lines to flush
|
|
// PARAM9 [32] : Operation control flag
|
|
// bit 0 (SFLUSHBIT): Source Flush flag 0:No Flush, 1:Flush
|
|
// bit 1 (TFLUSHBIT): Target Flush flag 0:No Flush, 1:Flush
|
|
// bit 2 (TTOUCHBIT): Target Touch flag using "dcbz" 0:No Touch, 1:Touch
|
|
// PARAM10 [36] : Register save area 1
|
|
// PARAM11 [40] : Register save area 2
|
|
// PARAM12 [44] : Register save area 3
|
|
// PARAM13 [48] : Register save area 4
|
|
// PARAM14 [52] : Register save area 5
|
|
// PARAM15 [56] : Register save area 6
|
|
// r4: Pointer to the palette
|
|
//
|
|
// Register usage:
|
|
// r0: Save LR
|
|
// r4: Updating source address
|
|
// r5: Number of pixels to copy per line
|
|
// r6: Updating remaining number of lines to copy
|
|
// r7: Target increment bytes per line (changed for pre caluculated value)
|
|
// r8: Source increment bytes per line (changed for pre caluculated value)
|
|
// r9: Updating target address
|
|
// r10: Work register
|
|
// r11: Palette pointer
|
|
// r12: Work register
|
|
// CTR: Used for counter
|
|
//
|
|
// Restrictions:
|
|
// Copy width is assumed to be equal or shorter than target delta.
|
|
// Target is always cached VRAM and the source is always DRAM.
|
|
//
|
|
mflr r0 // Save return address
|
|
//
|
|
PROLOGUE_END(RectCopy8to8)
|
|
//
|
|
lwz r6,PARAM4(r3) // r6 <- number of lines to copy
|
|
and. r6,r6,r6 // Any lines to copy?
|
|
beq- copy0808_exit // No -> exit
|
|
mr r11,r4 // r11 <- pointer to the ULONG palette
|
|
lwz r9,PARAM1(r3) // r9 <- target address
|
|
lwz r4,PARAM2(r3) // r4 <- source address
|
|
lwz r5,PARAM3(r3) // r5 <- bytes to copy per line
|
|
lwz r7,PARAM5(r3) // r7 <- target byte distance between lines
|
|
lwz r8,PARAM6(r3) // r8 <- source byte distance between lines
|
|
and. r5,r5,r5 // Any pixel to copy?
|
|
beq- copy0808_exit // No -> exit
|
|
subf r7,r5,r7 // r7 <- line delta after updating pointer (target)
|
|
subf r8,r5,r8 // r8 <- line delta after updating pointer (source)
|
|
//
|
|
copy0808_10:
|
|
mtctr r5
|
|
#if USE_DCBZ
|
|
addi r10,r9,-1 // r10 <- starting cache line address which can be dcbz'ed minus 32
|
|
add r12,r9,r5 // r12 <- one byte after last byte
|
|
addi r12,r12,-31 // r12 <- ending cache line address which can be dcbz'ed
|
|
copy0808_15:
|
|
addi r10,r10,32
|
|
cmplw r10,r12 // no more cache line to dcbz?
|
|
bge copy0808_20
|
|
dcbz 0,r10
|
|
b copy0808_15
|
|
#endif
|
|
copy0808_20:
|
|
lbz r10,0(r4) // r10 <- 8 bit index to the palette
|
|
rlwinm r10,r10,2,22,29
|
|
lbzx r10,r10,r11
|
|
stb r10,0(r9)
|
|
addi r4,r4,1
|
|
addi r9,r9,1
|
|
bdnz copy0808_20
|
|
add r4,r8,r4
|
|
add r9,r7,r9
|
|
addic. r6,r6,-1
|
|
bne copy0808_10
|
|
//
|
|
#if (! FULLCACHE)
|
|
add r7,r5,r7 // restore target delta
|
|
bl flushcopy_00
|
|
#endif
|
|
mtlr r0
|
|
//
|
|
copy0808_exit:
|
|
//
|
|
SPECIAL_EXIT(RectCopy8to8)
|
|
//
|
|
//*************************************************************************************************
|
|
SPECIAL_ENTRY(RectCopy8to16)
|
|
//
|
|
// Input Parameters:
|
|
// r3: The pointer to the parameter structure as follows.
|
|
// PARAM1 [00] : Target address
|
|
// PARAM2 [04] : Source address
|
|
// PARAM3 [08] : Number of bytes to copy per line
|
|
// PARAM4 [12] : Number of lines to copy
|
|
// PARAM5 [16] : Target line increments byte per line
|
|
// PARAM6 [20] : Source line increments byte per line
|
|
// PARAM7 [24] : Maximum number of cache lines to flush
|
|
// PARAM8 [28] : Maximum number of display lines to flush
|
|
// PARAM9 [32] : Operation control flag
|
|
// bit 0 (SFLUSHBIT): Source Flush flag 0:No Flush, 1:Flush
|
|
// bit 1 (TFLUSHBIT): Target Flush flag 0:No Flush, 1:Flush
|
|
// bit 2 (TTOUCHBIT): Target Touch flag using "dcbz" 0:No Touch, 1:Touch
|
|
// PARAM10 [36] : Register save area 1
|
|
// PARAM11 [40] : Register save area 2
|
|
// PARAM12 [44] : Register save area 3
|
|
// PARAM13 [48] : Register save area 4
|
|
// PARAM14 [52] : Register save area 5
|
|
// PARAM15 [56] : Register save area 6
|
|
// r4: Pointer to the palette
|
|
//
|
|
// Register usage:
|
|
// r0: Pixel count
|
|
// r4: Updating source address
|
|
// r5: Number of pixels to copy per line
|
|
// r6: Updating remaining number of lines to copy
|
|
// r7: Target increment bytes per line (changed for pre caluculated value)
|
|
// r8: Source increment bytes per line (changed for pre caluculated value)
|
|
// r9: Updating target address
|
|
// r10: Work register
|
|
// r11: Palette pointer
|
|
// r12: Work register
|
|
// r31: Save LR
|
|
// CTR: Used for counter
|
|
//
|
|
// Restrictions:
|
|
// Copy width is assumed to be equal or shorter than target delta.
|
|
// Target is always cached VRAM and the source is always DRAM.
|
|
//
|
|
stw r31,SLACK1(sp)
|
|
mflr r31
|
|
//
|
|
PROLOGUE_END(RectCopy8to16)
|
|
//
|
|
lwz r6,PARAM4(r3) // r6 <- number of lines to copy
|
|
and. r6,r6,r6 // Any lines to copy?
|
|
beq- copy0816_exit // No -> exit
|
|
mr r11,r4 // r11 <- pointer to the ULONG palette
|
|
lwz r9,PARAM1(r3) // r9 <- target address
|
|
lwz r4,PARAM2(r3) // r4 <- source address
|
|
lwz r5,PARAM3(r3) // r5 <- bytes to copy per line
|
|
lwz r7,PARAM5(r3) // r7 <- target byte distance between lines
|
|
lwz r8,PARAM6(r3) // r8 <- source byte distance between lines
|
|
subf r7,r5,r7 // r7 <- line delta after updating pointer (target)
|
|
srawi. r0,r5,1 // r0 <- pixel count
|
|
beq- copy0816_exit // No pixel -> exit
|
|
subf r8,r0,r8 // r8 <- line delta after updating pointer (source)
|
|
//
|
|
copy0816_10:
|
|
mtctr r0
|
|
#if USE_DCBZ
|
|
addi r10,r9,-1 // r10 <- starting cache line address which can be dcbz'ed minus 32
|
|
add r12,r9,r5 // r12 <- one byte after last byte
|
|
addi r12,r12,-31 // r12 <- ending cache line address which can be dcbz'ed
|
|
copy0816_15:
|
|
addi r10,r10,32
|
|
cmplw r10,r12 // no more cache line to dcbz?
|
|
bge copy0816_20
|
|
dcbz 0,r10
|
|
b copy0816_15
|
|
#endif
|
|
copy0816_20:
|
|
lbz r10,0(r4) // r10 <- 8 bit index to the palette
|
|
rlwinm r10,r10,2,22,29
|
|
lhzx r10,r10,r11
|
|
sth r10,0(r9)
|
|
addi r4,r4,1
|
|
addi r9,r9,2
|
|
bdnz copy0816_20
|
|
add r4,r8,r4
|
|
add r9,r7,r9
|
|
addic. r6,r6,-1
|
|
bne copy0816_10
|
|
//
|
|
#if (! FULLCACHE)
|
|
add r7,r5,r7 // restore target delta
|
|
bl flushcopy_00
|
|
#endif
|
|
//
|
|
copy0816_exit:
|
|
mtlr r31
|
|
lwz r31,SLACK1(sp)
|
|
//
|
|
SPECIAL_EXIT(RectCopy8to16)
|
|
//
|
|
//*************************************************************************************************
|
|
SPECIAL_ENTRY(RectCopy8to32)
|
|
//
|
|
// Input Parameters:
|
|
// r3: The pointer to the parameter structure as follows.
|
|
// PARAM1 [00] : Target address
|
|
// PARAM2 [04] : Source address
|
|
// PARAM3 [08] : Number of bytes to copy per line
|
|
// PARAM4 [12] : Number of lines to copy
|
|
// PARAM5 [16] : Target line increments byte per line
|
|
// PARAM6 [20] : Source line increments byte per line
|
|
// PARAM7 [24] : Maximum number of cache lines to flush
|
|
// PARAM8 [28] : Maximum number of display lines to flush
|
|
// PARAM9 [32] : Operation control flag
|
|
// bit 0 (SFLUSHBIT): Source Flush flag 0:No Flush, 1:Flush
|
|
// bit 1 (TFLUSHBIT): Target Flush flag 0:No Flush, 1:Flush
|
|
// bit 2 (TTOUCHBIT): Target Touch flag using "dcbz" 0:No Touch, 1:Touch
|
|
// PARAM10 [36] : Register save area 1
|
|
// PARAM11 [40] : Register save area 2
|
|
// PARAM12 [44] : Register save area 3
|
|
// PARAM13 [48] : Register save area 4
|
|
// PARAM14 [52] : Register save area 5
|
|
// PARAM15 [56] : Register save area 6
|
|
// r4: Pointer to the palette
|
|
//
|
|
// Register usage:
|
|
// r0: Pixel count
|
|
// r4: Updating source address
|
|
// r5: Number of pixels to copy per line
|
|
// r6: Updating remaining number of lines to copy
|
|
// r7: Target increment bytes per line (changed for pre caluculated value)
|
|
// r8: Source increment bytes per line (changed for pre caluculated value)
|
|
// r9: Updating target address
|
|
// r10: Work register
|
|
// r11: Palette pointer
|
|
// r12: Work register
|
|
// r31: LR save
|
|
// CTR: Used for counter
|
|
//
|
|
// Restrictions:
|
|
// Copy width is assumed to be equal or shorter than target delta.
|
|
// Target is always cached VRAM and the source is always DRAM.
|
|
//
|
|
stw r31,SLACK1(sp)
|
|
mflr r31
|
|
//
|
|
PROLOGUE_END(RectCopy8to32)
|
|
//
|
|
lwz r6,PARAM4(r3) // r6 <- number of lines to copy
|
|
and. r6,r6,r6 // Any lines to copy?
|
|
beq- copy0832_exit // No -> exit
|
|
mr r11,r4 // r11 <- pointer to the ULONG palette
|
|
lwz r9,PARAM1(r3) // r9 <- target address
|
|
lwz r4,PARAM2(r3) // r4 <- source address
|
|
lwz r5,PARAM3(r3) // r5 <- bytes to copy per line
|
|
lwz r7,PARAM5(r3) // r7 <- target byte distance between lines
|
|
lwz r8,PARAM6(r3) // r8 <- source byte distance between lines
|
|
subf r7,r5,r7 // r7 <- line delta after updating pointer (target)
|
|
srawi. r0,r5,2 // r0 <- pixel count
|
|
beq- copy0832_exit // No pixel -> exit
|
|
subf r8,r0,r8 // r8 <- line delta after updating pointer (source)
|
|
//
|
|
copy0832_10:
|
|
mtctr r0
|
|
#if USE_DCBZ
|
|
addi r10,r9,-1 // r10 <- starting cache line address which can be dcbz'ed minus 32
|
|
add r12,r9,r5 // r12 <- one byte after last byte
|
|
addi r12,r12,-31 // r12 <- ending cache line address which can be dcbz'ed
|
|
copy0832_15:
|
|
addi r10,r10,32
|
|
cmplw r10,r12 // no more cache line to dcbz?
|
|
bge copy0832_20
|
|
dcbz 0,r10
|
|
b copy0832_15
|
|
#endif
|
|
copy0832_20:
|
|
lbz r10,0(r4) // r10 <- 8 bit index to the palette
|
|
rlwinm r10,r10,2,22,29
|
|
lwzx r10,r10,r11
|
|
stw r10,0(r9)
|
|
addi r4,r4,1
|
|
addi r9,r9,4
|
|
bdnz copy0832_20
|
|
add r4,r8,r4
|
|
add r9,r7,r9
|
|
addic. r6,r6,-1
|
|
bne copy0832_10
|
|
//
|
|
#if (! FULLCACHE)
|
|
add r7,r5,r7 // restore target delta
|
|
bl flushcopy_00
|
|
#endif
|
|
//
|
|
copy0832_exit:
|
|
mtlr r31
|
|
lwz r31,SLACK1(sp)
|
|
//
|
|
SPECIAL_EXIT(RectCopy8to32)
|
|
//
|
|
//*************************************************************************************************
|
|
SPECIAL_ENTRY(Stretch32)
|
|
//
|
|
// Input Parameters:
|
|
// r3: The pointer to the parameter structure as follows.
|
|
// PARAM1 [00] : Target address
|
|
// PARAM2 [04] : Source address
|
|
// PARAM3 [08] : Number of bytes to copy per line
|
|
// PARAM4 [12] : Number of lines to copy
|
|
// PARAM5 [16] : Target line increments byte per line
|
|
// PARAM6 [20] : Source line increments byte per line
|
|
// PARAM7 [24] : Maximum number of cache lines to flush
|
|
// PARAM8 [28] : Maximum number of display lines to flush
|
|
// PARAM9 [32] : Operation control flag
|
|
// bit 0 (SFLUSHBIT): Source Flush flag 0:No Flush, 1:Flush
|
|
// bit 1 (TFLUSHBIT): Target Flush flag 0:No Flush, 1:Flush
|
|
// bit 2 (TTOUCHBIT): Target Touch flag using "dcbz" 0:No Touch, 1:Touch
|
|
// PARAM10 [36] : Register save area 1
|
|
// PARAM11 [40] : Register save area 2
|
|
// PARAM12 [44] : Register save area 3
|
|
// PARAM13 [48] : Register save area 4
|
|
// PARAM14 [52] : Register save area 5
|
|
// PARAM15 [56] : Register save area 6
|
|
//
|
|
// Register usage:
|
|
// r0: Pixel count
|
|
// r4: Updating source address
|
|
// r5: Number of bytes to copy per line (target)
|
|
// r6: Updating remaining number of lines to copy
|
|
// r7: Target increment bytes per line (changed for pre caluculated value)
|
|
// r8: Source increment bytes per line (changed for pre caluculated value)
|
|
// r9: Updating target address
|
|
// r10: Work register
|
|
// r11: Number of bytes to copy per line (target)
|
|
// r12: Work register
|
|
// r31: Save LR
|
|
// CTR: Used for counter
|
|
//
|
|
// Restrictions:
|
|
// Copy width is assumed to be equal or shorter than target delta.
|
|
// Target is always cached VRAM and the source is always DRAM.
|
|
// This is a routine to copy 32 BPP source to 32 BPP target with
|
|
// 200% stretching. The target rectangle is assumed that exactly
|
|
// twice of source rectangle. RECT clipped area can be supported, but
|
|
// top left position has to be in the clipping area in that case.
|
|
//
|
|
stw r31,SLACK1(sp)
|
|
mflr r31
|
|
//
|
|
PROLOGUE_END(Stretch32)
|
|
//
|
|
lwz r6,PARAM4(r3) // r6 <- number of lines to copy
|
|
and. r6,r6,r6 // Any lines to copy?
|
|
beq- stretch32_exit // No -> exit
|
|
lwz r9,PARAM1(r3) // r9 <- target address
|
|
lwz r4,PARAM2(r3) // r4 <- source address
|
|
lwz r5,PARAM3(r3) // r5 <- bytes to copy per line (target)
|
|
lwz r7,PARAM5(r3) // r7 <- target byte distance between lines
|
|
lwz r8,PARAM6(r3) // r8 <- source byte distance between lines
|
|
subf r7,r5,r7 // r7 <- line delta after updating pointer (target)
|
|
srawi. r11,r5,1 // r11 <- bytes to copy per line (source)
|
|
beq- stretch32_exit // No pixel -> exit
|
|
andi. r11,r11,0xfffc // Clear LS 2 bit for odd pixel target
|
|
subf r8,r11,r8 // r8 <- line delta after updating pointer (source)
|
|
srawi r0,r5,2 // r0 <- target pixel count
|
|
//
|
|
stretch32_10:
|
|
mtctr r0
|
|
#if USE_DCBZ
|
|
addi r10,r9,-1 // r10 <- starting cache line address which can be dcbz'ed minus 32
|
|
add r12,r9,r5 // r12 <- one byte after last byte
|
|
addi r12,r12,-31 // r12 <- ending cache line address which can be dcbz'ed
|
|
stretch32_15:
|
|
addi r10,r10,32
|
|
cmplw r10,r12 // no more cache line to dcbz?
|
|
bge stretch32_20
|
|
dcbz 0,r10
|
|
b stretch32_15
|
|
#endif
|
|
stretch32_20:
|
|
lwz r10,0(r4) // r10 <- source pixel
|
|
stw r10,0(r9)
|
|
addi r9,r9,4
|
|
bdz stretch32_22
|
|
stw r10,0(r9) // stretching pixel to 200%
|
|
addi r4,r4,4
|
|
addi r9,r9,4
|
|
bdnz stretch32_20
|
|
stretch32_22:
|
|
subf r4,r11,r4 // seek back source
|
|
add r9,r7,r9 // seek forward target
|
|
addic. r6,r6,-1
|
|
beq- stretch32_50
|
|
mtctr r0
|
|
#if USE_DCBZ
|
|
addi r10,r9,-1 // r10 <- starting cache line address which can be dcbz'ed minus 32
|
|
add r12,r9,r5 // r12 <- one byte after last byte
|
|
addi r12,r12,-31 // r12 <- ending cache line address which can be dcbz'ed
|
|
stretch32_25:
|
|
addi r10,r10,32
|
|
cmplw r10,r12 // no more cache line to dcbz?
|
|
bge stretch32_30
|
|
dcbz 0,r10
|
|
b stretch32_25
|
|
#endif
|
|
stretch32_30:
|
|
lwz r10,0(r4) // r10 <- source pixel
|
|
stw r10,0(r9)
|
|
addi r9,r9,4
|
|
bdz stretch32_32
|
|
stw r10,0(r9) // stretching pixel to 200%
|
|
addi r4,r4,4
|
|
addi r9,r9,4
|
|
bdnz stretch32_30
|
|
stretch32_32:
|
|
add r4,r8,r4
|
|
add r9,r7,r9
|
|
addic. r6,r6,-1
|
|
bne stretch32_10
|
|
//
|
|
stretch32_50:
|
|
#if (! FULLCACHE)
|
|
add r7,r5,r7 // restore target delta
|
|
bl flushcopy_00
|
|
#endif
|
|
//
|
|
stretch32_exit:
|
|
mtlr r31
|
|
lwz r31,SLACK1(sp)
|
|
//
|
|
SPECIAL_EXIT(Stretch32)
|
|
//
|
|
//*************************************************************************************************
|
|
SPECIAL_ENTRY(Stretch16)
|
|
//
|
|
// Input Parameters:
|
|
// r3: The pointer to the parameter structure as follows.
|
|
// PARAM1 [00] : Target address
|
|
// PARAM2 [04] : Source address
|
|
// PARAM3 [08] : Number of bytes to copy per line
|
|
// PARAM4 [12] : Number of lines to copy
|
|
// PARAM5 [16] : Target line increments byte per line
|
|
// PARAM6 [20] : Source line increments byte per line
|
|
// PARAM7 [24] : Maximum number of cache lines to flush
|
|
// PARAM8 [28] : Maximum number of display lines to flush
|
|
// PARAM9 [32] : Operation control flag
|
|
// bit 0 (SFLUSHBIT): Source Flush flag 0:No Flush, 1:Flush
|
|
// bit 1 (TFLUSHBIT): Target Flush flag 0:No Flush, 1:Flush
|
|
// bit 2 (TTOUCHBIT): Target Touch flag using "dcbz" 0:No Touch, 1:Touch
|
|
// PARAM10 [36] : Register save area 1
|
|
// PARAM11 [40] : Register save area 2
|
|
// PARAM12 [44] : Register save area 3
|
|
// PARAM13 [48] : Register save area 4
|
|
// PARAM14 [52] : Register save area 5
|
|
// PARAM15 [56] : Register save area 6
|
|
//
|
|
// Register usage:
|
|
// r0: Pixel count -> Save LR
|
|
// r4: Updating source address
|
|
// r5: Number of bytes to copy per line (target)
|
|
// r6: Updating remaining number of lines to copy
|
|
// r7: Target increment bytes per line (changed for pre caluculated value)
|
|
// r8: Source increment bytes per line (changed for pre caluculated value)
|
|
// r9: Updating target address
|
|
// r10: Work register
|
|
// r11: Number of bytes to copy per line (target)
|
|
// r12: Work register
|
|
// CTR: Used for counter
|
|
//
|
|
// Restrictions:
|
|
// Copy width is assumed to be equal or shorter than target delta.
|
|
// Target is always cached VRAM and the source is always DRAM.
|
|
// This is a routine to copy 16 BPP source to 16 BPP target with
|
|
// 200% stretching. The target rectangle is assumed that exactly
|
|
// twice of source rectangle. RECT clipped area can be supported, but
|
|
// top left position has to be in the clipping area in that case.
|
|
//
|
|
stw r31,SLACK1(sp)
|
|
mflr r31
|
|
//
|
|
PROLOGUE_END(Stretch16)
|
|
//
|
|
lwz r6,PARAM4(r3) // r6 <- number of lines to copy
|
|
and. r6,r6,r6 // Any lines to copy?
|
|
beq- stretch16_exit // No -> exit
|
|
lwz r9,PARAM1(r3) // r9 <- target address
|
|
lwz r4,PARAM2(r3) // r4 <- source address
|
|
lwz r5,PARAM3(r3) // r5 <- bytes to copy per line (target)
|
|
lwz r7,PARAM5(r3) // r7 <- target byte distance between lines
|
|
lwz r8,PARAM6(r3) // r8 <- source byte distance between lines
|
|
subf r7,r5,r7 // r7 <- line delta after updating pointer (target)
|
|
srawi. r11,r5,1 // r11 <- bytes to copy per line (source)
|
|
beq- stretch16_exit // No pixel -> exit
|
|
andi. r11,r11,0xfffe // Clear LS 1 bit for odd pixel target adjustment
|
|
subf r8,r11,r8 // r8 <- line delta after updating pointer (source)
|
|
srawi r0,r5,1 // r0 <- pixel count (target)
|
|
//
|
|
stretch16_10:
|
|
mtctr r0
|
|
#if USE_DCBZ
|
|
addi r10,r9,-1 // r10 <- starting cache line address which can be dcbz'ed minus 32
|
|
add r12,r9,r5 // r12 <- one byte after last byte
|
|
addi r12,r12,-31 // r12 <- ending cache line address which can be dcbz'ed
|
|
stretch16_15:
|
|
addi r10,r10,32
|
|
cmplw r10,r12 // no more cache line to dcbz?
|
|
bge stretch16_20
|
|
dcbz 0,r10
|
|
b stretch16_15
|
|
#endif
|
|
stretch16_20:
|
|
lhz r10,0(r4) // r10 <- source pixel
|
|
sth r10,0(r9)
|
|
addi r9,r9,2
|
|
bdz stretch16_22
|
|
sth r10,0(r9)
|
|
addi r4,r4,2
|
|
addi r9,r9,2
|
|
bdnz stretch16_20
|
|
stretch16_22:
|
|
subf r4,r11,r4 // seek back source
|
|
add r9,r7,r9 // seek forward target
|
|
addic. r6,r6,-1
|
|
beq- stretch16_50
|
|
mtctr r0
|
|
#if USE_DCBZ
|
|
addi r10,r9,-1 // r10 <- starting cache line address which can be dcbz'ed minus 32
|
|
add r12,r9,r5 // r12 <- one byte after last byte
|
|
addi r12,r12,-31 // r12 <- ending cache line address which can be dcbz'ed
|
|
stretch16_25:
|
|
addi r10,r10,32
|
|
cmplw r10,r12 // no more cache line to dcbz?
|
|
bge stretch16_30
|
|
dcbz 0,r10
|
|
b stretch16_25
|
|
#endif
|
|
stretch16_30:
|
|
lhz r10,0(r4) // r10 <- source pixel
|
|
sth r10,0(r9)
|
|
addi r9,r9,2
|
|
bdz stretch16_32
|
|
sth r10,0(r9)
|
|
addi r4,r4,2
|
|
addi r9,r9,2
|
|
bdnz stretch16_30
|
|
stretch16_32:
|
|
add r4,r8,r4
|
|
add r9,r7,r9
|
|
addic. r6,r6,-1
|
|
bne stretch16_10
|
|
//
|
|
stretch16_50:
|
|
#if (! FULLCACHE)
|
|
add r7,r5,r7 // restore target delta
|
|
bl flushcopy_00
|
|
#endif
|
|
//
|
|
stretch16_exit:
|
|
mtlr r31
|
|
lwz r31,SLACK1(sp)
|
|
//
|
|
SPECIAL_EXIT(Stretch16)
|
|
//
|
|
#if PAINT_NEW_METHOD
|
|
//
|
|
//*************************************************************************************************
|
|
SPECIAL_ENTRY(LineFill)
|
|
//
|
|
// Input Parameters:
|
|
// r3 : Target address
|
|
// r4 : The pointer to the solid brush data (double word)
|
|
// r5 : Number of bytes to fill
|
|
// r6 : Cache control
|
|
// bit 1 (TFLUSHBIT): Target Flush flag 0:No Flush, 1:Flush
|
|
//
|
|
// r4 is pointing to the following data
|
|
// PARAM1 [00] : First word of dword solid brush to use (duplicated brush)
|
|
// PARAM2 [04] : Second word of dword solid brush to use (same as the first word)
|
|
//
|
|
// Register usage:
|
|
// r0: Saved return address
|
|
// r7: Start address (cache aligned)
|
|
// r8: Word brush date
|
|
// r9: Work register
|
|
// r10: Work register
|
|
// CTR: Used for loop counter and linking
|
|
// f1: Solid dword brush to be used for the fill operation
|
|
//
|
|
// Restrictions:
|
|
// Target memory has to be cachable.
|
|
//
|
|
mflr r0 // Save return address
|
|
//
|
|
PROLOGUE_END(LineFill)
|
|
//
|
|
mr r7,r3 // r7 <- saved start address
|
|
rlwinm r7,r7,0,0,26 // r7 <- 32 byte aligned start address
|
|
lwz r8,PARAM1(r4) // Load brush data to r8
|
|
cmplwi r5,MINLENGTH_FILL // Is it wide enough to use 32 byte inner loop?
|
|
bge Lfill_100 // Yes -> use long logic
|
|
//
|
|
cmplwi r5,6 // More than 6 bytes?
|
|
bgt Lfill_20 // Yes -> use medium logic
|
|
bl Lfill_10 // No -> use short logic
|
|
__ShortLnFillProcS:
|
|
.ualong __Lfillshort_0
|
|
.ualong __Lfillshort_0
|
|
.ualong __Lfillshort_0
|
|
.ualong __Lfillshort_0
|
|
.ualong __Lfillshort_1
|
|
.ualong __Lfillshort_1
|
|
.ualong __Lfillshort_1
|
|
.ualong __Lfillshort_1
|
|
.ualong __Lfillshort_2_0
|
|
.ualong __Lfillshort_2_1
|
|
.ualong __Lfillshort_2_2
|
|
.ualong __Lfillshort_2_3
|
|
.ualong __Lfillshort_3_0
|
|
.ualong __Lfillshort_3_1
|
|
.ualong __Lfillshort_3_2
|
|
.ualong __Lfillshort_3_3
|
|
.ualong __Lfillshort_4_0
|
|
.ualong __Lfillshort_4_1
|
|
.ualong __Lfillshort_4_2
|
|
.ualong __Lfillshort_4_3
|
|
.ualong __Lfillshort_5_0
|
|
.ualong __Lfillshort_5_1
|
|
.ualong __Lfillshort_5_2
|
|
.ualong __Lfillshort_5_3
|
|
.ualong __Lfillshort_6_0
|
|
.ualong __Lfillshort_6_1
|
|
.ualong __Lfillshort_6_2
|
|
.ualong __Lfillshort_6_3
|
|
//
|
|
__Lfillshort_0:
|
|
blr
|
|
__Lfillshort_1:
|
|
stb r8,0(r3)
|
|
addi r3,r3,1
|
|
b flush_line
|
|
__Lfillshort_2_0:
|
|
__Lfillshort_2_2:
|
|
sth r8,0(r3)
|
|
addi r3,r3,2
|
|
b flush_line
|
|
__Lfillshort_2_1:
|
|
__Lfillshort_2_3:
|
|
stb r8,0(r3)
|
|
stb r8,1(r3)
|
|
addi r3,r3,2
|
|
b flush_line
|
|
__Lfillshort_3_0:
|
|
__Lfillshort_3_2:
|
|
sth r8,0(r3)
|
|
stb r8,2(r3)
|
|
addi r3,r3,3
|
|
b flush_line
|
|
__Lfillshort_3_1:
|
|
__Lfillshort_3_3:
|
|
stb r8,0(r3)
|
|
sth r8,1(r3)
|
|
addi r3,r3,3
|
|
b flush_line
|
|
__Lfillshort_4_0:
|
|
stw r8,0(r3)
|
|
addi r3,r3,4
|
|
b flush_line
|
|
__Lfillshort_4_1:
|
|
__Lfillshort_4_3:
|
|
stb r8,0(r3)
|
|
sth r8,1(r3)
|
|
stb r8,3(r3)
|
|
addi r3,r3,4
|
|
b flush_line
|
|
__Lfillshort_4_2:
|
|
sth r8,0(r3)
|
|
sth r8,2(r3)
|
|
addi r3,r3,4
|
|
b flush_line
|
|
__Lfillshort_5_0:
|
|
stw r8,0(r3)
|
|
stb r8,4(r3)
|
|
addi r3,r3,5
|
|
b flush_line
|
|
__Lfillshort_5_1:
|
|
stb r8,0(r3)
|
|
sth r8,1(r3)
|
|
sth r8,3(r3)
|
|
addi r3,r3,5
|
|
b flush_line
|
|
__Lfillshort_5_2:
|
|
sth r8,0(r3)
|
|
sth r8,2(r3)
|
|
stb r8,4(r3)
|
|
addi r3,r3,5
|
|
b flush_line
|
|
__Lfillshort_5_3:
|
|
stb r8,0(r3)
|
|
stw r8,1(r3)
|
|
addi r3,r3,5
|
|
b flush_line
|
|
__Lfillshort_6_0:
|
|
stw r8,0(r3)
|
|
sth r8,4(r3)
|
|
addi r3,r3,6
|
|
b flush_line
|
|
__Lfillshort_6_1:
|
|
stb r8,0(r3)
|
|
sth r8,1(r3)
|
|
sth r8,3(r3)
|
|
stb r8,5(r3)
|
|
addi r3,r3,6
|
|
b flush_line
|
|
__Lfillshort_6_2:
|
|
sth r8,0(r3)
|
|
stw r8,2(r3)
|
|
addi r3,r3,6
|
|
b flush_line
|
|
__Lfillshort_6_3:
|
|
stb r8,0(r3)
|
|
stw r8,1(r3)
|
|
stb r8,5(r3)
|
|
addi r3,r3,6
|
|
b flush_line
|
|
//
|
|
// Short fill <= 6 bytes
|
|
//
|
|
Lfill_10:
|
|
mflr r10 // r10 <- InitProcS address
|
|
rlwinm r9,r5,4,25,27 // bit 25~27 of r9 <- width (0~6)
|
|
rlwimi r9,r3,2,28,29 // bit 28~29 of r9 <- mod 4 of target address
|
|
lwzx r9,r10,r9 // r9 <- subroutine to call
|
|
mtctr r9
|
|
mtlr r0 // Restore return address
|
|
bctr // and jump to corresponding fill routine
|
|
//
|
|
// 63 > width > 6 -- medium process
|
|
//
|
|
Lfill_20:
|
|
andi. r10,r3,0x01 // Word alignment 1 or 3?
|
|
beq Lfill_30
|
|
stb r8,0(r3)
|
|
addi r3,r3,1
|
|
addi r5,r5,-1
|
|
Lfill_30:
|
|
andi. r10,r3,0x02 // Word alignment 2?
|
|
beq Lfill_40
|
|
sth r8,0(r3)
|
|
addi r3,r3,2
|
|
addi r5,r5,-2
|
|
Lfill_40:
|
|
srawi r10,r5,2 // r5 <- inner loop count
|
|
Lfill_50:
|
|
stw r8,0(r3)
|
|
addi r3,r3,4
|
|
addic. r10,r10,-1
|
|
bne Lfill_50
|
|
andi. r10,r5,0x02 // Remaining half word?
|
|
beq Lfill_60
|
|
sth r8,0(r3)
|
|
addi r3,r3,2
|
|
Lfill_60:
|
|
andi. r10,r5,0x01 // Remaining byte?
|
|
beq Lfill_70
|
|
stb r8,0(r3)
|
|
addi r3,r3,1
|
|
Lfill_70:
|
|
mtlr r0 // Restore return address
|
|
b flush_line
|
|
//
|
|
// width >= 64 -- long process
|
|
//
|
|
Lfill_100:
|
|
lfd f1,PARAM1(r4) // f1 <- FPR brush
|
|
andi. r10,r3,0x01 // Word alignment 1 or 3?
|
|
beq Lfill_110
|
|
stb r8,0(r3)
|
|
addi r3,r3,1
|
|
addi r5,r5,-1
|
|
Lfill_110:
|
|
andi. r10,r3,0x02 // Word alignment 2?
|
|
beq Lfill_120
|
|
sth r8,0(r3)
|
|
addi r3,r3,2
|
|
addi r5,r5,-2
|
|
Lfill_120:
|
|
andi. r10,r3,0x1c // r10 <- number of bytes to fill to make cache line alignment
|
|
beq Lfill_130
|
|
stw r8,0(r3)
|
|
addi r3,r3,4
|
|
addi r5,r5,-4
|
|
b Lfill_120
|
|
Lfill_130:
|
|
srawi r10,r5,5 // r10 <- inner most loop (32 byte) count to fill
|
|
mtctr r10
|
|
Lfill_140:
|
|
#if USE_DCBZ
|
|
dcbz 0,r3 // Clear cache line
|
|
#endif
|
|
stfd f1,0(r3) // Fill 32 bytes of data
|
|
stfd f1,8(r3)
|
|
stfd f1,16(r3)
|
|
stfd f1,24(r3)
|
|
addi r3,r3,32 // Increment target pointer
|
|
bdnz Lfill_140
|
|
//
|
|
andi. r10,r5,0x1c // r10 <- remaining byte can be filled by word fill
|
|
beq Lfill_160
|
|
Lfill_150:
|
|
stw r8,0(r3)
|
|
addi r3,r3,4
|
|
addic. r10,r10,-4
|
|
bne Lfill_150
|
|
Lfill_160:
|
|
andi. r10,r5,0x02 // Remaining half word to fill?
|
|
beq Lfill_170
|
|
sth r8,0(r3)
|
|
addi r3,r3,2
|
|
Lfill_170:
|
|
andi. r10,r5,0x01 // Remaining byte to fill
|
|
beq Lfill_180
|
|
stb r8,0(r3)
|
|
addi r3,r3,1
|
|
Lfill_180:
|
|
mtlr r0 // Restore return address
|
|
//
|
|
flush_line:
|
|
#if (! FULLCACHE)
|
|
andis. r6,r6,TFLUSHBIT // Need to flush target cache?
|
|
beq- flush_line_exit // No -> exit
|
|
flush_line_10:
|
|
dcbf 0,r7 // Yes -> flush cache
|
|
addi r7,r7,32
|
|
cmplw r7,r3 // over end address?
|
|
blt flush_line_10
|
|
flush_line_exit:
|
|
#endif
|
|
SPECIAL_EXIT(LineFill)
|
|
//
|
|
//*************************************************************************************************
|
|
SPECIAL_ENTRY(LineXor)
|
|
//
|
|
// Input Parameters:
|
|
// r3 : Target address
|
|
// r4 : Solid brush
|
|
// r5 : Number of bytes to xor
|
|
// r6 : Cache control
|
|
// bit 1 (TFLUSHBIT): Target Flush flag 0:No Flush, 1:Flush
|
|
//
|
|
// Register usage:
|
|
// r0: Saved return address
|
|
// r7: Start address (cache aligned)
|
|
// r8: Work register
|
|
// r9: Work register
|
|
// r10: Work register
|
|
// r11: Work register
|
|
// CTR: Used for loop counter and linking
|
|
//
|
|
// Restrictions:
|
|
// Target memory has to be cachable.
|
|
//
|
|
mflr r0 // Save return address
|
|
//
|
|
PROLOGUE_END(LineXor)
|
|
//
|
|
mr r7,r3 // r7 <- saved start address
|
|
rlwinm r7,r7,0,0,26 // r7 <- 32 byte aligned start address
|
|
cmplwi r5,MINLENGTH_FILL // Is it wide enough to use 32 byte inner loop?
|
|
bge Lxor_100 // Yes -> use long logic
|
|
//
|
|
cmplwi r5,6 // More than 6 bytes?
|
|
bgt Lxor_20 // Yes -> use medium logic
|
|
bl Lxor_10 // No -> use short logic
|
|
__ShortLnXorProcS:
|
|
.ualong __Lxorshort_0
|
|
.ualong __Lxorshort_0
|
|
.ualong __Lxorshort_0
|
|
.ualong __Lxorshort_0
|
|
.ualong __Lxorshort_1
|
|
.ualong __Lxorshort_1
|
|
.ualong __Lxorshort_1
|
|
.ualong __Lxorshort_1
|
|
.ualong __Lxorshort_2_0
|
|
.ualong __Lxorshort_2_1
|
|
.ualong __Lxorshort_2_2
|
|
.ualong __Lxorshort_2_3
|
|
.ualong __Lxorshort_3_0
|
|
.ualong __Lxorshort_3_1
|
|
.ualong __Lxorshort_3_2
|
|
.ualong __Lxorshort_3_3
|
|
.ualong __Lxorshort_4_0
|
|
.ualong __Lxorshort_4_1
|
|
.ualong __Lxorshort_4_2
|
|
.ualong __Lxorshort_4_3
|
|
.ualong __Lxorshort_5_0
|
|
.ualong __Lxorshort_5_1
|
|
.ualong __Lxorshort_5_2
|
|
.ualong __Lxorshort_5_3
|
|
.ualong __Lxorshort_6_0
|
|
.ualong __Lxorshort_6_1
|
|
.ualong __Lxorshort_6_2
|
|
.ualong __Lxorshort_6_3
|
|
//
|
|
//
|
|
__Lxorshort_0:
|
|
blr
|
|
__Lxorshort_1:
|
|
lbz r9,0(r3)
|
|
xor r9,r9,r4
|
|
stb r9,0(r3)
|
|
addi r3,r3,1
|
|
b flush_line
|
|
__Lxorshort_2_0:
|
|
__Lxorshort_2_2:
|
|
lhz r9,0(r3)
|
|
xor r9,r9,r4
|
|
sth r9,0(r3)
|
|
addi r3,r3,2
|
|
b flush_line
|
|
__Lxorshort_2_1:
|
|
__Lxorshort_2_3:
|
|
lbz r9,0(r3)
|
|
lbz r10,1(r3)
|
|
xor r9,r9,r4
|
|
xor r10,r10,r4
|
|
stb r9,0(r3)
|
|
stb r10,1(r3)
|
|
addi r3,r3,2
|
|
b flush_line
|
|
__Lxorshort_3_0:
|
|
__Lxorshort_3_2:
|
|
lhz r9,0(r3)
|
|
lbz r10,2(r3)
|
|
xor r9,r9,r4
|
|
xor r10,r10,r4
|
|
sth r9,0(r3)
|
|
stb r10,2(r3)
|
|
addi r3,r3,3
|
|
b flush_line
|
|
__Lxorshort_3_1:
|
|
__Lxorshort_3_3:
|
|
lbz r9,0(r3)
|
|
lhz r10,1(r3)
|
|
xor r9,r9,r4
|
|
xor r10,r10,r4
|
|
stb r9,0(r3)
|
|
sth r10,1(r3)
|
|
addi r3,r3,3
|
|
b flush_line
|
|
__Lxorshort_4_0:
|
|
lwz r9,0(r3)
|
|
xor r9,r9,r4
|
|
stw r9,0(r3)
|
|
addi r3,r3,4
|
|
b flush_line
|
|
__Lxorshort_4_1:
|
|
__Lxorshort_4_3:
|
|
lbz r9,0(r3)
|
|
lhz r10,1(r3)
|
|
lbz r11,3(r3)
|
|
xor r9,r9,r4
|
|
xor r10,r10,r4
|
|
xor r11,r11,r4
|
|
stb r9,0(r3)
|
|
sth r10,1(r3)
|
|
stb r11,3(r3)
|
|
addi r3,r3,4
|
|
b flush_line
|
|
__Lxorshort_4_2:
|
|
lhz r9,0(r3)
|
|
lhz r10,2(r3)
|
|
xor r9,r9,r4
|
|
xor r10,r10,r4
|
|
sth r9,0(r3)
|
|
sth r10,2(r3)
|
|
addi r3,r3,4
|
|
b flush_line
|
|
__Lxorshort_5_0:
|
|
lwz r9,0(r3)
|
|
lbz r10,4(r3)
|
|
xor r9,r9,r4
|
|
xor r10,r10,r4
|
|
stw r9,0(r3)
|
|
stb r10,4(r3)
|
|
addi r3,r3,5
|
|
b flush_line
|
|
__Lxorshort_5_1:
|
|
lbz r9,0(r3)
|
|
lhz r10,1(r3)
|
|
lhz r11,3(r3)
|
|
xor r9,r9,r4
|
|
xor r10,r10,r4
|
|
xor r11,r11,r4
|
|
stb r9,0(r3)
|
|
sth r10,1(r3)
|
|
sth r11,3(r3)
|
|
addi r3,r3,5
|
|
b flush_line
|
|
__Lxorshort_5_2:
|
|
lhz r9,0(r3)
|
|
lhz r10,2(r3)
|
|
lbz r11,4(r3)
|
|
xor r9,r9,r4
|
|
xor r10,r10,r4
|
|
xor r11,r11,r4
|
|
sth r9,0(r3)
|
|
sth r10,2(r3)
|
|
stb r11,4(r3)
|
|
addi r3,r3,5
|
|
b flush_line
|
|
__Lxorshort_5_3:
|
|
lbz r9,0(r3)
|
|
lwz r10,1(r3)
|
|
xor r9,r9,r4
|
|
xor r10,r10,r4
|
|
stb r9,0(r3)
|
|
stw r10,1(r3)
|
|
addi r3,r3,5
|
|
b flush_line
|
|
__Lxorshort_6_0:
|
|
lwz r9,0(r3)
|
|
lhz r10,4(r3)
|
|
xor r9,r9,r4
|
|
xor r10,r10,r4
|
|
stw r9,0(r3)
|
|
sth r10,4(r3)
|
|
addi r3,r3,6
|
|
b flush_line
|
|
__Lxorshort_6_1:
|
|
lbz r8,0(r3)
|
|
lhz r9,1(r3)
|
|
lhz r10,3(r3)
|
|
lbz r11,5(r3)
|
|
xor r8,r8,r4
|
|
xor r9,r9,r4
|
|
xor r10,r10,r4
|
|
xor r11,r11,r4
|
|
stb r8,0(r3)
|
|
sth r9,1(r3)
|
|
sth r10,3(r3)
|
|
stb r11,5(r3)
|
|
addi r3,r3,6
|
|
b flush_line
|
|
__Lxorshort_6_2:
|
|
lhz r9,0(r3)
|
|
lwz r10,2(r3)
|
|
xor r9,r9,r4
|
|
xor r10,r10,r4
|
|
sth r9,0(r3)
|
|
stw r10,2(r3)
|
|
addi r3,r3,6
|
|
b flush_line
|
|
__Lxorshort_6_3:
|
|
lbz r9,0(r3)
|
|
lwz r10,1(r3)
|
|
lbz r11,5(r3)
|
|
xor r9,r9,r4
|
|
xor r10,r10,r4
|
|
xor r11,r11,r4
|
|
stb r9,0(r3)
|
|
stw r10,1(r3)
|
|
stb r11,5(r3)
|
|
addi r3,r3,6
|
|
b flush_line
|
|
//
|
|
//
|
|
// Short xor <= 6 bytes
|
|
//
|
|
Lxor_10:
|
|
mflr r10 // r10 <- InitProcS address
|
|
rlwinm r9,r5,4,25,27 // bit 25~27 of r9 <- width (0~6)
|
|
rlwimi r9,r3,2,28,29 // bit 28~29 of r9 <- mod 4 of target address
|
|
lwzx r9,r10,r9 // r9 <- subroutine to call
|
|
mtctr r9
|
|
mtlr r0 // Restore return address
|
|
bctr // and jump to corresponding xor routine
|
|
//
|
|
// 63 > width > 6 -- medium process
|
|
//
|
|
Lxor_20:
|
|
andi. r10,r3,0x01 // Word alignment 1 or 3?
|
|
beq Lxor_30
|
|
lbz r9,0(r3)
|
|
xor r9,r9,r4
|
|
stb r9,0(r3)
|
|
addi r3,r3,1
|
|
addi r5,r5,-1
|
|
Lxor_30:
|
|
andi. r10,r3,0x02 // Word alignment 2?
|
|
beq Lxor_40
|
|
lhz r9,0(r3)
|
|
xor r9,r9,r4
|
|
sth r9,0(r3)
|
|
addi r3,r3,2
|
|
addi r5,r5,-2
|
|
Lxor_40:
|
|
srawi r10,r5,2 // r5 <- inner loop count
|
|
Lxor_50:
|
|
lwz r9,0(r3)
|
|
xor r9,r9,r4
|
|
stw r9,0(r3)
|
|
addi r3,r3,4
|
|
addic. r10,r10,-1
|
|
bne Lxor_50
|
|
andi. r10,r5,0x02 // Remaining half word?
|
|
beq Lxor_60
|
|
lhz r9,0(r3)
|
|
xor r9,r9,r4
|
|
sth r9,0(r3)
|
|
addi r3,r3,2
|
|
Lxor_60:
|
|
andi. r10,r5,0x01 // Remaining byte?
|
|
beq Lxor_70
|
|
lbz r9,0(r3)
|
|
xor r9,r9,r4
|
|
stb r9,0(r3)
|
|
addi r3,r3,1
|
|
Lxor_70:
|
|
mtlr r0 // Restore return address
|
|
b flush_line
|
|
//
|
|
// width >= 64 -- long process
|
|
//
|
|
Lxor_100:
|
|
andi. r10,r3,0x01 // Word alignment 1 or 3?
|
|
beq Lxor_110
|
|
lbz r9,0(r3)
|
|
xor r9,r9,r4
|
|
stb r9,0(r3)
|
|
addi r3,r3,1
|
|
addi r5,r5,-1
|
|
Lxor_110:
|
|
andi. r10,r3,0x02 // Word alignment 2?
|
|
beq Lxor_120
|
|
lhz r9,0(r3)
|
|
xor r9,r9,r4
|
|
sth r9,0(r3)
|
|
addi r3,r3,2
|
|
addi r5,r5,-2
|
|
Lxor_120:
|
|
andi. r10,r3,0x1c // r10 <- number of bytes to xor to make cache line alignment
|
|
beq Lxor_130
|
|
lwz r9,0(r3)
|
|
xor r9,r9,r4
|
|
stw r9,0(r3)
|
|
addi r3,r3,4
|
|
addi r5,r5,-4
|
|
b Lxor_120
|
|
Lxor_130:
|
|
srawi r10,r5,5 // r10 <- inner most loop (32 byte) count to xor
|
|
mtctr r10
|
|
Lxor_140:
|
|
lwz r8,0(r3)
|
|
lwz r9,4(r3)
|
|
lwz r10,8(r3)
|
|
lwz r11,12(r3)
|
|
xor r8,r8,r4
|
|
xor r9,r9,r4
|
|
xor r10,r10,r4
|
|
xor r11,r11,r4
|
|
stw r8,0(r3)
|
|
stw r9,4(r3)
|
|
stw r10,8(r3)
|
|
stw r11,12(r3)
|
|
lwz r8,16(r3)
|
|
lwz r9,20(r3)
|
|
lwz r10,24(r3)
|
|
lwz r11,28(r3)
|
|
xor r8,r8,r4
|
|
xor r9,r9,r4
|
|
xor r10,r10,r4
|
|
xor r11,r11,r4
|
|
stw r8,16(r3)
|
|
stw r9,20(r3)
|
|
stw r10,24(r3)
|
|
stw r11,28(r3)
|
|
addi r3,r3,32 // Increment target pointer
|
|
bdnz Lxor_140
|
|
//
|
|
andi. r10,r5,0x1c // r10 <- remaining byte can be xored by word xor
|
|
beq Lxor_160
|
|
Lxor_150:
|
|
lwz r9,0(r3)
|
|
xor r9,r9,r4
|
|
stw r9,0(r3)
|
|
addi r3,r3,4
|
|
addic. r10,r10,-4
|
|
bne Lxor_150
|
|
Lxor_160:
|
|
andi. r10,r5,0x02 // Remaining half word to xor?
|
|
beq Lxor_170
|
|
lhz r9,0(r3)
|
|
xor r9,r9,r4
|
|
sth r9,0(r3)
|
|
addi r3,r3,2
|
|
Lxor_170:
|
|
andi. r10,r5,0x01 // Remaining byte to xor
|
|
beq Lxor_180
|
|
lbz r9,0(r3)
|
|
xor r9,r9,r4
|
|
stb r9,0(r3)
|
|
addi r3,r3,1
|
|
Lxor_180:
|
|
mtlr r0 // Restore return address
|
|
b flush_line
|
|
//
|
|
SPECIAL_EXIT(LineXor)
|
|
#endif // PAINT_NEW_METHOD
|
|
//
|