Windows NT 4.0 source code leak
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

1226 lines
42 KiB

//
// Copyright (c) 1995 FirePower Systems, Inc.
// DO NOT DISTRIBUTE without permission
//
// $RCSfile: asmfunc.s $
// $Revision: 1.1 $
// $Date: 1996/03/08 01:16:37 $
// $Locker: $
//
//
// Copyright (c) 1994 FirePower Systems, Inc.
//
// Module Name:
// asmfunc.s
//
// Abstract:
// This module includes several asmmebler functions to be used
// in PSIDISP.DLL display driver for PowerPro & PowerTop. These
// functions are used only for INVETIGATION - not needed for
// release product.
//
// Author:
// Neil Ogura: 9-7-1994
//
// Environment:
// User mode.
//
// Revision History:
//
//--
#include "ksppc.h"
#include "ladj.h"
// Conditional compiling flag for cache control - testing purpose only
// for release version all should be TRUE and cache control is done by parameter
#define TGTTOUCH 1
#define TGTFLUSH 1
#define SRCFLUSH 1
// This flag is used to select new copy method which Dave Stewart discovered
#define NEWMETHOD 1
#if TGTTOUCH
#define T_TOUCH dcbz r7,r9
#else
#define T_TOUCH
#endif
// Cache Flush control bit for memcpy2 & memset2 parameter MS half word
#define SFLUSHBIT 0x8000
#define TFLUSHBIT 0x4000
#define TTOUCHBIT 0x2000
// Maximum L1 cache size to flush -- must be lass than 16 bits value
#define MAXFLUSH 32*1024
#define MINLENGTH 64
#define MINDISTANCE 29
//
LEAF_ENTRY(memcpy2)
//
// Input Parameters:
// r3: Target address (unchanged for return value)
// r4: Source address
// r5: Move length in bytes
// r6: Cache flush flag
// bit 0 (SFLUSHBIT): Source Flush flag 0:No Flush, 1:Flush
// bit 1 (TFLUSHBIT): Target Flush flag 0:No Flush, 1:Flush
// bit 2 (TTOUCHBIT): Source and Target touch flag 0:No Touch, 1:Touch
// bit 16 ~ 31: Is used to keep size to flush (move length or MAXFLUSH whichever smaller)
// inside this routine.
//
// Register usage:
// r7: Cache touch offset
// r8: Temporary work (local loop counter, etc.)
// r9: Target address
// r10 ~ r12: Used for data move
// CTR: Used for loop counter
//
#if TGTFLUSH || SRCFLUSH
rlwimi r6,r5,0,16,31 // Retrieve bit 16 ~ 31 of r5 into r6
cmplwi r5,MAXFLUSH // Size exceeds maximum L1 cache size?
ble lab05 // No -> Flush size is same as original length (now in r6)
andis. r6,r6,0xffff // Yes -> Clear bit 16 ~ 31 of r6
ori r6,r6,MAXFLUSH // and set MAXFLUSH in but 16 ~ 31
lab05:
#endif // TGTFLUSH || SRCFLUSH
mr r9,r3 // Move target address to r9 (to return r3 unchanged)
cmplw r4,r9 // Which direction to move?
blt srclow // SRC lower -> move from the end to top
cmplwi r5,4 // Less than 4 bytes?
blt lastmv1 // YES -> do special short move
andi. r8,r9,0x3 // TGT word alignment check
beq lab15 // Word aligned target -> proceed without adjustment
subfic r8,r8,4 // Not word aligned -> move unaligned bytes first
lbz r10,0(r4)
stb r10,0(r9)
cmpwi r8,2
blt lab10
lbz r10,1(r4)
stb r10,1(r9)
beq lab10
lbz r10,2(r4)
stb r10,2(r9)
lab10: add r4,r4,r8 // Adjust source pointer
add r9,r9,r8 // target pointer
subf r5,r8,r5 // and length
lab15: li r7,4 // Source cache touch offset
andi. r8,r4,0x03 // SRC and TGT aligned check
beq+ wdalgn1 // Word aligned -> easy move
cmpwi r8,2 // Half word aligned?
beq+ hwalgn1 // Yes -> half word align move
blt lftsft1 // No -> check shift direction
//
// Case1: Need 1 byte right shift (or 3 bytes left shift)
//
#if TGTTOUCH
cmplwi r5,MINLENGTH // Less than MINLENGTH bytes?
blt lab35 // Yes -> do 4 bytes unit move
subf r8,r9,r4 // Check distance between source & target
cmpwi r8,MINDISTANCE // Too close to touch target? (may destroy uncopied source)
blt lab35 // Yes -> do 4 bytes unit move
andis. r8,r6,TTOUCHBIT // Touch source and target cache?
beq lab35 // No -> do 4 bytes unit move (no cache control)
andi. r8,r9,0x1c // r8 <- number of bytes to move to make 32 byte aligned target
lbz r10,0(r4) // Load the first byte in LS r10
addi r4,r4,-3 // Adjust source
addi r9,r9,-4 // and target pointer ro make update load
beq lab25 // Target is 32 bytes aligned -> skip pre-move
subfic r8,r8,32 // r8 <- bytes to move to make 32 byte alignment
subf r5,r8,r5 // Adjust length to move in advance
lab20: lwzu r11,4(r4) // Load next word
rlwimi r10,r11,8,0,23 // Insert LS 3 bytes in r11 to MS 3 bytes in r10
stwu r10,4(r9) // Store
rlwinm r10,r11,8,24,31 // Move MS 1 byte in r11 to LS position in r10
addic. r8,r8,-4
bne lab20
lab25: srawi. r8,r5,5 // r8 <- number of 32 bytes units
rlwinm r5,r5,0,27,31 // r5 <- remaining length to move after this loop is done
mtctr r8 // Use CTR as a counter for 8 word units to move
#else // TGTTOUCH
srawi. r8,r5,5 // r8 <- number of 32 bytes units
beq lab35 // less than 32 bytes to move
rlwinm r5,r5,0,27,31 // r5 <- remaining length to move after this loop is done
lbz r10,0(r4) // Load the first byte in LS r10
addi r4,r4,-3 // Adjust source
addi r9,r9,-4 // and target pointer ro make update load
mtctr r8 // Use CTR as a counter for 8 word units to move
#endif // TGTTOUCH
lab30:
T_TOUCH // Touch next target cache line
lwzu r11,4(r4) // Load following
lwzu r12,4(r4) // two words in r11 & r12
rlwimi r10,r11,8,0,23 // Insert LS 3 bytes in r11 to MS 3 bytes in r10
rlwinm r11,r11,8,24,31 // Move MS 1 byte in r11 to LS position
rlwimi r11,r12,8,0,23 // Insert LS 3 bytes in r12 to MS 3 bytes in r11
stwu r10,4(r9) // Store r10
stwu r11,4(r9) // Store r11
rlwinm r10,r12,8,24,31 // Move MS 1 bytes in r12 to LS byte in r10
lwzu r11,4(r4) // Repeat this 4 times to process 32 bytes
lwzu r12,4(r4)
rlwimi r10,r11,8,0,23
rlwinm r11,r11,8,24,31
rlwimi r11,r12,8,0,23
stwu r10,4(r9)
stwu r11,4(r9)
rlwinm r10,r12,8,24,31
lwzu r11,4(r4)
lwzu r12,4(r4)
rlwimi r10,r11,8,0,23
rlwinm r11,r11,8,24,31
rlwimi r11,r12,8,0,23
stwu r10,4(r9)
stwu r11,4(r9)
rlwinm r10,r12,8,24,31
lwzu r11,4(r4)
lwzu r12,4(r4)
rlwimi r10,r11,8,0,23
rlwinm r11,r11,8,24,31
rlwimi r11,r12,8,0,23
stwu r10,4(r9)
stwu r11,4(r9)
rlwinm r10,r12,8,24,31
bdnz lab30 // End of main loop
srawi. r8,r5,2 // Check if more than 4 bytes left to move
bne+ lab36 // More than 4 bytes -> continue to move by 4 byte unit
addi r4,r4,3 // Less than 4 bytes -> adjust pointer
addi r9,r9,4 // and proceed to lastmv
b lastmv1
lab35: srawi. r8,r5,2 // Check if more than 4 bytes left to move
beq lastmv1 // No -> just proceed to lastmv
lbz r10,0(r4) // Load first byte
addi r4,r4,-3 // Adjust source
addi r9,r9,-4 // and target pointer to make update word access
lab36: lwzu r11,4(r4) // Load next word
rlwimi r10,r11,8,0,23 // Insert LS 3 bytes in r11 to MS 3 bytes in r10
stwu r10,4(r9) // Store
rlwinm r10,r11,8,24,31 // Move MS 1 byte in r11 to LS position in r10
addic. r8,r8,-1
bne lab36
addi r4,r4,3 // Adjust source and target pointer
addi r9,r9,4 // to point the next byte to move
b lastmv1 // then proceed to lastmv
//
// Case2: Need 1 byte left shift
//
lftsft1:
#if TGTTOUCH
cmplwi r5,MINLENGTH // Less than MINLENGTH bytes?
blt lab55 // Yes -> do 4 bytes unit move
subf r8,r9,r4 // Check distance between source & target
cmpwi r8,MINDISTANCE // Too close to touch target (may destroy uncopied source)?
blt lab55 // Yes -> do 4 bytes unit move
andis. r8,r6,TTOUCHBIT // Touch source and target cache?
beq lab55 // No -> do 4 bytes unit move (no cache control)
andi. r8,r9,0x1c // r8 <- number of bytes to move to make 32 byte aligned target
addi r4,r4,-1 // Adjust source pointer to make update word access
lwz r10,0(r4) // Load needed three bytes in MS r10
addi r9,r9,-4 // Adjust target pointer to make update word access
beq lab45 // Target is 32 bytes aligned -> skip pre-move
subfic r8,r8,32 // r8 <- bytes to move to make 32 byte alignment
subf r5,r8,r5 // Adjust length to move in advance
lab40: rlwinm r11,r10,24,8,31 // Move MS 3 bytes in r10 to LS bytes in r11
lwzu r10,4(r4) // Load following word
rlwimi r11,r10,24,0,7 // Insert LS 1 bytes in r10 to MS 1 byte in r11
stwu r11,4(r9) // Store r11
addic. r8,r8,-4
bne lab40
lab45: srawi. r8,r5,5 // r8 <- number of 32 bytes units
rlwinm r5,r5,0,27,31 // r5 <- remaining length to move after this loop is done
mtctr r8 // Use CTR as a counter for 8 word units to move
#else // TGTTOUCH
srawi. r8,r5,5 // r8 <- number of 32 bytes units
beq lab55 // less than 32 bytes to move
rlwinm r5,r5,0,27,31 // r5 <- remaining length to move after this loop is done
addi r4,r4,-1 // Adjust source pointer to make update word access
lwz r10,0(r4) // Load needed three bytes in MS r10
addi r9,r9,-4 // Adjust target pointer to make update word access
mtctr r8 // Use CTR as a counter for 8 word units to move
#endif // TGTTOUCH
lab50:
T_TOUCH // Touch next target cache line
rlwinm r11,r10,24,8,31 // Move MS 3 bytes in r10 to LS bytes in r11
lwzu r12,4(r4) // Load following
lwzu r10,4(r4) // two words in r12 & r10
rlwimi r11,r12,24,0,7 // Insert LS 1 bytes in r12 to MS 1 byte in r11
rlwinm r12,r12,24,8,31 // Move MS 3 byte in r12 to LS position
rlwimi r12,r10,24,0,7 // Insert LS 1 bytes in r10 to MS 1 byte in r12
stwu r11,4(r9) // Store r11
stwu r12,4(r9) // Store r12
rlwinm r11,r10,24,8,31 // Repeat this 4 times to process 32 bytes
lwzu r12,4(r4)
lwzu r10,4(r4)
rlwimi r11,r12,24,0,7
rlwinm r12,r12,24,8,31
rlwimi r12,r10,24,0,7
stwu r11,4(r9)
stwu r12,4(r9)
rlwinm r11,r10,24,8,31
lwzu r12,4(r4)
lwzu r10,4(r4)
rlwimi r11,r12,24,0,7
rlwinm r12,r12,24,8,31
rlwimi r12,r10,24,0,7
stwu r11,4(r9)
stwu r12,4(r9)
rlwinm r11,r10,24,8,31
lwzu r12,4(r4)
lwzu r10,4(r4)
rlwimi r11,r12,24,0,7
rlwinm r12,r12,24,8,31
rlwimi r12,r10,24,0,7
stwu r11,4(r9)
stwu r12,4(r9)
bdnz lab50 // End of main loop
srawi. r8,r5,2 // Check if more than 4 bytes left to move
bne+ lab56 // More than 4 bytes -> continue to move by 4 byte unit
addi r4,r4,1 // Less than 4 bytes -> adjust pointer
addi r9,r9,4 // and proceed to lastmv
b lastmv1
lab55: srawi. r8,r5,2 // Check if more than 4 bytes left to move
beq lastmv1 // No -> just proceed to lastmv
addi r4,r4,-1 // Adjust source pointer to make update word access
lwz r10,0(r4) // Load needed three bytes in MS r10
addi r9,r9,-4 // Adjust target pointer to make update word access
lab56: rlwinm r11,r10,24,8,31 // Move MS 3 bytes in r10 to LS bytes in r11
lwzu r10,4(r4) // Load following word
rlwimi r11,r10,24,0,7 // Insert LS 1 bytes in r10 to MS 1 byte in r11
stwu r11,4(r9) // Store r11
addic. r8,r8,-1
bne lab56
addi r4,r4,1 // Adjust source and target pointer
addi r9,r9,4 // to point the next byte to move
b lastmv1 // then proceed to lastmv
//
// Case3: Need 2 byte shift
//
hwalgn1:
#if TGTTOUCH
cmplwi r5,MINLENGTH // Less than MINLENGTH bytes?
blt lab75 // Yes -> do 4 bytes unit move
subf r8,r9,r4 // Check distance between source & target
cmpwi r8,MINDISTANCE // Too close to touch target (may destroy uncopied source)?
blt lab75 // Yes -> do 4 bytes unit move
andis. r8,r6,TTOUCHBIT // Touch source and target cache?
beq lab75 // No -> do 4 bytes unit move (no cache control)
andi. r8,r9,0x1c // r8 <- number of bytes to move to make 32 byte aligned target
lhz r10,0(r4) // Load needed two bytes in r10
addi r4,r4,-2 // Adjust source
addi r9,r9,-4 // and target pointer to make update word load
beq lab65 // target is 32 bytes aligned
subfic r8,r8,32 // r8 <- bytes to move to make 32 byte alignment
subf r5,r8,r5 // Adjust length to move in advance
lab60: lwzu r11,4(r4) // Load following word in r11
rlwimi r10,r11,16,0,15 // Insert LS 2 bytes in r11 to MS 2 bytes in r10
stwu r10,4(r9) // Store r10
rlwinm r10,r11,16,16,31 // Move MS 2 bytes in r10 to LS position
addic. r8,r8,-4
bne lab60
lab65: srawi. r8,r5,5 // r8 <- number of 32 bytes units
rlwinm r5,r5,0,27,31 // r5 <- remaining length to move after this loop is done
mtctr r8 // Use CTR as a counter for 8 word units to move
#else // TGTTOUCH
srawi. r8,r5,5 // r8 <- number of 32 bytes units
beq lab75 // less than 32 bytes to move
rlwinm r5,r5,0,27,31 // r5 <- remaining length to move after this loop is done
lhz r10,0(r4) // Load needed two bytes in r10
addi r4,r4,-2 // Adjust source
addi r9,r9,-4 // and target pointer to make update word load
mtctr r8 // Use CTR as a counter for 8 word units to move
#endif // TGTTOUCH
lab70:
T_TOUCH // Touch next target cache line
lwzu r11,4(r4) // Load following two word in r11
lwzu r12,4(r4) // and r12
rlwimi r10,r11,16,0,15 // Insert LS 2 bytes in r11 to MS 2 bytes in r10
rlwinm r11,r11,16,16,31 // Move MS 2 bytes in r11 to MS 2 bytes in r11
rlwimi r11,r12,16,0,15 // Insert LS 2 bytes in r12 to MS 2 bytes in r11
stwu r10,4(r9) // Store r10
stwu r11,4(r9) // and r11
rlwinm r10,r12,16,16,31 // Move MS 2 bytes in r12 to LS 2 bytes in r10
lwzu r11,4(r4) // Repeat this 4 times to process 32 bytes
lwzu r12,4(r4)
rlwimi r10,r11,16,0,15
rlwinm r11,r11,16,16,31
rlwimi r11,r12,16,0,15
stwu r10,4(r9)
stwu r11,4(r9)
rlwinm r10,r12,16,16,31
lwzu r11,4(r4)
lwzu r12,4(r4)
rlwimi r10,r11,16,0,15
rlwinm r11,r11,16,16,31
rlwimi r11,r12,16,0,15
stwu r10,4(r9)
stwu r11,4(r9)
rlwinm r10,r12,16,16,31
lwzu r11,4(r4)
lwzu r12,4(r4)
rlwimi r10,r11,16,0,15
rlwinm r11,r11,16,16,31
rlwimi r11,r12,16,0,15
stwu r10,4(r9)
stwu r11,4(r9)
rlwinm r10,r12,16,16,31
bdnz lab70 // End of main loop
srawi. r8,r5,2 // Check if more than 4 bytes left to move
bne+ lab76 // More than 4 bytes -> continue to move by 4 byte unit
addi r4,r4,2 // Less than 4 bytes -> adjust pointer
addi r9,r9,4 // and proceed to lastmv
b lastmv1
lab75: srawi. r8,r5,2 // Check if more than 4 bytes left to move
beq lastmv1 // No -> just proceed to lastmv
lhz r10,0(r4) // Load needed two bytes in r10
addi r4,r4,-2 // Adjust source
addi r9,r9,-4 // and target pointer to make update word load
lab76: lwzu r11,4(r4) // Load following word in r11
rlwimi r10,r11,16,0,15 // Insert LS 2 bytes in r11 to MS 2 bytes in r10
stwu r10,4(r9) // Store r10
rlwinm r10,r11,16,16,31 // Move MS 2 bytes in r11 to LS position
addic. r8,r8,-1
bne lab76
addi r4,r4,2 // Adjust source and target pointer
addi r9,r9,4 // to point the next byte to move
b lastmv1 // then proceed to lastmv
//
// Case4: No need for shift (source & target aligned)
//
#if NEWMETHOD
wdalgn1:
#if TGTTOUCH
cmplwi r5,MINLENGTH+96 // Less than MINLENGTH bytes?
blt lab95 // Yes -> do 4 bytes unit move
subf r8,r9,r4 // Check distance between source & target
cmpwi r8,MINDISTANCE+96 // Too close to touch target (may destroy uncopied source)?
blt lab95 // Yes -> do 4 bytes unit move
andis. r8,r6,TTOUCHBIT // Touch source and target cache?
beq lab95 // No -> do 4 bytes unit move (no cache control)
andi. r8,r9,0x1c // r8 <- number of bytes to move to make 32 byte aligned target
addi r9,r9,-4 // Adjust source
addi r4,r4,-4 // and target pointer to make updated access
beq lab85 // Target is 32 bytes aligned -> skip pre-move
subfic r8,r8,32 // r8 <- bytes to move to make 32 byte alignment
subf r5,r8,r5 // Adjust length to move in advance
lab80: lwzu r11,4(r4) // Load next word
stwu r11,4(r9) // Store
addic. r8,r8,-4
bne lab80
lab85: srawi. r8,r5,7 // r8 <- number of 128 bytes units
rlwinm r5,r5,0,25,31 // r5 <- remaining length to move after this loop is done
mtctr r8 // Use CTR as a counter for 8 word units to move
#else // TGTTOUCH
b lab95
#endif // TGTTOUCH
lab90:
// first we zero 4 cache lines at the target. Supposedly single cache line aligned by here
// this has been checked and seems to work as expected
li r10,4 // r9 comes in pointing at last moved target, need to add 4
dcbz r10,r9 // create target line 0
li r10,4+32
dcbz r10,r9 // create target line 1
li r10,4+32+32
dcbz r10,r9 // create target line 2
li r10,4+32+32+32
dcbz r10,r9 // create target line 3
lwzu r11,4(r4) // Load and store 8 times (32 bytes)
li r10,32 // the intent here is to start a non-cache-blocking prefetch of the next line
dcbt r10,r4 // immediately after the load
lwzu r12,4(r4)
stwu r11,4(r9)
stwu r12,4(r9)
lwzu r11,4(r4)
lwzu r12,4(r4)
stwu r11,4(r9)
stwu r12,4(r9)
lwzu r11,4(r4)
lwzu r12,4(r4)
stwu r11,4(r9)
stwu r12,4(r9)
lwzu r11,4(r4)
lwzu r12,4(r4)
stwu r11,4(r9)
stwu r12,4(r9)
lwzu r11,4(r4) // Load and store 8 times (32 bytes)
dcbt r10,r4 // immediately after the load
lwzu r12,4(r4)
stwu r11,4(r9)
stwu r12,4(r9)
lwzu r11,4(r4)
lwzu r12,4(r4)
stwu r11,4(r9)
stwu r12,4(r9)
lwzu r11,4(r4)
lwzu r12,4(r4)
stwu r11,4(r9)
stwu r12,4(r9)
lwzu r11,4(r4)
lwzu r12,4(r4)
stwu r11,4(r9)
stwu r12,4(r9)
lwzu r11,4(r4) // Load and store 8 times (32 bytes)
dcbt r10,r4 // immediately after the load
lwzu r12,4(r4)
stwu r11,4(r9)
stwu r12,4(r9)
lwzu r11,4(r4)
lwzu r12,4(r4)
stwu r11,4(r9)
stwu r12,4(r9)
lwzu r11,4(r4)
lwzu r12,4(r4)
stwu r11,4(r9)
stwu r12,4(r9)
lwzu r11,4(r4)
lwzu r12,4(r4)
stwu r11,4(r9)
stwu r12,4(r9)
lwzu r11,4(r4) // Load and store 8 times (32 bytes)
dcbt r10,r4 // immediately after the load
lwzu r12,4(r4)
stwu r11,4(r9)
stwu r12,4(r9)
lwzu r11,4(r4)
lwzu r12,4(r4)
stwu r11,4(r9)
stwu r12,4(r9)
lwzu r11,4(r4)
lwzu r12,4(r4)
stwu r11,4(r9)
stwu r12,4(r9)
lwzu r11,4(r4)
lwzu r12,4(r4)
stwu r11,4(r9)
stwu r12,4(r9)
// now we flush the old target from the last time thru the loop
// this should fill the write buffer and should come after the reads for
// best performance
li r10,-4 // this could be 0, rather than -4 ne?
dcbf r10,r9 // since r9 is still at last moved target -1
li r10,-4-32
dcbf r10,r9 // target line -2
li r10,-4-32-32
dcbf r10,r9 // target line -3
li r10,-4-32-32-32
dcbf r10,r9 // target line -4
//
bdnz lab90 // End of main loop
srawi. r8,r5,2 // Check if more than 4 bytes left to move
bne+ lab96 // More than 4 bytes -> continue to move by 4 byte unit
addi r4,r4,4 // Less than 4 bytes -> adjust pointer
addi r9,r9,4 // and proceed to lastmv
b lastmv1
lab95: srawi. r8,r5,2 // Check if more than 4 bytes left to move
beq lastmv1 // No -> just proceed to lastmv
addi r4,r4,-4 // Adjust source pointer to make update word access
addi r9,r9,-4 // Adjust target pointer to make update word access
lab96: lwzu r11,4(r4)
stwu r11,4(r9)
addic. r8,r8,-1
bne lab96
addi r4,r4,4 // Adjust source and target pointer
addi r9,r9,4 // to point the next byte to move
#else // NEWMETHOD
wdalgn1:
#if TGTTOUCH
cmplwi r5,MINLENGTH // Less than MINLENGTH bytes?
blt lab95 // Yes -> do 4 bytes unit move
subf r8,r9,r4 // Check distance between source & target
cmpwi r8,MINDISTANCE // Too close to touch target (may destroy uncopied source)?
blt lab95 // Yes -> do 4 bytes unit move
andis. r8,r6,TTOUCHBIT // Touch source and target cache?
beq lab95 // No -> do 4 bytes unit move (no cache control)
andi. r8,r9,0x1c // r8 <- number of bytes to move to make 32 byte aligned target
addi r9,r9,-4 // Adjust source
addi r4,r4,-4 // and target pointer to make updated access
beq lab85 // Target is 32 bytes aligned -> skip pre-move
subfic r8,r8,32 // r8 <- bytes to move to make 32 byte alignment
subf r5,r8,r5 // Adjust length to move in advance
lab80: lwzu r11,4(r4) // Load next word
stwu r11,4(r9) // Store
addic. r8,r8,-4
bne lab80
lab85: srawi. r8,r5,5 // r8 <- number of 32 bytes units
rlwinm r5,r5,0,27,31 // r5 <- remaining length to move after this loop is done
mtctr r8 // Use CTR as a counter for 8 word units to move
#else // TGTTOUCH
srawi. r8,r5,5 // r8 <- number of 32 bytes units
beq lab95 // less than 32 bytes to move
rlwinm r5,r5,0,27,31 // r5 <- remaining length to move after this loop is done
addi r9,r9,-4 // Adjust source
addi r4,r4,-4 // and target pointer to make updated access
mtctr r8 // Use CTR as a counter for 8 word units to move
#endif // TGTTOUCH
li r10,-28
b lab91
lab90:
dcbf r10,r4 // Flush previous source cache
lab91:
T_TOUCH // Touch next target cache line
lwzu r11,4(r4) // Load and store 8 times (32 bytes)
lwzu r12,4(r4)
stwu r11,4(r9)
stwu r12,4(r9)
lwzu r11,4(r4)
lwzu r12,4(r4)
stwu r11,4(r9)
stwu r12,4(r9)
lwzu r11,4(r4)
lwzu r12,4(r4)
stwu r11,4(r9)
stwu r12,4(r9)
lwzu r11,4(r4)
lwzu r12,4(r4)
stwu r11,4(r9)
stwu r12,4(r9)
bdnz lab90 // End of main loop
srawi. r8,r5,2 // Check if more than 4 bytes left to move
bne+ lab96 // More than 4 bytes -> continue to move by 4 byte unit
addi r4,r4,4 // Less than 4 bytes -> adjust pointer
addi r9,r9,4 // and proceed to lastmv
b lastmv1
lab95: srawi. r8,r5,2 // Check if more than 4 bytes left to move
beq lastmv1 // No -> just proceed to lastmv
addi r4,r4,-4 // Adjust source pointer to make update word access
addi r9,r9,-4 // Adjust target pointer to make update word access
lab96: lwzu r11,4(r4)
stwu r11,4(r9)
addic. r8,r8,-1
bne lab96
addi r4,r4,4 // Adjust source and target pointer
addi r9,r9,4 // to point the next byte to move
#endif // NEWMETHOD
//
// Final process -> move remaining bytes up tp 3 bytes
//
#if TGTFLUSH || SRCFLUSH
lastmv1:
andi. r8,r5,0x3 // Get length reminder of 4 in r8
beq exit10 // No more byte to move -> exit1 to flush cache
lbz r10,0(r4)
stb r10,0(r9)
cmpwi r8,2
blt exit10
lbz r10,1(r4)
stb r10,1(r9)
beq exit10
lbz r10,2(r4)
stb r10,2(r9)
exit10:
#if TGTFLUSH
andis. r10,r6,TFLUSHBIT // Need to flush target cache?
beq exit15 // No -> check source flush
add r9,r9,r8 // r9 <- pointing to after last stored byte
andi. r10,r6,0xffff // r10 <- length to flush
beq exit
subf r10,r10,r9 // r10 <- pointing to the first byte
rlwinm r10,r10,0,0,26 // r10 <- 32 byte aligned start address
addi r9,r9,-1 // r9 <- pointing to the last byte
rlwinm r9,r9,0,0,26 // r9 <- 32 byte aligned end address
flush10:
dcbf 0,r9 // Flush cached data
addi r9,r9,-32
cmplw r9,r10 // Exceeding end address?
bge flush10
exit15:
#endif // TGTFLUSH
#if SRCFLUSH
andis. r10,r6,SFLUSHBIT // Need to flush source cache?
beq exit // No -> exit
add r4,r4,r8 // r4 <- pointing to after last source byte
andi. r10,r6,0xffff // r10 <- length to flush
beq exit
subf r10,r10,r4 // r10 <- pointing to the first byte
rlwinm r10,r10,0,0,26 // r10 <- 32 byte aligned start address
addi r4,r4,-1 // r4 <- pointing to the last byte
rlwinm r4,r4,0,0,26 // r4 <- 32 byte aligned end address
flush15:
dcbf 0,r4 // Flush cached data
addi r4,r4,-32
cmplw r4,r10 // Exceeding end address?
bge flush15
b exit
#endif // SRCFLUSH
#else // TGTFLUSH || SRCFLUSH
lastmv1:
andi. r8,r5,0x3 // Get length reminder of 4 in r8
beq exit // No more byte to move -> exit
lbz r10,0(r4)
stb r10,0(r9)
cmpwi r8,2
blt exit
lbz r10,1(r4)
stb r10,1(r9)
beq exit
lbz r10,2(r4)
stb r10,2(r9)
b exit
#endif // TGTFLUSH || SRCFLUSH
//
// SRC address is lower --> Move from end to top
//
srclow: add r9,r9,r5 // End target pointer
add r4,r4,r5 // End source pointer
cmplwi r5,4 // Less than 4 bytes?
blt lastmv2 // YES -> do special short move
andi. r8,r9,0x3 // TGT word alignment check
beq lab115 // Word aligned target -> proceed without adjustment
lbz r10,-1(r4)
stb r10,-1(r9)
cmpwi r8,2
blt lab110
lbz r10,-2(r4)
stb r10,-2(r9)
beq lab110
lbz r10,-3(r4)
stb r10,-3(r9)
lab110: subf r4,r8,r4 // Adjust source pointer
subf r9,r8,r9 // target pointer
subf r5,r8,r5 // and length
lab115: li r7,-4 // Source cache touch offset
andi. r8,r4,0x03 // SRC and TGT aligned check
beq+ wdalgn2 // Word aligned -> easy move
cmpwi r8,2 // Half word aligned?
beq+ hwalgn2 // Yes -> half word align move
bgt lftsft2 // No -> check shift direction
//
// Case1: Need 1 byte right shift (or 3 bytes left shift)
//
#if TGTTOUCH
cmplwi r5,MINLENGTH // Less than MINLENGTH bytes?
blt lab135 // Yes -> do 4 bytes unit move
subf r8,r4,r9 // Check distance between source & target
cmpwi r8,MINDISTANCE // Too close to touch target (may destroy uncopied source)?
blt lab135 // Yes -> do 4 bytes unit move
andis. r8,r6,TTOUCHBIT // Touch source and target cache?
beq lab135 // No -> do 4 bytes unit move (no cache control)
andi. r8,r9,0x1c // r8 <- number of bytes to move to make 32 byte aligned target
lbzu r10,-1(r4) // Load first byte and
beq lab125 // Target is 32 bytes aligned -> skip pre-move
subf r5,r8,r5 // Adjust length to move in advance
lab120: rlwinm r11,r10,24,0,7 // Move LS 1 bytes in r10 to MS byte in r11
lwzu r10,-4(r4) // Load preceeding word
rlwimi r11,r10,24,8,31 // Insert MS 3 bytes in r10 to LS 3 bytes in r11
stwu r11,-4(r9) // Store r11
addic. r8,r8,-4
bne lab120
lab125: srawi. r8,r5,5 // r8 <- number of 32 bytes units
rlwinm r5,r5,0,27,31 // r5 <- remaining length to move after this loop is done
mtctr r8 // Use CTR as a counter for 8 word units to move
#else // TGTTOUCH
srawi. r8,r5,5 // r8 <- number of 32 bytes units
beq lab135 // less than 32 bytes to move
rlwinm r5,r5,0,27,31 // r5 <- remaining length to move after this loop is done
lbzu r10,-1(r4) // Load first byte and
mtctr r8 // Use CTR as a counter for 8 word units to move
#endif // TGTTOUCH
lab130:
T_TOUCH // Touch next target cache line
rlwinm r11,r10,24,0,7 // Move LS 1 bytes in r10 to MS byte in r11
lwzu r12,-4(r4) // Load preceeding
lwzu r10,-4(r4) // two words in r12 & r10
rlwimi r11,r12,24,8,31 // Insert MS 3 bytes in r12 to LS 3 bytes in r11
rlwinm r12,r12,24,0,7 // Move LS 1 byte in r12 to MS position
rlwimi r12,r10,24,8,31 // Insert MS 3 bytes in r10 to LS 3 bytes in r12
stwu r11,-4(r9) // Store r11
stwu r12,-4(r9) // Store r12
rlwinm r11,r10,24,0,7 // Repeat this 4 times to process 32 bytes
lwzu r12,-4(r4)
lwzu r10,-4(r4)
rlwimi r11,r12,24,8,31
rlwinm r12,r12,24,0,7
rlwimi r12,r10,24,8,31
stwu r11,-4(r9)
stwu r12,-4(r9)
rlwinm r11,r10,24,0,7
lwzu r12,-4(r4)
lwzu r10,-4(r4)
rlwimi r11,r12,24,8,31
rlwinm r12,r12,24,0,7
rlwimi r12,r10,24,8,31
stwu r11,-4(r9)
stwu r12,-4(r9)
rlwinm r11,r10,24,0,7
lwzu r12,-4(r4)
lwzu r10,-4(r4)
rlwimi r11,r12,24,8,31
rlwinm r12,r12,24,0,7
rlwimi r12,r10,24,8,31
stwu r11,-4(r9)
stwu r12,-4(r9)
bdnz lab130 // End of main loop
srawi. r8,r5,2 // Check if more than 4 bytes left to move
bne+ lab136 // More than 4 bytes -> continue to move by 4 byte unit
addi r4,r4,1 // Less than 4 bytes -> adjust pointer
b lastmv2
lab135: srawi. r8,r5,2 // Check if more than 4 bytes left to move
beq lastmv2 // No -> just proceed to lastmv
lbzu r10,-1(r4) // Load first byte
lab136: rlwinm r11,r10,24,0,7 // Move LS 1 bytes in r10 to MS byte in r11
lwzu r10,-4(r4) // Load preceeding word
rlwimi r11,r10,24,8,31 // Insert MS 3 bytes in r10 to LS 3 bytes in r11
stwu r11,-4(r9) // Store r11
addic. r8,r8,-1
bne lab136
addi r4,r4,1 // Adjust source pointer
b lastmv2 // then proceed to lastmv
//
// Case2: Need 1 byte left shift
//
lftsft2:
#if TGTTOUCH
cmplwi r5,MINLENGTH // Less than MINLENGTH bytes?
blt lab155 // Yes -> do 4 bytes unit move
subf r8,r4,r9 // Check distance between source & target
cmpwi r8,MINDISTANCE // Too close to touch target (may destroy uncopied source)?
blt lab155 // Yes -> do 4 bytes unit move
andis. r8,r6,TTOUCHBIT // Touch source and target cache?
beq lab155 // No -> do 4 bytes unit move (no cache control)
andi. r8,r9,0x1c // r8 <- number of bytes to move to make 32 byte aligned target
addi r4,r4,1 // Adjust source pointer to make update word access
lwzu r10,-4(r4) // Load needed three bytes in LS r10
beq lab145 // Target is 32 bytes aligned -> skip pre-move
subf r5,r8,r5 // Adjust length to move in advance
lab140: rlwinm r11,r10,8,0,23 // Move LS 3 bytes in r10 to MS bytes in r11
lwzu r10,-4(r4) // Load preceeding word
rlwimi r11,r10,8,24,31 // Insert MS 1 bytes in r10 to LS 1 byte in r11
stwu r11,-4(r9) // Store r11
addic. r8,r8,-4
bne lab140
lab145: srawi. r8,r5,5 // r8 <- number of 32 bytes units
rlwinm r5,r5,0,27,31 // r5 <- remaining length to move after this loop is done
mtctr r8 // Use CTR as a counter for 8 word units to move
#else // TGTTOUCH
srawi. r8,r5,5 // r8 <- number of 32 bytes units
beq lab155 // less than 32 bytes to move
rlwinm r5,r5,0,27,31 // r5 <- remaining length to move after this loop is done
addi r4,r4,1 // Adjust source pointer to make update word access
lwzu r10,-4(r4) // Load needed three bytes in LS r10
mtctr r8 // Use CTR as a counter for 8 word units to move
#endif // TGTTOUCH
lab150:
T_TOUCH // Touch next target cache line
rlwinm r11,r10,8,0,23 // Move LS 3 bytes in r10 to MS bytes in r11
lwzu r12,-4(r4) // Load preceeding
lwzu r10,-4(r4) // two words in r12 & r10
rlwimi r11,r12,8,24,31 // Insert MS 1 bytes in r12 to LS 1 byte in r11
rlwinm r12,r12,8,0,23 // Move LS 3 byte in r12 to MS position
rlwimi r12,r10,8,24,31 // Insert MS 1 bytes in r10 to LS 1 byte in r12
stwu r11,-4(r9) // Store r11
stwu r12,-4(r9) // Store r12
rlwinm r11,r10,8,0,23 // Repeat this 4 times to process 32 bytes
lwzu r12,-4(r4)
lwzu r10,-4(r4)
rlwimi r11,r12,8,24,31
rlwinm r12,r12,8,0,23
rlwimi r12,r10,8,24,31
stwu r11,-4(r9)
stwu r12,-4(r9)
rlwinm r11,r10,8,0,23
lwzu r12,-4(r4)
lwzu r10,-4(r4)
rlwimi r11,r12,8,24,31
rlwinm r12,r12,8,0,23
rlwimi r12,r10,8,24,31
stwu r11,-4(r9)
stwu r12,-4(r9)
rlwinm r11,r10,8,0,23
lwzu r12,-4(r4)
lwzu r10,-4(r4)
rlwimi r11,r12,8,24,31
rlwinm r12,r12,8,0,23
rlwimi r12,r10,8,24,31
stwu r11,-4(r9)
stwu r12,-4(r9)
bdnz lab150 // End of main loop
srawi. r8,r5,2 // Check if more than 4 bytes left to move
bne+ lab156 // More than 4 bytes -> continue to move by 4 byte unit
addi r4,r4,3 // Less than 4 bytes -> adjust pointer
b lastmv2
lab155: srawi. r8,r5,2 // Check if more than 4 bytes left to move
beq lastmv2 // No -> just proceed to lastmv
addi r4,r4,1 // Adjust source pointer to make update word access
lwzu r10,-4(r4) // Load needed three bytes in MS r10
lab156: rlwinm r11,r10,8,0,23 // Move LS 3 bytes in r10 to MS bytes in r11
lwzu r10,-4(r4) // Load preceeding word
rlwimi r11,r10,8,24,31 // Insert MS 1 bytes in r10 to LS 1 byte in r11
stwu r11,-4(r9) // Store r11
addic. r8,r8,-1
bne lab156
addi r4,r4,3 // Adjust source pointer
b lastmv2
//
// Case3: Need 2 byte shift
//
hwalgn2:
#if TGTTOUCH
cmplwi r5,MINLENGTH // Less than MINLENGTH bytes?
blt lab175 // Yes -> do 4 bytes unit move
subf r8,r4,r9 // Check distance between source & target
cmpwi r8,MINDISTANCE // Too close to touch target (may destroy uncopied source)?
blt lab175 // Yes -> do 4 bytes unit move
andis. r8,r6,TTOUCHBIT // Touch source and target cache?
beq lab175 // No -> do 4 bytes unit move (no cache control)
andi. r8,r9,0x1c // r8 <- number of bytes to move to make 32 byte aligned target
lhzu r10,-2(r4) // Load needed two bytes in r11
beq lab165 // Target is 32 bytes aligned -> skip pre-move
subf r5,r8,r5 // Adjust length to move in advance
lab160: rlwinm r11,r10,16,0,15 // Move LS 2 bytes in r10 to MS 2 bytes in r11
lwzu r10,-4(r4) // Load preceeding word in r10
rlwimi r11,r10,16,16,31 // Insert MS 2 bytes in r10 to LS 2 bytes in r11
stwu r11,-4(r9) // Store r11
addic. r8,r8,-4
bne lab160
lab165: srawi. r8,r5,5 // r8 <- number of 32 bytes units
rlwinm r5,r5,0,27,31 // r5 <- remaining length to move after this loop is done
mtctr r8 // Use CTR as a counter for 8 word units to move
#else // TGTTOUCH
srawi. r8,r5,5 // r8 <- number of 32 bytes units
beq lab175 // less than 32 bytes to move
rlwinm r5,r5,0,27,31 // r5 <- remaining length to move after this loop is done
lhzu r10,-2(r4) // Load needed two bytes in r11
mtctr r8 // Use CTR as a counter for 8 word units to move
#endif // TGTTOUCH
lab170:
T_TOUCH // Touch next target cache line
rlwinm r11,r10,16,0,15 // Move LS 2 bytes in r10 to MS 2 bytes in r11
lwzu r12,-4(r4) // Load preceeding two word in r12
lwzu r10,-4(r4) // and r10
rlwimi r11,r12,16,16,31 // Insert MS 2 bytes in r12 to LS 2 bytes in r11
rlwinm r12,r12,16,0,15 // Move LS 2 bytes in r12 to MS position
rlwimi r12,r10,16,16,31 // Insert MS 2 bytes in r10 to LS 2 bytes in r12
stwu r11,-4(r9) // Store r11
stwu r12,-4(r9) // and r12
rlwinm r11,r10,16,0,15 // Repeat this 4 times to process 32 bytes
lwzu r12,-4(r4)
lwzu r10,-4(r4)
rlwimi r11,r12,16,16,31
rlwinm r12,r12,16,0,15
rlwimi r12,r10,16,16,31
stwu r11,-4(r9)
stwu r12,-4(r9)
rlwinm r11,r10,16,0,15
lwzu r12,-4(r4)
lwzu r10,-4(r4)
rlwimi r11,r12,16,16,31
rlwinm r12,r12,16,0,15
rlwimi r12,r10,16,16,31
stwu r11,-4(r9)
stwu r12,-4(r9)
rlwinm r11,r10,16,0,15
lwzu r12,-4(r4)
lwzu r10,-4(r4)
rlwimi r11,r12,16,16,31
rlwinm r12,r12,16,0,15
rlwimi r12,r10,16,16,31
stwu r11,-4(r9)
stwu r12,-4(r9)
bdnz lab170 // End of main loop
srawi. r8,r5,2 // Check if more than 4 bytes left to move
bne+ lab176 // More than 4 bytes -> continue to move by 4 byte unit
addi r4,r4,2 // No -> adjust pointer and proceed to lastmv
b lastmv2
lab175: srawi. r8,r5,2 // Check if more than 4 bytes left to move
beq lastmv2 // No -> just proceed to lastmv
lhzu r10,-2(r4) // Load needed two bytes in r11
lab176: rlwinm r11,r10,16,0,15 // Move LS 2 bytes in r10 to MS 2 bytes in r11
lwzu r10,-4(r4) // Load preceeding word in r10
rlwimi r11,r10,16,16,31 // Insert MS 2 bytes in r10 to LS 2 bytes in r11
stwu r11,-4(r9) // Store r11
addic. r8,r8,-1
bne lab176
addi r4,r4,2 // Adjust source pointer
b lastmv2 // then proceed to lastmv
//
// Case4: No need for shift (source & target aligned)
//
wdalgn2:
#if TGTTOUCH
cmplwi r5,MINLENGTH // Less than MINLENGTH bytes?
blt lab195 // Yes -> do 4 bytes unit move
subf r8,r4,r9 // Check distance between source & target
cmpwi r8,MINDISTANCE // Too close to touch target (may destroy uncopied source)?
blt lab195 // Yes -> do 4 bytes unit move
andis. r8,r6,TTOUCHBIT // Touch source and target cache?
beq lab195 // No -> do 4 bytes unit move (no cache control)
andi. r8,r9,0x1c // r8 <- number of bytes to move to make 32 byte aligned target
beq lab185 // Target is 32 bytes aligned -> skip pre-move
subf r5,r8,r5 // Adjust length to move in advance
lab180: lwzu r11,-4(r4)
stwu r11,-4(r9)
addic. r8,r8,-4
bne lab180
lab185: srawi. r8,r5,5 // r8 <- number of 32 bytes units
rlwinm r5,r5,0,27,31 // r5 <- remaining length to move after this loop is done
mtctr r8 // Use CTR as a counter for 8 word units to move
#else // TGTTOUCH
srawi. r8,r5,5 // r8 <- number of 32 bytes units
beq lab195 // less than 32 bytes to move
rlwinm r5,r5,0,27,31 // r5 <- remaining length to move after this loop is done
mtctr r8 // Use CTR as a counter for 8 word units to move
#endif // TGTTOUCH
lab190:
T_TOUCH // Touch next target cache line
lwzu r11,-4(r4) // Load and store 8 times (32 bytes)
lwzu r12,-4(r4)
stwu r11,-4(r9)
stwu r12,-4(r9)
lwzu r11,-4(r4)
lwzu r12,-4(r4)
stwu r11,-4(r9)
stwu r12,-4(r9)
lwzu r11,-4(r4)
lwzu r12,-4(r4)
stwu r11,-4(r9)
stwu r12,-4(r9)
lwzu r11,-4(r4)
lwzu r12,-4(r4)
stwu r11,-4(r9)
stwu r12,-4(r9)
bdnz lab190 // End of main loop
srawi. r8,r5,2 // Check if more than 4 bytes left to move
bne+ lab196 // More than 4 bytes -> continue to move by 4 byte unit
b lastmv2
lab195: srawi. r8,r5,2 // Check if more than 4 bytes left to move
beq lastmv2 // No -> just proceed to lastmv
lab196: lwzu r11,-4(r4)
stwu r11,-4(r9)
addic. r8,r8,-1
bne lab196
//
// Final process -> move remaining bytes up tp 3 bytes
//
#if TGTFLUSH || SRCFLUSH
lastmv2:
andi. r8,r5,0x3 // Get length reminder of 4 in r8
beq exit20 // No more byte to move -> exit1 to flush cache
lbz r10,-1(r4)
stb r10,-1(r9)
cmpwi r8,2
blt exit20
lbz r10,-2(r4)
stb r10,-2(r9)
beq exit20
lbz r10,-3(r4)
stb r10,-3(r9)
exit20:
#if TGTFLUSH
andis. r10,r6,TFLUSHBIT // Need to flush target cache?
beq exit25 // No -> check source flush
subf r9,r8,r9 // r9 <- pointing to the first byte
andi. r10,r6,0xffff // r10 <- length to flush
beq exit
add r10,r9,r10 // r10 <- pointing one byte after last
addi r10,r10,-1 // r10 <- pointing to the last byte
rlwinm r10,r10,0,0,26 // r10 <- 32 byte aligned end address
rlwinm r9,r9,0,0,26 // r9 <- 32 byte aligned start address
flush20:
dcbf 0,r9 // Flush cached data
addi r9,r9,32
cmplw r9,r10 // Exceeding end address?
ble flush20
exit25:
#endif // TGTFLUSH
#if SRCFLUSH
andis. r10,r6,SFLUSHBIT // Need to flush source cache?
beq exit // No -> exit
subf r4,r8,r4 // r4 <- pointing to the first byte
andi. r10,r6,0xffff // r10 <- length to flush
beq exit
add r10,r4,r10 // r10 <- pointing one byte after last
addi r10,r10,-1 // r10 <- pointing to the last byte
rlwinm r10,r10,0,0,26 // r10 <- 32 byte aligned end address
rlwinm r4,r4,0,0,26 // r4 <- 32 byte aligned start address
flush25:
dcbf 0,r4 // Flush cached data
addi r4,r4,32
cmplw r4,r10 // Exceeding end address?
ble flush25
#endif // SRCFLUSH
#else // TGTFLUSH || SRCFLUSH
lastmv2:
andi. r8,r5,0x3 // Get length reminder of 4 in r8
beq exit // No more bytes to move -> return
lbz r10,-1(r4)
stb r10,-1(r9)
cmpwi r8,2
blt exit
lbz r10,-2(r4)
stb r10,-2(r9)
beq exit
lbz r10,-3(r4)
stb r10,-3(r9)
#endif // TGTFLUSH
exit:
LEAF_EXIT(memcpy2)
//
LEAF_ENTRY(memset2)
//
// Input Parameters:
// r3: Target address (unchanged for return value)
// r4: Byte data to set
// r5: Set length in bytes
// r6: Cache flush flag
// bit 1 (TFLUSHBIT): Target Flush flag 0:No Flush, 1:Flush
// bit 2 (TTOUCHBIT): Source and Target touch flag 0:No Touch, 1:Touch
//
// Register usage:
// r7: Cache touch offset
// r8: Temporary work (local loop counter, etc.)
// r9: Target address
// r10: Expanded data
// r11: Work register
// r12: Used to keep size to flush (move length or MAXFLUSH whichever smaller)
// CTR: Used for loop counter
//
// Restrictions:
// If the target memory is NON-Cachable, set TTOUCHBIT and TFLUSHBIT to zero.
//
and. r5,r5,r5 // Any bytes to set?
beq exits // No -> exit
mr r12,r5 // Keep length in r12
cmplwi r5,MAXFLUSH // Size exceeds maximum L1 cache size?
ble labs05 // No -> Flush size is same as original length (now in r12)
li r12,0
ori r12,r12,MAXFLUSH // Yes -> Use MAXFLUSH
labs05:
mr r9,r3 // Move target address to r9 (to return r3 unchanged)
cmplwi r5,4 // Less than 4 bytes?
blt lastset // YES -> do special short move
andi. r8,r9,0x3 // TGT word alignment check
beq labs15 // Word aligned target -> proceed without adjustment
subfic r8,r8,4 // Not word aligned -> move unaligned bytes first
stb r4,0(r9)
cmpwi r8,2
blt labs10
stb r4,1(r9)
beq labs10
stb r4,2(r9)
labs10: add r9,r9,r8 // Update target pointer
subf r5,r8,r5 // and length
labs15: li r7,4 // Cache touch offset
li r8,8 // Amount of shift
slw r10,r4,r8 // r10 <- r4<<8
or r11,r10,r4 // r11 <- r10 | r4
li r8,16
slw r10,r11,r8 // r10 <- r11<<16
or r10,r10,r11 // r10 <- Four repeated byte of LS r4
cmplwi r5,MINLENGTH // Less than MINLENGTH bytes?
blt labs95 // Yes -> do 4 bytes unit move
andis. r8,r6,TTOUCHBIT // Touch source and target cache?
beq labs95 // No -> do 4 bytes unit move (no cache control)
andi. r8,r9,0x1c // r8 <- number of bytes to move to make 32 byte aligned target
addi r9,r9,-4 // Adjust target pointer to make updated access
beq labs85 // Target is 32 bytes aligned -> skip pre-move
subfic r8,r8,32 // r8 <- bytes to move to make 32 byte alignment
subf r5,r8,r5 // Adjust length to move in advance
labs80: stwu r10,4(r9) // Store
addic. r8,r8,-4
bne labs80
labs85: srawi. r8,r5,5 // r8 <- number of 32 bytes units
rlwinm r5,r5,0,27,31 // r5 <- remaining length to move after this loop is done
mtctr r8 // Use CTR as a counter for 8 word units to move
and. r10,r10,r10 // Storing zero?
bne labs90 // No -> need to store
labs86: // Yes -> Just "dcbz" target
dcbz r7,r9
addi r9,r9,32
bdnz labs86
b labs94
labs90:
dcbz r7,r9 // Touch next target cache line
stwu r10,4(r9)
stwu r10,4(r9)
stwu r10,4(r9)
stwu r10,4(r9)
stwu r10,4(r9)
stwu r10,4(r9)
stwu r10,4(r9)
stwu r10,4(r9)
bdnz labs90 // End of main loop
labs94:
srawi. r8,r5,2 // Check if more than 4 bytes left to move
bne+ labs96 // More than 4 bytes -> continue to move by 4 byte unit
addi r9,r9,4 // Less than 4 bytes -> update pointer and proceed to lastset
b lastset
labs95: srawi. r8,r5,2 // Check if more than 4 bytes left to move
beq lastset // No -> just proceed to lastmv
addi r9,r9,-4 // Adjust target pointer to make update word access
labs96: stwu r10,4(r9)
addic. r8,r8,-1
bne labs96
addi r9,r9,4 // Adjust target pointer to point the next byte to set
//
// Final process -> store remaining bytes up tp 3 bytes
//
lastset:
andi. r8,r5,0x3 // Get length reminder of 4 in r8
beq exits1 // No more byte to move -> exit1 to flush cache
stb r4,0(r9)
cmpwi r8,2
blt exits1
stb r4,1(r9)
beq exits1
stb r4,2(r9)
exits1:
andis. r10,r6,TFLUSHBIT // Need to flush target cache?
beq exits // No -> just exit
add r9,r9,r8 // r9 <- pointing to after last stored byte
subf r10,r12,r9 // r10 <- pointing to the first byte
rlwinm r10,r10,0,0,26 // r10 <- 32 byte aligned start address
addi r9,r9,-1 // r9 <- pointing to the last byte
rlwinm r9,r9,0,0,26 // r9 <- 32 byte aligned end address
flushs:
dcbf 0,r9 // Flush cached data
addi r9,r9,-32
cmplw r9,r10 // Exceeding end address?
bge flushs
exits:
LEAF_EXIT(memset2)
//
LEAF_ENTRY(flush)
//
// Input Parameters:
// r3: Target address
// r4: Area length
//
add r5,r3,r4
addi r5,r5,-1 // r5 <- end address
rlwinm r3,r3,0,0,26 // r3 <- 32 byte aligned start line
rlwinm r5,r5,0,0,26 // r5 <- 32 byte aligned last line
floop:
dcbf 0,r3 // Flush cached data
addi r3,r3,32
cmplw r3,r5 // Exceeding end address?
ble floop
LEAF_EXIT(flush)
//
LEAF_ENTRY(memcmp2)
li r10,0
loopx2: lbzx r9,r10,r4
lbzx r8,r10,r3
cmp 0,0,r8,r9
bne exitlpx2
addi r10,r10,1
cmp 0,0,r10,r5
bne loopx2
li r3,-1
b exitx2
exitlpx2:
mr r3,r10
exitx2:
LEAF_EXIT(memcmp2)
//
LEAF_ENTRY(noop)
LEAF_EXIT(noop)