Windows NT 4.0 source code leak
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

848 lines
32 KiB

// TITLE("Pattern Tiler")
//++
//
// Copyright (c) 1992 Microsoft Corporation
//
// Module Name:
//
// tiler.s
//
// Abstract:
//
// This module implements code to copy a pattern to a target surface.
//
// N.B. The code is written to optimally write to a frame buffer display
// surface. This means there is an occasional movement of data to
// floating point registers so that 8-byte writes to the display
// can be performed.
//
// Author:
//
// Donald Sidoroff (donalds) 2-Feb-1992
//
// Rewritten by:
//
// David N. Cutler (davec) 4-May-1992
//
// Environment:
//
// User mode only.
//
// Revision History:
//
//--
#include "ksmips.h"
#include "gdimips.h"
.extern Gdip64bitDisabled 4
SBTTL("rop P, Aligned")
//++
//
// VOID
// vFetchAndCopy (
// IN PFETCHFRAME pff
// )
//
// Routine Description:
//
// This routine repeatedly tiles one scan line of an aligned pattern.
//
// Arguments:
//
// pff (a0) - Supplies a pointer to a fetch frame.
//
// Return Value:
//
// None.
//
//--
LEAF_ENTRY(vCopyPattern)
ALTERNATE_ENTRY(vFetchAndCopy)
lw t0,ff_pvTrg(a0) // get starting target address
lw t1,ff_pvPat(a0) // get base pattern address
lw t2,ff_xPat(a0) // get pattern offset in bytes
lw t3,ff_cxPat(a0) // get pattern width in pixels
lw t4,ff_culFill(a0) // compute ending target address
sll a1,t4,2 //
addu t4,a1,t0 //
addu t5,t2,t1 // compute current pattern address
subu v0,t3,8 // check if pattern is exactly 8 bytes
bne zero,v0,10f // if ne, pattern is not 8 bytes
lw v0,0(t5) // get low part of 8-byte pattern
lw v1,4(t5) // get high part of 8-byte pattern
beq zero,t2,CopyPattern // if eq, zero offset value
lw v1,0(t1) // get high part of 8-byte pattern
b CopyPattern // finish in common code
//
// The pattern is not 8 bytes in width or cannot be moved 8 bytes at a time.
//
.set noreorder
.set noat
10: lw v0,0(t5) // get 4-byte pattern value
addu t3,t3,t1 // compute ending pattern address
20: addu t0,t0,4 // advance target pointer
sw v0,-4(t0) // store pattern in target
beq t0,t4,30f // if eq, end of target
addu t5,t5,4 // advance pixel offset
subu t6,t5,t3 // check if at end of pattern
bne zero,t6,20b // if ne, not at end of pattern
lw v0,0(t5) // get 4-byte pattern value
move t5,t1 // set starting pattern addres
b 20b //
lw v0,0(t5) // get 4-byte pattern value
.set at
.set reorder
30: j ra // return
SBTTL("rop P, Unaligned")
//++
//
// VOID
// vFetchShiftAndCopy (
// IN PFETCHFRAME pff
// )
//
// Routine Description:
//
// This routine repeatedly tiles one line of an unaligned pattern
// using rop (P).
//
// Arguments:
//
// pff (a0) - Supplies a pointer to a fetch frame.
//
// Return Value:
//
// None.
//
//--
ALTERNATE_ENTRY(vFetchShiftAndCopy)
lw t0,ff_pvTrg(a0) // get starting target address
lw t1,ff_pvPat(a0) // get base pattern address
lw t2,ff_xPat(a0) // get pattern offset in bytes
lw t3,ff_cxPat(a0) // get pattern width in pixels
lw t4,ff_culFill(a0) // compute ending target address
sll a1,t4,2 //
addu t4,a1,t0 //
addu t5,t2,t1 // compute current pattern address
subu v0,t3,8 // check if pattern is exactly 8 bytes
bne zero,v0,10f // if ne, pattern is not 8 bytes
lwr v0,0(t5) // get low part of 8-byte pattern
lwl v0,3(t5) //
lwr v1,4(t5) // get high part of 8-byte pattern
lwl v1,3 - 4(t5) //
b CopyPattern // finish in common code
//
// The pattern is not 8 bytes in width or cannot be moved 8 bytes at a time.
//
.set noreorder
.set noat
10: lwr v0,0(t5) // get low bytes of pattern
lwl v0,3(t5) // get high bytes of pattern
addu t0,t0,4 // advance target pointer
sw v0,-4(t0) // store pattern in target
beq t0,t4,20f // if eq, end of target
addu t2,t2,4 // advance pixel offset
subu t6,t2,t3 // check if at end of pattern
bltz t6,10b // if ltz, not at end of pattern
addu t5,t2,t1 // compute address of pattern
move t2,t6 // set offset in pattern
b 10b //
addu t5,t2,t1 // compute address of pattern
.set at
.set reorder
20: j ra // return
SBTTL("rop Pn, Aligned")
//++
//
// VOID
// vFetchNotAndCopy (
// IN PFETCHFRAME pff
// )
//
// Routine Description:
//
// This routine repeatedly tiles one line of an aligned pattern.
//
// Arguments:
//
// pff (a0) - Supplies a pointer to a fetch frame.
//
// Return Value:
//
// None.
//
//--
ALTERNATE_ENTRY(vFetchNotAndCopy)
lw t0,ff_pvTrg(a0) // get starting target address
lw t1,ff_pvPat(a0) // get base pattern address
lw t2,ff_xPat(a0) // get pattern offset in bytes
lw t3,ff_cxPat(a0) // get pattern width in pixels
lw t4,ff_culFill(a0) // compute ending target address
sll a1,t4,2 //
addu t4,a1,t0 //
addu t5,t2,t1 // compute current pattern address
subu v0,t3,8 // check if pattern is exactly 8 bytes
bne zero,v0,20f // if ne, pattern is not 8 bytes
lw v0,0(t5) // get low part of 8-byte pattern
lw v1,4(t5) // get high part of 8-byte pattern
beq zero,t2,10f // if eq, zero offset value
lw v1,0(t1) // get high part of 8-byte pattern
10: nor v0,v0,zero // complement pattern
nor v1,v1,zero //
b CopyPattern // finish in common code
//
// The pattern is not 8 bytes in width or cannot be moved 8 bytes at a time.
//
.set noreorder
.set noat
20: lw v0,0(t5) // get 4-byte pattern value
addu t3,t3,t1 // compute ending pattern address
30: addu t0,t0,4 // advance target pointer
nor v0,v0,zero // complement pattern
sw v0,-4(t0) // store pattern in target
beq t0,t4,40f // if eq, end of target
addu t5,t5,4 // advance pattern address
subu t6,t5,t3 // check if at end of pattern
bne zero,t6,30b // if ne, not at end of pattern
lw v0,0(t5) // get 4-byte pattern value
move t5,t1 // set starting pattern address
b 30b //
lw v0,0(t5) // get 4-byte pattern value
.set at
.set reorder
40: j ra // return
SBTTL("rop Pn, Unaligned")
//++
//
// VOID
// vFetchShiftNotAndCopy (
// IN PFETCHFRAME pff
// )
//
// Routine Description:
//
// This routine repeatedly tiles one line of an unaligned pattern
// using rop (Pn).
//
// Arguments:
//
// pff (a0) - Supplies a pointer to a fetch frame.
//
// Return Value:
//
// None.
//
//--
ALTERNATE_ENTRY(vFetchShiftNotAndCopy)
lw t0,ff_pvTrg(a0) // get starting target address
lw t1,ff_pvPat(a0) // get base pattern address
lw t2,ff_xPat(a0) // get pattern offset in bytes
lw t3,ff_cxPat(a0) // get pattern width in pixels
lw t4,ff_culFill(a0) // compute ending target address
sll a1,t4,2 //
addu t4,a1,t0 //
addu t5,t2,t1 // compute current pattern address
subu v0,t3,8 // check if pattern is exactly 8 bytes
bne zero,v0,10f // if ne, pattern is not 8 bytes
lwr v0,0(t5) // get low part of 8-byte pattern
lwl v0,3(t5) //
lwr v1,4(t5) // get high part of 8-byte pattern
lwl v1,3 - 4(t5) //
nor v0,v0,zero // complement pattern
nor v1,v1,zero //
b CopyPattern // finish in common code
//
// The pattern is not 8 bytes in width or cannot be moved 8 bytes at a time.
//
.set noreorder
.set noat
10: lwr v0,0(t5) // get low bytes of pattern
lwl v0,3(t5) // get high bytes of pattern
addu t0,t0,4 // advance target pointer
nor v0,v0,zero // complement pattern
sw v0,-4(t0) // store pattern in target
beq t0,t4,20f // if eq, end of target
addu t2,t2,4 // advance pixel offset
subu t6,t2,t3 // check if at end of pattern
bltz t6,10b // if ltz, not at end of pattern
addu t5,t2,t1 // compute address of pattern
move t2,t6 // set offset in pattern
b 10b //
addu t5,t2,t1 // compute address of pattern
.set at
.set reorder
20: j ra // return
SBTTL("Copy Pattern")
//++
//
// Routine Description:
//
// This routine contains common code for copying an 8-byte pattern to
// a target surface.
//
// Arguments:
//
// a1 - Supplies the size of the fill in bytes.
// v0 and v1 - Supplies the 8-byte pattern to copy.
// t0 - Supplies the starting target surface address.
// t4 - Supplies the ending target surface address.
//
// Return Value:
//
// None.
//
//--
CopyPattern: //
//
// If the fill size is not an even multiple of 8 bytes, then move one
// longword and swap the pattern value.
//
and t8,a1,0x4 // check if even multiple of 8 bytes
beq zero,t8,10f // if eq, even multiple of 8 bytes
sw v0,0(t0) // store low 4 bytes of pattern
addu t0,t0,4 // advance target address
subu a1,a1,4 // reduce size of fill operation
beq zero,a1,200f // if eq, no more to move
move t8,v0 // swap 8-byte pattern value
move v0,v1 //
move v1,t8 //
//
// Many system platforms do not support 64 bit access to video memory. For
// these platforms, data is moved 32-bits at a time.
//
10: lbu t7,Gdip64bitDisabled // get 64-bit disable flag
bne zero,t7,140f // if eq, 64-bit access is disabled
//
// If the target buffer is 8-byte aligned, then move the pattern value to
// the target 32 bytes at a time by moving any intervening 8-byte blocks
// first. Otherwise, move a single longword, move any intervening 8-byte
// blocks, move 32-byte blocks, and then move a single longword at the end.
//
and t8,t0,0x4 // isolate target alignment bits
bne zero,t8,70f // if ne, target not aligned
//
// Move 8-byte pattern value to target 32 bytes at a time.
//
.set noreorder
.set noat
dsll v0,v0,32 // merge 8 bytes of pattern
dsrl v0,v0,32 //
dsll v1,v1,32 //
or v0,v0,v1 //
and t8,a1,0x18 // check if even multiple of 32 bytes
beq zero,t8,30f // if eq, even multiple of 32 bytes
subu t4,t4,32 // compute ending segment address
subu a1,a1,t8 // reduce size of fill operation
beq zero,a1,40f // if eq, only alignment part to move
addu t0,t0,t8 // advance target address
xor t8,t8,0x18 // check if 24 bytes need to be moved
beql zero,t8,20f // if eq, 24 bytes to move
sd v0,-24(t0) // store first 8 bytes of 24 bytes
and t8,t8,0x10 // check if 8 bytes to move
bnel zero,t8,30f // if ne, only 8 bytes to move
sd v0,-8(t0) // store 8-bytes of pattern
20: sd v0,-16(t0) // store last 16 bytes of 16 or 24 bytes
sd v0,-8(t0) //
30: sd v0,0(t0) // store 8 byte pattern value 4 times
sd v0,8(t0) //
sd v0,16(t0) //
sd v0,24(t0) //
bne t0,t4,30b // if ne, more to move
addu t0,t0,32 // advance target address
.set at
.set reorder
j ra // return
.set noreorder
.set noat
40: xor t8,t8,0x18 // check if 24 bytes need to be moved
beql zero,t8,50f // if eq, 24 bytes to move
sd v0,-24(t0) // store first 8 bytes of 24 bytes
and t8,t8,0x10 // check if 8 bytes to move
bnel zero,t8,60f // if ne, only 8 bytes to move
sd v0,-8(t0) // store 8-bytes of pattern
50: sd v0,-16(t0) // store last 16 bytes of 16 or 24 bytes
sd v0,-8(t0) //
.set at
.set reorder
60: j ra // return
//
// Align the target to an 8-byte boundary, move any intervening 8-byte blocks,
// move the pattern to the target 32 bytes at a time, and move the remaining
// longword at the end.
//
70: sw v0,0(t0) // store low 4 bytes of pattern
addu t0,t0,4 // advance target address
subu a1,a1,8 // reduce size of fill
beq zero,a1,120f // if eq, nothing in the middle
.set noreorder
.set noat
dsll v1,v1,32 // merge 8 bytes of pattern
dsrl v1,v1,32 //
dsll v0,v0,32 //
or v1,v0,v1 //
and t8,a1,0x18 // check if even multiple of 32 bytes
beq zero,t8,90f // if eq, even multiple of 32 bytes
subu t4,t4,32 + 4 // compute ending segment address
subu a1,a1,t8 // reduce size of fill operation
beq zero,a1,100f // if eq, only alignment part to move
addu t0,t0,t8 // advance target address
xor t8,t8,0x18 // check if 24 bytes need to be moved
beql zero,t8,80f // if eq, 24 bytes to move
sd v1,-24(t0) // store first 8 bytes of 24 bytes
and t8,t8,0x10 // check if 8 bytes to move
bnel zero,t8,90f // if ne, only 8 bytes to move
sd v1,-8(t0) // store 8-bytes of pattern
80: sd v1,-16(t0) // store last 16 bytes of 16 or 24 bytes
sd v1,-8(t0) //
90: sd v1,0(t0) // store 8 byte pattern value 4 times
sd v1,8(t0) //
sd v1,16(t0) //
sd v1,24(t0) //
bne t0,t4,90b // if ne, more to move
addu t0,t0,32 // advance target address
.set at
.set reorder
sw v1,0(t0) // store high bytes of pattern
j ra // return
.set noreorder
.set noat
100: xor t8,t8,0x18 // check if 24 bytes need to be moved
beql zero,t8,110f // if eq, 24 bytes to move
sd v1,-24(t0) // store first 8 bytes of 24 bytes
and t8,t8,0x10 // check if 8 bytes to move
bnel zero,t8,120f // if ne, only 8 bytes to move
sd v1,-8(t0) // store 8-bytes of pattern
110: sd v1,-16(t0) // store last 16 bytes of 16 or 24 bytes
sd v1,-8(t0) //
.set at
.set reorder
120: sw v1,0(t0) // store high 4 bytes of pattern
j ra // return
//
// Move 8-byte pattern value to target 8 bytes at a time using 32-bit
// operations.
//
.set noreorder
.set noat
140: and t8,a1,0x8 // check if even multiple of 8 bytes
beq zero,t8,160f // if eq, even multiple of 8 bytes
subu t4,t4,8 // compute ending segment address
150: sw v0,0(t0) // store 8-byte pattern value
sw v1,4(t0) //
bne t0,t4,150b // if ne, more to move
addu t0,t0,8 // advance target address
.set at
.set reorder
j ra // return
//
// Move 8-byte pattern value to target 16 bytes at a time using 32-bit
// operations.
//
.set noreorder
.set noat
160: subu t4,t4,8 // compute ending segment address
170: sw v0,0(t0) // store 8-byte pattern value
sw v1,4(t0) //
sw v0,8(t0) // store 8-byte pattern value
sw v1,12(t0) //
bne t0,t4,170b // if ne, more to move
addu t0,t0,16 // advance target address
.set at
.set reorder
200: j ra // return
.end vCopyPattern
SBTTL("rop DPx, Aligned")
//++
//
// VOID
// vFetchAndMerge (
// IN PFETCHFRAME pff
// )
//
// Routine Description:
//
// This routine repeatedly tiles one line of an aligned pattern.
//
// Arguments:
//
// pff (a0) - Supplies a pointer to a fetch frame.
//
// Return Value:
//
// None.
//
//--
LEAF_ENTRY(vMergePattern)
ALTERNATE_ENTRY(vFetchAndMerge)
lw t0,ff_pvTrg(a0) // get starting target address
lw t1,ff_pvPat(a0) // get base pattern address
lw t2,ff_xPat(a0) // get pattern offset in bytes
lw t3,ff_cxPat(a0) // get pattern width in pixels
lw t4,ff_culFill(a0) // compute ending target address
sll a1,t4,2 //
addu t4,a1,t0 //
addu t5,t2,t1 // compute current pattern address
subu v0,t3,8 // check if pattern is exactly 8 bytes
bne zero,v0,10f // if ne, pattern is not 8 bytes
lw v0,0(t5) // get low part of 8-byte pattern
lw v1,4(t5) // get high part of 8-byte pattern
beq zero,t2,MergePattern // if eq, zero offset value
lw v1,0(t1) // get high part of 8-byte pattern
b MergePattern // finish in common code
//
// The pattern is not 8 bytes in width or cannot be moved 8 bytes at a time.
//
.set noreorder
.set noat
10: lw v0,0(t5) // get 4-byte pattern value
addu t3,t3,t1 // compute ending pattern address
20: lw v1,0(t0) // get 4-byte target value
addu t0,t0,4 // advance target pointer
xor v0,v1,v0 // compute exclusive or with pattern
sw v0,-4(t0) // store pattern in target
beq t0,t4,30f // if eq, end of target
addu t5,t5,4 // advance pixel offset
subu t6,t5,t3 // check if at end of pattern
bne zero,t6,20b // if ne, not at end of pattern
lw v0,0(t5) // get 4-byte pattern value
move t5,t1 // set starting pattern address
b 20b //
lw v0,0(t5) // get 4-byte pattern value
.set at
.set reorder
30: j ra // return
SBTTL("rop DPx, Unaligned")
//++
//
// VOID
// vFetchShiftAndMerge (
// IN PFETCHFRAME pff
// )
//
// Routine Description:
//
// This routine repeatedly tiles one line of an unaligned pattern
// using rop (DPx).
//
// Arguments:
//
// pff (a0) - Supplies a pointer to a fetch frame.
//
// Return Value:
//
// None.
//
//--
ALTERNATE_ENTRY(vFetchShiftAndMerge)
lw t0,ff_pvTrg(a0) // get starting target address
lw t1,ff_pvPat(a0) // get base pattern address
lw t2,ff_xPat(a0) // get pattern offset in bytes
lw t3,ff_cxPat(a0) // get pattern width in pixels
lw t4,ff_culFill(a0) // compute ending target address
sll a1,t4,2 //
addu t4,a1,t0 //
addu t5,t2,t1 // compute current pattern address
subu v0,t3,8 // check if pattern is exactly 8 bytes
bne zero,v0,10f // if ne, pattern is not 8 bytes
lwr v0,0(t5) // get low part of 8-byte pattern
lwl v0,3(t5) //
lwr v1,4(t5) // get high part of 8-byte pattern
lwl v1,3 - 4(t5) //
b MergePattern // finish in common code
//
// The pattern is not 8 bytes in width or cannot be moved 8 bytes at a time.
//
.set noreorder
.set noat
10: lw v1,0(t0) // get 4-byte target value
lwr v0,0(t5) // get low bytes of pattern
lwl v0,3(t5) // get high bytes of pattern
addu t0,t0,4 // advance target pointer
xor v0,v1,v0 // compute exclusive or with pattern
sw v0,-4(t0) // store pattern in target
beq t0,t4,20f // if eq, end of target
addu t2,t2,4 // advance pixel offset
subu t6,t2,t3 // check if at end of pattern
bltz t6,10b // if ltz, not at end of pattern
addu t5,t2,t1 // compute address of pattern
move t2,t6 // set offset in pattern
b 10b //
addu t5,t2,t1 // compute address of pattern
.set at
.set reorder
20: j ra // return
SBTTL("Merge Pattern")
//++
//
// Routine Description:
//
// This routine contains common code for merging an 8-byte pattern to
// a target surface.
//
// Arguments:
//
// v0 and v1 - Supplies the 8-byte pattern to copy.
// t0 - Supplies the starting target surface address.
// t4 - Supplies the ending target surface address.
//
// Return Value:
//
// None.
//
//--
MergePattern: //
//
// If the fill size is not an even multiple of 8 bytes, then merge one
// longword and swap the pattern value.
//
and t8,a1,0x4 // check if even multiple of 8 bytes
beq zero,t8,10f // if eq, even multiple of 8 bytes
lw t6,0(t0) // get 4-byte target value
addu t0,t0,4 // advance target address
xor t6,t6,v0 // compute exclusive or with pattern
sw t6,-4(t0) // store low 4 bytes of pattern
subu a1,a1,4 // reduce size of fill operation
beq zero,a1,160f // if eq, no more to move
move t8,v0 // swap 8-byte pattern value
move v0,v1 //
move v1,t8 //
//
// Many system platforms do not support 64 bit access to video memory. For
// these platforms, data is moved 32-bits at a time.
//
10: lbu t7,Gdip64bitDisabled // get 64-bit disable flag
bne zero,t7,110f // if eq, 64-bit access is disabled
//
// If the target buffer is 8-byte aligned, then merge the pattern value with
// the target 8 bytes at a time. Otherwise, merge a single longword, merge any
// intervening 8-byte blocks, and then merge a single longword at the end.
//
and t8,t0,0x4 // isolate target alignment bits
bne zero,t8,30f // if ne, target alignment problem
//
// Merge 8-byte pattern value with target.
//
.set noreorder
.set noat
dsll v0,v0,32 // merge 8 bytes of pattern
dsrl v0,v0,32 //
dsll v1,v1,32 //
or v0,v0,v1 //
and a2,a1,32 - 1 // isolate residual number of bytes
subu a2,a1,a2 // compute 32-byte block count
beq zero,a2,17f // if eq, no 32-byte block to merge
subu a1,a1,a2 // compute residual number of bytes
addu a2,a2,t0 // compute ending segment address
subu a2,a2,32 //
//
// Merge 8-byte pattern value with target 32 bytes at a time.
//
13: ld t1,0(t0) // get 8-byte target values
ld t2,8(t0) //
ld t3,16(t0) //
ld t5,24(t0) //
xor t1,t1,v0 // compute exclusive or with pattern
xor t2,t2,v0 //
xor t3,t3,v0 //
xor t5,t5,v0 //
sd t1,0(t0) // store 8-byte pattern values
sd t2,8(t0) //
sd t3,16(t0) //
sd t5,24(t0) //
bne t0,a2,13b // if ne, more to move
addu t0,t0,32 // advance target address
.set at
.set reorder
beq zero,a1,160f // if eq, no residual 8-byte blocks
//
// Merge 8-byte pattern value with target 8 bytes at a time.
//
.set noreorder
.set noat
17: subu t4,t4,8 // compute ending segment address
20: ld t1,0(t0) // get 8-byte target value
xor t1,t1,v0 // compute exclusive or with pattern
sd t1,0(t0) // store 8-byte pattern value
bne t0,t4,20b // if ne, more to move
addu t0,t0,8 // advance target address
.set at
.set reorder
j ra // return
//
// Align the target to an 8-byte boundary, merge any intervening 8-byte blocks,
// and merge the remaining longword at the end.
//
30: lw t6,0(t0) // get 4-byte target value
addu t0,t0,4 // advance target address
xor t6,t6,v0 // compute exclusive or with pattern
sw t6,-4(t0) // store low 4 bytes of pattern
subu a1,a1,8 // reduce size of fill
beq zero,a1,50f // if eq, nothing in the middle
//
// Merge 8-byte pattern value with target.
//
.set noreorder
.set noat
dsll v1,v1,32 // merge 8 bytes of pattern
dsrl v1,v1,32 //
dsll v0,v0,32 //
or v1,v0,v1 //
and a2,a1,32 - 1 // isolate residual number of bytes
subu a2,a1,a2 // compute 32-byte block count
beq zero,a2,37f // if eq, no 32-byte block to merge
subu a1,a1,a2 // compute residual number of bytes
addu a2,a2,t0 // compute ending segment address
subu a2,a2,32 //
//
// Merge 8-byte pattern value with target 32 bytes at a time.
//
33: ld t1,0(t0) // get 8-byte target values
ld t2,8(t0) //
ld t3,16(t0) //
ld t5,24(t0) //
xor t1,t1,v1 // compute exclusive or with pattern
xor t2,t2,v1 //
xor t3,t3,v1 //
xor t5,t5,v1 //
sd t1,0(t0) // store 8-byte pattern values
sd t2,8(t0) //
sd t3,16(t0) //
sd t5,24(t0) //
bne t0,a2,33b // if ne, more to move
addu t0,t0,32 // advance target address
.set at
.set reorder
beq zero,a1,50f // if eq, no residual 8-byte blocks
//
// Merge 8-byte pattern value with target 8 bytes at a time.
//
.set noreorder
.set noat
37: subu t4,t4,12 // compute ending segment address
40: ld t1,0(t0) // get 8-byte target value
xor t1,t1,v1 // compute exclusive or with pattern
sd t1,0(t0) // store 8-byte pattern value
bne t0,t4,40b // if ne, more to move
addu t0,t0,8 // advance target address
.set at
.set reorder
50: lw t6,0(t0) // get 4-byte target value
xor t6,t6,v1 // compute exclusive or with pattern
sw t6,0(t0) // store high bytes of pattern
j ra // return
//
// Merge 8-byte pattern value with target using 32-bit operations.
//
.set noreorder
.set noat
110: subu t4,t4,8 // compute ending segment address
120: lw t6,0(t0) // get 8-byte target value
lw t7,4(t0) //
xor t6,t6,v0 // compute exclusive or with pattern
xor t7,t7,v1 //
sw t6,0(t0) // store 8-byte pattern value
sw t7,4(t0) //
bne t0,t4,120b // if ne, more to move
addu t0,t0,8 // advance target address
.set at
.set reorder
160: j ra // return
.end vMergePattern