Windows NT 4.0 source code leak
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

148 lines
4.2 KiB

/* ------------------------------------------------------------------ */
/* | Copyright Unpublished, MIPS Computer Systems, Inc. All Rights | */
/* | Reserved. This software contains proprietary and confidential | */
/* | information of MIPS and its suppliers. Use, disclosure or | */
/* | reproduction is prohibited without the prior express written | */
/* | consent of MIPS. | */
/* ------------------------------------------------------------------ */
/* strcpy.s 1.2 */
/* This function is an assembly-code replacement for the libc function
* strcpy. It uses the MIPS special instructions "lwl", "lwr", "swl",
* and "swr", which handle unaligned words.
* The standard C version of this function is a 5-instruction loop,
* working one byte at a time:
* Copy string s2 to s1. s1 must be large enough.
* return s1
* char *strcpy(s1, s2)
* register char *s1, *s2;
* {
* register char *os1;
* os1 = s1;
* while (*s1++ = *s2++);
* return(os1);
* }
* A better C version is 4 cycles/byte. Loop is unrolled once.
* char *
* strcpy(s1, s2)
* register char *s1, *s2;
* {
* register char *os1 = s1;
* while (1) {
* register unsigned c;
* c = s2[0];
* s2 += 2;
* s1[0] = c;
* if (c == 0) break;
* c = s2[1-2];
* s1 += 2;
* s1[1-2] = c;
* if (c == 0) break;
* }
* return(os1);
* }
* This function starts with an unrolled loop, which uses 5
* instructions per byte (including the store bytes at the end) for
* the first few bytes.
* After filling a word, the first word or portion of a word is saved
* using a "swl" instruction. If the start of destination string is at
* a word boundary, this leaves the result valid in the cache. Because
* this replaces up to 4 store byte instructions, we are still near 3
* instructions per byte, but there is only one write.
* The inner loop moves 4 bytes in 16 cycles, an average of 4 cycles
* per byte. This is 1 cycle faster than the standard C code, the
* same speed as the unrolled version, and it also leaves the result
* valid in the cache.
* Finally, when a zero byte is found, the end of the string is stored
* using store byte instructions. This adds one instruction per byte
* for as much as three bytes, but elminates the up to four cycles of
* overhead we counted before.
* The end result is that this function is never slower than the C
* function, is faster by up to 30% in instruction count, uses up to
* 75% fewer writes, and leaves most of the result valid in the cache.
* There are one caveat to consider: this function is written in
* assembler code, and as such, cannot be merged using the U-code
* loader. */
/* Craig Hansen - 3-September-86 */
#include <kxmips.h>
/* It turns out better to think of lwl/lwr and swl/swr as
smaller-vs-bigger address rather than left-vs-right.
Such a representation makes the code endian-independent. */
#define LWS lwr
#define LWB lwl
#define SWS swr
#define SWB swl
.text
LEAF_ENTRY(strcpy)
.set noreorder
// a0/ destination
// a1/ source
move v0, a0 # a copy of destination address is returned
// start up first word
// adjust pointers so that a0 points to next word
// t7 = a1 adjusted by same amount minus one
// t0,t1,t2,t3 are filled with 4 consecutive bytes
// t4 is filled with the same 4 bytes in a single word
lb t0, 0(a1)
ori t5, a0, 3 # get an early start
beq t0, 0, $doch0
sub t6, t5, a0 # number of char in 1st word of dest - 1
lb t1, 1(a1)
add t7, a1, t6 # offset starting point for source string
beq t1, 0, $doch1
nop
lb t2, 2(a1)
nop
beq t2, 0, $doch2
LWS t4, 0(a1) # safe: always in same word as 0(a1)
lb t3, 3(a1)
LWB t4, 3(a1) # fill out word
beq t3, 0, $doch3
SWS t4, 0(a0) # store entire or part word
addi a0, t5, 1-4 # adjust destination ptr
// inner loop
1: lb t0, 1(t7)
addi t7, 4
beq t0, 0, $doch0
addi a0, 4
lb t1, 1+1-4(t7)
nop
beq t1, 0, $doch1
nop
lb t2, 2+1-4(t7)
nop
beq t2, 0, $doch2
LWS t4, 0+1-4(t7)
lb t3, 3+1-4(t7)
LWB t4, 3+1-4(t7)
bne t3, 0, 1b
sw t4, 0(a0)
j ra
nop
// store four bytes using swl/swr
$doch3: j ra
SWB t4, 3(a0)
// store up to three bytes, a byte at a time.
$doch2: sb t2, 2(a0)
$doch1: sb t1, 1(a0)
$doch0: j ra
sb t0, 0(a0)
.end strcpy