windows-nt-4.0/private/crt32/string/mips/strcpym.s


								/* ------------------------------------------------------------------ */

								/* | Copyright Unpublished, MIPS Computer Systems, Inc.  All Rights | */

								/* | Reserved.  This software contains proprietary and confidential | */

								/* | information of MIPS and its suppliers.  Use, disclosure or     | */

								/* | reproduction is prohibited without the prior express written   | */

								/* | consent of MIPS.                                               | */

								/* ------------------------------------------------------------------ */

								/*  strcpy.s 1.2 */


								/* This function is an assembly-code replacement for the libc function

								 * strcpy.  It uses the MIPS special instructions "lwl", "lwr", "swl",

								 * and "swr", which handle unaligned words.


								 * The standard C version of this function is a 5-instruction loop,

								 * working one byte at a time:


								 * Copy string s2 to s1.  s1 must be large enough.

								 * return s1

								 *	char *strcpy(s1, s2)

								 *	register char *s1, *s2;

								 *	{

								 *		register char *os1;

								 *		os1 = s1;

								 *		while (*s1++ = *s2++);

								 *		return(os1);

								 *	}


								 * A better C version is 4 cycles/byte. Loop is unrolled once.

								 * char *

								 * strcpy(s1, s2)

								 * register char *s1, *s2;

								 * {

								 * 	register char *os1 = s1;

								 * 	while (1) {

								 * 		register unsigned c;

								 * 		c = s2[0];

								 * 		s2 += 2;

								 * 		s1[0] = c;

								 * 		if (c == 0) break;

								 * 		c = s2[1-2];

								 * 		s1 += 2;

								 * 		s1[1-2] = c;

								 * 		if (c == 0) break;

								 * 	}

								 * 	return(os1);

								 * }


								 * This function starts with an unrolled loop, which uses 5

								 * instructions per byte (including the store bytes at the end) for

								 * the first few bytes.


								 * After filling a word, the first word or portion of a word is saved

								 * using a "swl" instruction. If the start of destination string is at

								 * a word boundary, this leaves the result valid in the cache. Because

								 * this replaces up to 4 store byte instructions, we are still near 3

								 * instructions per byte, but there is only one write.


								 * The inner loop moves 4 bytes in 16 cycles, an average of 4 cycles

								 * per byte.  This is 1 cycle faster than the standard C code, the

								 * same speed as the unrolled version, and it also leaves the result

								 * valid in the cache.


								 * Finally, when a zero byte is found, the end of the string is stored

								 * using store byte instructions.  This adds one instruction per byte

								 * for as much as three bytes, but elminates the up to four cycles of

								 * overhead we counted before.


								 * The end result is that this function is never slower than the C

								 * function, is faster by up to 30% in instruction count, uses up to

								 * 75% fewer writes, and leaves most of the result valid in the cache.


								 * There are one caveat to consider: this function is written in

								 * assembler code, and as such, cannot be merged using the U-code

								 * loader. */


								/* Craig Hansen - 3-September-86 */


								#include <kxmips.h>


								/* It turns out better to think of lwl/lwr and swl/swr as

								   smaller-vs-bigger address rather than left-vs-right.

								   Such a representation makes the code endian-independent. */


								#define LWS lwr

								#define LWB lwl

								#define SWS swr

								#define SWB swl


								.text


								LEAF_ENTRY(strcpy)

								.set noreorder

									// a0/ destination

									// a1/ source

									move	v0, a0		# a copy of destination address is returned

									// start up first word

									// adjust pointers so that a0 points to next word

									// t7 = a1 adjusted by same amount minus one

									// t0,t1,t2,t3 are filled with 4 consecutive bytes

									// t4 is filled with the same 4 bytes in a single word

									lb	t0, 0(a1)

									ori	t5, a0, 3	# get an early start

									beq	t0, 0, $doch0

									sub	t6, t5, a0	# number of char in 1st word of dest - 1

									lb	t1, 1(a1)

									add	t7, a1, t6	# offset starting point for source string

									beq	t1, 0, $doch1

									nop

									lb	t2, 2(a1)

									nop

									beq	t2, 0, $doch2

									LWS	t4, 0(a1)	# safe: always in same word as 0(a1)

									lb	t3, 3(a1)

									LWB	t4, 3(a1)	# fill out word

									beq	t3, 0, $doch3

									SWS	t4, 0(a0)	# store entire or part word

									addi	a0, t5, 1-4	# adjust destination ptr


									// inner loop

								1:	lb	t0, 1(t7)

									addi	t7, 4

									beq	t0, 0, $doch0

									addi	a0, 4

									lb	t1, 1+1-4(t7)

									nop

									beq	t1, 0, $doch1

									nop

									lb	t2, 2+1-4(t7)

									nop

									beq	t2, 0, $doch2

									LWS	t4, 0+1-4(t7)

									lb	t3, 3+1-4(t7)

									LWB	t4, 3+1-4(t7)

									bne	t3, 0, 1b

									sw	t4, 0(a0)

									j	ra

									nop


									// store four bytes using swl/swr

								$doch3:	j	ra

									SWB	t4, 3(a0)

									// store up to three bytes, a byte at a time.

								$doch2:	sb	t2, 2(a0)

								$doch1:	sb	t1, 1(a0)

								$doch0:	j	ra

									sb	t0, 0(a0)


								.end strcpy