windows-nt-4.0/private/crt32/string/ppc/strlenp.s


								//      TITLE("strlen")

								//++

								//

								// Copyright (c) 1994  IBM Corporation

								//

								// Module Name:

								//

								//    strlen.s

								//

								// Routine Description:

								//

								//    This function returns the length of a string excluding the

								//    terminating null.

								//

								//    The algorithm used here merits some explanation.  It turns

								//    out to be quite fast on a 32 bit processor but it is known

								//    to be extremely fast on a 64 bit processor.  I have seen this

								//    algorithm used elsewhere but the best description I have seen

								//    for it is in a document entitled "Hacker's Delight" by

								//    Henry S. Warren, Jr.   IBM Thomas J. Watson Research Center.

								//

								//    I have found (by experimentation) that it is faster to just

								//    get on with it and do the first few bytes as single bytes

								//    rather than trying to work out the alignment up front.

								//

								//    Once word aligned, we can process the string a word at a time.

								//    The hard part is to figure out if the word contains a zero byte.

								//

								//    Given x where x is the word being examined, consider

								//

								//    y = x & 0x7f7f7f7f        we have reduced each byte in x to a

								//                              value of 7f or less.

								//

								//    y += 0x7f7f7f7f           each byte whose lower 7 bits were non

								//                              zero now has its left most bit set.

								//

								//    y |= x                    each byte whose value was 0x80 now

								//                              also has its upper bit set.

								//

								//    y |= 0x7f7f7f7f           each non zero byte now has the value

								//                              0xff.  (Each zero byte is now 0x7f).

								//

								//    Note the last two "or" operations can be rewritten as

								//

								//    x |= 0x7f7f7f7f

								//

								//    y |= x

								//

								//    making the operations to independent which allows them to run in

								//    parrallel on a superscalar machine with multiple boolean functional

								//    units.

								//

								//    This value can now be checked for equality with -1, or its

								//    complement with 0.  If equal, then this word contains no zero

								//    bytes.

								//

								//    The complement is more interesting.  On a big endian machine, a

								//    count leading zeroes on the complement gives you 8 times the number

								//    of non-zero bytes before the zero byte.  Little endian, no such

								//    luxury.  Given that a non-zero complement means one or more bytes

								//    are zero, we only need check the first three (starting at the

								//    right).  On PowerPC we can easily do this by moving the complement

								//    to the condition register (after suitably rotating so we do not

								//    overwrite any non-volatile condition register fields) then testing

								//    one bit for each byte.

								//

								//    I have found this method to be about the same speed (no slower) for

								//    very short strings and almost (not quite) twice as fast as checking

								//    each byte individually for long strings.

								//

								//

								// Author:

								//

								//    Peter L Johnston   ([email protected]) 15-Aug-1994

								//

								// Environment:

								//

								//    User or Kernel mode.

								//

								// Revision History:

								//

								// Arguments:

								//

								//    addr (r.3) - A pointer to the string

								//

								// Return Value:

								//

								//    length (r.3)  - length of the string excluding the terminating null

								//


								#include <kxppc.h>


								        LEAF_ENTRY(strlen)


								//

								// Do the first 1, 2, 3 or 4 bytes individually while we try to figure

								// out the real alignment.

								//


								        lbz     r.4, 0(r.3)             // get first byte

								        addi    r.7, r.3, 1             // bump and move address

								        cmpwi   cr.1, r.4, 0            // first byte zero?

								        andi.   r.8, r.7, 0x3           // check alignment of next byte

								        li      r.3, 0                  // initialize length

								        beqlr   cr.1                    // return if byte is zero

								        beq     wds                     // switch to word oriented count if

								                                        // now word aligned.

								nxbyte: lbz     r.4, 0(r.7)             // get next byte

								        addi    r.7, r.7, 1             // bump address

								        cmpwi   cr.1, r.4, 0            // byte equal zero?

								        andi.   r.8, r.7, 0x3           // check new alignment

								        addi    r.3, r.3, 1             // bump length

								        beqlr   cr.1                    // return if byte is zero

								        bne     nxbyte


								//

								// We can look at the rest on a word by word basis

								//


								wds:    lwz     r.4, 0(r.7)             // get first word


								        lis     r.6, 0x7f7f             // setup magic constant

								        ori     r.6, r.6, 0x7f7f


								        addi    r.3, r.3, 1             // count was one short by here


								//

								// See introductory comments for an explanation of the following.

								//


								chkwd:  and     r.8, r.4, r.6           // y = x & 0x7f7f7f7f

								        or      r.4, r.4, r.6           // x = x | 0x7f7f7f7f

								        add     r.8, r.8, r.6           // y += 0x7f7f7f7f

								        or      r.8, r.8, r.4           // y |= x

								        not.    r.8, r.8                // if complement = 0 then no zero bytes

								        bne     bytes                   // non-zero means one of the bytes IS


								        lwzu    r.4, 4(r.7)             // get next word

								        addi    r.3, r.3, 4             // bump count

								        b       chkwd


								//

								// When we get here, we encountered a word that contains a zero byte. As

								// a result of the above algorithm, the zero byte is now represented within

								// the word as 0x80.  All non-zero bytes are now 0x00.

								//

								// We have up to 4 bits in r.8 (if we were big endian we could use count

								// leading zeros for this), we'll slam them in the condition register and

								// look at them individually.  We know at least one of them is set so we

								// only bother checking the first three.

								//


								bytes:  rlwinm  r.8, r.8, 20, 0xff000fff// posn so dont use non-volatile

								        mtcrf   0x45, r.8               // fields in CR

								        btlr    4                       // jif right most is terminator

								        addi    r.3, r.3, 1             // bump count

								        btlr    28                      // jif 2nd from right

								        addi    r.3, r.3, 1             // bump

								        btlr    20                      // jif 3rd from right

								        addi    r.3, r.3, 1             // bump


								        blr


								        LEAF_EXIT(strlen)