|
|
;---------------------------Module-Header------------------------------; ; Module Name: math.asm ; ; Fast math routines. ; ; Created: 11/1/1996 ; Author: Otto Berkes [ottob] ; ; Copyright (c) 1996 Microsoft Corporation ;----------------------------------------------------------------------; .386
.model small,pascal
assume cs:FLAT,ds:FLAT,es:FLAT,ss:FLAT assume fs:nothing,gs:nothing
.xlist include gli386.inc .list
PROFILE = 0 include profile.inc
; ; ; We're trying to solve: ; ; 1/sqrt(x) ; ; which in IEEE float is: ; ; 1/sqrt(M * 2^[E-127]) ; ; To simplify, substitute e = [E-127] ; ; We can simplify this by pulling a large portion of the exponent out ; by using only that portion of the exponent divisible by two (so we can ; pull it out of the sqrt term): ; ; 1/sqrt(M * 2^(2*[e div 2]) * 2^[e MOD 2]) ; ; which is: ; ; 1/ (2^[e div 2] * sqrt(M * 2^[e MOD 2])) ; ; or ; ; (2^[e div 2])^(-1) * 1/sqrt(M * 2^[e MOD 2]) ; ; = ; 2^-[e div 2] * 1/sqrt(M * 2^[e MOD 2]) ; ; substitute back for e = [E-127]: ; ; 2^-[(E - 127) div 2] * 1/sqrt(M * 2^[(E - 127) MOD 2]) ; ; = ; 2^-[(E div 2) - 63] * 1/sqrt(M * 2^[(E - 1) MOD 2]) ; ; = ; 2^[63 - (E div 2)] * 1/sqrt(M * 2^[(E - 1) MOD 2]) ; ; As a floating-point number, 2^[63 - (E div 2)] is just the exponent value: ; ; [63 - (E div 2)] + 127 ; ; or ; [(127+63) - (E div 2)] ; ; Remembering to account for the implicit '1' im the mantissa of IEEE floating- ; point numbers, the range of (M * 2^[(E - 1) MOD 2]) is 0.800000 to ; 0.ffffff*2, which is the interval [0.5, 2.0). We can use the fact that this ; is a relatively small range, and therefore can use a table lookup near the ; actual value. The table will contain values for the piece-wise approximation ; to the curve 1/sqrt(M * 2^[(E - 1) MOD 2]) using an acceptable interval. ; These values can then be used to approximate the desired inverse square root ; value. At this point, all that remains is to apply the correct exponent ; for the number, which is simply [(127+63) - (E div 2)] from the above ; equations. ; ; To do the piecewise-linear approximation, we can store a table of values at ; the appropriate intervals, and the deltas between them. However, this ; will require calculating the difference between the interval value and ; x. We can do a bit better by using slope-intercept (y = mx + b)m so the ; table will store (m, b). ; ; With a 512-entry table, we will get at least 16 bits of precision. This ; result was obtined using simulations.
.data
; The following 'C' code generates the table below:
;#define SQRT_TAB_SIZE_LOG2 9 // 512-entry table ; ;#define MANTISSA_SIZE 24 ;#define SQRT_TAB_SIZE (1 << SQRT_TAB_SIZE_LOG2) ;#define SQRT_INC (1 << (MANTISSA_SIZE - SQRT_TAB_SIZE_LOG2)) ;#define CASTFIX(a) (*((LONG *)&(a))) ; ;void genTable() ;{ ; int i; ; float x; ; float xNext; ; float y; ; float yNext; ; float xInterval; ; ; // We will start our table with the case where the exponent is even. ; ; CASTFIX(x) = 0x3f000000; ; ; // We will use the current and next values to generate the piece-wise ; // data for the curve. The interval between 'current' and 'next' is ; // based on the smallest change possible in the floating-point value ; // that also represents a difference of one table-lookup entry. ; ; // When we switch to the odd-exponent case (at 1.0), we have to adjust ; // for the fact that effective interval between successive values ; / is doubled. ; ; CASTFIX(xNext) = CASTFIX(x) + SQRT_INC; ; y = (float)1.0 / sqrt((double)x); ; ; // Calculate 1.0 / (piece-wise approximation interval). ; ; xInterval = xNext - x; ; ; xInterval = (float)1.0 / xInterval; ; ; // Now, generate the table: ; ; for (i = 0; i < SQRT_TAB_SIZE; i++) { ; float m; ; float b; ; ; // We increment our floating-point values using integer operations ; // to ensure accuracy: ; ; CASTFIX(xNext) = CASTFIX(x) + SQRT_INC; ; ; // Find next point on curve: ; ; yNext = (float)1.0 / sqrt((double)xNext); ; ; // Test for odd-exponent case: ; ; if (CASTFIX(x) == 0x3f800000) ; xInterval *= (float)0.5; ; ; m = (yNext - y) * xInterval; ; b = y - (m * x); ; ; printf("\t\tdd\t0%8xh, 0%8xh\n", CASTFIX(m), CASTFIX(b)); ; ; y = yNext; ; x = xNext; ; } ;}
invSqrtTab dd 0bfb47e00h, 04007a1fah dd 0bfb37000h, 040075e36h dd 0bfb26600h, 040071b31h dd 0bfb16000h, 04006d8ech dd 0bfb05800h, 0400695e4h dd 0bfaf5800h, 0400654a4h dd 0bfae5600h, 0400612a2h dd 0bfad5800h, 04005d165h dd 0bfac5e00h, 0400590f1h dd 0bfab6400h, 04005503eh dd 0bfaa6e00h, 040051058h dd 0bfa97800h, 04004d033h dd 0bfa88800h, 040049163h dd 0bfa79600h, 0400451d0h dd 0bfa6aa00h, 040041396h dd 0bfa5be00h, 04003d522h dd 0bfa4d400h, 0400396fah dd 0bfa3ee00h, 0400359a8h dd 0bfa30800h, 040031c1dh dd 0bfa22400h, 04002dee2h dd 0bfa14400h, 04002a282h dd 0bfa06600h, 040026674h dd 0bf9f8800h, 040022a30h dd 0bf9eae00h, 04001eecah dd 0bf9dd400h, 04001b32eh dd 0bf9cfc00h, 0400177e8h dd 0bf9c2800h, 040013d86h dd 0bf9b5400h, 0400102efh dd 0bf9a8400h, 04000c93fh dd 0bf99b400h, 040008f5bh dd 0bf98e600h, 0400055d2h dd 0bf981800h, 040001c16h dd 0bf975000h, 03fffc7abh dd 0bf968600h, 03fff55a6h dd 0bf95c000h, 03ffee580h dd 0bf94fc00h, 03ffe761ah dd 0bf943800h, 03ffe0652h dd 0bf937400h, 03ffd9628h dd 0bf92b600h, 03ffd290eh dd 0bf91f800h, 03ffcbb95h dd 0bf913a00h, 03ffc4dbdh dd 0bf907e00h, 03ffbe0afh dd 0bf8fc600h, 03ffb7597h dd 0bf8f0c00h, 03ffb08f8h dd 0bf8e5800h, 03ffa9f80h dd 0bf8da000h, 03ffa3354h dd 0bf8cee00h, 03ff9ca56h dd 0bf8c3c00h, 03ff960ffh dd 0bf8b8a00h, 03ff8f74fh dd 0bf8adc00h, 03ff88fa8h dd 0bf8a2e00h, 03ff827aah dd 0bf898000h, 03ff7bf55h dd 0bf88d600h, 03ff75911h dd 0bf882e00h, 03ff6f3adh dd 0bf878400h, 03ff68cbfh dd 0bf86de00h, 03ff627eah dd 0bf863600h, 03ff5c18ah dd 0bf859400h, 03ff55e81h dd 0bf84f000h, 03ff4f9edh dd 0bf845000h, 03ff4977dh dd 0bf83ae00h, 03ff43381h dd 0bf831000h, 03ff3d1aeh dd 0bf827200h, 03ff36f8ch dd 0bf81d400h, 03ff30d1bh dd 0bf813a00h, 03ff2acdbh dd 0bf809e00h, 03ff24b0dh dd 0bf800600h, 03ff1eb75h dd 0bf7edc00h, 03ff18b91h dd 0bf7db000h, 03ff12ca5h dd 0bf7c8400h, 03ff0cd6eh dd 0bf7b5c00h, 03ff06f32h dd 0bf7a3400h, 03ff010ach dd 0bf791000h, 03fefb324h dd 0bf77f000h, 03fef569ch dd 0bf76d000h, 03feef9cch dd 0bf75b000h, 03fee9cb4h dd 0bf749400h, 03fee40a0h dd 0bf737c00h, 03fede592h dd 0bf726800h, 03fed8b8ch dd 0bf714c00h, 03fed2ea3h dd 0bf704000h, 03fecd6b3h dd 0bf6f2800h, 03fec7a8dh dd 0bf6e1c00h, 03fec2217h dd 0bf6d1000h, 03febc95eh dd 0bf6c0400h, 03feb7062h dd 0bf6afc00h, 03feb1878h dd 0bf69f400h, 03feac04ch dd 0bf68ec00h, 03fea67deh dd 0bf67ec00h, 03fea11deh dd 0bf66e800h, 03fe9ba45h dd 0bf65e800h, 03fe963c5h dd 0bf64ec00h, 03fe90e60h dd 0bf63f000h, 03fe8b8bch dd 0bf62f400h, 03fe862d9h dd 0bf620000h, 03fe80f73h dd 0bf610400h, 03fe7b912h dd 0bf601000h, 03fe76532h dd 0bf5f2000h, 03fe71276h dd 0bf5e2c00h, 03fe6be1ch dd 0bf5d3c00h, 03fe66ae8h dd 0bf5c5000h, 03fe618dch dd 0bf5b6000h, 03fe5c530h dd 0bf5a7800h, 03fe57414h dd 0bf598c00h, 03fe52157h dd 0bf58a800h, 03fe4d12fh dd 0bf57c000h, 03fe47f65h dd 0bf56dc00h, 03fe42ecbh dd 0bf55f800h, 03fe3ddf8h dd 0bf551800h, 03fe38e58h dd 0bf543800h, 03fe33e80h dd 0bf535c00h, 03fe2efdeh dd 0bf527c00h, 03fe29f96h dd 0bf51a000h, 03fe25086h dd 0bf50c800h, 03fe202b0h dd 0bf4ff000h, 03fe1b4a4h dd 0bf4f1c00h, 03fe167d5h dd 0bf4e4400h, 03fe1195dh dd 0bf4d7000h, 03fe0cc24h dd 0bf4c9c00h, 03fe07eb6h dd 0bf4bcc00h, 03fe0328ah dd 0bf4afc00h, 03fdfe62ah dd 0bf4a3000h, 03fdf9b0fh dd 0bf496000h, 03fdf4e47h dd 0bf489800h, 03fdf0441h dd 0bf47c800h, 03fdeb711h dd 0bf470400h, 03fde6e24h dd 0bf463c00h, 03fde2388h dd 0bf457400h, 03fddd8bah dd 0bf44b000h, 03fdd8f3ah dd 0bf43ec00h, 03fdd4589h dd 0bf432800h, 03fdcfba7h dd 0bf426800h, 03fdcb317h dd 0bf41a800h, 03fdc6a57h dd 0bf40e800h, 03fdc2167h dd 0bf402c00h, 03fdbd9cdh dd 0bf3f6c00h, 03fdb907dh dd 0bf3eb400h, 03fdb4a0dh dd 0bf3dfc00h, 03fdb036fh dd 0bf3d4000h, 03fdabb19h dd 0bf3c8800h, 03fda741fh dd 0bf3bd400h, 03fda2e83h dd 0bf3b2000h, 03fd9e8bah dd 0bf3a6800h, 03fd9a136h dd 0bf39b400h, 03fd95b13h dd 0bf390800h, 03fd917e3h dd 0bf385000h, 03fd8cfd5h dd 0bf37a400h, 03fd88c4fh dd 0bf36f800h, 03fd8489eh dd 0bf364400h, 03fd8019ah dd 0bf359c00h, 03fd7bf28h dd 0bf34f000h, 03fd77af6h dd 0bf344400h, 03fd73699h dd 0bf339c00h, 03fd6f3a9h dd 0bf32f400h, 03fd6b08fh dd 0bf324c00h, 03fd66d4bh dd 0bf31a800h, 03fd62b78h dd 0bf310000h, 03fd5e7e0h dd 0bf305c00h, 03fd5a5bbh dd 0bf2fb800h, 03fd5636dh dd 0bf2f1800h, 03fd52295h dd 0bf2e7400h, 03fd4dff5h dd 0bf2dd800h, 03fd4a06eh dd 0bf2d3400h, 03fd45d7ch dd 0bf2c9800h, 03fd41da7h dd 0bf2bf800h, 03fd3dc07h dd 0bf2b6000h, 03fd39d89h dd 0bf2ac000h, 03fd35b99h dd 0bf2a2800h, 03fd31ccfh dd 0bf298c00h, 03fd2dc37h dd 0bf28f400h, 03fd29d21h dd 0bf285c00h, 03fd25de5h dd 0bf27c400h, 03fd21e83h dd 0bf273000h, 03fd1e0a7h dd 0bf269800h, 03fd1a0f9h dd 0bf260400h, 03fd162d3h dd 0bf257000h, 03fd12488h dd 0bf24e000h, 03fd0e7c8h dd 0bf244c00h, 03fd0a933h dd 0bf23bc00h, 03fd06c2bh dd 0bf232800h, 03fd02d4ch dd 0bf229c00h, 03fcff1b0h dd 0bf220c00h, 03fcfb43ch dd 0bf218000h, 03fcf785ah dd 0bf20f400h, 03fcf3c55h dd 0bf206400h, 03fcefe75h dd 0bf1fdc00h, 03fcec3e3h dd 0bf1f4c00h, 03fce85bbh dd 0bf1ec800h, 03fce4ca0h dd 0bf1e3c00h, 03fce0fech dd 0bf1db400h, 03fcdd4d2h dd 0bf1d2c00h, 03fcd9996h dd 0bf1ca800h, 03fcd5ff7h dd 0bf1c2000h, 03fcd2477h dd 0bf1b9800h, 03fcce8d5h dd 0bf1b1800h, 03fccb095h dd 0bf1a9400h, 03fcc7672h dd 0bf1a0c00h, 03fcc3a6ah dd 0bf199000h, 03fcc038fh dd 0bf190800h, 03fcbc743h dd 0bf188c00h, 03fcb902ah dd 0bf180800h, 03fcb5562h dd 0bf178c00h, 03fcb1e0bh dd 0bf170c00h, 03fcae4cbh dd 0bf168c00h, 03fcaab6bh dd 0bf161000h, 03fca73b7h dd 0bf159400h, 03fca3be4h dd 0bf151800h, 03fca03f2h dd 0bf149800h, 03fc9ca12h dd 0bf142400h, 03fc99582h dd 0bf13a400h, 03fc95b62h dd 0bf133000h, 03fc92698h dd 0bf12b400h, 03fc8ee0bh dd 0bf123c00h, 03fc8b733h dd 0bf11c400h, 03fc8803dh dd 0bf114c00h, 03fc84929h dd 0bf10d800h, 03fc813ceh dd 0bf106400h, 03fc7de56h dd 0bf0fec00h, 03fc7a6e8h dd 0bf0f7800h, 03fc77136h dd 0bf0f0400h, 03fc73b67h dd 0bf0e9000h, 03fc7057bh dd 0bf0e2000h, 03fc6d14fh dd 0bf0dac00h, 03fc69b29h dd 0bf0d3c00h, 03fc666c5h dd 0bf0ccc00h, 03fc63245h dd 0bf0c5800h, 03fc5fbc8h dd 0bf0bec00h, 03fc5c8f2h dd 0bf0b7c00h, 03fc5941eh dd 0bf0b0c00h, 03fc55f2eh dd 0bf0aa000h, 03fc52c07h dd 0bf0a3000h, 03fc4f6dfh dd 0bf09c400h, 03fc4c382h dd 0bf095c00h, 03fc491f2h dd 0bf08ec00h, 03fc45c76h dd 0bf088000h, 03fc428c8h dd 0bf081800h, 03fc3f6eah dd 0bf07b000h, 03fc3c4f2h dd 0bf074000h, 03fc38f06h dd 0bf06dc00h, 03fc35ec8h dd 0bf067400h, 03fc32c82h dd 0bf060800h, 03fc2f832h dd 0bf05a400h, 03fc2c7a9h dd 0bf053c00h, 03fc29515h dd 0bf04d800h, 03fc2645ah dd 0bf047000h, 03fc23192h dd 0bf040800h, 03fc1feb0h dd 0bf03a800h, 03fc1cfa0h dd 0bf034000h, 03fc19c8ah dd 0bf02dc00h, 03fc16b52h dd 0bf027c00h, 03fc13bfah dd 0bf021800h, 03fc10a90h dd 0bf01b400h, 03fc0d90dh dd 0bf015000h, 03fc0a771h dd 0bf00f400h, 03fc079b6h dd 0bf009000h, 03fc047e8h dd 0bf003000h, 03fc01800h dd 0beff4000h, 03fbfd000h dd 0befdc400h, 03fbf70a1h dd 0befc4c00h, 03fbf11e5h dd 0befad800h, 03fbeb3ceh dd 0bef96400h, 03fbe555ah dd 0bef7f800h, 03fbdf893h dd 0bef68e00h, 03fbd9bf4h dd 0bef52600h, 03fbd3f7eh dd 0bef3c200h, 03fbce3b6h dd 0bef26200h, 03fbc889eh dd 0bef10600h, 03fbc2e38h dd 0beefac00h, 03fbbd400h dd 0beee5400h, 03fbb79f8h dd 0beed0200h, 03fbb212eh dd 0beebb200h, 03fbac896h dd 0beea6600h, 03fba70b9h dd 0bee91a00h, 03fba1889h dd 0bee7d400h, 03fb9c1a0h dd 0bee69000h, 03fb96aeeh dd 0bee54e00h, 03fb91474h dd 0bee41200h, 03fb8bf48h dd 0bee2d400h, 03fb86942h dd 0bee19e00h, 03fb8151ah dd 0bee06600h, 03fb7c018h dd 0bedf3400h, 03fb76c6ch dd 0bede0400h, 03fb71900h dd 0bedcd600h, 03fb6c5d4h dd 0bedbac00h, 03fb67379h dd 0beda8400h, 03fb62161h dd 0bed95e00h, 03fb5cf8eh dd 0bed83a00h, 03fb57e00h dd 0bed71a00h, 03fb52d48h dd 0bed5fc00h, 03fb4dcd8h dd 0bed4e000h, 03fb48cb0h dd 0bed3c800h, 03fb43d64h dd 0bed2b000h, 03fb3edd2h dd 0bed19c00h, 03fb39f1eh dd 0bed08a00h, 03fb350b8h dd 0becf7c00h, 03fb30333h dd 0bece6c00h, 03fb2b4d7h dd 0becd6200h, 03fb267f3h dd 0becc5a00h, 03fb21b61h dd 0becb5200h, 03fb1ce8dh dd 0beca4e00h, 03fb182a2h dd 0bec94c00h, 03fb1370ch dd 0bec84a00h, 03fb0eb36h dd 0bec74e00h, 03fb0a0e4h dd 0bec65200h, 03fb05652h dd 0bec55800h, 03fb00c1ah dd 0bec45e00h, 03fafc1a4h dd 0bec36a00h, 03faf78bah dd 0bec27600h, 03faf2f93h dd 0bec18400h, 03faee6c9h dd 0bec09600h, 03fae9ef8h dd 0bebfa600h, 03fae5650h dd 0bebeba00h, 03fae0ea2h dd 0bebdd000h, 03fadc756h dd 0bebce800h, 03fad806ch dd 0bebc0000h, 03fad3948h dd 0bebb1e00h, 03facf3c3h dd 0beba3a00h, 03facad67h dd 0beb95800h, 03fac6770h dd 0beb87a00h, 03fac2280h dd 0beb79c00h, 03fabdd57h dd 0beb6c000h, 03fab9897h dd 0beb5e600h, 03fab5440h dd 0beb50e00h, 03fab1054h dd 0beb43600h, 03faacc32h dd 0beb36200h, 03faa891eh dd 0beb28e00h, 03faa45d6h dd 0beb1bc00h, 03faa02fah dd 0beb0ec00h, 03fa9c08eh dd 0beb01e00h, 03fa97e92h dd 0beaf5000h, 03fa93c63h dd 0beae8600h, 03fa8fb4ah dd 0beadba00h, 03fa8b959h dd 0beacf400h, 03fa87927h dd 0beac2a00h, 03fa83776h dd 0beab6600h, 03fa7f788h dd 0beaaa200h, 03fa7b76ah dd 0bea9e000h, 03fa777c2h dd 0bea91e00h, 03fa737e9h dd 0bea85e00h, 03fa6f889h dd 0bea7a000h, 03fa6b9a2h dd 0bea6e400h, 03fa67b36h dd 0bea62800h, 03fa63c9ch dd 0bea56e00h, 03fa5fe7ch dd 0bea4b400h, 03fa5c02fh dd 0bea3fe00h, 03fa5830bh dd 0bea34600h, 03fa5450dh dd 0bea29400h, 03fa508e8h dd 0bea1de00h, 03fa4cb3ch dd 0bea12c00h, 03fa48ebeh dd 0bea07c00h, 03fa452c2h dd 0be9fcc00h, 03fa4169ah dd 0be9f1e00h, 03fa3daf5h dd 0be9e7000h, 03fa39f25h dd 0be9dc400h, 03fa363dah dd 0be9d1a00h, 03fa32915h dd 0be9c7000h, 03fa2ee26h dd 0be9bc800h, 03fa2b3beh dd 0be9b2000h, 03fa2792ch dd 0be9a7a00h, 03fa23f22h dd 0be99d600h, 03fa205a4h dd 0be993200h, 03fa1cbfch dd 0be989000h, 03fa192dfh dd 0be97ec00h, 03fa158e5h dd 0be974e00h, 03fa120e2h dd 0be96ae00h, 03fa0e802h dd 0be961000h, 03fa0afb1h dd 0be957200h, 03fa07738h dd 0be94d800h, 03fa04006h dd 0be943a00h, 03fa0073eh dd 0be93a200h, 03f9fd078h dd 0be930a00h, 03f9f998ch dd 0be927000h, 03f9f61c1h dd 0be91da00h, 03f9f2b43h dd 0be914400h, 03f9ef4a0h dd 0be90b000h, 03f9ebe92h dd 0be901a00h, 03f9e87a3h dd 0be8f8a00h, 03f9e52c3h dd 0be8ef600h, 03f9e1c46h dd 0be8e6600h, 03f9de71eh dd 0be8dd600h, 03f9db1d2h dd 0be8d4600h, 03f9d7c62h dd 0be8cb800h, 03f9d478ch dd 0be8c2c00h, 03f9d1352h dd 0be8b9e00h, 03f9cde36h dd 0be8b1400h, 03f9caa76h dd 0be8a8a00h, 03f9c7694h dd 0be8a0000h, 03f9c428eh dd 0be897600h, 03f9c0e67h dd 0be88f000h, 03f9bdba1h dd 0be886800h, 03f9ba7f7h dd 0be87e200h, 03f9b74eeh dd 0be875e00h, 03f9b4287h dd 0be86d800h, 03f9b0f3bh dd 0be865600h, 03f9add56h dd 0be85d200h, 03f9aaa8ch dd 0be855200h, 03f9a792ch dd 0be84d000h, 03f9a46e6h dd 0be844e00h, 03f9a1480h dd 0be83d000h, 03f99e387h dd 0be835200h, 03f99b26eh dd 0be82d400h, 03f998136h dd 0be825600h, 03f994fdfh dd 0be81da00h, 03f991f31h dd 0be816000h, 03f98ef2eh dd 0be80e400h, 03f98be42h dd 0be806a00h, 03f988e01h dd 0be7fe000h, 03f985da2h dd 0be7ef400h, 03f982ebch dd 0be7e0000h, 03f97fe20h dd 0be7d1400h, 03f97cefeh dd 0be7c2400h, 03f979ef2h dd 0be7b3c00h, 03f977063h dd 0be7a5400h, 03f9741b7h dd 0be796800h, 03f971220h dd 0be788400h, 03f96e408h dd 0be779c00h, 03f96b506h dd 0be76b800h, 03f9686b6h dd 0be75d800h, 03f96591ah dd 0be74f400h, 03f962a90h dd 0be741400h, 03f95fcbch dd 0be733400h, 03f95cecch dd 0be725800h, 03f95a193h dd 0be717c00h, 03f95743eh dd 0be70a400h, 03f9547a1h dd 0be6fc800h, 03f951a15h dd 0be6ef000h, 03f94ed42h dd 0be6e1800h, 03f94c054h dd 0be6d4000h, 03f94934bh dd 0be6c7000h, 03f9467d3h dd 0be6b9c00h, 03f943b6ah dd 0be6ac800h, 03f940ee8h dd 0be69f800h, 03f93e322h dd 0be692800h, 03f93b742h dd 0be685c00h, 03f938c20h dd 0be678c00h, 03f93600ch dd 0be66c000h, 03f9334b8h dd 0be65f800h, 03f930a24h dd 0be652c00h, 03f92de9ch dd 0be646400h, 03f92b3d6h dd 0be639c00h, 03f9288f7h dd 0be62d400h, 03f925dffh dd 0be621000h, 03f9233cah dd 0be615000h, 03f920a5ah dd 0be608800h, 03f91df18h dd 0be5fc800h, 03f91b578h dd 0be5f0800h, 03f918bc0h dd 0be5e4800h, 03f9161f0h dd 0be5d8800h, 03f913808h dd 0be5ccc00h, 03f910ee8h dd 0be5c0c00h, 03f90e4d0h dd 0be5b5400h, 03f90bc62h dd 0be5a9800h, 03f9092fbh dd 0be59e000h, 03f906a5fh dd 0be592800h, 03f9041ach dd 0be587000h, 03f9018e2h dd 0be57b800h, 03f8ff001h dd 0be570400h, 03f8fc7edh dd 0be565000h, 03f8f9fc2h dd 0be559c00h, 03f8f7782h dd 0be54e800h, 03f8f4f2ah dd 0be543800h, 03f8f27a2h dd 0be538800h, 03f8f0004h dd 0be52d800h, 03f8ed850h dd 0be522c00h, 03f8eb16eh dd 0be517c00h, 03f8e898eh dd 0be50d000h, 03f8e6280h dd 0be502400h, 03f8e3b5dh dd 0be4f7800h, 03f8e1424h dd 0be4ecc00h, 03f8decd6h dd 0be4e2800h, 03f8dc748h dd 0be4d7c00h, 03f8d9fcfh dd 0be4cd800h, 03f8d7a18h dd 0be4c3000h, 03f8d5360h dd 0be4b8800h, 03f8d2c92h dd 0be4ae800h, 03f8d078ah dd 0be4a4000h, 03f8ce094h dd 0be49a000h, 03f8cbb64h dd 0be48fc00h, 03f8c9531h dd 0be485c00h, 03f8c6fd9h dd 0be47bc00h, 03f8c4a6dh dd 0be471c00h, 03f8c24edh dd 0be467c00h, 03f8bff59h dd 0be45e000h, 03f8bdaa2h dd 0be454000h, 03f8bb4e6h dd 0be44a800h, 03f8b90fah dd 0be440800h, 03f8b6b16h dd 0be437000h, 03f8b4704h dd 0be42d800h, 03f8b22dfh dd 0be423c00h, 03f8afdb3h dd 0be41a400h, 03f8ad968h dd 0be410c00h, 03f8ab50ah dd 0be407800h, 03f8a918eh dd 0be3fe000h, 03f8a6d0ah dd 0be3f4c00h, 03f8a496ah dd 0be3eb400h, 03f8a24c0h dd 0be3e2400h, 03f8a01f2h dd 0be3d9000h, 03f89de1ah dd 0be3d0000h, 03f89bb28h dd 0be3c6c00h, 03f89972bh dd 0be3bd800h, 03f89731ch dd 0be3b4c00h, 03f8950eeh dd 0be3abc00h, 03f892db4h dd 0be3a3000h, 03f890b62h dd 0be399c00h, 03f88e709h dd 0be391400h, 03f88c591h dd 0be388400h, 03f88a20fh dd 0be37fc00h, 03f888075h dd 0be377000h, 03f885dcch dd 0be36e400h, 03f883b12h dd 0be365800h, 03f881847h dd 0be35d400h, 03f87f768h dd 0be354800h, 03f87d47ah
.code
SQRT_TAB_LOG2 equ 9 ;; log2 of the lookup-table MANTISSA_SIZE equ 24 ;; number if mantissa bits in fp value ;; number of represented mantissa bits ;; (one less than total due to hidden ;; leading one). MANTISSA_BITS equ (MANTISSA_SIZE - 1) ELEMENT_SIZE_LOG2 equ 3 ;; log2 of each table entry (8 bytes) ;; shift required to get bits in value ;; in the correct place to use as an ;; index for the table lookup EXPONENT_SHIFT equ (MANTISSA_BITS - (SQRT_TAB_LOG2 - 1)\ - ELEMENT_SIZE_LOG2) ;; mask value for clamping to [.5..2) CLAMP_MASK equ ((1 SHL (MANTISSA_BITS+1)) - 1) ;; mask for sign/exponent bits MANTISSA_MASK equ ((1 SHL MANTISSA_BITS) - 1) ;; mask for sign/exponent bits EXPONENT_MASK equ (-1 AND (NOT MANTISSA_MASK)) ;; mask for table lookup TABLE_MASK equ ((1 SHL (SQRT_TAB_LOG2 + ELEMENT_SIZE_LOG2)) - 1) \ AND (NOT((1 SHL ELEMENT_SIZE_LOG2) - 1)) ;; bias used to represent clamped value EXPONENT_BIAS_EVEN equ 3f000000h ;; bias value used for final exponent ;; computation LARGE_EXPONENT_BIAS equ (((127 + 127/2) SHL (MANTISSA_BITS+1)) OR CLAMP_MASK)
x equ DWORD PTR 8[ebp] num equ DWORD PTR -8[ebp]
@__FastInvSqrt@4 PROC NEAR
push ebp mov ebp, esp sub esp, 8 push ecx
mov eax, x mov ecx, eax shr ecx, EXPONENT_SHIFT ;; ecx is table index (8 frac. bits) and eax, CLAMP_MASK ;; clamp number to [0.5, 2.0] and ecx, TABLE_MASK ;; (8 bytes)/(table entry) or eax, EXPONENT_BIAS_EVEN ;; re-adjust exponent for clamped number mov num, eax fld num fmul [invSqrtTab+ecx] ;; find mx mov eax, LARGE_EXPONENT_BIAS;; (127+63)<<23 to re-adjust exponent sub eax, x ;; divide exponent by 2 fadd [invSqrtTab+ecx+4] ;; get mx + b shr eax, 1 and eax, EXPONENT_MASK ;; mask exponent mov num, eax fmul num ;; now adjust for exponent
pop ecx mov esp, ebp pop ebp ret 4
@__FastInvSqrt@4 endp
len equ DWORD PTR -4[ebp] num equ DWORD PTR -8[ebp]
@__glNormalize@8 PROC NEAR
push ebp mov ebp, esp sub esp, 8 fld DWORD PTR [edx] fmul DWORD PTR [edx] ;; x fld DWORD PTR [edx+4] fmul DWORD PTR [edx+4] ;; y x fld DWORD PTR [edx+8] fmul DWORD PTR [edx+8] ;; z y x fxch ST(2) ;; x y z faddp ST(1), ST ;; xy z faddp ST(1), ST ;; xyz fstp len mov eax, len test eax, eax jne notZeroLen
mov [ecx], eax mov [ecx+4], eax mov [ecx+8], eax mov esp, ebp pop ebp ret 0
notZeroLen:
cmp eax, __FLOAT_ONE jne notOneLen cmp ecx, edx je normExit mov eax, [edx] mov [ecx], eax mov eax, [edx+4] mov [ecx+4], eax mov eax, [edx+8] mov [ecx+8], eax mov esp, ebp pop ebp ret 0
notOneLen:
;; eax already has length
push edi mov edi, eax shr edi, EXPONENT_SHIFT ;; edi is table index (8 frac. bits) and eax, CLAMP_MASK ;; clamp number to [0.5, 2.0] and edi, TABLE_MASK ;; (8 bytes)/(table entry) or eax, EXPONENT_BIAS_EVEN ;; re-adjust exponent for clamped number mov num, eax fld num fmul [invSqrtTab+edi] ;; find mx mov eax, LARGE_EXPONENT_BIAS;; (127+63)<<23 to re-adjust exponent sub eax, len ;; divide exponent by 2 fadd [invSqrtTab+edi+4] ;; get mx + b shr eax, 1 and eax, EXPONENT_MASK ;; mask exponent mov num, eax fmul num ;; now adjust for exponent
fld DWORD PTR [edx] ;; 1/sqrt(len) on stack fmul ST, ST(1) fld DWORD PTR [edx+4] fmul ST, ST(2) fld DWORD PTR [edx+8] fmul ST, ST(3) ;; z y x len fxch ST(2) ;; x y z len fstp DWORD PTR [ecx] fstp DWORD PTR [ecx+4] fstp DWORD PTR [ecx+8] fstp ST(0) ;; pop len pop edi mov esp, ebp pop ebp ret 0
normExit:
mov esp, ebp pop ebp ret 0
@__glNormalize@8 ENDP
END
|