|
|
;/* inffas32.asm is a hand tuned assembler version of inffast.c -- fast decoding ; * ; * inffas32.asm is derivated from inffas86.c, with translation of assembly code ; * ; * Copyright (C) 1995-2003 Mark Adler ; * For conditions of distribution and use, see copyright notice in zlib.h ; * ; * Copyright (C) 2003 Chris Anderson <[email protected]> ; * Please use the copyright conditions above. ; * ; * Mar-13-2003 -- Most of this is derived from inffast.S which is derived from ; * the gcc -S output of zlib-1.2.0/inffast.c. Zlib-1.2.0 is in beta release at ; * the moment. I have successfully compiled and tested this code with gcc2.96, ; * gcc3.2, icc5.0, msvc6.0. It is very close to the speed of inffast.S ; * compiled with gcc -DNO_MMX, but inffast.S is still faster on the P3 with MMX ; * enabled. I will attempt to merge the MMX code into this version. Newer ; * versions of this and inffast.S can be found at ; * http://www.eetbeetee.com/zlib/ and http://www.charm.net/~christop/zlib/ ; * ; * 2005 : modification by Gilles Vollant ; */ ; For Visual C++ 4.x and higher and ML 6.x and higher ; ml.exe is in directory \MASM611C of Win95 DDK ; ml.exe is also distributed in http://www.masm32.com/masmdl.htm ; and in VC++2003 toolkit at http://msdn.microsoft.com/visualc/vctoolkit2003/ ; ; ; compile with command line option ; ml /coff /Zi /c /Flinffas32.lst inffas32.asm
; if you define NO_GZIP (see inflate.h), compile with ; ml /coff /Zi /c /Flinffas32.lst /DNO_GUNZIP inffas32.asm
; zlib122sup is 0 fort zlib 1.2.2.1 and lower ; zlib122sup is 8 fort zlib 1.2.2.2 and more (with addition of dmax and head ; in inflate_state in inflate.h) zlib1222sup equ 8
IFDEF GUNZIP INFLATE_MODE_TYPE equ 11 INFLATE_MODE_BAD equ 26 ELSE IFNDEF NO_GUNZIP INFLATE_MODE_TYPE equ 11 INFLATE_MODE_BAD equ 26 ELSE INFLATE_MODE_TYPE equ 3 INFLATE_MODE_BAD equ 17 ENDIF ENDIF
; 75 "inffast.S" ;FILE "inffast.S"
;;;GLOBAL _inflate_fast
;;;SECTION .text
.586p .mmx
name inflate_fast_x86 .MODEL FLAT
_DATA segment inflate_fast_use_mmx: dd 1
_TEXT segment PUBLIC _inflate_fast
ALIGN 4 _inflate_fast: jmp inflate_fast_entry
ALIGN 4 db 'Fast decoding Code from Chris Anderson' db 0
ALIGN 4 invalid_literal_length_code_msg: db 'invalid literal/length code' db 0
ALIGN 4 invalid_distance_code_msg: db 'invalid distance code' db 0
ALIGN 4 invalid_distance_too_far_msg: db 'invalid distance too far back' db 0
ALIGN 4 inflate_fast_mask: dd 0 dd 1 dd 3 dd 7 dd 15 dd 31 dd 63 dd 127 dd 255 dd 511 dd 1023 dd 2047 dd 4095 dd 8191 dd 16383 dd 32767 dd 65535 dd 131071 dd 262143 dd 524287 dd 1048575 dd 2097151 dd 4194303 dd 8388607 dd 16777215 dd 33554431 dd 67108863 dd 134217727 dd 268435455 dd 536870911 dd 1073741823 dd 2147483647 dd 4294967295
mode_state equ 0 ;/* state->mode */ wsize_state equ (32+zlib1222sup) ;/* state->wsize */ write_state equ (36+4+zlib1222sup) ;/* state->write */ window_state equ (40+4+zlib1222sup) ;/* state->window */ hold_state equ (44+4+zlib1222sup) ;/* state->hold */ bits_state equ (48+4+zlib1222sup) ;/* state->bits */ lencode_state equ (64+4+zlib1222sup) ;/* state->lencode */ distcode_state equ (68+4+zlib1222sup) ;/* state->distcode */ lenbits_state equ (72+4+zlib1222sup) ;/* state->lenbits */ distbits_state equ (76+4+zlib1222sup) ;/* state->distbits */
;;SECTION .text ; 205 "inffast.S" ;GLOBAL inflate_fast_use_mmx
;SECTION .data
; GLOBAL inflate_fast_use_mmx:object ;.size inflate_fast_use_mmx, 4 ; 226 "inffast.S" ;SECTION .text
ALIGN 4 inflate_fast_entry: push edi push esi push ebp push ebx pushfd sub esp,64 cld
mov esi, [esp+88] mov edi, [esi+28]
mov edx, [esi+4] mov eax, [esi+0]
add edx,eax sub edx,11
mov [esp+44],eax mov [esp+20],edx
mov ebp, [esp+92] mov ecx, [esi+16] mov ebx, [esi+12]
sub ebp,ecx neg ebp add ebp,ebx
sub ecx,257 add ecx,ebx
mov [esp+60],ebx mov [esp+40],ebp mov [esp+16],ecx ; 285 "inffast.S" mov eax, [edi+lencode_state] mov ecx, [edi+distcode_state]
mov [esp+8],eax mov [esp+12],ecx
mov eax,1 mov ecx, [edi+lenbits_state] shl eax,cl dec eax mov [esp+0],eax
mov eax,1 mov ecx, [edi+distbits_state] shl eax,cl dec eax mov [esp+4],eax
mov eax, [edi+wsize_state] mov ecx, [edi+write_state] mov edx, [edi+window_state]
mov [esp+52],eax mov [esp+48],ecx mov [esp+56],edx
mov ebp, [edi+hold_state] mov ebx, [edi+bits_state] ; 321 "inffast.S" mov esi, [esp+44] mov ecx, [esp+20] cmp ecx,esi ja L_align_long
add ecx,11 sub ecx,esi mov eax,12 sub eax,ecx lea edi, [esp+28] rep movsb mov ecx,eax xor eax,eax rep stosb lea esi, [esp+28] mov [esp+20],esi jmp L_is_aligned
L_align_long: test esi,3 jz L_is_aligned xor eax,eax mov al, [esi] inc esi mov ecx,ebx add ebx,8 shl eax,cl or ebp,eax jmp L_align_long
L_is_aligned: mov edi, [esp+60] ; 366 "inffast.S" L_check_mmx: cmp dword ptr [inflate_fast_use_mmx],2 je L_init_mmx ja L_do_loop
push eax push ebx push ecx push edx pushfd mov eax, [esp] xor dword ptr [esp],0200000h
popfd pushfd pop edx xor edx,eax jz L_dont_use_mmx xor eax,eax cpuid cmp ebx,0756e6547h jne L_dont_use_mmx cmp ecx,06c65746eh jne L_dont_use_mmx cmp edx,049656e69h jne L_dont_use_mmx mov eax,1 cpuid shr eax,8 and eax,15 cmp eax,6 jne L_dont_use_mmx test edx,0800000h jnz L_use_mmx jmp L_dont_use_mmx L_use_mmx: mov dword ptr [inflate_fast_use_mmx],2 jmp L_check_mmx_pop L_dont_use_mmx: mov dword ptr [inflate_fast_use_mmx],3 L_check_mmx_pop: pop edx pop ecx pop ebx pop eax jmp L_check_mmx ; 426 "inffast.S" ALIGN 4 L_do_loop: ; 437 "inffast.S" cmp bl,15 ja L_get_length_code
xor eax,eax lodsw mov cl,bl add bl,16 shl eax,cl or ebp,eax
L_get_length_code: mov edx, [esp+0] mov ecx, [esp+8] and edx,ebp mov eax, [ecx+edx*4]
L_dolen:
mov cl,ah sub bl,ah shr ebp,cl
test al,al jnz L_test_for_length_base
shr eax,16 stosb
L_while_test:
cmp [esp+16],edi jbe L_break_loop
cmp [esp+20],esi ja L_do_loop jmp L_break_loop
L_test_for_length_base: ; 502 "inffast.S" mov edx,eax shr edx,16 mov cl,al
test al,16 jz L_test_for_second_level_length and cl,15 jz L_save_len cmp bl,cl jae L_add_bits_to_len
mov ch,cl xor eax,eax lodsw mov cl,bl add bl,16 shl eax,cl or ebp,eax mov cl,ch
L_add_bits_to_len: mov eax,1 shl eax,cl dec eax sub bl,cl and eax,ebp shr ebp,cl add edx,eax
L_save_len: mov [esp+24],edx
L_decode_distance: ; 549 "inffast.S" cmp bl,15 ja L_get_distance_code
xor eax,eax lodsw mov cl,bl add bl,16 shl eax,cl or ebp,eax
L_get_distance_code: mov edx, [esp+4] mov ecx, [esp+12] and edx,ebp mov eax, [ecx+edx*4]
L_dodist: mov edx,eax shr edx,16 mov cl,ah sub bl,ah shr ebp,cl ; 584 "inffast.S" mov cl,al
test al,16 jz L_test_for_second_level_dist and cl,15 jz L_check_dist_one cmp bl,cl jae L_add_bits_to_dist
mov ch,cl xor eax,eax lodsw mov cl,bl add bl,16 shl eax,cl or ebp,eax mov cl,ch
L_add_bits_to_dist: mov eax,1 shl eax,cl dec eax sub bl,cl and eax,ebp shr ebp,cl add edx,eax jmp L_check_window
L_check_window: ; 625 "inffast.S" mov [esp+44],esi mov eax,edi sub eax, [esp+40]
cmp eax,edx jb L_clip_window
mov ecx, [esp+24] mov esi,edi sub esi,edx
sub ecx,3 mov al, [esi] mov [edi],al mov al, [esi+1] mov dl, [esi+2] add esi,3 mov [edi+1],al mov [edi+2],dl add edi,3 rep movsb
mov esi, [esp+44] jmp L_while_test
ALIGN 4 L_check_dist_one: cmp edx,1 jne L_check_window cmp [esp+40],edi je L_check_window
dec edi mov ecx, [esp+24] mov al, [edi] sub ecx,3
mov [edi+1],al mov [edi+2],al mov [edi+3],al add edi,4 rep stosb
jmp L_while_test
ALIGN 4 L_test_for_second_level_length:
test al,64 jnz L_test_for_end_of_block
mov eax,1 shl eax,cl dec eax and eax,ebp add eax,edx mov edx, [esp+8] mov eax, [edx+eax*4] jmp L_dolen
ALIGN 4 L_test_for_second_level_dist:
test al,64 jnz L_invalid_distance_code
mov eax,1 shl eax,cl dec eax and eax,ebp add eax,edx mov edx, [esp+12] mov eax, [edx+eax*4] jmp L_dodist
ALIGN 4 L_clip_window: ; 721 "inffast.S" mov ecx,eax mov eax, [esp+52] neg ecx mov esi, [esp+56]
cmp eax,edx jb L_invalid_distance_too_far
add ecx,edx cmp dword ptr [esp+48],0 jne L_wrap_around_window
sub eax,ecx add esi,eax ; 749 "inffast.S" mov eax, [esp+24] cmp eax,ecx jbe L_do_copy1
sub eax,ecx rep movsb mov esi,edi sub esi,edx jmp L_do_copy1
cmp eax,ecx jbe L_do_copy1
sub eax,ecx rep movsb mov esi,edi sub esi,edx jmp L_do_copy1
L_wrap_around_window: ; 793 "inffast.S" mov eax, [esp+48] cmp ecx,eax jbe L_contiguous_in_window
add esi, [esp+52] add esi,eax sub esi,ecx sub ecx,eax
mov eax, [esp+24] cmp eax,ecx jbe L_do_copy1
sub eax,ecx rep movsb mov esi, [esp+56] mov ecx, [esp+48] cmp eax,ecx jbe L_do_copy1
sub eax,ecx rep movsb mov esi,edi sub esi,edx jmp L_do_copy1
L_contiguous_in_window: ; 836 "inffast.S" add esi,eax sub esi,ecx
mov eax, [esp+24] cmp eax,ecx jbe L_do_copy1
sub eax,ecx rep movsb mov esi,edi sub esi,edx
L_do_copy1: ; 862 "inffast.S" mov ecx,eax rep movsb
mov esi, [esp+44] jmp L_while_test ; 878 "inffast.S" ALIGN 4 L_init_mmx: emms
movd mm0,ebp mov ebp,ebx ; 896 "inffast.S" movd mm4,dword ptr [esp+0] movq mm3,mm4 movd mm5,dword ptr [esp+4] movq mm2,mm5 pxor mm1,mm1 mov ebx, [esp+8] jmp L_do_loop_mmx
ALIGN 4 L_do_loop_mmx: psrlq mm0,mm1
cmp ebp,32 ja L_get_length_code_mmx
movd mm6,ebp movd mm7,dword ptr [esi] add esi,4 psllq mm7,mm6 add ebp,32 por mm0,mm7
L_get_length_code_mmx: pand mm4,mm0 movd eax,mm4 movq mm4,mm3 mov eax, [ebx+eax*4]
L_dolen_mmx: movzx ecx,ah movd mm1,ecx sub ebp,ecx
test al,al jnz L_test_for_length_base_mmx
shr eax,16 stosb
L_while_test_mmx:
cmp [esp+16],edi jbe L_break_loop
cmp [esp+20],esi ja L_do_loop_mmx jmp L_break_loop
L_test_for_length_base_mmx:
mov edx,eax shr edx,16
test al,16 jz L_test_for_second_level_length_mmx and eax,15 jz L_decode_distance_mmx
psrlq mm0,mm1 movd mm1,eax movd ecx,mm0 sub ebp,eax and ecx, [inflate_fast_mask+eax*4] add edx,ecx
L_decode_distance_mmx: psrlq mm0,mm1
cmp ebp,32 ja L_get_dist_code_mmx
movd mm6,ebp movd mm7,dword ptr [esi] add esi,4 psllq mm7,mm6 add ebp,32 por mm0,mm7
L_get_dist_code_mmx: mov ebx, [esp+12] pand mm5,mm0 movd eax,mm5 movq mm5,mm2 mov eax, [ebx+eax*4]
L_dodist_mmx:
movzx ecx,ah mov ebx,eax shr ebx,16 sub ebp,ecx movd mm1,ecx
test al,16 jz L_test_for_second_level_dist_mmx and eax,15 jz L_check_dist_one_mmx
L_add_bits_to_dist_mmx: psrlq mm0,mm1 movd mm1,eax movd ecx,mm0 sub ebp,eax and ecx, [inflate_fast_mask+eax*4] add ebx,ecx
L_check_window_mmx: mov [esp+44],esi mov eax,edi sub eax, [esp+40]
cmp eax,ebx jb L_clip_window_mmx
mov ecx,edx mov esi,edi sub esi,ebx
sub ecx,3 mov al, [esi] mov [edi],al mov al, [esi+1] mov dl, [esi+2] add esi,3 mov [edi+1],al mov [edi+2],dl add edi,3 rep movsb
mov esi, [esp+44] mov ebx, [esp+8] jmp L_while_test_mmx
ALIGN 4 L_check_dist_one_mmx: cmp ebx,1 jne L_check_window_mmx cmp [esp+40],edi je L_check_window_mmx
dec edi mov ecx,edx mov al, [edi] sub ecx,3
mov [edi+1],al mov [edi+2],al mov [edi+3],al add edi,4 rep stosb
mov ebx, [esp+8] jmp L_while_test_mmx
ALIGN 4 L_test_for_second_level_length_mmx: test al,64 jnz L_test_for_end_of_block
and eax,15 psrlq mm0,mm1 movd ecx,mm0 and ecx, [inflate_fast_mask+eax*4] add ecx,edx mov eax, [ebx+ecx*4] jmp L_dolen_mmx
ALIGN 4 L_test_for_second_level_dist_mmx: test al,64 jnz L_invalid_distance_code
and eax,15 psrlq mm0,mm1 movd ecx,mm0 and ecx, [inflate_fast_mask+eax*4] mov eax, [esp+12] add ecx,ebx mov eax, [eax+ecx*4] jmp L_dodist_mmx
ALIGN 4 L_clip_window_mmx:
mov ecx,eax mov eax, [esp+52] neg ecx mov esi, [esp+56]
cmp eax,ebx jb L_invalid_distance_too_far
add ecx,ebx cmp dword ptr [esp+48],0 jne L_wrap_around_window_mmx
sub eax,ecx add esi,eax
cmp edx,ecx jbe L_do_copy1_mmx
sub edx,ecx rep movsb mov esi,edi sub esi,ebx jmp L_do_copy1_mmx
cmp edx,ecx jbe L_do_copy1_mmx
sub edx,ecx rep movsb mov esi,edi sub esi,ebx jmp L_do_copy1_mmx
L_wrap_around_window_mmx:
mov eax, [esp+48] cmp ecx,eax jbe L_contiguous_in_window_mmx
add esi, [esp+52] add esi,eax sub esi,ecx sub ecx,eax
cmp edx,ecx jbe L_do_copy1_mmx
sub edx,ecx rep movsb mov esi, [esp+56] mov ecx, [esp+48] cmp edx,ecx jbe L_do_copy1_mmx
sub edx,ecx rep movsb mov esi,edi sub esi,ebx jmp L_do_copy1_mmx
L_contiguous_in_window_mmx:
add esi,eax sub esi,ecx
cmp edx,ecx jbe L_do_copy1_mmx
sub edx,ecx rep movsb mov esi,edi sub esi,ebx
L_do_copy1_mmx:
mov ecx,edx rep movsb
mov esi, [esp+44] mov ebx, [esp+8] jmp L_while_test_mmx ; 1174 "inffast.S" L_invalid_distance_code:
mov ecx, invalid_distance_code_msg mov edx,INFLATE_MODE_BAD jmp L_update_stream_state
L_test_for_end_of_block:
test al,32 jz L_invalid_literal_length_code
mov ecx,0 mov edx,INFLATE_MODE_TYPE jmp L_update_stream_state
L_invalid_literal_length_code:
mov ecx, invalid_literal_length_code_msg mov edx,INFLATE_MODE_BAD jmp L_update_stream_state
L_invalid_distance_too_far:
mov esi, [esp+44] mov ecx, invalid_distance_too_far_msg mov edx,INFLATE_MODE_BAD jmp L_update_stream_state
L_update_stream_state:
mov eax, [esp+88] test ecx,ecx jz L_skip_msg mov [eax+24],ecx L_skip_msg: mov eax, [eax+28] mov [eax+mode_state],edx jmp L_break_loop
ALIGN 4 L_break_loop: ; 1243 "inffast.S" cmp dword ptr [inflate_fast_use_mmx],2 jne L_update_next_in
mov ebx,ebp
L_update_next_in: ; 1266 "inffast.S" mov eax, [esp+88] mov ecx,ebx mov edx, [eax+28] shr ecx,3 sub esi,ecx shl ecx,3 sub ebx,ecx mov [eax+12],edi mov [edx+bits_state],ebx mov ecx,ebx
lea ebx, [esp+28] cmp [esp+20],ebx jne L_buf_not_used
sub esi,ebx mov ebx, [eax+0] mov [esp+20],ebx add esi,ebx mov ebx, [eax+4] sub ebx,11 add [esp+20],ebx
L_buf_not_used: mov [eax+0],esi
mov ebx,1 shl ebx,cl dec ebx
cmp dword ptr [inflate_fast_use_mmx],2 jne L_update_hold
psrlq mm0,mm1 movd ebp,mm0
emms
L_update_hold:
and ebp,ebx mov [edx+hold_state],ebp
mov ebx, [esp+20] cmp ebx,esi jbe L_last_is_smaller
sub ebx,esi add ebx,11 mov [eax+4],ebx jmp L_fixup_out L_last_is_smaller: sub esi,ebx neg esi add esi,11 mov [eax+4],esi
L_fixup_out:
mov ebx, [esp+16] cmp ebx,edi jbe L_end_is_smaller
sub ebx,edi add ebx,257 mov [eax+16],ebx jmp L_done L_end_is_smaller: sub edi,ebx neg edi add edi,257 mov [eax+16],edi
L_done: add esp,64 popfd pop ebx pop ebp pop esi pop edi ret
_TEXT ends end
|