Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
6725 | siemargl | 1 | ;=========================================================================== |
2 | ; Copyright (c) 1990-2007 Info-ZIP. All rights reserved. |
||
3 | ; |
||
4 | ; See the accompanying file LICENSE, version 2000-Apr-09 or later |
||
5 | ; (the contents of which are also included in zip.h) for terms of use. |
||
6 | ; If, for some reason, all these files are missing, the Info-ZIP license |
||
7 | ; also may be found at: ftp://ftp.info-zip.org/pub/infozip/license.html |
||
8 | ;=========================================================================== |
||
9 | ; crc_i386.asm, optimized CRC calculation function for Zip and UnZip, |
||
10 | ; created by Paul Kienitz and Christian Spieler. Last revised 07 Jan 2007. |
||
11 | ; |
||
12 | ; Revised 06-Oct-96, Scott Field (sfield@microsoft.com) |
||
13 | ; fixed to assemble with masm by not using .model directive which makes |
||
14 | ; assumptions about segment alignment. Also, |
||
15 | ; avoid using loop, and j[e]cxz where possible. Use mov + inc, rather |
||
16 | ; than lodsb, and other misc. changes resulting in the following performance |
||
17 | ; increases: |
||
18 | ; |
||
19 | ; unrolled loops NO_UNROLLED_LOOPS |
||
20 | ; *8 >8 <8 *8 >8 <8 |
||
21 | ; |
||
22 | ; +54% +42% +35% +82% +52% +25% |
||
23 | ; |
||
24 | ; first item in each table is input buffer length, even multiple of 8 |
||
25 | ; second item in each table is input buffer length, > 8 |
||
26 | ; third item in each table is input buffer length, < 8 |
||
27 | ; |
||
28 | ; Revised 02-Apr-97, Chr. Spieler, based on Rodney Brown (rdb@cmutual.com.au) |
||
29 | ; Incorporated Rodney Brown's 32-bit-reads optimization as found in the |
||
30 | ; UNIX AS source crc_i386.S. This new code can be disabled by defining |
||
31 | ; the macro symbol NO_32_BIT_LOADS. |
||
32 | ; |
||
33 | ; Revised 12-Oct-97, Chr. Spieler, based on Rodney Brown (rdb@cmutual.com.au) |
||
34 | ; Incorporated Rodney Brown's additional tweaks for 32-bit-optimized CPUs |
||
35 | ; (like the Pentium Pro, Pentium II, and probably some Pentium clones). |
||
36 | ; This optimization is controlled by the macro symbol __686 and is disabled |
||
37 | ; by default. (This default is based on the assumption that most users |
||
38 | ; do not yet work on a Pentium Pro or Pentium II machine ...) |
||
39 | ; |
||
40 | ; Revised 25-Mar-98, Cosmin Truta (cosmint@cs.ubbcluj.ro) |
||
41 | ; Working without .model directive caused tasm32 version 5.0 to produce |
||
42 | ; bad object code. The optimized alignments can be optionally disabled |
||
43 | ; by defining NO_ALIGN, thus allowing to use .model flat. There is no need |
||
44 | ; to define this macro if using other versions of tasm. |
||
45 | ; |
||
46 | ; Revised 16-Jan-2005, Cosmin Truta (cosmint@cs.ubbcluj.ro) |
||
47 | ; Enabled the 686 build by default, because there are hardly any pre-686 CPUs |
||
48 | ; in serious use nowadays. (See the 12-Oct-97 note above.) |
||
49 | ; |
||
50 | ; Revised 03-Jan-2006, Chr. Spieler |
||
51 | ; Enlarged unrolling loops to "do 16 bytes per turn"; optimized access to |
||
52 | ; data buffer in loop body (adjust pointer only once in loop body and use |
||
53 | ; offsets to access each item); added additional support for the "unfolded |
||
54 | ; tables" optimization variant (enabled by IZ_CRCOPTIM_UNFOLDTBL). |
||
55 | ; |
||
56 | ; Revised 07-Jan-2007, Chr. Spieler |
||
57 | ; Recognize additional conditional flag CRC_TABLE_ONLY that prevents |
||
58 | ; compilation of the crc32() function. |
||
59 | ; |
||
60 | ; FLAT memory model assumed. |
||
61 | ; |
||
62 | ; Loop unrolling can be disabled by defining the macro NO_UNROLLED_LOOPS. |
||
63 | ; This results in shorter code at the expense of reduced performance. |
||
64 | ; |
||
65 | ;============================================================================== |
||
66 | ; |
||
67 | ; Do NOT assemble this source if external crc32 routine from zlib gets used, |
||
68 | ; or only the precomputed CRC_32_Table is needed. |
||
69 | ; |
||
70 | IFNDEF USE_ZLIB |
||
71 | IFNDEF CRC_TABLE_ONLY |
||
72 | ; |
||
73 | .386p |
||
74 | name crc_i386 |
||
75 | |||
76 | IFDEF NO_ALIGN |
||
77 | .model flat |
||
78 | ENDIF |
||
79 | |||
80 | IFNDEF PRE_686 |
||
81 | IFNDEF __686 |
||
82 | __686 EQU 1 ; optimize for Pentium Pro, Pentium II and compatible CPUs |
||
83 | ENDIF |
||
84 | ENDIF |
||
85 | |||
86 | extrn _get_crc_table:near ; ZCONST ulg near *get_crc_table(void); |
||
87 | |||
88 | ; |
||
89 | IFNDEF NO_STD_STACKFRAME |
||
90 | ; Use a `standard' stack frame setup on routine entry and exit. |
||
91 | ; Actually, this option is set as default, because it results |
||
92 | ; in smaller code !! |
||
93 | STD_ENTRY MACRO |
||
94 | push ebp |
||
95 | mov ebp,esp |
||
96 | ENDM |
||
97 | |||
98 | Arg1 EQU 08H[ebp] |
||
99 | Arg2 EQU 0CH[ebp] |
||
100 | Arg3 EQU 10H[ebp] |
||
101 | |||
102 | STD_LEAVE MACRO |
||
103 | pop ebp |
||
104 | ENDM |
||
105 | |||
106 | ELSE ; NO_STD_STACKFRAME |
||
107 | |||
108 | STD_ENTRY MACRO |
||
109 | ENDM |
||
110 | |||
111 | Arg1 EQU 18H[esp] |
||
112 | Arg2 EQU 1CH[esp] |
||
113 | Arg3 EQU 20H[esp] |
||
114 | |||
115 | STD_LEAVE MACRO |
||
116 | ENDM |
||
117 | |||
118 | ENDIF ; ?NO_STD_STACKFRAME |
||
119 | |||
120 | ; These two (three) macros make up the loop body of the CRC32 cruncher. |
||
121 | ; registers modified: |
||
122 | ; eax : crc value "c" |
||
123 | ; esi : pointer to next data byte (or dword) "buf++" |
||
124 | ; registers read: |
||
125 | ; edi : pointer to base of crc_table array |
||
126 | ; scratch registers: |
||
127 | ; ebx : index into crc_table array |
||
128 | ; (requires upper three bytes = 0 when __686 is undefined) |
||
129 | IFNDEF __686 ; optimize for 386, 486, Pentium |
||
130 | Do_CRC MACRO |
||
131 | mov bl,al ; tmp = c & 0xFF |
||
132 | shr eax,8 ; c = (c >> 8) |
||
133 | xor eax,[edi+ebx*4] ; ^ table[tmp] |
||
134 | ENDM |
||
135 | ELSE ; __686 : optimize for Pentium Pro, Pentium II and compatible CPUs |
||
136 | Do_CRC MACRO |
||
137 | movzx ebx,al ; tmp = c & 0xFF |
||
138 | shr eax,8 ; c = (c >> 8) |
||
139 | xor eax,[edi+ebx*4] ; ^ table[tmp] |
||
140 | ENDM |
||
141 | ENDIF ; ?__686 |
||
142 | Do_CRC_byte MACRO |
||
143 | xor al, byte ptr [esi] ; c ^= *buf |
||
144 | inc esi ; buf++ |
||
145 | Do_CRC ; c = (c >> 8) ^ table[c & 0xFF] |
||
146 | ENDM |
||
147 | Do_CRC_byteof MACRO ofs |
||
148 | xor al, byte ptr [esi+ofs] ; c ^= *(buf+ofs) |
||
149 | Do_CRC ; c = (c >> 8) ^ table[c & 0xFF] |
||
150 | ENDM |
||
151 | IFNDEF NO_32_BIT_LOADS |
||
152 | IFDEF IZ_CRCOPTIM_UNFOLDTBL |
||
153 | ; the edx register is needed in crc calculation |
||
154 | SavLen EQU Arg3 |
||
155 | |||
156 | UpdCRC_dword MACRO |
||
157 | movzx ebx,al ; tmp = c & 0xFF |
||
158 | mov edx,[edi+ebx*4+3072] ; table[256*3+tmp] |
||
159 | movzx ebx,ah ; tmp = (c>>8) & 0xFF |
||
160 | shr eax,16 ; |
||
161 | xor edx,[edi+ebx*4+2048] ; ^ table[256*2+tmp] |
||
162 | movzx ebx,al ; tmp = (c>>16) & 0xFF |
||
163 | shr eax,8 ; tmp = (c>>24) |
||
164 | xor edx,[edi+ebx*4+1024] ; ^ table[256*1+tmp] |
||
165 | mov eax,[edi+eax*4] ; ^ table[256*0+tmp] |
||
166 | xor eax,edx ; .. |
||
167 | ENDM |
||
168 | UpdCRC_dword_sh MACRO dwPtrIncr |
||
169 | movzx ebx,al ; tmp = c & 0xFF |
||
170 | mov edx,[edi+ebx*4+3072] ; table[256*3+tmp] |
||
171 | movzx ebx,ah ; tmp = (c>>8) & 0xFF |
||
172 | xor edx,[edi+ebx*4+2048] ; ^ table[256*2+tmp] |
||
173 | shr eax,16 ; |
||
174 | movzx ebx,al ; tmp = (c>>16) & 0xFF |
||
175 | add esi, 4*dwPtrIncr ; ((ulg *)buf) += dwPtrIncr |
||
176 | shr eax,8 ; tmp = (c>>24) |
||
177 | xor edx,[edi+ebx*4+1024] ; ^ table[256*1+tmp] |
||
178 | mov eax,[edi+eax*4] ; ^ table[256*0+tmp] |
||
179 | xor eax,edx ; .. |
||
180 | ENDM |
||
181 | ELSE ; IZ_CRCOPTIM_UNFOLDTBL |
||
182 | ; the edx register is not needed anywhere else |
||
183 | SavLen EQU edx |
||
184 | |||
185 | UpdCRC_dword MACRO |
||
186 | Do_CRC |
||
187 | Do_CRC |
||
188 | Do_CRC |
||
189 | Do_CRC |
||
190 | ENDM |
||
191 | UpdCRC_dword_sh MACRO dwPtrIncr |
||
192 | Do_CRC |
||
193 | Do_CRC |
||
194 | add esi, 4*dwPtrIncr ; ((ulg *)buf) += dwPtrIncr |
||
195 | Do_CRC |
||
196 | Do_CRC |
||
197 | ENDM |
||
198 | ENDIF ; ?IZ_CRCOPTIM_UNFOLDTBL |
||
199 | Do_CRC_dword MACRO |
||
200 | xor eax, dword ptr [esi] ; c ^= *(ulg *)buf |
||
201 | UpdCRC_dword_sh 1 ; ... ((ulg *)buf)++ |
||
202 | ENDM |
||
203 | Do_CRC_4dword MACRO |
||
204 | xor eax, dword ptr [esi] ; c ^= *(ulg *)buf |
||
205 | UpdCRC_dword |
||
206 | xor eax, dword ptr [esi+4] ; c ^= *((ulg *)buf+1) |
||
207 | UpdCRC_dword |
||
208 | xor eax, dword ptr [esi+8] ; c ^= *((ulg *)buf+2) |
||
209 | UpdCRC_dword |
||
210 | xor eax, dword ptr [esi+12] ; c ^= *((ulg *)buf]+3 |
||
211 | UpdCRC_dword_sh 4 ; ... ((ulg *)buf)+=4 |
||
212 | ENDM |
||
213 | ENDIF ; !NO_32_BIT_LOADS |
||
214 | |||
215 | IFNDEF NO_ALIGN |
||
216 | _TEXT segment use32 para public 'CODE' |
||
217 | ELSE |
||
218 | _TEXT segment use32 |
||
219 | ENDIF |
||
220 | assume CS: _TEXT |
||
221 | |||
222 | public _crc32 |
||
223 | _crc32 proc near ; ulg crc32(ulg crc, ZCONST uch *buf, extent len) |
||
224 | STD_ENTRY |
||
225 | push edi |
||
226 | push esi |
||
227 | push ebx |
||
228 | push edx |
||
229 | push ecx |
||
230 | |||
231 | mov esi,Arg2 ; 2nd arg: uch *buf |
||
232 | sub eax,eax ;> if (!buf) |
||
233 | test esi,esi ;> return 0; |
||
234 | jz fine ;> else { |
||
235 | |||
236 | call _get_crc_table |
||
237 | mov edi,eax |
||
238 | mov eax,Arg1 ; 1st arg: ulg crc |
||
239 | IFNDEF __686 |
||
240 | sub ebx,ebx ; ebx=0; make bl usable as a dword |
||
241 | ENDIF |
||
242 | mov ecx,Arg3 ; 3rd arg: extent len |
||
243 | not eax ;> c = ~crc; |
||
244 | |||
245 | test ecx,ecx |
||
246 | IFNDEF NO_UNROLLED_LOOPS |
||
247 | jz bail |
||
248 | IFNDEF NO_32_BIT_LOADS |
||
249 | align_loop: |
||
250 | test esi,3 ; align buf pointer on next |
||
251 | jz SHORT aligned_now ; dword boundary |
||
252 | Do_CRC_byte |
||
253 | dec ecx |
||
254 | jnz align_loop |
||
255 | aligned_now: |
||
256 | ENDIF ; !NO_32_BIT_LOADS |
||
257 | mov SavLen,ecx ; save current len for later |
||
258 | shr ecx,4 ; ecx = len / 16 |
||
259 | jz No_Sixteens |
||
260 | IFNDEF NO_ALIGN |
||
261 | ; align loop head at start of 486 internal cache line !! |
||
262 | align 16 |
||
263 | ENDIF |
||
264 | Next_Sixteen: |
||
265 | IFNDEF NO_32_BIT_LOADS |
||
266 | Do_CRC_4dword |
||
267 | ELSE ; NO_32_BIT_LOADS |
||
268 | Do_CRC_byteof 0 |
||
269 | Do_CRC_byteof 1 |
||
270 | Do_CRC_byteof 2 |
||
271 | Do_CRC_byteof 3 |
||
272 | Do_CRC_byteof 4 |
||
273 | Do_CRC_byteof 5 |
||
274 | Do_CRC_byteof 6 |
||
275 | Do_CRC_byteof 7 |
||
276 | Do_CRC_byteof 8 |
||
277 | Do_CRC_byteof 9 |
||
278 | Do_CRC_byteof 10 |
||
279 | Do_CRC_byteof 11 |
||
280 | Do_CRC_byteof 12 |
||
281 | Do_CRC_byteof 13 |
||
282 | Do_CRC_byteof 14 |
||
283 | Do_CRC_byteof 15 |
||
284 | add esi, 16 ; buf += 16 |
||
285 | ENDIF ; ?NO_32_BIT_LOADS |
||
286 | dec ecx |
||
287 | jnz Next_Sixteen |
||
288 | No_Sixteens: |
||
289 | mov ecx,SavLen |
||
290 | and ecx,00000000FH ; ecx = len % 16 |
||
291 | IFNDEF NO_32_BIT_LOADS |
||
292 | shr ecx,2 ; ecx = len / 4 |
||
293 | jz SHORT No_Fours |
||
294 | Next_Four: |
||
295 | Do_CRC_dword |
||
296 | dec ecx |
||
297 | jnz Next_Four |
||
298 | No_Fours: |
||
299 | mov ecx,SavLen |
||
300 | and ecx,000000003H ; ecx = len % 4 |
||
301 | ENDIF ; !NO_32_BIT_LOADS |
||
302 | ENDIF ; !NO_UNROLLED_LOOPS |
||
303 | jz SHORT bail ;> if (len) |
||
304 | IFNDEF NO_ALIGN |
||
305 | ; align loop head at start of 486 internal cache line !! |
||
306 | align 16 |
||
307 | ENDIF |
||
308 | loupe: ;> do { |
||
309 | Do_CRC_byte ; c = CRC32(c,*buf++,crctab); |
||
310 | dec ecx ;> } while (--len); |
||
311 | jnz loupe |
||
312 | |||
313 | bail: ;> } |
||
314 | not eax ;> return ~c; |
||
315 | fine: |
||
316 | pop ecx |
||
317 | pop edx |
||
318 | pop ebx |
||
319 | pop esi |
||
320 | pop edi |
||
321 | STD_LEAVE |
||
322 | ret |
||
323 | _crc32 endp |
||
324 | |||
325 | _TEXT ends |
||
326 | ; |
||
327 | ENDIF ; !CRC_TABLE_ONLY |
||
328 | ENDIF ; !USE_ZLIB |
||
329 | ; |
||
330 | end>8 |