Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
1769 | yogev_ezra | 1 | ;// fast life generator: ~2.2 pixel*generation/tact |
2 | |||
3 | macro live_shl x,do_shl |
||
4 | { |
||
5 | if do_shl eq yes |
||
6 | psllq x,1 |
||
7 | end if |
||
8 | } |
||
9 | |||
10 | macro live_shr x,do_shr |
||
11 | { |
||
12 | if do_shr eq yes |
||
13 | psrlq x,1 |
||
14 | end if |
||
15 | } |
||
16 | |||
17 | macro live_zero x,y |
||
18 | { |
||
19 | pxor x,x |
||
20 | movq y,x |
||
21 | } |
||
22 | |||
23 | macro live_load x,y,z,t,shl_edi,shr_esi |
||
24 | { |
||
25 | movq y,[edi+ecx] |
||
26 | movq x,[ebx+ecx] |
||
27 | live_shl y,shl_edi |
||
28 | movq t,y |
||
29 | pxor y,x |
||
30 | movq z,[esi+ecx] |
||
31 | pand x,t |
||
32 | live_shr z,shr_esi |
||
33 | movq t,y |
||
34 | pxor y,z |
||
35 | pand t,z |
||
36 | por x,t |
||
37 | } |
||
38 | |||
39 | macro live_operation a,A,b,B,c,C,d,D,shift |
||
40 | { |
||
41 | movq D,A |
||
42 | pxor A,B |
||
43 | pand D,B |
||
44 | movq d,a |
||
45 | pxor a,D |
||
46 | pand d,D |
||
47 | movq D,a |
||
48 | pxor a,b |
||
49 | pand D,b |
||
50 | por d,D |
||
51 | movq D,a |
||
52 | pxor a,c |
||
53 | pand D,c |
||
54 | pxor d,D |
||
55 | pxor a,d |
||
56 | movq D,A |
||
57 | por D,C |
||
58 | pxor A,C |
||
59 | pxor d,D |
||
60 | por A,[ebx+shift+16] |
||
61 | pand a,d |
||
62 | pand a,A |
||
63 | movq [ebp+shift],a |
||
64 | } |
||
65 | |||
66 | macro live_cycle shl_edi,shr_esi |
||
67 | { |
||
68 | local cycle |
||
69 | local cycle_entry |
||
70 | local last_oper |
||
71 | lea ecx,[edx-8] |
||
72 | live_zero mm2,mm3 |
||
73 | live_load mm4,mm5,mm6,mm7,shl_edi,shr_esi |
||
74 | sub ecx,eax |
||
75 | jmp cycle_entry |
||
76 | cycle: |
||
77 | live_load mm4,mm5,mm6,mm7,shl_edi,shr_esi |
||
78 | live_operation mm0,mm1,mm2,mm3,mm4,mm5,mm6,mm7,ecx |
||
79 | sub ecx,eax |
||
80 | cycle_entry: |
||
81 | live_load mm6,mm7,mm0,mm1,shl_edi,shr_esi |
||
82 | live_operation mm2,mm3,mm4,mm5,mm6,mm7,mm0,mm1,ecx |
||
83 | sub ecx,eax |
||
84 | live_load mm0,mm1,mm2,mm3,shl_edi,shr_esi |
||
85 | live_operation mm4,mm5,mm6,mm7,mm0,mm1,mm2,mm3,ecx |
||
86 | sub ecx,eax |
||
87 | live_load mm2,mm3,mm4,mm5,shl_edi,shr_esi |
||
88 | live_operation mm6,mm7,mm0,mm1,mm2,mm3,mm4,mm5,ecx |
||
89 | sub ecx,eax |
||
90 | jnl cycle |
||
91 | cmp cl,-8 |
||
92 | jnz last_oper |
||
93 | lea ecx,[edx-16] |
||
94 | live_load mm4,mm5,mm6,mm7,shl_edi,shr_esi |
||
95 | live_operation mm0,mm1,mm2,mm3,mm4,mm5,mm6,mm7,-8 |
||
96 | sub ecx,eax |
||
97 | jmp cycle_entry |
||
98 | last_oper: |
||
99 | live_zero mm4,mm5 |
||
100 | live_operation mm0,mm1,mm2,mm3,mm4,mm5,mm6,mm7,ecx |
||
101 | } |
||
102 | |||
103 | OneGeneration_Flag12: |
||
104 | push edi |
||
105 | lea esi,[eax+1] |
||
106 | bt dword [esp+48],1 |
||
107 | jnc OneGeneration_flag2_end |
||
108 | bt dword [esp+48],3 |
||
109 | jc OneGeneration_flag2_end |
||
110 | mov edi,[esp+36] |
||
111 | shl edi,4 |
||
112 | cmp edi,edx |
||
113 | jb OneGeneration_flag2_uphalf |
||
114 | sub edi,edx |
||
115 | cmp edi,edx |
||
116 | jnb OneGeneration_flag2_end |
||
117 | add edi,8 |
||
118 | OneGeneration_flag2_uphalf: |
||
119 | mov ecx,esi |
||
120 | add edi,ebx |
||
121 | pxor mm0,mm0 |
||
122 | OneGeneration_flag2_cycle: |
||
123 | movq [edi],mm0 |
||
124 | add edi,edx |
||
125 | loop OneGeneration_flag2_cycle |
||
126 | OneGeneration_flag2_end: |
||
127 | bt dword [esp+48],0 |
||
128 | jnc OneGeneration_flag1_end |
||
129 | bt dword [esp+48],2 |
||
130 | jc OneGeneration_flag1_end |
||
131 | push edx |
||
132 | mov eax,[esp+36] |
||
133 | xor edx,edx |
||
134 | div esi |
||
135 | mov esi,edx |
||
136 | pop edx |
||
137 | cmp eax,64 |
||
138 | jnb OneGeneration_flag1_end |
||
139 | imul esi,edx |
||
140 | add esi,ebx |
||
141 | btr eax,5 |
||
142 | jnc OneGeneration_flag1_noadd4 |
||
143 | add esi,4 |
||
144 | OneGeneration_flag1_noadd4: |
||
145 | lea ecx,[edx-8] |
||
146 | mov edi,8 |
||
147 | OneGeneration_flag1_cycle: |
||
148 | btr dword [esi+ecx],eax |
||
149 | sub ecx,edi |
||
150 | btr dword [esi+ecx],eax |
||
151 | sub ecx,edi |
||
152 | btr dword [esi+ecx],eax |
||
153 | sub ecx,edi |
||
154 | btr dword [esi+ecx],eax |
||
155 | sub ecx,edi |
||
156 | jnl OneGeneration_flag1_cycle |
||
157 | OneGeneration_flag1_end: |
||
158 | pop edi |
||
159 | ret |
||
160 | |||
161 | @OneGeneration$qqsiipvpxvi: |
||
162 | push ebp |
||
163 | push ebx |
||
164 | push esi |
||
165 | push edi |
||
166 | mov eax,[esp+20] |
||
167 | mov edx,[esp+24] |
||
168 | mov ebp,[esp+28] |
||
169 | mov ebx,[esp+32] |
||
170 | dec eax |
||
171 | jl OneGeneration_end |
||
172 | add edx,7 |
||
173 | add ebp,31 |
||
174 | add ebx,15 |
||
175 | shr eax,6 |
||
176 | shl edx,3 |
||
177 | and ebp,not 15 |
||
178 | and ebx,not 15 |
||
179 | and edx,not 63 |
||
180 | jng OneGeneration_end |
||
181 | test eax,eax |
||
182 | jz OneGeneration_single |
||
183 | mov edi,edx |
||
184 | imul edi,eax |
||
185 | jo OneGeneration_end |
||
186 | push eax |
||
187 | add edi,ebx |
||
188 | call OneGeneration_Flag12 |
||
189 | lea esi,[ebx+edx] |
||
190 | push dword [esp] |
||
191 | mov eax,16 |
||
192 | live_cycle yes,no |
||
193 | jmp OneGeneration_cycle_fin |
||
194 | OneGeneration_cycle: |
||
195 | mov edi,ebx |
||
196 | mov ebx,esi |
||
197 | add ebp,edx |
||
198 | add esi,edx |
||
199 | live_cycle no,no |
||
200 | OneGeneration_cycle_fin: |
||
201 | dec dword [esp] |
||
202 | jg OneGeneration_cycle |
||
203 | mov edi,ebx |
||
204 | pop ecx |
||
205 | mov ebx,esi |
||
206 | mov esi,edx |
||
207 | add ebp,edx |
||
208 | imul esi,[esp] |
||
209 | neg esi |
||
210 | add esi,ebx |
||
211 | live_cycle no,yes |
||
212 | jmp OneGeneration_flag48 |
||
213 | OneGeneration_single: |
||
214 | push eax |
||
215 | mov edi,ebx |
||
216 | call OneGeneration_Flag12 |
||
217 | mov esi,ebx |
||
218 | mov eax,16 |
||
219 | live_cycle yes,yes |
||
220 | OneGeneration_flag48: |
||
221 | pop ebp |
||
222 | inc ebp |
||
223 | bt dword [esp+36],3 |
||
224 | jnc OneGeneration_flag8_end |
||
225 | mov edi,[esp+24] |
||
226 | mov ebx,[esp+28] |
||
227 | dec edi |
||
228 | add ebx,15 |
||
229 | shl edi,4 |
||
230 | lea esi,[edi-16] |
||
231 | and ebx,not 15 |
||
232 | cmp edi,edx |
||
233 | jb OneGeneration_flag8_uphalf |
||
234 | sub edi,edx |
||
235 | add edi,8 |
||
236 | cmp esi,edx |
||
237 | jb OneGeneration_flag8_uphalf |
||
238 | sub esi,edx |
||
239 | add esi,8 |
||
240 | OneGeneration_flag8_uphalf: |
||
241 | mov ecx,ebp |
||
242 | OneGeneration_flag8_cycle: |
||
243 | movq mm0,[ebx+esi] |
||
244 | movq [ebx],mm0 |
||
245 | movq mm0,[ebx+16] |
||
246 | movq [ebx+edi],mm0 |
||
247 | add ebx,edx |
||
248 | loop OneGeneration_flag8_cycle |
||
249 | OneGeneration_flag8_end: |
||
250 | bt dword [esp+36],2 |
||
251 | jnc OneGeneration_flag4_end |
||
252 | mov eax,[esp+20] |
||
253 | push edx |
||
254 | dec eax |
||
255 | xor edx,edx |
||
256 | mov ebx,[esp+32] |
||
257 | div ebp |
||
258 | add ebx,15 |
||
259 | mov esi,eax |
||
260 | mov edi,edx |
||
261 | and ebx,not 15 |
||
262 | dec edx |
||
263 | jl OneGeneration_flag4_dec0 |
||
264 | mov ebp,edx |
||
265 | jmp OneGeneration_flag4_after_dec |
||
266 | OneGeneration_flag4_dec0: |
||
267 | dec ebp |
||
268 | dec eax |
||
269 | OneGeneration_flag4_after_dec: |
||
270 | pop edx |
||
271 | imul edi,edx |
||
272 | imul ebp,edx |
||
273 | add edi,ebx |
||
274 | add ebp,ebx |
||
275 | btr esi,5 |
||
276 | jnc OneGeneration_flag4_noadd4f |
||
277 | add edi,4 |
||
278 | OneGeneration_flag4_noadd4f: |
||
279 | btr eax,5 |
||
280 | jnc OneGeneration_flag4_noadd4s |
||
281 | add ebp,4 |
||
282 | OneGeneration_flag4_noadd4s: |
||
283 | mov ecx,edx |
||
284 | jmp OneGeneration_flag4_cycle0_entry |
||
285 | OneGeneration_flag4_cycle0: |
||
286 | btr dword [ebx+ecx],0 |
||
287 | OneGeneration_flag4_cycle0_entry: |
||
288 | sub ecx,8 |
||
289 | jl OneGeneration_flag4_cycle0_end |
||
290 | bt dword [ebp+ecx],eax |
||
291 | jnc OneGeneration_flag4_cycle0 |
||
292 | bts dword [ebx+ecx],0 |
||
293 | jmp OneGeneration_flag4_cycle0_entry |
||
294 | OneGeneration_flag4_cycle0_end: |
||
295 | xor eax,eax |
||
296 | cmp dword [esp+20],64 |
||
297 | jng OneGeneration_flag4_single |
||
298 | add ebx,edx |
||
299 | jmp OneGeneration_flag4_cycle1_entry |
||
300 | OneGeneration_flag4_single: |
||
301 | inc eax |
||
302 | jmp OneGeneration_flag4_cycle1_entry |
||
303 | OneGeneration_flag4_cycle1: |
||
304 | btr dword [edi+edx],esi |
||
305 | OneGeneration_flag4_cycle1_entry: |
||
306 | sub edx,8 |
||
307 | jl OneGeneration_flag4_end |
||
308 | bt dword [ebx+edx],eax |
||
309 | jnc OneGeneration_flag4_cycle1 |
||
310 | bts dword [edi+edx],esi |
||
311 | jmp OneGeneration_flag4_cycle1_entry |
||
312 | OneGeneration_flag4_end: |
||
313 | emms |
||
314 | OneGeneration_end: |
||
315 | pop edi |
||
316 | pop esi |
||
317 | pop ebx |
||
318 | pop ebp |
||
319 | ret 20 |
||
320 |