Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
5131 | clevermous | 1 | /* |
2 | Copyright (C) 1996-1997 Id Software, Inc. |
||
3 | |||
4 | This program is free software; you can redistribute it and/or |
||
5 | modify it under the terms of the GNU General Public License |
||
6 | as published by the Free Software Foundation; either version 2 |
||
7 | of the License, or (at your option) any later version. |
||
8 | |||
9 | This program is distributed in the hope that it will be useful, |
||
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
||
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. |
||
12 | |||
13 | See the GNU General Public License for more details. |
||
14 | |||
15 | You should have received a copy of the GNU General Public License |
||
16 | along with this program; if not, write to the Free Software |
||
17 | Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. |
||
18 | |||
19 | */ |
||
20 | // |
||
21 | // d_draw16.s |
||
22 | // x86 assembly-language horizontal 8-bpp span-drawing code, with 16-pixel |
||
23 | // subdivision. |
||
24 | // |
||
25 | |||
26 | #include "asm_i386.h" |
||
27 | #include "quakeasm.h" |
||
28 | #include "asm_draw.h" |
||
29 | #include "d_ifacea.h" |
||
30 | |||
31 | #if id386 |
||
32 | |||
33 | //---------------------------------------------------------------------- |
||
34 | // 8-bpp horizontal span drawing code for polygons, with no transparency and |
||
35 | // 16-pixel subdivision. |
||
36 | // |
||
37 | // Assumes there is at least one span in pspans, and that every span |
||
38 | // contains at least one pixel |
||
39 | //---------------------------------------------------------------------- |
||
40 | |||
41 | .data |
||
42 | |||
43 | .text |
||
44 | |||
45 | // out-of-line, rarely-needed clamping code |
||
46 | |||
47 | LClampHigh0: |
||
48 | movl C(bbextents),%esi |
||
49 | jmp LClampReentry0 |
||
50 | LClampHighOrLow0: |
||
51 | jg LClampHigh0 |
||
52 | xorl %esi,%esi |
||
53 | jmp LClampReentry0 |
||
54 | |||
55 | LClampHigh1: |
||
56 | movl C(bbextentt),%edx |
||
57 | jmp LClampReentry1 |
||
58 | LClampHighOrLow1: |
||
59 | jg LClampHigh1 |
||
60 | xorl %edx,%edx |
||
61 | jmp LClampReentry1 |
||
62 | |||
63 | LClampLow2: |
||
64 | movl $4096,%ebp |
||
65 | jmp LClampReentry2 |
||
66 | LClampHigh2: |
||
67 | movl C(bbextents),%ebp |
||
68 | jmp LClampReentry2 |
||
69 | |||
70 | LClampLow3: |
||
71 | movl $4096,%ecx |
||
72 | jmp LClampReentry3 |
||
73 | LClampHigh3: |
||
74 | movl C(bbextentt),%ecx |
||
75 | jmp LClampReentry3 |
||
76 | |||
77 | LClampLow4: |
||
78 | movl $4096,%eax |
||
79 | jmp LClampReentry4 |
||
80 | LClampHigh4: |
||
81 | movl C(bbextents),%eax |
||
82 | jmp LClampReentry4 |
||
83 | |||
84 | LClampLow5: |
||
85 | movl $4096,%ebx |
||
86 | jmp LClampReentry5 |
||
87 | LClampHigh5: |
||
88 | movl C(bbextentt),%ebx |
||
89 | jmp LClampReentry5 |
||
90 | |||
91 | |||
92 | #define pspans 4+16 |
||
93 | |||
94 | .align 4 |
||
95 | .globl C(D_DrawSpans16) |
||
96 | C(D_DrawSpans16): |
||
97 | pushl %ebp // preserve caller's stack frame |
||
98 | pushl %edi |
||
99 | pushl %esi // preserve register variables |
||
100 | pushl %ebx |
||
101 | |||
102 | // |
||
103 | // set up scaled-by-16 steps, for 16-long segments; also set up cacheblock |
||
104 | // and span list pointers |
||
105 | // |
||
106 | // TODO: any overlap from rearranging? |
||
107 | flds C(d_sdivzstepu) |
||
108 | fmuls fp_16 |
||
109 | movl C(cacheblock),%edx |
||
110 | flds C(d_tdivzstepu) |
||
111 | fmuls fp_16 |
||
112 | movl pspans(%esp),%ebx // point to the first span descriptor |
||
113 | flds C(d_zistepu) |
||
114 | fmuls fp_16 |
||
115 | movl %edx,pbase // pbase = cacheblock |
||
116 | fstps zi16stepu |
||
117 | fstps tdivz16stepu |
||
118 | fstps sdivz16stepu |
||
119 | |||
120 | LSpanLoop: |
||
121 | // |
||
122 | // set up the initial s/z, t/z, and 1/z on the FP stack, and generate the |
||
123 | // initial s and t values |
||
124 | // |
||
125 | // FIXME: pipeline FILD? |
||
126 | fildl espan_t_v(%ebx) |
||
127 | fildl espan_t_u(%ebx) |
||
128 | |||
129 | fld %st(1) // dv | du | dv |
||
130 | fmuls C(d_sdivzstepv) // dv*d_sdivzstepv | du | dv |
||
131 | fld %st(1) // du | dv*d_sdivzstepv | du | dv |
||
132 | fmuls C(d_sdivzstepu) // du*d_sdivzstepu | dv*d_sdivzstepv | du | dv |
||
133 | fld %st(2) // du | du*d_sdivzstepu | dv*d_sdivzstepv | du | dv |
||
134 | fmuls C(d_tdivzstepu) // du*d_tdivzstepu | du*d_sdivzstepu | |
||
135 | // dv*d_sdivzstepv | du | dv |
||
136 | fxch %st(1) // du*d_sdivzstepu | du*d_tdivzstepu | |
||
137 | // dv*d_sdivzstepv | du | dv |
||
138 | faddp %st(0),%st(2) // du*d_tdivzstepu | |
||
139 | // du*d_sdivzstepu + dv*d_sdivzstepv | du | dv |
||
140 | fxch %st(1) // du*d_sdivzstepu + dv*d_sdivzstepv | |
||
141 | // du*d_tdivzstepu | du | dv |
||
142 | fld %st(3) // dv | du*d_sdivzstepu + dv*d_sdivzstepv | |
||
143 | // du*d_tdivzstepu | du | dv |
||
144 | fmuls C(d_tdivzstepv) // dv*d_tdivzstepv | |
||
145 | // du*d_sdivzstepu + dv*d_sdivzstepv | |
||
146 | // du*d_tdivzstepu | du | dv |
||
147 | fxch %st(1) // du*d_sdivzstepu + dv*d_sdivzstepv | |
||
148 | // dv*d_tdivzstepv | du*d_tdivzstepu | du | dv |
||
149 | fadds C(d_sdivzorigin) // sdivz = d_sdivzorigin + dv*d_sdivzstepv + |
||
150 | // du*d_sdivzstepu; stays in %st(2) at end |
||
151 | fxch %st(4) // dv | dv*d_tdivzstepv | du*d_tdivzstepu | du | |
||
152 | // s/z |
||
153 | fmuls C(d_zistepv) // dv*d_zistepv | dv*d_tdivzstepv | |
||
154 | // du*d_tdivzstepu | du | s/z |
||
155 | fxch %st(1) // dv*d_tdivzstepv | dv*d_zistepv | |
||
156 | // du*d_tdivzstepu | du | s/z |
||
157 | faddp %st(0),%st(2) // dv*d_zistepv | |
||
158 | // dv*d_tdivzstepv + du*d_tdivzstepu | du | s/z |
||
159 | fxch %st(2) // du | dv*d_tdivzstepv + du*d_tdivzstepu | |
||
160 | // dv*d_zistepv | s/z |
||
161 | fmuls C(d_zistepu) // du*d_zistepu | |
||
162 | // dv*d_tdivzstepv + du*d_tdivzstepu | |
||
163 | // dv*d_zistepv | s/z |
||
164 | fxch %st(1) // dv*d_tdivzstepv + du*d_tdivzstepu | |
||
165 | // du*d_zistepu | dv*d_zistepv | s/z |
||
166 | fadds C(d_tdivzorigin) // tdivz = d_tdivzorigin + dv*d_tdivzstepv + |
||
167 | // du*d_tdivzstepu; stays in %st(1) at end |
||
168 | fxch %st(2) // dv*d_zistepv | du*d_zistepu | t/z | s/z |
||
169 | faddp %st(0),%st(1) // dv*d_zistepv + du*d_zistepu | t/z | s/z |
||
170 | |||
171 | flds fp_64k // fp_64k | dv*d_zistepv + du*d_zistepu | t/z | s/z |
||
172 | fxch %st(1) // dv*d_zistepv + du*d_zistepu | fp_64k | t/z | s/z |
||
173 | fadds C(d_ziorigin) // zi = d_ziorigin + dv*d_zistepv + |
||
174 | // du*d_zistepu; stays in %st(0) at end |
||
175 | // 1/z | fp_64k | t/z | s/z |
||
176 | // |
||
177 | // calculate and clamp s & t |
||
178 | // |
||
179 | fdivr %st(0),%st(1) // 1/z | z*64k | t/z | s/z |
||
180 | |||
181 | // |
||
182 | // point %edi to the first pixel in the span |
||
183 | // |
||
184 | movl C(d_viewbuffer),%ecx |
||
185 | movl espan_t_v(%ebx),%eax |
||
186 | movl %ebx,pspantemp // preserve spans pointer |
||
187 | |||
188 | movl C(tadjust),%edx |
||
189 | movl C(sadjust),%esi |
||
190 | movl C(d_scantable)(,%eax,4),%edi // v * screenwidth |
||
191 | addl %ecx,%edi |
||
192 | movl espan_t_u(%ebx),%ecx |
||
193 | addl %ecx,%edi // pdest = &pdestspan[scans->u]; |
||
194 | movl espan_t_count(%ebx),%ecx |
||
195 | |||
196 | // |
||
197 | // now start the FDIV for the end of the span |
||
198 | // |
||
199 | cmpl $16,%ecx |
||
200 | ja LSetupNotLast1 |
||
201 | |||
202 | decl %ecx |
||
203 | jz LCleanup1 // if only one pixel, no need to start an FDIV |
||
204 | movl %ecx,spancountminus1 |
||
205 | |||
206 | // finish up the s and t calcs |
||
207 | fxch %st(1) // z*64k | 1/z | t/z | s/z |
||
208 | |||
209 | fld %st(0) // z*64k | z*64k | 1/z | t/z | s/z |
||
210 | fmul %st(4),%st(0) // s | z*64k | 1/z | t/z | s/z |
||
211 | fxch %st(1) // z*64k | s | 1/z | t/z | s/z |
||
212 | fmul %st(3),%st(0) // t | s | 1/z | t/z | s/z |
||
213 | fxch %st(1) // s | t | 1/z | t/z | s/z |
||
214 | fistpl s // 1/z | t | t/z | s/z |
||
215 | fistpl t // 1/z | t/z | s/z |
||
216 | |||
217 | fildl spancountminus1 |
||
218 | |||
219 | flds C(d_tdivzstepu) // C(d_tdivzstepu) | spancountminus1 |
||
220 | flds C(d_zistepu) // C(d_zistepu) | C(d_tdivzstepu) | spancountminus1 |
||
221 | fmul %st(2),%st(0) // C(d_zistepu)*scm1 | C(d_tdivzstepu) | scm1 |
||
222 | fxch %st(1) // C(d_tdivzstepu) | C(d_zistepu)*scm1 | scm1 |
||
223 | fmul %st(2),%st(0) // C(d_tdivzstepu)*scm1 | C(d_zistepu)*scm1 | scm1 |
||
224 | fxch %st(2) // scm1 | C(d_zistepu)*scm1 | C(d_tdivzstepu)*scm1 |
||
225 | fmuls C(d_sdivzstepu) // C(d_sdivzstepu)*scm1 | C(d_zistepu)*scm1 | |
||
226 | // C(d_tdivzstepu)*scm1 |
||
227 | fxch %st(1) // C(d_zistepu)*scm1 | C(d_sdivzstepu)*scm1 | |
||
228 | // C(d_tdivzstepu)*scm1 |
||
229 | faddp %st(0),%st(3) // C(d_sdivzstepu)*scm1 | C(d_tdivzstepu)*scm1 |
||
230 | fxch %st(1) // C(d_tdivzstepu)*scm1 | C(d_sdivzstepu)*scm1 |
||
231 | faddp %st(0),%st(3) // C(d_sdivzstepu)*scm1 |
||
232 | faddp %st(0),%st(3) |
||
233 | |||
234 | flds fp_64k |
||
235 | fdiv %st(1),%st(0) // this is what we've gone to all this trouble to |
||
236 | // overlap |
||
237 | jmp LFDIVInFlight1 |
||
238 | |||
239 | LCleanup1: |
||
240 | // finish up the s and t calcs |
||
241 | fxch %st(1) // z*64k | 1/z | t/z | s/z |
||
242 | |||
243 | fld %st(0) // z*64k | z*64k | 1/z | t/z | s/z |
||
244 | fmul %st(4),%st(0) // s | z*64k | 1/z | t/z | s/z |
||
245 | fxch %st(1) // z*64k | s | 1/z | t/z | s/z |
||
246 | fmul %st(3),%st(0) // t | s | 1/z | t/z | s/z |
||
247 | fxch %st(1) // s | t | 1/z | t/z | s/z |
||
248 | fistpl s // 1/z | t | t/z | s/z |
||
249 | fistpl t // 1/z | t/z | s/z |
||
250 | jmp LFDIVInFlight1 |
||
251 | |||
252 | .align 4 |
||
253 | LSetupNotLast1: |
||
254 | // finish up the s and t calcs |
||
255 | fxch %st(1) // z*64k | 1/z | t/z | s/z |
||
256 | |||
257 | fld %st(0) // z*64k | z*64k | 1/z | t/z | s/z |
||
258 | fmul %st(4),%st(0) // s | z*64k | 1/z | t/z | s/z |
||
259 | fxch %st(1) // z*64k | s | 1/z | t/z | s/z |
||
260 | fmul %st(3),%st(0) // t | s | 1/z | t/z | s/z |
||
261 | fxch %st(1) // s | t | 1/z | t/z | s/z |
||
262 | fistpl s // 1/z | t | t/z | s/z |
||
263 | fistpl t // 1/z | t/z | s/z |
||
264 | |||
265 | fadds zi16stepu |
||
266 | fxch %st(2) |
||
267 | fadds sdivz16stepu |
||
268 | fxch %st(2) |
||
269 | flds tdivz16stepu |
||
270 | faddp %st(0),%st(2) |
||
271 | flds fp_64k |
||
272 | fdiv %st(1),%st(0) // z = 1/1/z |
||
273 | // this is what we've gone to all this trouble to |
||
274 | // overlap |
||
275 | LFDIVInFlight1: |
||
276 | |||
277 | addl s,%esi |
||
278 | addl t,%edx |
||
279 | movl C(bbextents),%ebx |
||
280 | movl C(bbextentt),%ebp |
||
281 | cmpl %ebx,%esi |
||
282 | ja LClampHighOrLow0 |
||
283 | LClampReentry0: |
||
284 | movl %esi,s |
||
285 | movl pbase,%ebx |
||
286 | shll $16,%esi |
||
287 | cmpl %ebp,%edx |
||
288 | movl %esi,sfracf |
||
289 | ja LClampHighOrLow1 |
||
290 | LClampReentry1: |
||
291 | movl %edx,t |
||
292 | movl s,%esi // sfrac = scans->sfrac; |
||
293 | shll $16,%edx |
||
294 | movl t,%eax // tfrac = scans->tfrac; |
||
295 | sarl $16,%esi |
||
296 | movl %edx,tfracf |
||
297 | |||
298 | // |
||
299 | // calculate the texture starting address |
||
300 | // |
||
301 | sarl $16,%eax |
||
302 | movl C(cachewidth),%edx |
||
303 | imull %edx,%eax // (tfrac >> 16) * cachewidth |
||
304 | addl %ebx,%esi |
||
305 | addl %eax,%esi // psource = pbase + (sfrac >> 16) + |
||
306 | // ((tfrac >> 16) * cachewidth); |
||
307 | // |
||
308 | // determine whether last span or not |
||
309 | // |
||
310 | cmpl $16,%ecx |
||
311 | jna LLastSegment |
||
312 | |||
313 | // |
||
314 | // not the last segment; do full 16-wide segment |
||
315 | // |
||
316 | LNotLastSegment: |
||
317 | |||
318 | // |
||
319 | // advance s/z, t/z, and 1/z, and calculate s & t at end of span and steps to |
||
320 | // get there |
||
321 | // |
||
322 | |||
323 | // pick up after the FDIV that was left in flight previously |
||
324 | |||
325 | fld %st(0) // duplicate it |
||
326 | fmul %st(4),%st(0) // s = s/z * z |
||
327 | fxch %st(1) |
||
328 | fmul %st(3),%st(0) // t = t/z * z |
||
329 | fxch %st(1) |
||
330 | fistpl snext |
||
331 | fistpl tnext |
||
332 | movl snext,%eax |
||
333 | movl tnext,%edx |
||
334 | |||
335 | movb (%esi),%bl // get first source texel |
||
336 | subl $16,%ecx // count off this segments' pixels |
||
337 | movl C(sadjust),%ebp |
||
338 | movl %ecx,counttemp // remember count of remaining pixels |
||
339 | |||
340 | movl C(tadjust),%ecx |
||
341 | movb %bl,(%edi) // store first dest pixel |
||
342 | |||
343 | addl %eax,%ebp |
||
344 | addl %edx,%ecx |
||
345 | |||
346 | movl C(bbextents),%eax |
||
347 | movl C(bbextentt),%edx |
||
348 | |||
349 | cmpl $4096,%ebp |
||
350 | jl LClampLow2 |
||
351 | cmpl %eax,%ebp |
||
352 | ja LClampHigh2 |
||
353 | LClampReentry2: |
||
354 | |||
355 | cmpl $4096,%ecx |
||
356 | jl LClampLow3 |
||
357 | cmpl %edx,%ecx |
||
358 | ja LClampHigh3 |
||
359 | LClampReentry3: |
||
360 | |||
361 | movl %ebp,snext |
||
362 | movl %ecx,tnext |
||
363 | |||
364 | subl s,%ebp |
||
365 | subl t,%ecx |
||
366 | |||
367 | // |
||
368 | // set up advancetable |
||
369 | // |
||
370 | movl %ecx,%eax |
||
371 | movl %ebp,%edx |
||
372 | sarl $20,%eax // tstep >>= 16; |
||
373 | jz LZero |
||
374 | sarl $20,%edx // sstep >>= 16; |
||
375 | movl C(cachewidth),%ebx |
||
376 | imull %ebx,%eax |
||
377 | jmp LSetUp1 |
||
378 | |||
379 | LZero: |
||
380 | sarl $20,%edx // sstep >>= 16; |
||
381 | movl C(cachewidth),%ebx |
||
382 | |||
383 | LSetUp1: |
||
384 | |||
385 | addl %edx,%eax // add in sstep |
||
386 | // (tstep >> 16) * cachewidth + (sstep >> 16); |
||
387 | movl tfracf,%edx |
||
388 | movl %eax,advancetable+4 // advance base in t |
||
389 | addl %ebx,%eax // ((tstep >> 16) + 1) * cachewidth + |
||
390 | // (sstep >> 16); |
||
391 | shll $12,%ebp // left-justify sstep fractional part |
||
392 | movl sfracf,%ebx |
||
393 | shll $12,%ecx // left-justify tstep fractional part |
||
394 | movl %eax,advancetable // advance extra in t |
||
395 | |||
396 | movl %ecx,tstep |
||
397 | addl %ecx,%edx // advance tfrac fractional part by tstep frac |
||
398 | |||
399 | sbbl %ecx,%ecx // turn tstep carry into -1 (0 if none) |
||
400 | addl %ebp,%ebx // advance sfrac fractional part by sstep frac |
||
401 | adcl advancetable+4(,%ecx,4),%esi // point to next source texel |
||
402 | |||
403 | addl tstep,%edx |
||
404 | sbbl %ecx,%ecx |
||
405 | movb (%esi),%al |
||
406 | addl %ebp,%ebx |
||
407 | movb %al,1(%edi) |
||
408 | adcl advancetable+4(,%ecx,4),%esi |
||
409 | |||
410 | addl tstep,%edx |
||
411 | sbbl %ecx,%ecx |
||
412 | addl %ebp,%ebx |
||
413 | movb (%esi),%al |
||
414 | adcl advancetable+4(,%ecx,4),%esi |
||
415 | |||
416 | addl tstep,%edx |
||
417 | sbbl %ecx,%ecx |
||
418 | movb %al,2(%edi) |
||
419 | addl %ebp,%ebx |
||
420 | movb (%esi),%al |
||
421 | adcl advancetable+4(,%ecx,4),%esi |
||
422 | |||
423 | addl tstep,%edx |
||
424 | sbbl %ecx,%ecx |
||
425 | movb %al,3(%edi) |
||
426 | addl %ebp,%ebx |
||
427 | movb (%esi),%al |
||
428 | adcl advancetable+4(,%ecx,4),%esi |
||
429 | |||
430 | addl tstep,%edx |
||
431 | sbbl %ecx,%ecx |
||
432 | movb %al,4(%edi) |
||
433 | addl %ebp,%ebx |
||
434 | movb (%esi),%al |
||
435 | adcl advancetable+4(,%ecx,4),%esi |
||
436 | |||
437 | addl tstep,%edx |
||
438 | sbbl %ecx,%ecx |
||
439 | movb %al,5(%edi) |
||
440 | addl %ebp,%ebx |
||
441 | movb (%esi),%al |
||
442 | adcl advancetable+4(,%ecx,4),%esi |
||
443 | |||
444 | addl tstep,%edx |
||
445 | sbbl %ecx,%ecx |
||
446 | movb %al,6(%edi) |
||
447 | addl %ebp,%ebx |
||
448 | movb (%esi),%al |
||
449 | adcl advancetable+4(,%ecx,4),%esi |
||
450 | |||
451 | addl tstep,%edx |
||
452 | sbbl %ecx,%ecx |
||
453 | movb %al,7(%edi) |
||
454 | addl %ebp,%ebx |
||
455 | movb (%esi),%al |
||
456 | adcl advancetable+4(,%ecx,4),%esi |
||
457 | |||
458 | |||
459 | // |
||
460 | // start FDIV for end of next segment in flight, so it can overlap |
||
461 | // |
||
462 | movl counttemp,%ecx |
||
463 | cmpl $16,%ecx // more than one segment after this? |
||
464 | ja LSetupNotLast2 // yes |
||
465 | |||
466 | decl %ecx |
||
467 | jz LFDIVInFlight2 // if only one pixel, no need to start an FDIV |
||
468 | movl %ecx,spancountminus1 |
||
469 | fildl spancountminus1 |
||
470 | |||
471 | flds C(d_zistepu) // C(d_zistepu) | spancountminus1 |
||
472 | fmul %st(1),%st(0) // C(d_zistepu)*scm1 | scm1 |
||
473 | flds C(d_tdivzstepu) // C(d_tdivzstepu) | C(d_zistepu)*scm1 | scm1 |
||
474 | fmul %st(2),%st(0) // C(d_tdivzstepu)*scm1 | C(d_zistepu)*scm1 | scm1 |
||
475 | fxch %st(1) // C(d_zistepu)*scm1 | C(d_tdivzstepu)*scm1 | scm1 |
||
476 | faddp %st(0),%st(3) // C(d_tdivzstepu)*scm1 | scm1 |
||
477 | fxch %st(1) // scm1 | C(d_tdivzstepu)*scm1 |
||
478 | fmuls C(d_sdivzstepu) // C(d_sdivzstepu)*scm1 | C(d_tdivzstepu)*scm1 |
||
479 | fxch %st(1) // C(d_tdivzstepu)*scm1 | C(d_sdivzstepu)*scm1 |
||
480 | faddp %st(0),%st(3) // C(d_sdivzstepu)*scm1 |
||
481 | flds fp_64k // 64k | C(d_sdivzstepu)*scm1 |
||
482 | fxch %st(1) // C(d_sdivzstepu)*scm1 | 64k |
||
483 | faddp %st(0),%st(4) // 64k |
||
484 | |||
485 | fdiv %st(1),%st(0) // this is what we've gone to all this trouble to |
||
486 | // overlap |
||
487 | jmp LFDIVInFlight2 |
||
488 | |||
489 | .align 4 |
||
490 | LSetupNotLast2: |
||
491 | fadds zi16stepu |
||
492 | fxch %st(2) |
||
493 | fadds sdivz16stepu |
||
494 | fxch %st(2) |
||
495 | flds tdivz16stepu |
||
496 | faddp %st(0),%st(2) |
||
497 | flds fp_64k |
||
498 | fdiv %st(1),%st(0) // z = 1/1/z |
||
499 | // this is what we've gone to all this trouble to |
||
500 | // overlap |
||
501 | LFDIVInFlight2: |
||
502 | movl %ecx,counttemp |
||
503 | |||
504 | addl tstep,%edx |
||
505 | sbbl %ecx,%ecx |
||
506 | movb %al,8(%edi) |
||
507 | addl %ebp,%ebx |
||
508 | movb (%esi),%al |
||
509 | adcl advancetable+4(,%ecx,4),%esi |
||
510 | |||
511 | addl tstep,%edx |
||
512 | sbbl %ecx,%ecx |
||
513 | movb %al,9(%edi) |
||
514 | addl %ebp,%ebx |
||
515 | movb (%esi),%al |
||
516 | adcl advancetable+4(,%ecx,4),%esi |
||
517 | |||
518 | addl tstep,%edx |
||
519 | sbbl %ecx,%ecx |
||
520 | movb %al,10(%edi) |
||
521 | addl %ebp,%ebx |
||
522 | movb (%esi),%al |
||
523 | adcl advancetable+4(,%ecx,4),%esi |
||
524 | |||
525 | addl tstep,%edx |
||
526 | sbbl %ecx,%ecx |
||
527 | movb %al,11(%edi) |
||
528 | addl %ebp,%ebx |
||
529 | movb (%esi),%al |
||
530 | adcl advancetable+4(,%ecx,4),%esi |
||
531 | |||
532 | addl tstep,%edx |
||
533 | sbbl %ecx,%ecx |
||
534 | movb %al,12(%edi) |
||
535 | addl %ebp,%ebx |
||
536 | movb (%esi),%al |
||
537 | adcl advancetable+4(,%ecx,4),%esi |
||
538 | |||
539 | addl tstep,%edx |
||
540 | sbbl %ecx,%ecx |
||
541 | movb %al,13(%edi) |
||
542 | addl %ebp,%ebx |
||
543 | movb (%esi),%al |
||
544 | adcl advancetable+4(,%ecx,4),%esi |
||
545 | |||
546 | addl tstep,%edx |
||
547 | sbbl %ecx,%ecx |
||
548 | movb %al,14(%edi) |
||
549 | addl %ebp,%ebx |
||
550 | movb (%esi),%al |
||
551 | adcl advancetable+4(,%ecx,4),%esi |
||
552 | |||
553 | addl $16,%edi |
||
554 | movl %edx,tfracf |
||
555 | movl snext,%edx |
||
556 | movl %ebx,sfracf |
||
557 | movl tnext,%ebx |
||
558 | movl %edx,s |
||
559 | movl %ebx,t |
||
560 | |||
561 | movl counttemp,%ecx // retrieve count |
||
562 | |||
563 | // |
||
564 | // determine whether last span or not |
||
565 | // |
||
566 | cmpl $16,%ecx // are there multiple segments remaining? |
||
567 | movb %al,-1(%edi) |
||
568 | ja LNotLastSegment // yes |
||
569 | |||
570 | // |
||
571 | // last segment of scan |
||
572 | // |
||
573 | LLastSegment: |
||
574 | |||
575 | // |
||
576 | // advance s/z, t/z, and 1/z, and calculate s & t at end of span and steps to |
||
577 | // get there. The number of pixels left is variable, and we want to land on the |
||
578 | // last pixel, not step one past it, so we can't run into arithmetic problems |
||
579 | // |
||
580 | testl %ecx,%ecx |
||
581 | jz LNoSteps // just draw the last pixel and we're done |
||
582 | |||
583 | // pick up after the FDIV that was left in flight previously |
||
584 | |||
585 | |||
586 | fld %st(0) // duplicate it |
||
587 | fmul %st(4),%st(0) // s = s/z * z |
||
588 | fxch %st(1) |
||
589 | fmul %st(3),%st(0) // t = t/z * z |
||
590 | fxch %st(1) |
||
591 | fistpl snext |
||
592 | fistpl tnext |
||
593 | |||
594 | movb (%esi),%al // load first texel in segment |
||
595 | movl C(tadjust),%ebx |
||
596 | movb %al,(%edi) // store first pixel in segment |
||
597 | movl C(sadjust),%eax |
||
598 | |||
599 | addl snext,%eax |
||
600 | addl tnext,%ebx |
||
601 | |||
602 | movl C(bbextents),%ebp |
||
603 | movl C(bbextentt),%edx |
||
604 | |||
605 | cmpl $4096,%eax |
||
606 | jl LClampLow4 |
||
607 | cmpl %ebp,%eax |
||
608 | ja LClampHigh4 |
||
609 | LClampReentry4: |
||
610 | movl %eax,snext |
||
611 | |||
612 | cmpl $4096,%ebx |
||
613 | jl LClampLow5 |
||
614 | cmpl %edx,%ebx |
||
615 | ja LClampHigh5 |
||
616 | LClampReentry5: |
||
617 | |||
618 | cmpl $1,%ecx // don't bother |
||
619 | je LOnlyOneStep // if two pixels in segment, there's only one step, |
||
620 | // of the segment length |
||
621 | subl s,%eax |
||
622 | subl t,%ebx |
||
623 | |||
624 | addl %eax,%eax // convert to 15.17 format so multiply by 1.31 |
||
625 | addl %ebx,%ebx // reciprocal yields 16.48 |
||
626 | |||
627 | imull reciprocal_table_16-8(,%ecx,4) // sstep = (snext - s) / |
||
628 | // (spancount-1) |
||
629 | movl %edx,%ebp |
||
630 | |||
631 | movl %ebx,%eax |
||
632 | imull reciprocal_table_16-8(,%ecx,4) // tstep = (tnext - t) / |
||
633 | // (spancount-1) |
||
634 | LSetEntryvec: |
||
635 | // |
||
636 | // set up advancetable |
||
637 | // |
||
638 | movl entryvec_table_16(,%ecx,4),%ebx |
||
639 | movl %edx,%eax |
||
640 | movl %ebx,jumptemp // entry point into code for RET later |
||
641 | movl %ebp,%ecx |
||
642 | sarl $16,%edx // tstep >>= 16; |
||
643 | movl C(cachewidth),%ebx |
||
644 | sarl $16,%ecx // sstep >>= 16; |
||
645 | imull %ebx,%edx |
||
646 | |||
647 | addl %ecx,%edx // add in sstep |
||
648 | // (tstep >> 16) * cachewidth + (sstep >> 16); |
||
649 | movl tfracf,%ecx |
||
650 | movl %edx,advancetable+4 // advance base in t |
||
651 | addl %ebx,%edx // ((tstep >> 16) + 1) * cachewidth + |
||
652 | // (sstep >> 16); |
||
653 | shll $16,%ebp // left-justify sstep fractional part |
||
654 | movl sfracf,%ebx |
||
655 | shll $16,%eax // left-justify tstep fractional part |
||
656 | movl %edx,advancetable // advance extra in t |
||
657 | |||
658 | movl %eax,tstep |
||
659 | movl %ecx,%edx |
||
660 | addl %eax,%edx |
||
661 | sbbl %ecx,%ecx |
||
662 | addl %ebp,%ebx |
||
663 | adcl advancetable+4(,%ecx,4),%esi |
||
664 | |||
665 | jmp *jumptemp // jump to the number-of-pixels handler |
||
666 | |||
667 | //---------------------------------------- |
||
668 | |||
669 | LNoSteps: |
||
670 | movb (%esi),%al // load first texel in segment |
||
671 | subl $15,%edi // adjust for hardwired offset |
||
672 | jmp LEndSpan |
||
673 | |||
674 | |||
675 | LOnlyOneStep: |
||
676 | subl s,%eax |
||
677 | subl t,%ebx |
||
678 | movl %eax,%ebp |
||
679 | movl %ebx,%edx |
||
680 | jmp LSetEntryvec |
||
681 | |||
682 | //---------------------------------------- |
||
683 | |||
684 | .globl Entry2_16, Entry3_16, Entry4_16, Entry5_16 |
||
685 | .globl Entry6_16, Entry7_16, Entry8_16, Entry9_16 |
||
686 | .globl Entry10_16, Entry11_16, Entry12_16, Entry13_16 |
||
687 | .globl Entry14_16, Entry15_16, Entry16_16 |
||
688 | |||
689 | Entry2_16: |
||
690 | subl $14,%edi // adjust for hardwired offsets |
||
691 | movb (%esi),%al |
||
692 | jmp LEntry2_16 |
||
693 | |||
694 | //---------------------------------------- |
||
695 | |||
696 | Entry3_16: |
||
697 | subl $13,%edi // adjust for hardwired offsets |
||
698 | addl %eax,%edx |
||
699 | movb (%esi),%al |
||
700 | sbbl %ecx,%ecx |
||
701 | addl %ebp,%ebx |
||
702 | adcl advancetable+4(,%ecx,4),%esi |
||
703 | jmp LEntry3_16 |
||
704 | |||
705 | //---------------------------------------- |
||
706 | |||
707 | Entry4_16: |
||
708 | subl $12,%edi // adjust for hardwired offsets |
||
709 | addl %eax,%edx |
||
710 | movb (%esi),%al |
||
711 | sbbl %ecx,%ecx |
||
712 | addl %ebp,%ebx |
||
713 | adcl advancetable+4(,%ecx,4),%esi |
||
714 | addl tstep,%edx |
||
715 | jmp LEntry4_16 |
||
716 | |||
717 | //---------------------------------------- |
||
718 | |||
719 | Entry5_16: |
||
720 | subl $11,%edi // adjust for hardwired offsets |
||
721 | addl %eax,%edx |
||
722 | movb (%esi),%al |
||
723 | sbbl %ecx,%ecx |
||
724 | addl %ebp,%ebx |
||
725 | adcl advancetable+4(,%ecx,4),%esi |
||
726 | addl tstep,%edx |
||
727 | jmp LEntry5_16 |
||
728 | |||
729 | //---------------------------------------- |
||
730 | |||
731 | Entry6_16: |
||
732 | subl $10,%edi // adjust for hardwired offsets |
||
733 | addl %eax,%edx |
||
734 | movb (%esi),%al |
||
735 | sbbl %ecx,%ecx |
||
736 | addl %ebp,%ebx |
||
737 | adcl advancetable+4(,%ecx,4),%esi |
||
738 | addl tstep,%edx |
||
739 | jmp LEntry6_16 |
||
740 | |||
741 | //---------------------------------------- |
||
742 | |||
743 | Entry7_16: |
||
744 | subl $9,%edi // adjust for hardwired offsets |
||
745 | addl %eax,%edx |
||
746 | movb (%esi),%al |
||
747 | sbbl %ecx,%ecx |
||
748 | addl %ebp,%ebx |
||
749 | adcl advancetable+4(,%ecx,4),%esi |
||
750 | addl tstep,%edx |
||
751 | jmp LEntry7_16 |
||
752 | |||
753 | //---------------------------------------- |
||
754 | |||
755 | Entry8_16: |
||
756 | subl $8,%edi // adjust for hardwired offsets |
||
757 | addl %eax,%edx |
||
758 | movb (%esi),%al |
||
759 | sbbl %ecx,%ecx |
||
760 | addl %ebp,%ebx |
||
761 | adcl advancetable+4(,%ecx,4),%esi |
||
762 | addl tstep,%edx |
||
763 | jmp LEntry8_16 |
||
764 | |||
765 | //---------------------------------------- |
||
766 | |||
767 | Entry9_16: |
||
768 | subl $7,%edi // adjust for hardwired offsets |
||
769 | addl %eax,%edx |
||
770 | movb (%esi),%al |
||
771 | sbbl %ecx,%ecx |
||
772 | addl %ebp,%ebx |
||
773 | adcl advancetable+4(,%ecx,4),%esi |
||
774 | addl tstep,%edx |
||
775 | jmp LEntry9_16 |
||
776 | |||
777 | //---------------------------------------- |
||
778 | |||
779 | Entry10_16: |
||
780 | subl $6,%edi // adjust for hardwired offsets |
||
781 | addl %eax,%edx |
||
782 | movb (%esi),%al |
||
783 | sbbl %ecx,%ecx |
||
784 | addl %ebp,%ebx |
||
785 | adcl advancetable+4(,%ecx,4),%esi |
||
786 | addl tstep,%edx |
||
787 | jmp LEntry10_16 |
||
788 | |||
789 | //---------------------------------------- |
||
790 | |||
791 | Entry11_16: |
||
792 | subl $5,%edi // adjust for hardwired offsets |
||
793 | addl %eax,%edx |
||
794 | movb (%esi),%al |
||
795 | sbbl %ecx,%ecx |
||
796 | addl %ebp,%ebx |
||
797 | adcl advancetable+4(,%ecx,4),%esi |
||
798 | addl tstep,%edx |
||
799 | jmp LEntry11_16 |
||
800 | |||
801 | //---------------------------------------- |
||
802 | |||
803 | Entry12_16: |
||
804 | subl $4,%edi // adjust for hardwired offsets |
||
805 | addl %eax,%edx |
||
806 | movb (%esi),%al |
||
807 | sbbl %ecx,%ecx |
||
808 | addl %ebp,%ebx |
||
809 | adcl advancetable+4(,%ecx,4),%esi |
||
810 | addl tstep,%edx |
||
811 | jmp LEntry12_16 |
||
812 | |||
813 | //---------------------------------------- |
||
814 | |||
815 | Entry13_16: |
||
816 | subl $3,%edi // adjust for hardwired offsets |
||
817 | addl %eax,%edx |
||
818 | movb (%esi),%al |
||
819 | sbbl %ecx,%ecx |
||
820 | addl %ebp,%ebx |
||
821 | adcl advancetable+4(,%ecx,4),%esi |
||
822 | addl tstep,%edx |
||
823 | jmp LEntry13_16 |
||
824 | |||
825 | //---------------------------------------- |
||
826 | |||
827 | Entry14_16: |
||
828 | subl $2,%edi // adjust for hardwired offsets |
||
829 | addl %eax,%edx |
||
830 | movb (%esi),%al |
||
831 | sbbl %ecx,%ecx |
||
832 | addl %ebp,%ebx |
||
833 | adcl advancetable+4(,%ecx,4),%esi |
||
834 | addl tstep,%edx |
||
835 | jmp LEntry14_16 |
||
836 | |||
837 | //---------------------------------------- |
||
838 | |||
839 | Entry15_16: |
||
840 | decl %edi // adjust for hardwired offsets |
||
841 | addl %eax,%edx |
||
842 | movb (%esi),%al |
||
843 | sbbl %ecx,%ecx |
||
844 | addl %ebp,%ebx |
||
845 | adcl advancetable+4(,%ecx,4),%esi |
||
846 | addl tstep,%edx |
||
847 | jmp LEntry15_16 |
||
848 | |||
849 | //---------------------------------------- |
||
850 | |||
851 | Entry16_16: |
||
852 | addl %eax,%edx |
||
853 | movb (%esi),%al |
||
854 | sbbl %ecx,%ecx |
||
855 | addl %ebp,%ebx |
||
856 | adcl advancetable+4(,%ecx,4),%esi |
||
857 | |||
858 | addl tstep,%edx |
||
859 | sbbl %ecx,%ecx |
||
860 | movb %al,1(%edi) |
||
861 | addl %ebp,%ebx |
||
862 | movb (%esi),%al |
||
863 | adcl advancetable+4(,%ecx,4),%esi |
||
864 | addl tstep,%edx |
||
865 | LEntry15_16: |
||
866 | sbbl %ecx,%ecx |
||
867 | movb %al,2(%edi) |
||
868 | addl %ebp,%ebx |
||
869 | movb (%esi),%al |
||
870 | adcl advancetable+4(,%ecx,4),%esi |
||
871 | addl tstep,%edx |
||
872 | LEntry14_16: |
||
873 | sbbl %ecx,%ecx |
||
874 | movb %al,3(%edi) |
||
875 | addl %ebp,%ebx |
||
876 | movb (%esi),%al |
||
877 | adcl advancetable+4(,%ecx,4),%esi |
||
878 | addl tstep,%edx |
||
879 | LEntry13_16: |
||
880 | sbbl %ecx,%ecx |
||
881 | movb %al,4(%edi) |
||
882 | addl %ebp,%ebx |
||
883 | movb (%esi),%al |
||
884 | adcl advancetable+4(,%ecx,4),%esi |
||
885 | addl tstep,%edx |
||
886 | LEntry12_16: |
||
887 | sbbl %ecx,%ecx |
||
888 | movb %al,5(%edi) |
||
889 | addl %ebp,%ebx |
||
890 | movb (%esi),%al |
||
891 | adcl advancetable+4(,%ecx,4),%esi |
||
892 | addl tstep,%edx |
||
893 | LEntry11_16: |
||
894 | sbbl %ecx,%ecx |
||
895 | movb %al,6(%edi) |
||
896 | addl %ebp,%ebx |
||
897 | movb (%esi),%al |
||
898 | adcl advancetable+4(,%ecx,4),%esi |
||
899 | addl tstep,%edx |
||
900 | LEntry10_16: |
||
901 | sbbl %ecx,%ecx |
||
902 | movb %al,7(%edi) |
||
903 | addl %ebp,%ebx |
||
904 | movb (%esi),%al |
||
905 | adcl advancetable+4(,%ecx,4),%esi |
||
906 | addl tstep,%edx |
||
907 | LEntry9_16: |
||
908 | sbbl %ecx,%ecx |
||
909 | movb %al,8(%edi) |
||
910 | addl %ebp,%ebx |
||
911 | movb (%esi),%al |
||
912 | adcl advancetable+4(,%ecx,4),%esi |
||
913 | addl tstep,%edx |
||
914 | LEntry8_16: |
||
915 | sbbl %ecx,%ecx |
||
916 | movb %al,9(%edi) |
||
917 | addl %ebp,%ebx |
||
918 | movb (%esi),%al |
||
919 | adcl advancetable+4(,%ecx,4),%esi |
||
920 | addl tstep,%edx |
||
921 | LEntry7_16: |
||
922 | sbbl %ecx,%ecx |
||
923 | movb %al,10(%edi) |
||
924 | addl %ebp,%ebx |
||
925 | movb (%esi),%al |
||
926 | adcl advancetable+4(,%ecx,4),%esi |
||
927 | addl tstep,%edx |
||
928 | LEntry6_16: |
||
929 | sbbl %ecx,%ecx |
||
930 | movb %al,11(%edi) |
||
931 | addl %ebp,%ebx |
||
932 | movb (%esi),%al |
||
933 | adcl advancetable+4(,%ecx,4),%esi |
||
934 | addl tstep,%edx |
||
935 | LEntry5_16: |
||
936 | sbbl %ecx,%ecx |
||
937 | movb %al,12(%edi) |
||
938 | addl %ebp,%ebx |
||
939 | movb (%esi),%al |
||
940 | adcl advancetable+4(,%ecx,4),%esi |
||
941 | addl tstep,%edx |
||
942 | LEntry4_16: |
||
943 | sbbl %ecx,%ecx |
||
944 | movb %al,13(%edi) |
||
945 | addl %ebp,%ebx |
||
946 | movb (%esi),%al |
||
947 | adcl advancetable+4(,%ecx,4),%esi |
||
948 | LEntry3_16: |
||
949 | movb %al,14(%edi) |
||
950 | movb (%esi),%al |
||
951 | LEntry2_16: |
||
952 | |||
953 | LEndSpan: |
||
954 | |||
955 | // |
||
956 | // clear s/z, t/z, 1/z from FP stack |
||
957 | // |
||
958 | fstp %st(0) |
||
959 | fstp %st(0) |
||
960 | fstp %st(0) |
||
961 | |||
962 | movl pspantemp,%ebx // restore spans pointer |
||
963 | movl espan_t_pnext(%ebx),%ebx // point to next span |
||
964 | testl %ebx,%ebx // any more spans? |
||
965 | movb %al,15(%edi) |
||
966 | jnz LSpanLoop // more spans |
||
967 | |||
968 | popl %ebx // restore register variables |
||
969 | popl %esi |
||
970 | popl %edi |
||
971 | popl %ebp // restore the caller's stack frame |
||
972 | ret |
||
973 | |||
974 | #endif // id386 |