1 | /********************************************************************** <BR>
|
---|
2 | This file is part of Crack dot Com's free source code release of
|
---|
3 | Golgotha. <a href="http://www.crack.com/golgotha_release"> <BR> for
|
---|
4 | information about compiling & licensing issues visit this URL</a>
|
---|
5 | <PRE> If that doesn't help, contact Jonathan Clark at
|
---|
6 | golgotha_source@usa.net (Subject should have "GOLG" in it)
|
---|
7 | ***********************************************************************/
|
---|
8 |
|
---|
9 | #include "software/r1_software_globals.hh"
|
---|
10 | #include "software/inline_fpu.hh"
|
---|
11 | #include "software/amd3d/amd3d.h"
|
---|
12 |
|
---|
13 | extern sw32 had_subdivisions;
|
---|
14 |
|
---|
15 | //instead of using left_s, left_t, right_s, and right_t,
|
---|
16 | //the divides and multiplies are nicely vectorized by the amd3d,
|
---|
17 | //and storing them is a single quad store to an array of 2 floats,
|
---|
18 | //rather than two dword stores to two seperate floats
|
---|
19 |
|
---|
20 | extern sw32 left_s_t[2];
|
---|
21 | extern sw32 right_s_t[2];
|
---|
22 |
|
---|
23 | extern float mmx0[2];
|
---|
24 | extern float mmx1[2];
|
---|
25 | extern float mmx2[2];
|
---|
26 | extern float mmx3[2];
|
---|
27 | extern float mmx4[2];
|
---|
28 | extern float mmx5[2];
|
---|
29 | extern float mmx6[2];
|
---|
30 | extern float mmx7[2];
|
---|
31 |
|
---|
32 | void texture_scanline_perspective_unlit_holy_amd3d(w16 *start_pixel,
|
---|
33 | sw32 start_x,
|
---|
34 | void *_left,//perspective_span *left,
|
---|
35 | sw32 width)
|
---|
36 | {
|
---|
37 | start_pixel = (w16 *)((w8 *)start_pixel + start_x);
|
---|
38 |
|
---|
39 | perspective_span *left = (perspective_span *)_left;
|
---|
40 |
|
---|
41 | _asm
|
---|
42 | {
|
---|
43 | //left_z = 1.f / left->ooz;
|
---|
44 | //left_s = qftoi(left->soz * left_z) + cur_grads.s_adjust;
|
---|
45 | //left_t = qftoi(left->toz * left_z) + cur_grads.t_adjust;
|
---|
46 |
|
---|
47 | //sw32 had_subdivisions = width & (~15);
|
---|
48 | //num_subdivisions = width >> 4;
|
---|
49 | //num_leftover = width & 15;
|
---|
50 |
|
---|
51 | mov edi,dword ptr [left]
|
---|
52 | mov eax,dword ptr [width]
|
---|
53 |
|
---|
54 | movd mm0, dword ptr [edi]perspective_span.ooz
|
---|
55 | mov ebx,eax
|
---|
56 |
|
---|
57 | pfrcp (m1, m0)
|
---|
58 | and eax,15
|
---|
59 |
|
---|
60 | shr ebx,4
|
---|
61 | punpckldq mm0, mm0 //duplicate low 32bits of m0 into high 32 bits of m0
|
---|
62 |
|
---|
63 | pfrcpit1 (m0, m1)
|
---|
64 | mov ecx,dword ptr [width]
|
---|
65 |
|
---|
66 | movq mm2, qword ptr [edi]perspective_span.soz
|
---|
67 | mov dword ptr [num_leftover],eax
|
---|
68 |
|
---|
69 | pfrcpit2 (m0, m1)
|
---|
70 | and ecx,(~15)
|
---|
71 |
|
---|
72 | //mov eax,dword ptr [edi]perspective_span.l
|
---|
73 | mov dword ptr [num_subdivisions],ebx
|
---|
74 |
|
---|
75 | pfmul (m2, m0)
|
---|
76 | mov dword ptr [had_subdivisions],ecx
|
---|
77 |
|
---|
78 | //mov dword ptr [left_l],eax
|
---|
79 | //clear these out
|
---|
80 | mov dword ptr [dsdx_frac],0
|
---|
81 |
|
---|
82 | //high 32 bits of mm2 - toz / ooz (aka t)
|
---|
83 | //low 32 bits of mm2 - soz / ooz (aka s)
|
---|
84 |
|
---|
85 | pf2id (m3, m2)
|
---|
86 | mov dword ptr [dtdx_frac],0
|
---|
87 |
|
---|
88 | //high 32 bits of mm3 - toz / ooz (aka t) - truncated ints
|
---|
89 | //low 32 bits of mm3 - soz / ooz (aka s) - truncated ints
|
---|
90 |
|
---|
91 | paddd mm3, qword ptr [cur_grads]tri_gradients.s_adjust
|
---|
92 |
|
---|
93 | //high 32 bits of mm3 - t + t_adjust
|
---|
94 | //low 32 bits of mm3 - s + s_adjust
|
---|
95 |
|
---|
96 | movq qword ptr [left_s_t], mm3
|
---|
97 | }
|
---|
98 |
|
---|
99 | if (num_subdivisions)
|
---|
100 | {
|
---|
101 | _asm
|
---|
102 | {
|
---|
103 | //ooz_right = left->ooz + (cur_grads.doozdxspan);
|
---|
104 | //soz_right = left->soz + (cur_grads.dsozdxspan);
|
---|
105 | //toz_right = left->toz + (cur_grads.dtozdxspan);
|
---|
106 |
|
---|
107 | //edi still has dword ptr [left]
|
---|
108 | lea ebx,dword ptr [cur_grads]
|
---|
109 | nop
|
---|
110 |
|
---|
111 | movd mm1, dword ptr [edi]perspective_span.ooz
|
---|
112 | mov esi,dword ptr [r1_software_texture_ptr]
|
---|
113 |
|
---|
114 | movd mm3, dword ptr [ebx]tri_gradients.doozdxspan
|
---|
115 | mov eax,dword ptr [left_s_t] //left_s
|
---|
116 |
|
---|
117 | shr esi,1
|
---|
118 | movq mm0, qword ptr [edi]perspective_span.soz
|
---|
119 |
|
---|
120 | pfadd (m1, m3)
|
---|
121 | movq mm2, qword ptr [ebx]tri_gradients.dsozdxspan
|
---|
122 |
|
---|
123 | sar eax,16 //get integral left_s into eax
|
---|
124 | mov edi,dword ptr [start_pixel]
|
---|
125 |
|
---|
126 | pfrcp (m6, m1)
|
---|
127 | movq mm7,mm1
|
---|
128 |
|
---|
129 | pfadd (m0, m2)
|
---|
130 | mov ebx,dword ptr [left_s_t+4] //left_t
|
---|
131 |
|
---|
132 | //calculate the 1st right_z in mm7
|
---|
133 | sar ebx,16 //get integral left_t into ebx
|
---|
134 | punpckldq mm7, mm7 //duplicate high 32bits of mm7 into low 32 bits of mm7
|
---|
135 |
|
---|
136 | pfrcpit1 (m7, m6)
|
---|
137 | mov edx,dword ptr [left_s_t+4] //left_t
|
---|
138 |
|
---|
139 | mov cl,byte ptr [r1_software_twidth_log2]
|
---|
140 | add esi,eax
|
---|
141 |
|
---|
142 | pfrcpit2 (m7, m6)
|
---|
143 |
|
---|
144 | //calculate starting fractional and integral values for s and t
|
---|
145 | //esi = starting_s_coordinate >> 16 + starting_t_coordinate >> 16 << r1_software_twidth_log2
|
---|
146 | //ecx = starting_s_coordinate << 16
|
---|
147 | //edx = starting_t_coordinate << 16
|
---|
148 |
|
---|
149 | //some stuff has been moved up, interleaved w/the mmx code above
|
---|
150 |
|
---|
151 | shl ebx,cl //multiply integral left_t by texture width
|
---|
152 |
|
---|
153 | sal edx,16 //get fractional left_t into edx
|
---|
154 | mov ecx,dword ptr [left_s_t] //left_s
|
---|
155 |
|
---|
156 | sal ecx,16
|
---|
157 | add esi,ebx
|
---|
158 | }
|
---|
159 |
|
---|
160 | while (num_subdivisions)
|
---|
161 | {
|
---|
162 | _asm
|
---|
163 | {
|
---|
164 | //right_s = qftoi(soz_right * right_z);
|
---|
165 | //right_t = qftoi(toz_right * right_z);
|
---|
166 |
|
---|
167 | //soz_right and toz_right are in mm0
|
---|
168 | //right_z is in mm7
|
---|
169 | pfmul (m7, m0)
|
---|
170 |
|
---|
171 | pf2id (m7, m7)
|
---|
172 |
|
---|
173 | movq qword ptr [right_s_t],mm7
|
---|
174 |
|
---|
175 | //calculate ooz_right, soz_right, toz_right, and right_z for the end of the next span. if there are
|
---|
176 | //more subdivisions, calculate the end of the next span. if there are no more and there is > 1 leftover
|
---|
177 | //in the leftover span, calculate the end of that.
|
---|
178 |
|
---|
179 | //if (num_subdivisions!=1)
|
---|
180 | //{
|
---|
181 | cmp dword ptr [num_subdivisions],1
|
---|
182 | je last_subdivision
|
---|
183 |
|
---|
184 | //ooz_right += (cur_grads.doozdxspan);
|
---|
185 | //soz_right += (cur_grads.dsozdxspan);
|
---|
186 | //toz_right += (cur_grads.dtozdxspan);
|
---|
187 |
|
---|
188 | pfadd (m0, m2)
|
---|
189 | pfadd (m1, m3)
|
---|
190 |
|
---|
191 | jmp proceed_with_mapping
|
---|
192 | //}
|
---|
193 | //else
|
---|
194 | //if (num_leftover > 1)
|
---|
195 | //{
|
---|
196 |
|
---|
197 | last_subdivision:
|
---|
198 | cmp dword ptr [num_leftover],1
|
---|
199 | jle proceed_with_mapping
|
---|
200 |
|
---|
201 | //calculate the right_z for the end of the leftover span
|
---|
202 | //ooz_right += (cur_grads.doozdx * num_leftover);
|
---|
203 | //soz_right += (cur_grads.dsozdx * num_leftover);
|
---|
204 | //toz_right += (cur_grads.dtozdx * num_leftover);
|
---|
205 |
|
---|
206 | movd mm2,dword ptr [num_leftover]
|
---|
207 | movd mm3, dword ptr [cur_grads]tri_gradients.dsozdx
|
---|
208 |
|
---|
209 | pi2fd (m2, m2)
|
---|
210 | movd mm4, dword ptr [cur_grads]tri_gradients.dtozdx
|
---|
211 |
|
---|
212 | pfmul (m3, m2)
|
---|
213 | movd mm5, dword ptr [cur_grads]tri_gradients.doozdx
|
---|
214 |
|
---|
215 | pfmul (m4, m2)
|
---|
216 | pfmul (m5, m2)
|
---|
217 |
|
---|
218 | pfacc (m3, m4) //gets dtozdx*num_leftover into high 32 bits of m3
|
---|
219 |
|
---|
220 | pfadd (m0, m3)
|
---|
221 | pfadd (m1, m5)
|
---|
222 | //}
|
---|
223 |
|
---|
224 | proceed_with_mapping:
|
---|
225 | //cap the right_s and right_t's so that they're valid
|
---|
226 |
|
---|
227 | mov eax,dword ptr [right_s_t] //right_s
|
---|
228 | mov ebx,dword ptr [right_s_t+4] //right_t
|
---|
229 |
|
---|
230 | add eax,dword ptr [cur_grads]tri_gradients.s_adjust
|
---|
231 | add ebx,dword ptr [cur_grads]tri_gradients.t_adjust
|
---|
232 |
|
---|
233 | //cap the right s and t
|
---|
234 | cmp eax,0
|
---|
235 | jge cmp_eax_high
|
---|
236 |
|
---|
237 | mov eax,0
|
---|
238 | jmp cmp_ebx_low
|
---|
239 |
|
---|
240 | cmp_eax_high:
|
---|
241 | cmp eax,dword ptr [s_mask]
|
---|
242 | jle cmp_ebx_low
|
---|
243 |
|
---|
244 | mov eax,dword ptr [s_mask]
|
---|
245 |
|
---|
246 | cmp_ebx_low:
|
---|
247 | cmp ebx,0
|
---|
248 | jge cmp_ebx_high
|
---|
249 |
|
---|
250 | mov ebx,0
|
---|
251 | jmp done_compare
|
---|
252 |
|
---|
253 | cmp_ebx_high:
|
---|
254 | cmp ebx,dword ptr [t_mask]
|
---|
255 | jle done_compare
|
---|
256 |
|
---|
257 | mov ebx,dword ptr [t_mask]
|
---|
258 |
|
---|
259 | done_compare:
|
---|
260 |
|
---|
261 | //store the right_s and right_t
|
---|
262 | //so they can be copied into left_s and left_t at the end of the 16-pixel span
|
---|
263 | //(the cant be copied now because we have to calculate (right_s-left_s)>>4 and (right_t-left_t)>>4
|
---|
264 |
|
---|
265 | //calculate the next right_z in mm7
|
---|
266 | //unfortunately, if the span is a multiple of 16, and this is the last set of 16, it will
|
---|
267 | //calculate an unnecessary z. but its best to have the code here mixed in w/integer ops so
|
---|
268 | //that the amd3d code has something for its executation latencies to sit through
|
---|
269 | movq mm7, mm1
|
---|
270 | pfrcp (m6, m1)
|
---|
271 |
|
---|
272 | mov dword ptr [right_s_t],eax //right_s
|
---|
273 | mov dword ptr [right_s_t+4],ebx //right_t
|
---|
274 |
|
---|
275 | punpckldq mm7, mm7 //duplicate low 32bits of mm7 into high 32 bits of mm7
|
---|
276 | sub eax,dword ptr [left_s_t] //left_s
|
---|
277 |
|
---|
278 | sar eax,4
|
---|
279 | push ebp
|
---|
280 |
|
---|
281 | pfrcpit1 (m7, m6)
|
---|
282 | sub ebx,dword ptr [left_s_t+4] //left_t
|
---|
283 |
|
---|
284 | sar ebx,4
|
---|
285 | mov word ptr [dsdx_frac+2],ax //this sets the upper 16 bits of dword ptr [dsdx_frac] to ((right_s-left_s)>>4)<<16
|
---|
286 |
|
---|
287 | pfrcpit2 (m7, m6)
|
---|
288 | nop
|
---|
289 |
|
---|
290 | sar eax,16
|
---|
291 | mov word ptr [dtdx_frac+2],bx //this sets the upper 16 bits of dword ptr [dtdx_frac] to ((right_t-left_t)>>4)<<16
|
---|
292 |
|
---|
293 | sar ebx,16
|
---|
294 | mov cl,byte ptr [r1_software_twidth_log2]
|
---|
295 |
|
---|
296 | shl ebx,cl
|
---|
297 |
|
---|
298 | add eax,ebx
|
---|
299 | mov cl,4 //loop is unrolled to 4 pixels - we want to draw 16, so loop 4 times
|
---|
300 |
|
---|
301 | //s_t_carry[1] = integral_dsdx + integral_dtdx<<r1_software_twidth_log2
|
---|
302 | //s_t_carry[0] = integral_dsdx + integral_dtdx<<r1_software_twidth_log2 + r1_software_texture_width
|
---|
303 |
|
---|
304 | mov dword ptr [s_t_carry+4],eax
|
---|
305 | add eax,dword ptr [r1_software_texture_width]
|
---|
306 |
|
---|
307 | mov dword ptr [s_t_carry],eax
|
---|
308 |
|
---|
309 | ALIGN 16
|
---|
310 |
|
---|
311 | //high 16 bits of ecx is the fractional s component
|
---|
312 | //high 16 bits of edx is the fractional t component
|
---|
313 |
|
---|
314 | //eax is used to lookup the texel as well as the low 8-bits of the lit texel
|
---|
315 | //ebx is used to lookup the high 8-bits of the lit texel
|
---|
316 | //ebp is used to detect a t-carry as well as lookup the lit texel
|
---|
317 | //cl is the loop count variable
|
---|
318 |
|
---|
319 | looper1:
|
---|
320 | add edi,8
|
---|
321 | add edx,dword ptr [dtdx_frac]
|
---|
322 |
|
---|
323 | sbb ebx,ebx
|
---|
324 | add ecx,dword ptr [dsdx_frac]
|
---|
325 |
|
---|
326 | movzx eax,word ptr [esi*2]
|
---|
327 |
|
---|
328 | adc esi,dword ptr [4+s_t_carry+ebx*4]
|
---|
329 | add edx,dword ptr [dtdx_frac]
|
---|
330 |
|
---|
331 | sbb ebx,ebx
|
---|
332 | and eax,eax //test to see if its zero
|
---|
333 |
|
---|
334 | jz skipped_1_pixel
|
---|
335 | mov word ptr [edi-8],ax //store 1 pixel
|
---|
336 |
|
---|
337 | skipped_1_pixel:
|
---|
338 | movzx eax,word ptr [esi*2]
|
---|
339 | add ecx,dword ptr [dsdx_frac]
|
---|
340 |
|
---|
341 | adc esi,dword ptr [4+s_t_carry+ebx*4]
|
---|
342 | add edx,dword ptr [dtdx_frac]
|
---|
343 |
|
---|
344 | sbb ebx,ebx
|
---|
345 | and eax,eax //test to see if its zero
|
---|
346 |
|
---|
347 | jz skipped_2_pixel
|
---|
348 | mov word ptr [edi-6],ax
|
---|
349 |
|
---|
350 | skipped_2_pixel:
|
---|
351 | movzx eax,word ptr [esi*2]
|
---|
352 | add ecx,dword ptr [dsdx_frac]
|
---|
353 |
|
---|
354 | adc esi,dword ptr [4+s_t_carry+ebx*4]
|
---|
355 | add edx,dword ptr [dtdx_frac]
|
---|
356 |
|
---|
357 | sbb ebx,ebx
|
---|
358 | and eax,eax //test to see if its zero
|
---|
359 |
|
---|
360 | jz skipped_3_pixel
|
---|
361 | mov word ptr [edi-4],ax
|
---|
362 |
|
---|
363 | skipped_3_pixel:
|
---|
364 | movzx eax,word ptr [esi*2]
|
---|
365 | add ecx,dword ptr [dsdx_frac]
|
---|
366 |
|
---|
367 | adc esi,dword ptr [4+s_t_carry+ebx*4]
|
---|
368 | and eax,eax //test to see if its zero
|
---|
369 |
|
---|
370 | jz skipped_4_pixel
|
---|
371 | mov word ptr [edi-2],ax
|
---|
372 |
|
---|
373 | skipped_4_pixel:
|
---|
374 | dec cl
|
---|
375 | jnz looper1
|
---|
376 |
|
---|
377 | pop ebp
|
---|
378 |
|
---|
379 | //store right_s and right_t in left_s and left_t
|
---|
380 | //right_s is what left_s starts at on the next 16 pixel span
|
---|
381 | //right_t is what left_t starts at on the next 16 pixel span
|
---|
382 |
|
---|
383 | mov eax,dword ptr [right_s_t] //right_s
|
---|
384 | mov ebx,dword ptr [right_s_t+4] //right_t
|
---|
385 |
|
---|
386 | mov dword ptr [left_s_t],eax //left_s
|
---|
387 | mov dword ptr [left_s_t+4],ebx //left_t
|
---|
388 | }
|
---|
389 |
|
---|
390 | _asm dec dword ptr [num_subdivisions]
|
---|
391 | }
|
---|
392 |
|
---|
393 | //store these so that the C code below actually works
|
---|
394 | _asm mov dword ptr [start_pixel],edi
|
---|
395 | }
|
---|
396 |
|
---|
397 | if (num_leftover)
|
---|
398 | {
|
---|
399 | if (num_leftover > 1)
|
---|
400 | {
|
---|
401 | if (had_subdivisions==0)
|
---|
402 | {
|
---|
403 | //calculate the right_z for the end of span
|
---|
404 | //ooz_right = left->ooz + (cur_grads.doozdx * num_leftover);
|
---|
405 | //soz_right = left->soz + (cur_grads.dsozdx * num_leftover);
|
---|
406 | //toz_right = left->toz + (cur_grads.dtozdx * num_leftover);
|
---|
407 |
|
---|
408 | _asm
|
---|
409 | {
|
---|
410 | movd mm2,dword ptr [num_leftover]
|
---|
411 | lea ebx,dword ptr [cur_grads]
|
---|
412 |
|
---|
413 | movd mm3, dword ptr [ebx]tri_gradients.dsozdx
|
---|
414 | mov edi,dword ptr [left]
|
---|
415 |
|
---|
416 | movd mm4, dword ptr [ebx]tri_gradients.dtozdx
|
---|
417 | pi2fd (m2, m2)
|
---|
418 |
|
---|
419 | movd mm5, dword ptr [ebx]tri_gradients.doozdx
|
---|
420 | pfmul (m3, m2)
|
---|
421 |
|
---|
422 | movq mm0, qword ptr [edi]perspective_span.soz
|
---|
423 | pfmul (m4, m2)
|
---|
424 |
|
---|
425 | movd mm1, dword ptr [edi]perspective_span.ooz
|
---|
426 | pfmul (m5, m2)
|
---|
427 |
|
---|
428 | pfacc (m3, m4) //gets dtozdx*num_leftover into high 32 bits of m3
|
---|
429 |
|
---|
430 | pfadd (m1, m5) //ooz += doozdx*num_leftover
|
---|
431 | pfadd (m0, m3) //soz += dsozdx*num_leftover AND toz += dtozdx*num_leftover
|
---|
432 |
|
---|
433 | //calculate the z at the right endpoint in mm7
|
---|
434 | movq mm7, mm1
|
---|
435 | pfrcp (m6, m1)
|
---|
436 |
|
---|
437 | punpckldq mm7, mm7 //duplicate low 32bits of mm7 into high 32 bits of mm7
|
---|
438 |
|
---|
439 | pfrcpit1 (m7, m6) //terrible stalls. oh well
|
---|
440 |
|
---|
441 | pfrcpit2 (m7, m6)
|
---|
442 | }
|
---|
443 | }
|
---|
444 | else
|
---|
445 | {
|
---|
446 | //the correct ending right_z is already being calculated
|
---|
447 | //(see the if (num_subdivisions!=1) case above
|
---|
448 | }
|
---|
449 |
|
---|
450 | _asm
|
---|
451 | {
|
---|
452 | //calculate starting fractional and integral values for s and t
|
---|
453 |
|
---|
454 | //calculate the right endpoint
|
---|
455 | //right_s = qftoi(soz_right * right_z) + cur_grads.s_adjust;
|
---|
456 | //right_t = qftoi(toz_right * right_z) + cur_grads.t_adjust;
|
---|
457 |
|
---|
458 | //soz_right and toz_right are in mm0
|
---|
459 | //right_z is in mm7
|
---|
460 | pfmul (m7, m0) //calculate right_s and right_t
|
---|
461 | mov edi,dword ptr [start_pixel]
|
---|
462 |
|
---|
463 | mov esi,dword ptr [r1_software_texture_ptr]
|
---|
464 | mov eax,dword ptr [left_s_t] //left_s
|
---|
465 |
|
---|
466 | shr esi,1
|
---|
467 | pf2id (m7, m7) //truncate right_s and right_t
|
---|
468 |
|
---|
469 | sar eax,16
|
---|
470 | mov ebx,dword ptr [left_s_t+4] //left_t
|
---|
471 |
|
---|
472 | sar ebx,16
|
---|
473 | movq qword ptr [right_s_t],mm7
|
---|
474 |
|
---|
475 | mov edx,dword ptr [left_s_t+4] //left_t
|
---|
476 | add esi,eax
|
---|
477 |
|
---|
478 | mov cl,byte ptr [r1_software_twidth_log2]
|
---|
479 | shl ebx,cl
|
---|
480 |
|
---|
481 | sal edx,16
|
---|
482 | mov ecx,dword ptr [left_s_t] //left_s
|
---|
483 |
|
---|
484 | sal ecx,16
|
---|
485 | add esi,ebx
|
---|
486 |
|
---|
487 | mov eax,dword ptr [right_s_t] //right_s
|
---|
488 | mov ebx,dword ptr [right_s_t+4] //right_t
|
---|
489 |
|
---|
490 | add eax,dword ptr [cur_grads]tri_gradients.s_adjust
|
---|
491 | add ebx,dword ptr [cur_grads]tri_gradients.t_adjust
|
---|
492 |
|
---|
493 | //cap the right s and t
|
---|
494 | cmp eax,0
|
---|
495 | jge cmp_eax_high_2
|
---|
496 |
|
---|
497 | mov eax,0
|
---|
498 | jmp cmp_ebx_low_2
|
---|
499 |
|
---|
500 | cmp_eax_high_2:
|
---|
501 | cmp eax,dword ptr [s_mask]
|
---|
502 | jle cmp_ebx_low_2
|
---|
503 |
|
---|
504 | mov eax,dword ptr [s_mask]
|
---|
505 |
|
---|
506 | cmp_ebx_low_2:
|
---|
507 | cmp ebx,0
|
---|
508 | jge cmp_ebx_high_2
|
---|
509 |
|
---|
510 | mov ebx,0
|
---|
511 | jmp done_compare_2
|
---|
512 |
|
---|
513 | cmp_ebx_high_2:
|
---|
514 | cmp ebx,dword ptr [t_mask]
|
---|
515 | jle done_compare_2
|
---|
516 |
|
---|
517 | mov ebx,dword ptr [t_mask]
|
---|
518 |
|
---|
519 | done_compare_2:
|
---|
520 |
|
---|
521 | //calculate the deltas (left to right)
|
---|
522 | //temp_dsdx = qftoi((float)(right_s - left_s) * inverse_leftover_lookup[num_leftover]);
|
---|
523 | //temp_dtdx = qftoi((float)(right_t - left_t) * inverse_leftover_lookup[num_leftover]);
|
---|
524 |
|
---|
525 | sub eax,dword ptr [left_s_t] //left_s
|
---|
526 | sub ebx,dword ptr [left_s_t+4] //left_t
|
---|
527 |
|
---|
528 | movd mm0,eax //temp_dsdx
|
---|
529 | push ebp
|
---|
530 |
|
---|
531 | movd mm1,ebx //temp_dtdx
|
---|
532 | mov ebp, dword ptr [num_leftover]
|
---|
533 |
|
---|
534 | pi2fd (m0, m0)
|
---|
535 | movd mm2, dword ptr [inverse_leftover_lookup + ebp*4]
|
---|
536 |
|
---|
537 | pi2fd (m1, m1)
|
---|
538 | pfmul (m0, m2)
|
---|
539 |
|
---|
540 | pfmul (m1, m2) //bad stalls here
|
---|
541 | pf2id (m0, m0)
|
---|
542 |
|
---|
543 | pf2id (m1, m1)
|
---|
544 |
|
---|
545 | movd eax, mm0 //temp_dsdx
|
---|
546 | movd ebx, mm1 //temp_dtdx
|
---|
547 |
|
---|
548 | //calculate the fractional and integral delta vars
|
---|
549 | //s_t_carry[0] = (temp_dsdx>>16) + ((temp_dtdx>>16)<<r1_software_twidth_log2) + r1_software_texture_width;
|
---|
550 | //s_t_carry[1] = (temp_dsdx>>16) + ((temp_dtdx>>16)<<r1_software_twidth_log2);
|
---|
551 | //dsdx_frac = (temp_dsdx<<16);
|
---|
552 | //dtdx_frac = (temp_dtdx<<16);
|
---|
553 |
|
---|
554 | mov word ptr [dsdx_frac+2],ax
|
---|
555 | mov word ptr [dtdx_frac+2],bx
|
---|
556 |
|
---|
557 | sar eax,16
|
---|
558 | mov dx,word ptr [left_l]
|
---|
559 |
|
---|
560 | sar ebx,16
|
---|
561 | mov cl,byte ptr [r1_software_twidth_log2]
|
---|
562 |
|
---|
563 | shl ebx,cl
|
---|
564 |
|
---|
565 | add eax,ebx
|
---|
566 | mov cl, byte ptr [num_leftover]
|
---|
567 |
|
---|
568 | mov dword ptr [s_t_carry+4],eax
|
---|
569 | add eax,dword ptr [r1_software_texture_width]
|
---|
570 |
|
---|
571 | mov dword ptr [s_t_carry],eax
|
---|
572 |
|
---|
573 | ALIGN 16
|
---|
574 |
|
---|
575 | looper3:
|
---|
576 | movzx eax,word ptr [esi*2]
|
---|
577 | add edx,dword ptr [dtdx_frac]
|
---|
578 |
|
---|
579 | sbb ebp,ebp
|
---|
580 | add edi,2 //the only convenient place for the stepping of edi was way up here
|
---|
581 |
|
---|
582 | add ecx,dword ptr [dsdx_frac]
|
---|
583 |
|
---|
584 | adc esi,dword ptr [4+s_t_carry+ebp*4]
|
---|
585 | and eax,eax
|
---|
586 |
|
---|
587 | jz skip_a_pixel
|
---|
588 | mov word ptr [edi-2],ax
|
---|
589 |
|
---|
590 | skip_a_pixel:
|
---|
591 | dec cl
|
---|
592 | jnz looper3
|
---|
593 |
|
---|
594 | pop ebp
|
---|
595 | }
|
---|
596 | }
|
---|
597 | else
|
---|
598 | {
|
---|
599 | //highly unoptimized single pixel drawer
|
---|
600 | register w16 texel = *(r1_software_texture_ptr + (left_s_t[0]>>16) + ((left_s_t[1]>>16)<<r1_software_twidth_log2));
|
---|
601 |
|
---|
602 | if (texel)
|
---|
603 | *start_pixel = texel;
|
---|
604 | }
|
---|
605 | }
|
---|
606 |
|
---|
607 | return;
|
---|
608 |
|
---|
609 | _asm
|
---|
610 | {
|
---|
611 | dumpmmxregs:
|
---|
612 | movq qword ptr [mmx0],mm0
|
---|
613 | movq qword ptr [mmx1],mm1
|
---|
614 | movq qword ptr [mmx2],mm2
|
---|
615 | movq qword ptr [mmx3],mm3
|
---|
616 | movq qword ptr [mmx4],mm4
|
---|
617 | movq qword ptr [mmx5],mm5
|
---|
618 | movq qword ptr [mmx6],mm6
|
---|
619 | movq qword ptr [mmx7],mm7
|
---|
620 | ret
|
---|
621 | }
|
---|
622 |
|
---|
623 | }
|
---|
624 |
|
---|