1 | /********************************************************************** <BR>
|
---|
2 | This file is part of Crack dot Com's free source code release of
|
---|
3 | Golgotha. <a href="http://www.crack.com/golgotha_release"> <BR> for
|
---|
4 | information about compiling & licensing issues visit this URL</a>
|
---|
5 | <PRE> If that doesn't help, contact Jonathan Clark at
|
---|
6 | golgotha_source@usa.net (Subject should have "GOLG" in it)
|
---|
7 | ***********************************************************************/
|
---|
8 |
|
---|
9 | #include "software/r1_software_globals.hh"
|
---|
10 | #include "software/inline_fpu.hh"
|
---|
11 | #include "software/amd3d/amd3d.h"
|
---|
12 |
|
---|
13 | w32 *texture_perspective_lit_starter_amd3d()
|
---|
14 | {
|
---|
15 | bogus_label:
|
---|
16 |
|
---|
17 | w32 returnval;
|
---|
18 | _asm
|
---|
19 | {
|
---|
20 | lea eax,bogus_label
|
---|
21 | mov dword ptr [returnval],eax
|
---|
22 | }
|
---|
23 | return (w32 *)returnval;
|
---|
24 | }
|
---|
25 |
|
---|
26 | extern sw32 had_subdivisions;
|
---|
27 | extern w8 last_bh2;
|
---|
28 |
|
---|
29 | //instead of using left_s, left_t, right_s, and right_t,
|
---|
30 | //the divides and multiplies are nicely vectorized by the amd3d,
|
---|
31 | //and storing them is a single quad store to an array of 2 floats,
|
---|
32 | //rather than two dword stores to two seperate floats
|
---|
33 |
|
---|
34 | sw32 left_s_t[2];
|
---|
35 | sw32 right_s_t[2];
|
---|
36 |
|
---|
37 | float mmx0[2];
|
---|
38 | float mmx1[2];
|
---|
39 | float mmx2[2];
|
---|
40 | float mmx3[2];
|
---|
41 | float mmx4[2];
|
---|
42 | float mmx5[2];
|
---|
43 | float mmx6[2];
|
---|
44 | float mmx7[2];
|
---|
45 |
|
---|
46 | void texture_scanline_perspective_lit_amd3d(w16 *start_pixel,
|
---|
47 | sw32 start_x,
|
---|
48 | void *_left,//perspective_span *left,
|
---|
49 | sw32 width)
|
---|
50 | {
|
---|
51 | start_pixel = (w16 *)((w8 *)start_pixel + start_x);
|
---|
52 |
|
---|
53 | perspective_span *left = (perspective_span *)_left;
|
---|
54 |
|
---|
55 | last_bh2 = 0;
|
---|
56 |
|
---|
57 | _asm
|
---|
58 | {
|
---|
59 | //left_z = 1.f / left->ooz;
|
---|
60 | //left_s = qftoi(left->soz * left_z) + cur_grads.s_adjust;
|
---|
61 | //left_t = qftoi(left->toz * left_z) + cur_grads.t_adjust;
|
---|
62 |
|
---|
63 | //sw32 had_subdivisions = width & (~15);
|
---|
64 | //num_subdivisions = width >> 4;
|
---|
65 | //num_leftover = width & 15;
|
---|
66 |
|
---|
67 | mov edi,dword ptr [left]
|
---|
68 | mov eax,dword ptr [width]
|
---|
69 |
|
---|
70 | movd mm0, dword ptr [edi]perspective_span.ooz
|
---|
71 | mov ebx,eax
|
---|
72 |
|
---|
73 | pfrcp (m1, m0)
|
---|
74 | and eax,15
|
---|
75 |
|
---|
76 | shr ebx,4
|
---|
77 | punpckldq mm0, mm0 //duplicate low 32bits of m0 into high 32 bits of m0
|
---|
78 |
|
---|
79 | pfrcpit1 (m0, m1)
|
---|
80 | mov ecx,dword ptr [width]
|
---|
81 |
|
---|
82 | movq mm2, qword ptr [edi]perspective_span.soz
|
---|
83 | mov dword ptr [num_leftover],eax
|
---|
84 |
|
---|
85 | pfrcpit2 (m0, m1)
|
---|
86 | and ecx,(~15)
|
---|
87 |
|
---|
88 | mov eax,dword ptr [edi]perspective_span.l
|
---|
89 | mov dword ptr [num_subdivisions],ebx
|
---|
90 |
|
---|
91 | pfmul (m2, m0)
|
---|
92 | mov dword ptr [had_subdivisions],ecx
|
---|
93 |
|
---|
94 | mov dword ptr [left_l],eax
|
---|
95 | //clear these out
|
---|
96 | mov dword ptr [dsdx_frac],0
|
---|
97 |
|
---|
98 | //high 32 bits of mm2 - toz / ooz (aka t)
|
---|
99 | //low 32 bits of mm2 - soz / ooz (aka s)
|
---|
100 |
|
---|
101 | pf2id (m3, m2)
|
---|
102 | mov dword ptr [dtdx_frac],0
|
---|
103 |
|
---|
104 | //high 32 bits of mm3 - toz / ooz (aka t) - truncated ints
|
---|
105 | //low 32 bits of mm3 - soz / ooz (aka s) - truncated ints
|
---|
106 |
|
---|
107 | paddd mm3, qword ptr [cur_grads]tri_gradients.s_adjust
|
---|
108 |
|
---|
109 | //high 32 bits of mm3 - t + t_adjust
|
---|
110 | //low 32 bits of mm3 - s + s_adjust
|
---|
111 |
|
---|
112 | movq qword ptr [left_s_t], mm3
|
---|
113 | }
|
---|
114 |
|
---|
115 | if (num_subdivisions)
|
---|
116 | {
|
---|
117 | _asm
|
---|
118 | {
|
---|
119 | //ooz_right = left->ooz + (cur_grads.doozdxspan);
|
---|
120 | //soz_right = left->soz + (cur_grads.dsozdxspan);
|
---|
121 | //toz_right = left->toz + (cur_grads.dtozdxspan);
|
---|
122 |
|
---|
123 | //edi still has dword ptr [left]
|
---|
124 | lea ebx,dword ptr [cur_grads]
|
---|
125 | nop
|
---|
126 |
|
---|
127 | movd mm1, dword ptr [edi]perspective_span.ooz
|
---|
128 | mov esi,dword ptr [r1_software_texture_ptr]
|
---|
129 |
|
---|
130 | movd mm3, dword ptr [ebx]tri_gradients.doozdxspan
|
---|
131 | mov eax,dword ptr [left_s_t] //left_s
|
---|
132 |
|
---|
133 | shr esi,1
|
---|
134 | movq mm0, qword ptr [edi]perspective_span.soz
|
---|
135 |
|
---|
136 | pfadd (m1, m3)
|
---|
137 | movq mm2, qword ptr [ebx]tri_gradients.dsozdxspan
|
---|
138 |
|
---|
139 | sar eax,16 //get integral left_s into eax
|
---|
140 | mov edi,dword ptr [start_pixel]
|
---|
141 |
|
---|
142 | pfrcp (m6, m1)
|
---|
143 | movq mm7,mm1
|
---|
144 |
|
---|
145 | pfadd (m0, m2)
|
---|
146 | mov ebx,dword ptr [left_s_t+4] //left_t
|
---|
147 |
|
---|
148 | //calculate the 1st right_z in mm7
|
---|
149 | sar ebx,16 //get integral left_t into ebx
|
---|
150 | punpckldq mm7, mm7 //duplicate high 32bits of mm7 into low 32 bits of mm7
|
---|
151 |
|
---|
152 | pfrcpit1 (m7, m6)
|
---|
153 | mov edx,dword ptr [left_s_t+4] //left_t
|
---|
154 |
|
---|
155 | mov cl,byte ptr [r1_software_twidth_log2]
|
---|
156 | add esi,eax
|
---|
157 |
|
---|
158 | pfrcpit2 (m7, m6)
|
---|
159 |
|
---|
160 |
|
---|
161 |
|
---|
162 | //calculate starting fractional and integral values for s and t
|
---|
163 | //esi = starting_s_coordinate >> 16 + starting_t_coordinate >> 16 << r1_software_twidth_log2
|
---|
164 | //ecx = starting_s_coordinate << 16
|
---|
165 | //edx = starting_t_coordinate << 16
|
---|
166 | //dx = starting_light_value
|
---|
167 |
|
---|
168 | //some stuff has been moved up, interleaved w/the mmx code above
|
---|
169 |
|
---|
170 | shl ebx,cl //multiply integral left_t by texture width
|
---|
171 |
|
---|
172 | sal edx,16 //get fractional left_t into edx
|
---|
173 | mov ecx,dword ptr [left_s_t] //left_s
|
---|
174 |
|
---|
175 | sal ecx,16
|
---|
176 | add esi,ebx
|
---|
177 |
|
---|
178 | mov dx,word ptr [left_l]
|
---|
179 | mov ch,dl //store the initial lighting error from the 1st lighting value
|
---|
180 | //CH MUST not be touched between here and the actual rasterization loop
|
---|
181 | }
|
---|
182 |
|
---|
183 | while (num_subdivisions)
|
---|
184 | {
|
---|
185 | _asm
|
---|
186 | {
|
---|
187 | //right_s = qftoi(soz_right * right_z);
|
---|
188 | //right_t = qftoi(toz_right * right_z);
|
---|
189 |
|
---|
190 | //soz_right and toz_right are in mm0
|
---|
191 | //right_z is in mm7
|
---|
192 | pfmul (m7, m0)
|
---|
193 |
|
---|
194 | pf2id (m7, m7)
|
---|
195 |
|
---|
196 | movq qword ptr [right_s_t],mm7
|
---|
197 |
|
---|
198 | //calculate ooz_right, soz_right, toz_right, and right_z for the end of the next span. if there are
|
---|
199 | //more subdivisions, calculate the end of the next span. if there are no more and there is > 1 leftover
|
---|
200 | //in the leftover span, calculate the end of that.
|
---|
201 |
|
---|
202 | //if (num_subdivisions!=1)
|
---|
203 | //{
|
---|
204 | cmp dword ptr [num_subdivisions],1
|
---|
205 | je last_subdivision
|
---|
206 |
|
---|
207 | //ooz_right += (cur_grads.doozdxspan);
|
---|
208 | //soz_right += (cur_grads.dsozdxspan);
|
---|
209 | //toz_right += (cur_grads.dtozdxspan);
|
---|
210 |
|
---|
211 | pfadd (m0, m2)
|
---|
212 | pfadd (m1, m3)
|
---|
213 |
|
---|
214 | jmp proceed_with_mapping
|
---|
215 | //}
|
---|
216 | //else
|
---|
217 | //if (num_leftover > 1)
|
---|
218 | //{
|
---|
219 |
|
---|
220 | last_subdivision:
|
---|
221 | cmp dword ptr [num_leftover],1
|
---|
222 | jle proceed_with_mapping
|
---|
223 |
|
---|
224 | //calculate the right_z for the end of the leftover span
|
---|
225 | //ooz_right += (cur_grads.doozdx * num_leftover);
|
---|
226 | //soz_right += (cur_grads.dsozdx * num_leftover);
|
---|
227 | //toz_right += (cur_grads.dtozdx * num_leftover);
|
---|
228 |
|
---|
229 | movd mm2,dword ptr [num_leftover]
|
---|
230 | movd mm3, dword ptr [cur_grads]tri_gradients.dsozdx
|
---|
231 |
|
---|
232 | pi2fd (m2, m2)
|
---|
233 | movd mm4, dword ptr [cur_grads]tri_gradients.dtozdx
|
---|
234 |
|
---|
235 | pfmul (m3, m2)
|
---|
236 | movd mm5, dword ptr [cur_grads]tri_gradients.doozdx
|
---|
237 |
|
---|
238 | pfmul (m4, m2)
|
---|
239 | pfmul (m5, m2)
|
---|
240 |
|
---|
241 | pfacc (m3, m4) //gets dtozdx*num_leftover into high 32 bits of m3
|
---|
242 |
|
---|
243 | pfadd (m0, m3)
|
---|
244 | pfadd (m1, m5)
|
---|
245 | //}
|
---|
246 |
|
---|
247 | proceed_with_mapping:
|
---|
248 | //cap the right_s and right_t's so that they're valid
|
---|
249 |
|
---|
250 | mov eax,dword ptr [right_s_t] //right_s
|
---|
251 | mov ebx,dword ptr [right_s_t+4] //right_t
|
---|
252 |
|
---|
253 | add eax,dword ptr [cur_grads]tri_gradients.s_adjust
|
---|
254 | add ebx,dword ptr [cur_grads]tri_gradients.t_adjust
|
---|
255 |
|
---|
256 | //cap the right s and t
|
---|
257 | cmp eax,0
|
---|
258 | jge cmp_eax_high
|
---|
259 |
|
---|
260 | mov eax,0
|
---|
261 | jmp cmp_ebx_low
|
---|
262 |
|
---|
263 | cmp_eax_high:
|
---|
264 | cmp eax,dword ptr [s_mask]
|
---|
265 | jle cmp_ebx_low
|
---|
266 |
|
---|
267 | mov eax,dword ptr [s_mask]
|
---|
268 |
|
---|
269 | cmp_ebx_low:
|
---|
270 | cmp ebx,0
|
---|
271 | jge cmp_ebx_high
|
---|
272 |
|
---|
273 | mov ebx,0
|
---|
274 | jmp done_compare
|
---|
275 |
|
---|
276 | cmp_ebx_high:
|
---|
277 | cmp ebx,dword ptr [t_mask]
|
---|
278 | jle done_compare
|
---|
279 |
|
---|
280 | mov ebx,dword ptr [t_mask]
|
---|
281 |
|
---|
282 | done_compare:
|
---|
283 |
|
---|
284 | //store the right_s and right_t
|
---|
285 | //so they can be copied into left_s and left_t at the end of the 16-pixel span
|
---|
286 | //(the cant be copied now because we have to calculate (right_s-left_s)>>4 and (right_t-left_t)>>4
|
---|
287 |
|
---|
288 | //calculate the next right_z in mm7
|
---|
289 | //unfortunately, if the span is a multiple of 16, and this is the last set of 16, it will
|
---|
290 | //calculate an unnecessary z. but its best to have the code here mixed in w/integer ops so
|
---|
291 | //that the amd3d code has something for its executation latencies to sit through
|
---|
292 | movq mm7, mm1
|
---|
293 | pfrcp (m6, m1)
|
---|
294 |
|
---|
295 | mov dword ptr [right_s_t],eax //right_s
|
---|
296 | mov dword ptr [right_s_t+4],ebx //right_t
|
---|
297 |
|
---|
298 | punpckldq mm7, mm7 //duplicate low 32bits of mm7 into high 32 bits of mm7
|
---|
299 | sub eax,dword ptr [left_s_t] //left_s
|
---|
300 |
|
---|
301 | sar eax,4
|
---|
302 | push ebp
|
---|
303 |
|
---|
304 | pfrcpit1 (m7, m6)
|
---|
305 | sub ebx,dword ptr [left_s_t+4] //left_t
|
---|
306 |
|
---|
307 | sar ebx,4
|
---|
308 | mov word ptr [dsdx_frac+2],ax //this sets the upper 16 bits of dword ptr [dsdx_frac] to ((right_s-left_s)>>4)<<16
|
---|
309 |
|
---|
310 | pfrcpit2 (m7, m6)
|
---|
311 | nop
|
---|
312 |
|
---|
313 | sar eax,16
|
---|
314 | mov word ptr [dtdx_frac+2],bx //this sets the upper 16 bits of dword ptr [dtdx_frac] to ((right_t-left_t)>>4)<<16
|
---|
315 |
|
---|
316 | sar ebx,16
|
---|
317 | mov cl,byte ptr [r1_software_twidth_log2]
|
---|
318 |
|
---|
319 | shl ebx,cl
|
---|
320 |
|
---|
321 | add eax,ebx
|
---|
322 | mov ebx,0 //clear high bits of ebx
|
---|
323 |
|
---|
324 | //s_t_carry[1] = integral_dsdx + integral_dtdx<<r1_software_twidth_log2
|
---|
325 | //s_t_carry[0] = integral_dsdx + integral_dtdx<<r1_software_twidth_log2 + r1_software_texture_width
|
---|
326 |
|
---|
327 | mov dword ptr [s_t_carry+4],eax
|
---|
328 | add eax,dword ptr [r1_software_texture_width]
|
---|
329 |
|
---|
330 | mov dword ptr [s_t_carry],eax
|
---|
331 | mov cl,4 //loop is unrolled to 4 pixels - we want to draw 16, so loop 4 times
|
---|
332 |
|
---|
333 | mov bh,byte ptr [last_bh2] //setup the initial dither
|
---|
334 | clc //clear the carry bit
|
---|
335 |
|
---|
336 | ALIGN 16
|
---|
337 |
|
---|
338 | //high 16 bits of ecx is the fractional s component
|
---|
339 | //high 16 bits of edx is the fractional t component
|
---|
340 |
|
---|
341 | //eax is used to lookup the texel as well as the low 8-bits of the lit texel
|
---|
342 | //ebx is used to lookup the high 8-bits of the lit texel
|
---|
343 | //ebp is used to detect a t-carry as well as lookup the lit texel
|
---|
344 | //cl is the loop count variable
|
---|
345 | //dx is the lighting value (8 bits integer, 8 bits fraction)
|
---|
346 | //ch is the lighting error
|
---|
347 | //bh is used to dither the lighting (mov bh,0 then add ch,dl then adc bh,dh)
|
---|
348 |
|
---|
349 | looper1:
|
---|
350 | adc bh,dh
|
---|
351 | add edi,8 //the only convenient place for the stepping of edi was way up here
|
---|
352 |
|
---|
353 | movzx eax,word ptr [esi*2]
|
---|
354 | add edx,dword ptr [dtdx_frac]
|
---|
355 |
|
---|
356 | sbb ebp,ebp
|
---|
357 | mov bl,ah
|
---|
358 |
|
---|
359 | add ecx,dword ptr [dsdx_frac]
|
---|
360 | mov ah,bh
|
---|
361 |
|
---|
362 | adc esi,dword ptr [4+s_t_carry+ebp*4]
|
---|
363 | mov ebp,dword ptr [0xDEADBEEF+ctable_size_bytes+ebx*4]
|
---|
364 |
|
---|
365 | add ebp,dword ptr [0xDEADBEEF+eax*4]
|
---|
366 | add edx,dword ptr [dldx_fixed]
|
---|
367 |
|
---|
368 | mov bh,0
|
---|
369 | add ch,dl
|
---|
370 |
|
---|
371 | mov word ptr [edi-8],bp
|
---|
372 | adc bh,dh
|
---|
373 |
|
---|
374 | movzx eax,word ptr [esi*2] //first pixel is finished aroundhere
|
---|
375 | add edx,dword ptr [dtdx_frac]
|
---|
376 |
|
---|
377 | sbb ebp,ebp
|
---|
378 | mov bl,ah
|
---|
379 |
|
---|
380 | add ecx,dword ptr [dsdx_frac]
|
---|
381 | mov ah,bh
|
---|
382 |
|
---|
383 | adc esi,dword ptr [4+s_t_carry+ebp*4]
|
---|
384 | mov ebp,dword ptr [0xDEADBEEF+ctable_size_bytes+ebx*4]
|
---|
385 |
|
---|
386 | add ebp,dword ptr [0xDEADBEEF+eax*4]
|
---|
387 | add edx,dword ptr [dldx_fixed]
|
---|
388 |
|
---|
389 | mov bh,0
|
---|
390 | add ch,dl
|
---|
391 |
|
---|
392 | mov word ptr [edi-6],bp
|
---|
393 | adc bh,dh
|
---|
394 |
|
---|
395 | movzx eax,word ptr [esi*2]
|
---|
396 | add edx,dword ptr [dtdx_frac]
|
---|
397 |
|
---|
398 | sbb ebp,ebp
|
---|
399 | mov bl,ah
|
---|
400 |
|
---|
401 | add ecx,dword ptr [dsdx_frac]
|
---|
402 | mov ah,bh
|
---|
403 |
|
---|
404 | adc esi,dword ptr [4+s_t_carry+ebp*4]
|
---|
405 | mov ebp,dword ptr [0xDEADBEEF+ctable_size_bytes+ebx*4]
|
---|
406 |
|
---|
407 | add ebp,dword ptr [0xDEADBEEF+eax*4]
|
---|
408 | add edx,dword ptr [dldx_fixed]
|
---|
409 |
|
---|
410 | mov bh,0
|
---|
411 | add ch,dl
|
---|
412 |
|
---|
413 | mov word ptr [edi-4],bp
|
---|
414 | adc bh,dh
|
---|
415 |
|
---|
416 | movzx eax,word ptr [esi*2] //first pixel is finished aroundhere
|
---|
417 | add edx,dword ptr [dtdx_frac]
|
---|
418 |
|
---|
419 | sbb ebp,ebp
|
---|
420 | mov bl,ah
|
---|
421 |
|
---|
422 | add ecx,dword ptr [dsdx_frac]
|
---|
423 | mov ah,bh
|
---|
424 |
|
---|
425 | adc esi,dword ptr [4+s_t_carry+ebp*4]
|
---|
426 | mov ebp,dword ptr [0xDEADBEEF+ctable_size_bytes+ebx*4]
|
---|
427 |
|
---|
428 | add ebp,dword ptr [0xDEADBEEF+eax*4]
|
---|
429 | add edx,dword ptr [dldx_fixed]
|
---|
430 |
|
---|
431 | mov bh,0
|
---|
432 | add ch,dl
|
---|
433 |
|
---|
434 | mov word ptr [edi-2],bp
|
---|
435 | dec cl //thank god this doesnt modify the carry flag (the above add ch,dl needs to effect the adc bh,dh at the top of the loop)
|
---|
436 |
|
---|
437 | jnz looper1
|
---|
438 | adc bh,0 //if we're done looping, save the last carry information here
|
---|
439 |
|
---|
440 | pop ebp
|
---|
441 | mov byte ptr [last_bh2],bh //save it
|
---|
442 |
|
---|
443 | //store right_s and right_t in left_s and left_t
|
---|
444 | //right_s is what left_s starts at on the next 16 pixel span
|
---|
445 | //right_t is what left_t starts at on the next 16 pixel span
|
---|
446 |
|
---|
447 | mov eax,dword ptr [right_s_t] //right_s
|
---|
448 | mov ebx,dword ptr [right_s_t+4] //right_t
|
---|
449 |
|
---|
450 | mov dword ptr [left_s_t],eax //left_s
|
---|
451 | mov dword ptr [left_s_t+4],ebx //left_t
|
---|
452 | }
|
---|
453 |
|
---|
454 | _asm dec dword ptr [num_subdivisions]
|
---|
455 | }
|
---|
456 |
|
---|
457 | //store these so that the C code below actually works
|
---|
458 | _asm mov word ptr [left_l],dx
|
---|
459 | _asm mov dword ptr [start_pixel],edi
|
---|
460 | }
|
---|
461 |
|
---|
462 | if (num_leftover)
|
---|
463 | {
|
---|
464 | if (num_leftover > 1)
|
---|
465 | {
|
---|
466 | if (had_subdivisions==0)
|
---|
467 | {
|
---|
468 | //calculate the right_z for the end of span
|
---|
469 | //ooz_right = left->ooz + (cur_grads.doozdx * num_leftover);
|
---|
470 | //soz_right = left->soz + (cur_grads.dsozdx * num_leftover);
|
---|
471 | //toz_right = left->toz + (cur_grads.dtozdx * num_leftover);
|
---|
472 |
|
---|
473 | _asm
|
---|
474 | {
|
---|
475 | movd mm2,dword ptr [num_leftover]
|
---|
476 | lea ebx,dword ptr [cur_grads]
|
---|
477 |
|
---|
478 | movd mm3, dword ptr [ebx]tri_gradients.dsozdx
|
---|
479 | mov edi,dword ptr [left]
|
---|
480 |
|
---|
481 | movd mm4, dword ptr [ebx]tri_gradients.dtozdx
|
---|
482 | pi2fd (m2, m2)
|
---|
483 |
|
---|
484 | movd mm5, dword ptr [ebx]tri_gradients.doozdx
|
---|
485 | pfmul (m3, m2)
|
---|
486 |
|
---|
487 | movq mm0, qword ptr [edi]perspective_span.soz
|
---|
488 | pfmul (m4, m2)
|
---|
489 |
|
---|
490 | movd mm1, dword ptr [edi]perspective_span.ooz
|
---|
491 | pfmul (m5, m2)
|
---|
492 |
|
---|
493 | pfacc (m3, m4) //gets dtozdx*num_leftover into high 32 bits of m3
|
---|
494 |
|
---|
495 | pfadd (m1, m5) //ooz += doozdx*num_leftover
|
---|
496 | pfadd (m0, m3) //soz += dsozdx*num_leftover AND toz += dtozdx*num_leftover
|
---|
497 |
|
---|
498 | //calculate the z at the right endpoint in mm7
|
---|
499 | movq mm7, mm1
|
---|
500 | pfrcp (m6, m1)
|
---|
501 |
|
---|
502 | punpckldq mm7, mm7 //duplicate low 32bits of mm7 into high 32 bits of mm7
|
---|
503 |
|
---|
504 | pfrcpit1 (m7, m6) //terrible stalls. oh well
|
---|
505 |
|
---|
506 | pfrcpit2 (m7, m6)
|
---|
507 | }
|
---|
508 | }
|
---|
509 | else
|
---|
510 | {
|
---|
511 | //the correct ending right_z is already being calculated
|
---|
512 | //(see the if (num_subdivisions!=1) case above
|
---|
513 | }
|
---|
514 |
|
---|
515 | _asm
|
---|
516 | {
|
---|
517 | //calculate starting fractional and integral values for s and t
|
---|
518 |
|
---|
519 | //calculate the right endpoint
|
---|
520 | //right_s = qftoi(soz_right * right_z) + cur_grads.s_adjust;
|
---|
521 | //right_t = qftoi(toz_right * right_z) + cur_grads.t_adjust;
|
---|
522 |
|
---|
523 | //soz_right and toz_right are in mm0
|
---|
524 | //right_z is in mm7
|
---|
525 | pfmul (m7, m0) //calculate right_s and right_t
|
---|
526 | mov edi,dword ptr [start_pixel]
|
---|
527 |
|
---|
528 | mov esi,dword ptr [r1_software_texture_ptr]
|
---|
529 | mov eax,dword ptr [left_s_t] //left_s
|
---|
530 |
|
---|
531 | shr esi,1
|
---|
532 | pf2id (m7, m7) //truncate right_s and right_t
|
---|
533 |
|
---|
534 | sar eax,16
|
---|
535 | mov ebx,dword ptr [left_s_t+4] //left_t
|
---|
536 |
|
---|
537 | sar ebx,16
|
---|
538 | movq qword ptr [right_s_t],mm7
|
---|
539 |
|
---|
540 | mov edx,dword ptr [left_s_t+4] //left_t
|
---|
541 | add esi,eax
|
---|
542 |
|
---|
543 | mov cl,byte ptr [r1_software_twidth_log2]
|
---|
544 | shl ebx,cl
|
---|
545 |
|
---|
546 | sal edx,16
|
---|
547 | mov ecx,dword ptr [left_s_t] //left_s
|
---|
548 |
|
---|
549 | sal ecx,16
|
---|
550 | add esi,ebx
|
---|
551 |
|
---|
552 | mov eax,dword ptr [right_s_t] //right_s
|
---|
553 | mov ebx,dword ptr [right_s_t+4] //right_t
|
---|
554 |
|
---|
555 | add eax,dword ptr [cur_grads]tri_gradients.s_adjust
|
---|
556 | add ebx,dword ptr [cur_grads]tri_gradients.t_adjust
|
---|
557 |
|
---|
558 | //cap the right s and t
|
---|
559 | cmp eax,0
|
---|
560 | jge cmp_eax_high_2
|
---|
561 |
|
---|
562 | mov eax,0
|
---|
563 | jmp cmp_ebx_low_2
|
---|
564 |
|
---|
565 | cmp_eax_high_2:
|
---|
566 | cmp eax,dword ptr [s_mask]
|
---|
567 | jle cmp_ebx_low_2
|
---|
568 |
|
---|
569 | mov eax,dword ptr [s_mask]
|
---|
570 |
|
---|
571 | cmp_ebx_low_2:
|
---|
572 | cmp ebx,0
|
---|
573 | jge cmp_ebx_high_2
|
---|
574 |
|
---|
575 | mov ebx,0
|
---|
576 | jmp done_compare_2
|
---|
577 |
|
---|
578 | cmp_ebx_high_2:
|
---|
579 | cmp ebx,dword ptr [t_mask]
|
---|
580 | jle done_compare_2
|
---|
581 |
|
---|
582 | mov ebx,dword ptr [t_mask]
|
---|
583 |
|
---|
584 | done_compare_2:
|
---|
585 |
|
---|
586 | //calculate the deltas (left to right)
|
---|
587 | //temp_dsdx = qftoi((float)(right_s - left_s) * inverse_leftover_lookup[num_leftover]);
|
---|
588 | //temp_dtdx = qftoi((float)(right_t - left_t) * inverse_leftover_lookup[num_leftover]);
|
---|
589 |
|
---|
590 | sub eax,dword ptr [left_s_t] //left_s
|
---|
591 | sub ebx,dword ptr [left_s_t+4] //left_t
|
---|
592 |
|
---|
593 | movd mm0,eax //temp_dsdx
|
---|
594 | push ebp
|
---|
595 |
|
---|
596 | movd mm1,ebx //temp_dtdx
|
---|
597 | mov ebp, dword ptr [num_leftover]
|
---|
598 |
|
---|
599 | pi2fd (m0, m0)
|
---|
600 | movd mm2, dword ptr [inverse_leftover_lookup + ebp*4]
|
---|
601 |
|
---|
602 | pi2fd (m1, m1)
|
---|
603 | pfmul (m0, m2)
|
---|
604 |
|
---|
605 | pfmul (m1, m2) //bad stalls here
|
---|
606 | pf2id (m0, m0)
|
---|
607 |
|
---|
608 | pf2id (m1, m1)
|
---|
609 |
|
---|
610 | movd eax, mm0 //temp_dsdx
|
---|
611 | movd ebx, mm1 //temp_dtdx
|
---|
612 |
|
---|
613 | //calculate the fractional and integral delta vars
|
---|
614 | //s_t_carry[0] = (temp_dsdx>>16) + ((temp_dtdx>>16)<<r1_software_twidth_log2) + r1_software_texture_width;
|
---|
615 | //s_t_carry[1] = (temp_dsdx>>16) + ((temp_dtdx>>16)<<r1_software_twidth_log2);
|
---|
616 | //dsdx_frac = (temp_dsdx<<16);
|
---|
617 | //dtdx_frac = (temp_dtdx<<16);
|
---|
618 |
|
---|
619 | mov word ptr [dsdx_frac+2],ax
|
---|
620 | mov word ptr [dtdx_frac+2],bx
|
---|
621 |
|
---|
622 | sar eax,16
|
---|
623 | mov dx,word ptr [left_l]
|
---|
624 |
|
---|
625 | sar ebx,16
|
---|
626 | mov cl,byte ptr [r1_software_twidth_log2]
|
---|
627 |
|
---|
628 | shl ebx,cl
|
---|
629 |
|
---|
630 | add eax,ebx
|
---|
631 | mov ebx,0 //clear high bits
|
---|
632 |
|
---|
633 | mov dword ptr [s_t_carry+4],eax
|
---|
634 | add eax,dword ptr [r1_software_texture_width]
|
---|
635 |
|
---|
636 | mov dword ptr [s_t_carry],eax
|
---|
637 | mov cl, byte ptr [num_leftover]
|
---|
638 |
|
---|
639 | mov ch,dl //setup the initial lighting error
|
---|
640 | mov bh,byte ptr [last_bh2] //setup the initial dither
|
---|
641 |
|
---|
642 | clc //clear the carry bit
|
---|
643 |
|
---|
644 | ALIGN 16
|
---|
645 |
|
---|
646 | looper3:
|
---|
647 | adc bh,dh
|
---|
648 | add edi,2
|
---|
649 |
|
---|
650 | movzx eax,word ptr [esi*2]
|
---|
651 | add edx,dword ptr [dtdx_frac]
|
---|
652 |
|
---|
653 | sbb ebp,ebp
|
---|
654 | mov bl,ah
|
---|
655 |
|
---|
656 | add ecx,dword ptr [dsdx_frac]
|
---|
657 | mov ah,bh
|
---|
658 |
|
---|
659 | adc esi,dword ptr [4+s_t_carry+ebp*4]
|
---|
660 | mov ebp,dword ptr [0xDEADBEEF+ctable_size_bytes+ebx*4]
|
---|
661 |
|
---|
662 | add ebp,dword ptr [0xDEADBEEF+eax*4]
|
---|
663 | add edx,dword ptr [dldx_fixed]
|
---|
664 |
|
---|
665 | mov bh,0
|
---|
666 | add ch,dl
|
---|
667 |
|
---|
668 | mov word ptr [edi-2],bp
|
---|
669 | dec cl //thank god this doesnt modify the carry flag (the above add ch,dl needs to effect the adc bh,dh at the top of the loop)
|
---|
670 |
|
---|
671 | jnz looper3
|
---|
672 |
|
---|
673 | pop ebp
|
---|
674 | }
|
---|
675 | }
|
---|
676 | else
|
---|
677 | {
|
---|
678 | register w16 texel;
|
---|
679 | register w32 l_lookup;
|
---|
680 |
|
---|
681 | //highly unoptimized single pixel drawer //left_s //left_t
|
---|
682 | texel = *(r1_software_texture_ptr + (left_s_t[0]>>16) + ((left_s_t[1]>>16)<<r1_software_twidth_log2));
|
---|
683 |
|
---|
684 | l_lookup = left_l & (NUM_LIGHT_SHADES<<8);
|
---|
685 |
|
---|
686 | // low bits high bits
|
---|
687 | *start_pixel = (w16)(((w32 *)(0xDEADBEEF))[l_lookup + (texel & 0xFF)] + ((w32 *)(0xDEADBEEF)+ctable_size)[l_lookup + (texel>>8)]);
|
---|
688 | }
|
---|
689 | }
|
---|
690 |
|
---|
691 | return;
|
---|
692 |
|
---|
693 | _asm
|
---|
694 | {
|
---|
695 | dumpmmxregs:
|
---|
696 | movq qword ptr [mmx0],mm0
|
---|
697 | movq qword ptr [mmx1],mm1
|
---|
698 | movq qword ptr [mmx2],mm2
|
---|
699 | movq qword ptr [mmx3],mm3
|
---|
700 | movq qword ptr [mmx4],mm4
|
---|
701 | movq qword ptr [mmx5],mm5
|
---|
702 | movq qword ptr [mmx6],mm6
|
---|
703 | movq qword ptr [mmx7],mm7
|
---|
704 | ret
|
---|
705 | }
|
---|
706 |
|
---|
707 | }
|
---|
708 |
|
---|
709 | w32 *texture_perspective_lit_sentinel_amd3d()
|
---|
710 | {
|
---|
711 | bogus_label:
|
---|
712 |
|
---|
713 | w32 returnval;
|
---|
714 | _asm
|
---|
715 | {
|
---|
716 | lea eax,bogus_label
|
---|
717 | mov dword ptr [returnval],eax
|
---|
718 | }
|
---|
719 | return (w32 *)returnval;
|
---|
720 | }
|
---|
721 |
|
---|
722 | void insert_color_modify_address_low(w32 *address);
|
---|
723 | void insert_color_modify_address_high(w32 *address);
|
---|
724 | extern w32 color_modify_list[];
|
---|
725 | extern sw32 num_color_modifies;
|
---|
726 |
|
---|
727 | void setup_color_modify_perspective_lit_amd3d()
|
---|
728 | {
|
---|
729 | w32 *stop = texture_perspective_lit_sentinel_amd3d();
|
---|
730 |
|
---|
731 | w32 *search = texture_perspective_lit_starter_amd3d();
|
---|
732 | //start searching for 0xDEADBEEF
|
---|
733 | while (search < stop)
|
---|
734 | {
|
---|
735 | //casting craziness
|
---|
736 | search = (w32 *)((w8 *)search + 1);
|
---|
737 | if (*search==0xDEADBEEF)
|
---|
738 | {
|
---|
739 | insert_color_modify_address_low(search);
|
---|
740 | }
|
---|
741 | else
|
---|
742 | if (*search==(0xDEADBEEF + ctable_size_bytes))
|
---|
743 | {
|
---|
744 | insert_color_modify_address_high(search);
|
---|
745 | }
|
---|
746 | }
|
---|
747 | }
|
---|