source: golgotha/src/render/software/amd3d/perspective_map_unlit_asm_amd3d.cc @ 484

Last change on this file since 484 was 80, checked in by Sam Hocevar, 15 years ago
  • Adding the Golgotha source code. Not sure what's going to be interesting in there, but since it's all public domain, there's certainly stuff to pick up.
File size: 17.5 KB
Line 
1/********************************************************************** <BR>
2  This file is part of Crack dot Com's free source code release of
3  Golgotha. <a href="http://www.crack.com/golgotha_release"> <BR> for
4  information about compiling & licensing issues visit this URL</a>
5  <PRE> If that doesn't help, contact Jonathan Clark at
6  golgotha_source@usa.net (Subject should have "GOLG" in it)
7***********************************************************************/
8
9#include "software/r1_software_globals.hh"
10#include "software/inline_fpu.hh"
11#include "software/amd3d/amd3d.h"
12
13extern sw32 had_subdivisions;
14
15//instead of using left_s, left_t, right_s, and right_t,
16//the divides and multiplies are nicely vectorized by the amd3d,
17//and storing them is a single quad store to an array of 2 floats,
18//rather than two dword stores to two seperate floats
19
20extern sw32 left_s_t[2];
21extern sw32 right_s_t[2];
22
23extern float mmx0[2];
24extern float mmx1[2];
25extern float mmx2[2];
26extern float mmx3[2];
27extern float mmx4[2];
28extern float mmx5[2];
29extern float mmx6[2];
30extern float mmx7[2];
31
32void texture_scanline_perspective_unlit_amd3d(w16 *start_pixel,
33                                              sw32 start_x,
34                                              void *_left,//perspective_span *left,
35                                              sw32 width)
36{
37  start_pixel = (w16 *)((w8 *)start_pixel + start_x);
38
39  perspective_span *left = (perspective_span *)_left;
40 
41  _asm
42  {
43    //left_z = 1.f / left->ooz;
44    //left_s = qftoi(left->soz * left_z) + cur_grads.s_adjust;
45    //left_t = qftoi(left->toz * left_z) + cur_grads.t_adjust;
46   
47    //sw32 had_subdivisions = width & (~15);
48    //num_subdivisions = width >> 4;
49    //num_leftover     = width & 15;
50   
51    mov edi,dword ptr [left]
52    mov eax,dword ptr [width]
53
54    movd mm0, dword ptr [edi]perspective_span.ooz
55    mov ebx,eax
56   
57    pfrcp (m1, m0)
58    and eax,15
59
60    shr ebx,4
61    punpckldq mm0, mm0 //duplicate low 32bits of m0 into high 32 bits of m0
62       
63    pfrcpit1 (m0, m1)
64    mov ecx,dword ptr [width]
65   
66    movq mm2, qword ptr [edi]perspective_span.soz
67    mov dword ptr [num_leftover],eax
68   
69    pfrcpit2 (m0, m1)
70    and ecx,(~15)
71   
72    //mov eax,dword ptr [edi]perspective_span.l
73    mov dword ptr [num_subdivisions],ebx
74
75    pfmul (m2, m0)
76    mov dword ptr [had_subdivisions],ecx
77   
78    //mov dword ptr [left_l],eax
79    //clear these out
80    mov dword ptr [dsdx_frac],0
81
82    //high 32 bits of mm2 - toz / ooz (aka t)
83    //low  32 bits of mm2 - soz / ooz (aka s)
84
85    pf2id (m3, m2)
86    mov dword ptr [dtdx_frac],0
87
88    //high 32 bits of mm3 - toz / ooz (aka t) - truncated ints
89    //low  32 bits of mm3 - soz / ooz (aka s) - truncated ints
90
91    paddd mm3, qword ptr [cur_grads]tri_gradients.s_adjust
92
93    //high 32 bits of mm3 - t + t_adjust
94    //low  32 bits of mm3 - s + s_adjust
95
96    movq qword ptr [left_s_t], mm3
97  }
98
99  if (num_subdivisions)
100  {
101    _asm
102    {
103      //ooz_right = left->ooz + (cur_grads.doozdxspan);
104      //soz_right = left->soz + (cur_grads.dsozdxspan);
105      //toz_right = left->toz + (cur_grads.dtozdxspan);
106
107      //edi still has dword ptr [left]
108      lea ebx,dword ptr [cur_grads]
109      nop
110
111      movd mm1, dword ptr [edi]perspective_span.ooz
112      mov esi,dword ptr [r1_software_texture_ptr]
113     
114      movd mm3, dword ptr [ebx]tri_gradients.doozdxspan
115      mov eax,dword ptr [left_s_t] //left_s
116     
117      shr esi,1
118      movq mm0, qword ptr [edi]perspective_span.soz
119     
120      pfadd (m1, m3)
121      movq mm2, qword ptr [ebx]tri_gradients.dsozdxspan
122     
123      sar eax,16   //get integral left_s into eax
124      mov edi,dword ptr [start_pixel]
125     
126      pfrcp (m6, m1)
127      movq mm7,mm1
128     
129      pfadd (m0, m2)
130      mov ebx,dword ptr [left_s_t+4] //left_t     
131     
132      //calculate the 1st right_z in mm7
133      sar ebx,16 //get integral left_t into ebx
134      punpckldq mm7, mm7 //duplicate high 32bits of mm7 into low 32 bits of mm7
135     
136      pfrcpit1 (m7, m6)
137      mov edx,dword ptr [left_s_t+4] //left_t
138     
139      mov cl,byte ptr [r1_software_twidth_log2]
140      add esi,eax
141     
142      pfrcpit2 (m7, m6)
143
144      //calculate starting fractional and integral values for s and t
145      //esi = starting_s_coordinate >> 16 + starting_t_coordinate >> 16 << r1_software_twidth_log2
146      //ecx = starting_s_coordinate << 16
147      //edx = starting_t_coordinate << 16
148
149      //some stuff has been moved up, interleaved w/the mmx code above
150     
151      shl ebx,cl //multiply integral left_t by texture width
152     
153      sal edx,16 //get fractional left_t into edx
154      mov ecx,dword ptr [left_s_t] //left_s
155   
156      sal ecx,16
157      add esi,ebx
158    }
159
160    while (num_subdivisions)
161    {
162      _asm
163      {
164        //right_s = qftoi(soz_right * right_z);
165        //right_t = qftoi(toz_right * right_z);
166       
167        //soz_right and toz_right are in mm0
168        //right_z is in mm7
169        pfmul (m7, m0)
170       
171        pf2id (m7, m7)
172
173        movq qword ptr [right_s_t],mm7
174
175      //calculate ooz_right, soz_right, toz_right, and right_z for the end of the next span. if there are
176      //more subdivisions, calculate the end of the next span. if there are no more and there is > 1 leftover
177      //in the leftover span, calculate the end of that.
178
179      //if (num_subdivisions!=1)
180      //{
181          cmp dword ptr [num_subdivisions],1
182          je  last_subdivision
183       
184          //ooz_right += (cur_grads.doozdxspan);
185          //soz_right += (cur_grads.dsozdxspan);
186          //toz_right += (cur_grads.dtozdxspan);
187         
188          pfadd (m0, m2)
189          pfadd (m1, m3)
190
191          jmp proceed_with_mapping
192      //}
193      //else
194      //if (num_leftover > 1)
195      //{
196
197      last_subdivision:
198          cmp dword ptr [num_leftover],1
199          jle proceed_with_mapping
200       
201          //calculate the right_z for the end of the leftover span
202          //ooz_right += (cur_grads.doozdx * num_leftover);
203          //soz_right += (cur_grads.dsozdx * num_leftover);
204          //toz_right += (cur_grads.dtozdx * num_leftover);
205         
206          movd mm2,dword ptr [num_leftover]
207          movd mm3, dword ptr [cur_grads]tri_gradients.dsozdx
208         
209          pi2fd (m2, m2)
210          movd mm4, dword ptr [cur_grads]tri_gradients.dtozdx
211
212          pfmul (m3, m2)
213          movd mm5, dword ptr [cur_grads]tri_gradients.doozdx
214         
215          pfmul (m4, m2)
216          pfmul (m5, m2)
217
218          pfacc (m3, m4) //gets dtozdx*num_leftover into high 32 bits of m3
219
220          pfadd (m0, m3)
221          pfadd (m1, m5)
222      //}
223           
224      proceed_with_mapping:
225        //cap the right_s and right_t's so that they're valid
226
227        mov eax,dword ptr [right_s_t] //right_s
228        mov ebx,dword ptr [right_s_t+4] //right_t
229       
230        add eax,dword ptr [cur_grads]tri_gradients.s_adjust
231        add ebx,dword ptr [cur_grads]tri_gradients.t_adjust
232 
233        //cap the right s and t
234        cmp eax,0
235        jge cmp_eax_high
236
237        mov eax,0
238        jmp cmp_ebx_low
239
240      cmp_eax_high:
241        cmp eax,dword ptr [s_mask]
242        jle cmp_ebx_low
243
244        mov eax,dword ptr [s_mask]
245
246      cmp_ebx_low:
247        cmp ebx,0
248        jge cmp_ebx_high
249
250        mov ebx,0
251        jmp done_compare
252     
253      cmp_ebx_high:
254        cmp ebx,dword ptr [t_mask]
255        jle done_compare
256
257        mov ebx,dword ptr [t_mask]
258
259      done_compare:
260
261        //store the right_s and right_t
262        //so they can be copied into left_s and left_t at the end of the 16-pixel span
263        //(the cant be copied now because we have to calculate (right_s-left_s)>>4 and (right_t-left_t)>>4
264       
265        //calculate the next right_z in mm7
266        //unfortunately, if the span is a multiple of 16, and this is the last set of 16, it will
267        //calculate an unnecessary z. but its best to have the code here mixed in w/integer ops so
268        //that the amd3d code has something for its executation latencies to sit through
269        movq mm7, mm1
270        pfrcp (m6, m1)
271
272        mov dword ptr [right_s_t],eax //right_s
273        mov dword ptr [right_s_t+4],ebx //right_t
274
275        punpckldq mm7, mm7 //duplicate low 32bits of mm7 into high 32 bits of mm7
276        sub eax,dword ptr [left_s_t] //left_s
277
278        sar eax,4
279        push ebp
280
281        pfrcpit1 (m7, m6)
282        sub ebx,dword ptr [left_s_t+4] //left_t
283
284        sar ebx,4
285        mov word ptr [dsdx_frac+2],ax //this sets the upper 16 bits of dword ptr [dsdx_frac] to ((right_s-left_s)>>4)<<16
286       
287        pfrcpit2 (m7, m6)
288        nop
289       
290        sar eax,16
291        mov word ptr [dtdx_frac+2],bx //this sets the upper 16 bits of dword ptr [dtdx_frac] to ((right_t-left_t)>>4)<<16
292       
293        sar ebx,16
294        mov cl,byte ptr [r1_software_twidth_log2]
295       
296        shl ebx,cl
297
298        add eax,ebx
299
300        //s_t_carry[1] = integral_dsdx + integral_dtdx<<r1_software_twidth_log2
301        //s_t_carry[0] = integral_dsdx + integral_dtdx<<r1_software_twidth_log2 + r1_software_texture_width
302
303        mov dword ptr [s_t_carry+4],eax
304        add eax,dword ptr [r1_software_texture_width]
305               
306        mov dword ptr [s_t_carry],eax       
307        mov cl,4 //loop is unrolled to 4 pixels - we want to draw 16, so loop 4 times
308       
309        ALIGN 16
310
311        //high 16 bits of ecx is the fractional s component
312        //high 16 bits of edx is the fractional t component
313
314        //eax is used to lookup the texel as well as the low 8-bits of the lit texel
315        //ebx is used to lookup the high 8-bits of the lit texel
316        //ebp is used to detect a t-carry as well as lookup the lit texel
317        //cl  is the loop count variable
318
319    looper1:
320        add edx,dword ptr [dtdx_frac]
321        nop
322 
323        sbb ebp,ebp
324        add edi,8 //the only convenient place for the stepping of edi was way up here
325
326        movzx eax,word ptr [esi*2]
327        add ecx,dword ptr [dsdx_frac]
328
329        adc esi,dword ptr [4+s_t_carry+ebp*4]
330        add edx,dword ptr [dtdx_frac]
331       
332        sbb ebp,ebp
333        mov word ptr [edi-8],ax //1
334
335        movzx eax,word ptr [esi*2]
336        add ecx,dword ptr [dsdx_frac]
337
338        adc esi,dword ptr [4+s_t_carry+ebp*4]
339        add edx,dword ptr [dtdx_frac]
340       
341        sbb ebp,ebp
342        mov word ptr [edi-6],ax //2
343
344        movzx eax,word ptr [esi*2]
345        add ecx,dword ptr [dsdx_frac]
346
347        adc esi,dword ptr [4+s_t_carry+ebp*4]
348        add edx,dword ptr [dtdx_frac]
349       
350        sbb ebp,ebp
351        mov word ptr [edi-4],ax //3
352
353        movzx eax,word ptr [esi*2]
354        add ecx,dword ptr [dsdx_frac]
355
356        adc esi,dword ptr [4+s_t_carry+ebp*4]
357        mov word ptr [edi-2],ax //4
358       
359        dec cl //thank god this doesnt modify the carry flag (the above add ch,dl needs to effect the adc bh,dh at the top of the loop)
360       
361        jnz looper1
362        pop ebp
363
364        //store right_s and right_t in left_s and left_t
365        //right_s is what left_s starts at on the next 16 pixel span
366        //right_t is what left_t starts at on the next 16 pixel span
367
368        mov eax,dword ptr [right_s_t] //right_s
369        mov ebx,dword ptr [right_s_t+4] //right_t
370
371        mov dword ptr [left_s_t],eax //left_s
372        mov dword ptr [left_s_t+4],ebx //left_t
373      }
374         
375      _asm dec dword ptr [num_subdivisions]
376    }
377   
378    //store these so that the C code below actually works
379    _asm mov dword ptr [start_pixel],edi
380  }
381   
382  if (num_leftover)
383  {       
384    if (num_leftover > 1)
385    {     
386      if (had_subdivisions==0)
387      {
388        //calculate the right_z for the end of span
389        //ooz_right = left->ooz + (cur_grads.doozdx * num_leftover);
390        //soz_right = left->soz + (cur_grads.dsozdx * num_leftover);
391        //toz_right = left->toz + (cur_grads.dtozdx * num_leftover);
392
393        _asm
394        {
395          movd mm2,dword ptr [num_leftover]
396          lea ebx,dword ptr [cur_grads]
397         
398          movd mm3, dword ptr [ebx]tri_gradients.dsozdx
399          mov edi,dword ptr [left]
400
401          movd mm4, dword ptr [ebx]tri_gradients.dtozdx
402          pi2fd (m2, m2)
403         
404          movd mm5, dword ptr [ebx]tri_gradients.doozdx
405          pfmul (m3, m2)
406         
407          movq mm0, qword ptr [edi]perspective_span.soz
408          pfmul (m4, m2)
409
410          movd mm1, dword ptr [edi]perspective_span.ooz
411          pfmul (m5, m2)         
412         
413          pfacc (m3, m4) //gets dtozdx*num_leftover into high 32 bits of m3
414         
415          pfadd (m1, m5) //ooz += doozdx*num_leftover
416          pfadd (m0, m3) //soz += dsozdx*num_leftover AND toz += dtozdx*num_leftover
417
418          //calculate the z at the right endpoint in mm7
419          movq mm7, mm1
420          pfrcp (m6, m1)
421
422          punpckldq mm7, mm7 //duplicate low 32bits of mm7 into high 32 bits of mm7
423
424          pfrcpit1 (m7, m6) //terrible stalls. oh well
425       
426          pfrcpit2 (m7, m6)
427        }
428      }
429      else
430      {
431        //the correct ending right_z is already being calculated
432        //(see the if (num_subdivisions!=1) case above
433      }
434
435      _asm
436      {
437        //calculate starting fractional and integral values for s and t           
438       
439        //calculate the right endpoint
440        //right_s = qftoi(soz_right * right_z) + cur_grads.s_adjust;
441        //right_t = qftoi(toz_right * right_z) + cur_grads.t_adjust;
442       
443        //soz_right and toz_right are in mm0
444        //right_z is in mm7
445        pfmul (m7, m0) //calculate right_s and right_t
446        mov edi,dword ptr [start_pixel]
447
448        mov esi,dword ptr [r1_software_texture_ptr]
449        mov eax,dword ptr [left_s_t] //left_s
450
451        shr esi,1
452        pf2id (m7, m7) //truncate right_s and right_t
453       
454        sar eax,16
455        mov ebx,dword ptr [left_s_t+4] //left_t
456           
457        sar ebx,16
458        movq qword ptr [right_s_t],mm7
459
460        mov edx,dword ptr [left_s_t+4] //left_t
461        add esi,eax
462       
463        mov cl,byte ptr [r1_software_twidth_log2]
464        shl ebx,cl
465     
466        sal edx,16
467        mov ecx,dword ptr [left_s_t] //left_s
468     
469        sal ecx,16
470        add esi,ebx
471
472        mov eax,dword ptr [right_s_t] //right_s
473        mov ebx,dword ptr [right_s_t+4] //right_t
474       
475        add eax,dword ptr [cur_grads]tri_gradients.s_adjust
476        add ebx,dword ptr [cur_grads]tri_gradients.t_adjust
477 
478        //cap the right s and t
479        cmp eax,0
480        jge cmp_eax_high_2
481
482        mov eax,0
483        jmp cmp_ebx_low_2
484
485      cmp_eax_high_2:
486        cmp eax,dword ptr [s_mask]
487        jle cmp_ebx_low_2
488
489        mov eax,dword ptr [s_mask]
490
491      cmp_ebx_low_2:
492        cmp ebx,0
493        jge cmp_ebx_high_2
494
495        mov ebx,0
496        jmp done_compare_2
497     
498      cmp_ebx_high_2:
499        cmp ebx,dword ptr [t_mask]
500        jle done_compare_2
501
502        mov ebx,dword ptr [t_mask]
503
504      done_compare_2:
505           
506        //calculate the deltas (left to right)
507        //temp_dsdx = qftoi((float)(right_s - left_s) * inverse_leftover_lookup[num_leftover]);
508        //temp_dtdx = qftoi((float)(right_t - left_t) * inverse_leftover_lookup[num_leftover]);
509
510        sub eax,dword ptr [left_s_t] //left_s
511        sub ebx,dword ptr [left_s_t+4] //left_t
512
513        movd mm0,eax //temp_dsdx
514        push ebp
515       
516        movd mm1,ebx //temp_dtdx
517        mov ebp, dword ptr [num_leftover]
518       
519        pi2fd (m0, m0)
520        movd mm2, dword ptr [inverse_leftover_lookup + ebp*4]
521       
522        pi2fd (m1, m1)       
523        pfmul (m0, m2)
524
525        pfmul (m1, m2) //bad stalls here
526        pf2id (m0, m0)
527
528        pf2id (m1, m1)
529
530        movd eax, mm0 //temp_dsdx
531        movd ebx, mm1 //temp_dtdx
532
533        //calculate the fractional and integral delta vars
534        //s_t_carry[0] = (temp_dsdx>>16) + ((temp_dtdx>>16)<<r1_software_twidth_log2) + r1_software_texture_width;
535        //s_t_carry[1] = (temp_dsdx>>16) + ((temp_dtdx>>16)<<r1_software_twidth_log2);
536        //dsdx_frac    = (temp_dsdx<<16);
537        //dtdx_frac    = (temp_dtdx<<16);
538
539        mov word ptr [dsdx_frac+2],ax
540        mov word ptr [dtdx_frac+2],bx
541
542        sar eax,16
543        mov dx,word ptr [left_l]
544
545        sar ebx,16
546        mov cl,byte ptr [r1_software_twidth_log2]
547       
548        shl ebx,cl
549
550        add eax,ebx
551        nop //mov ebx,0
552
553        mov dword ptr [s_t_carry+4],eax
554        add eax,dword ptr [r1_software_texture_width]
555       
556        mov dword ptr [s_t_carry],eax
557        mov cl, byte ptr [num_leftover]
558       
559        ALIGN 16
560
561      looper3:
562        movzx eax,word ptr [esi*2]
563        add edx,dword ptr [dtdx_frac]
564       
565        sbb ebp,ebp
566        mov word ptr [edi],ax //1
567
568        add edi,2 //the only convenient place for the stepping of edi was way up here
569        add ecx,dword ptr [dsdx_frac]
570
571        adc esi,dword ptr [4+s_t_carry+ebp*4]
572        dec cl
573
574        jnz looper3
575
576        pop ebp
577      }
578    }
579    else
580    {
581      //highly unoptimized single pixel drawer
582      *start_pixel = *(r1_software_texture_ptr + (left_s_t[0]>>16) + ((left_s_t[1]>>16)<<r1_software_twidth_log2));
583    }
584  }
585 
586  return;
587
588  _asm
589  {
590  dumpmmxregs:
591    movq qword ptr [mmx0],mm0
592    movq qword ptr [mmx1],mm1
593    movq qword ptr [mmx2],mm2
594    movq qword ptr [mmx3],mm3
595    movq qword ptr [mmx4],mm4
596    movq qword ptr [mmx5],mm5
597    movq qword ptr [mmx6],mm6
598    movq qword ptr [mmx7],mm7
599    ret
600  }
601
602}
603
Note: See TracBrowser for help on using the repository browser.