source: golgotha/src/render/software/amd3d/perspective_map_unlit_holy_asm_amd3d.cc @ 484

Last change on this file since 484 was 80, checked in by Sam Hocevar, 15 years ago
  • Adding the Golgotha source code. Not sure what's going to be interesting in there, but since it's all public domain, there's certainly stuff to pick up.
File size: 17.9 KB
Line 
1/********************************************************************** <BR>
2  This file is part of Crack dot Com's free source code release of
3  Golgotha. <a href="http://www.crack.com/golgotha_release"> <BR> for
4  information about compiling & licensing issues visit this URL</a>
5  <PRE> If that doesn't help, contact Jonathan Clark at
6  golgotha_source@usa.net (Subject should have "GOLG" in it)
7***********************************************************************/
8
9#include "software/r1_software_globals.hh"
10#include "software/inline_fpu.hh"
11#include "software/amd3d/amd3d.h"
12
13extern sw32 had_subdivisions;
14
15//instead of using left_s, left_t, right_s, and right_t,
16//the divides and multiplies are nicely vectorized by the amd3d,
17//and storing them is a single quad store to an array of 2 floats,
18//rather than two dword stores to two seperate floats
19
20extern sw32 left_s_t[2];
21extern sw32 right_s_t[2];
22
23extern float mmx0[2];
24extern float mmx1[2];
25extern float mmx2[2];
26extern float mmx3[2];
27extern float mmx4[2];
28extern float mmx5[2];
29extern float mmx6[2];
30extern float mmx7[2];
31
32void texture_scanline_perspective_unlit_holy_amd3d(w16 *start_pixel,
33                                                   sw32 start_x,
34                                                   void *_left,//perspective_span *left,
35                                                   sw32 width)
36{
37  start_pixel = (w16 *)((w8 *)start_pixel + start_x);
38
39  perspective_span *left = (perspective_span *)_left;
40 
41  _asm
42  {
43    //left_z = 1.f / left->ooz;
44    //left_s = qftoi(left->soz * left_z) + cur_grads.s_adjust;
45    //left_t = qftoi(left->toz * left_z) + cur_grads.t_adjust;
46   
47    //sw32 had_subdivisions = width & (~15);
48    //num_subdivisions = width >> 4;
49    //num_leftover     = width & 15;
50   
51    mov edi,dword ptr [left]
52    mov eax,dword ptr [width]
53
54    movd mm0, dword ptr [edi]perspective_span.ooz
55    mov ebx,eax
56   
57    pfrcp (m1, m0)
58    and eax,15
59
60    shr ebx,4
61    punpckldq mm0, mm0 //duplicate low 32bits of m0 into high 32 bits of m0
62       
63    pfrcpit1 (m0, m1)
64    mov ecx,dword ptr [width]
65   
66    movq mm2, qword ptr [edi]perspective_span.soz
67    mov dword ptr [num_leftover],eax
68   
69    pfrcpit2 (m0, m1)
70    and ecx,(~15)
71   
72    //mov eax,dword ptr [edi]perspective_span.l
73    mov dword ptr [num_subdivisions],ebx
74
75    pfmul (m2, m0)
76    mov dword ptr [had_subdivisions],ecx
77   
78    //mov dword ptr [left_l],eax
79    //clear these out
80    mov dword ptr [dsdx_frac],0
81
82    //high 32 bits of mm2 - toz / ooz (aka t)
83    //low  32 bits of mm2 - soz / ooz (aka s)
84
85    pf2id (m3, m2)
86    mov dword ptr [dtdx_frac],0
87
88    //high 32 bits of mm3 - toz / ooz (aka t) - truncated ints
89    //low  32 bits of mm3 - soz / ooz (aka s) - truncated ints
90
91    paddd mm3, qword ptr [cur_grads]tri_gradients.s_adjust
92
93    //high 32 bits of mm3 - t + t_adjust
94    //low  32 bits of mm3 - s + s_adjust
95
96    movq qword ptr [left_s_t], mm3
97  }
98
99  if (num_subdivisions)
100  {
101    _asm
102    {
103      //ooz_right = left->ooz + (cur_grads.doozdxspan);
104      //soz_right = left->soz + (cur_grads.dsozdxspan);
105      //toz_right = left->toz + (cur_grads.dtozdxspan);
106
107      //edi still has dword ptr [left]
108      lea ebx,dword ptr [cur_grads]
109      nop
110
111      movd mm1, dword ptr [edi]perspective_span.ooz
112      mov esi,dword ptr [r1_software_texture_ptr]
113     
114      movd mm3, dword ptr [ebx]tri_gradients.doozdxspan
115      mov eax,dword ptr [left_s_t] //left_s
116     
117      shr esi,1
118      movq mm0, qword ptr [edi]perspective_span.soz
119     
120      pfadd (m1, m3)
121      movq mm2, qword ptr [ebx]tri_gradients.dsozdxspan
122     
123      sar eax,16   //get integral left_s into eax
124      mov edi,dword ptr [start_pixel]
125     
126      pfrcp (m6, m1)
127      movq mm7,mm1
128     
129      pfadd (m0, m2)
130      mov ebx,dword ptr [left_s_t+4] //left_t     
131     
132      //calculate the 1st right_z in mm7
133      sar ebx,16 //get integral left_t into ebx
134      punpckldq mm7, mm7 //duplicate high 32bits of mm7 into low 32 bits of mm7
135     
136      pfrcpit1 (m7, m6)
137      mov edx,dword ptr [left_s_t+4] //left_t
138     
139      mov cl,byte ptr [r1_software_twidth_log2]
140      add esi,eax
141     
142      pfrcpit2 (m7, m6)
143
144      //calculate starting fractional and integral values for s and t
145      //esi = starting_s_coordinate >> 16 + starting_t_coordinate >> 16 << r1_software_twidth_log2
146      //ecx = starting_s_coordinate << 16
147      //edx = starting_t_coordinate << 16
148
149      //some stuff has been moved up, interleaved w/the mmx code above
150     
151      shl ebx,cl //multiply integral left_t by texture width
152     
153      sal edx,16 //get fractional left_t into edx
154      mov ecx,dword ptr [left_s_t] //left_s
155   
156      sal ecx,16
157      add esi,ebx
158    }
159
160    while (num_subdivisions)
161    {
162      _asm
163      {
164        //right_s = qftoi(soz_right * right_z);
165        //right_t = qftoi(toz_right * right_z);
166       
167        //soz_right and toz_right are in mm0
168        //right_z is in mm7
169        pfmul (m7, m0)
170       
171        pf2id (m7, m7)
172
173        movq qword ptr [right_s_t],mm7
174
175      //calculate ooz_right, soz_right, toz_right, and right_z for the end of the next span. if there are
176      //more subdivisions, calculate the end of the next span. if there are no more and there is > 1 leftover
177      //in the leftover span, calculate the end of that.
178
179      //if (num_subdivisions!=1)
180      //{
181          cmp dword ptr [num_subdivisions],1
182          je  last_subdivision
183       
184          //ooz_right += (cur_grads.doozdxspan);
185          //soz_right += (cur_grads.dsozdxspan);
186          //toz_right += (cur_grads.dtozdxspan);
187         
188          pfadd (m0, m2)
189          pfadd (m1, m3)
190
191          jmp proceed_with_mapping
192      //}
193      //else
194      //if (num_leftover > 1)
195      //{
196
197      last_subdivision:
198          cmp dword ptr [num_leftover],1
199          jle proceed_with_mapping
200       
201          //calculate the right_z for the end of the leftover span
202          //ooz_right += (cur_grads.doozdx * num_leftover);
203          //soz_right += (cur_grads.dsozdx * num_leftover);
204          //toz_right += (cur_grads.dtozdx * num_leftover);
205         
206          movd mm2,dword ptr [num_leftover]
207          movd mm3, dword ptr [cur_grads]tri_gradients.dsozdx
208         
209          pi2fd (m2, m2)
210          movd mm4, dword ptr [cur_grads]tri_gradients.dtozdx
211
212          pfmul (m3, m2)
213          movd mm5, dword ptr [cur_grads]tri_gradients.doozdx
214         
215          pfmul (m4, m2)
216          pfmul (m5, m2)
217
218          pfacc (m3, m4) //gets dtozdx*num_leftover into high 32 bits of m3
219
220          pfadd (m0, m3)
221          pfadd (m1, m5)
222      //}
223           
224      proceed_with_mapping:
225        //cap the right_s and right_t's so that they're valid
226
227        mov eax,dword ptr [right_s_t] //right_s
228        mov ebx,dword ptr [right_s_t+4] //right_t
229       
230        add eax,dword ptr [cur_grads]tri_gradients.s_adjust
231        add ebx,dword ptr [cur_grads]tri_gradients.t_adjust
232 
233        //cap the right s and t
234        cmp eax,0
235        jge cmp_eax_high
236
237        mov eax,0
238        jmp cmp_ebx_low
239
240      cmp_eax_high:
241        cmp eax,dword ptr [s_mask]
242        jle cmp_ebx_low
243
244        mov eax,dword ptr [s_mask]
245
246      cmp_ebx_low:
247        cmp ebx,0
248        jge cmp_ebx_high
249
250        mov ebx,0
251        jmp done_compare
252     
253      cmp_ebx_high:
254        cmp ebx,dword ptr [t_mask]
255        jle done_compare
256
257        mov ebx,dword ptr [t_mask]
258
259      done_compare:
260
261        //store the right_s and right_t
262        //so they can be copied into left_s and left_t at the end of the 16-pixel span
263        //(the cant be copied now because we have to calculate (right_s-left_s)>>4 and (right_t-left_t)>>4
264       
265        //calculate the next right_z in mm7
266        //unfortunately, if the span is a multiple of 16, and this is the last set of 16, it will
267        //calculate an unnecessary z. but its best to have the code here mixed in w/integer ops so
268        //that the amd3d code has something for its executation latencies to sit through
269        movq mm7, mm1
270        pfrcp (m6, m1)
271
272        mov dword ptr [right_s_t],eax //right_s
273        mov dword ptr [right_s_t+4],ebx //right_t
274
275        punpckldq mm7, mm7 //duplicate low 32bits of mm7 into high 32 bits of mm7
276        sub eax,dword ptr [left_s_t] //left_s
277
278        sar eax,4
279        push ebp
280
281        pfrcpit1 (m7, m6)
282        sub ebx,dword ptr [left_s_t+4] //left_t
283
284        sar ebx,4
285        mov word ptr [dsdx_frac+2],ax //this sets the upper 16 bits of dword ptr [dsdx_frac] to ((right_s-left_s)>>4)<<16
286       
287        pfrcpit2 (m7, m6)
288        nop
289       
290        sar eax,16
291        mov word ptr [dtdx_frac+2],bx //this sets the upper 16 bits of dword ptr [dtdx_frac] to ((right_t-left_t)>>4)<<16
292       
293        sar ebx,16
294        mov cl,byte ptr [r1_software_twidth_log2]
295       
296        shl ebx,cl
297
298        add eax,ebx
299        mov cl,4 //loop is unrolled to 4 pixels - we want to draw 16, so loop 4 times
300
301        //s_t_carry[1] = integral_dsdx + integral_dtdx<<r1_software_twidth_log2
302        //s_t_carry[0] = integral_dsdx + integral_dtdx<<r1_software_twidth_log2 + r1_software_texture_width
303
304        mov dword ptr [s_t_carry+4],eax
305        add eax,dword ptr [r1_software_texture_width]
306               
307        mov dword ptr [s_t_carry],eax       
308       
309        ALIGN 16
310
311        //high 16 bits of ecx is the fractional s component
312        //high 16 bits of edx is the fractional t component
313
314        //eax is used to lookup the texel as well as the low 8-bits of the lit texel
315        //ebx is used to lookup the high 8-bits of the lit texel
316        //ebp is used to detect a t-carry as well as lookup the lit texel
317        //cl  is the loop count variable
318
319    looper1:
320        add edi,8
321        add edx,dword ptr [dtdx_frac]
322       
323        sbb ebx,ebx
324        add ecx,dword ptr [dsdx_frac]
325
326        movzx eax,word ptr [esi*2]
327       
328        adc esi,dword ptr [4+s_t_carry+ebx*4]
329        add edx,dword ptr [dtdx_frac]
330   
331        sbb ebx,ebx
332        and eax,eax //test to see if its zero
333
334        jz skipped_1_pixel
335        mov word ptr [edi-8],ax    //store 1 pixel
336           
337      skipped_1_pixel:
338        movzx eax,word ptr [esi*2]
339        add ecx,dword ptr [dsdx_frac]
340
341        adc esi,dword ptr [4+s_t_carry+ebx*4]
342        add edx,dword ptr [dtdx_frac]
343   
344        sbb ebx,ebx
345        and eax,eax //test to see if its zero
346
347        jz skipped_2_pixel
348        mov word ptr [edi-6],ax
349
350      skipped_2_pixel:
351        movzx eax,word ptr [esi*2]
352        add ecx,dword ptr [dsdx_frac]
353
354        adc esi,dword ptr [4+s_t_carry+ebx*4]
355        add edx,dword ptr [dtdx_frac]
356   
357        sbb ebx,ebx
358        and eax,eax //test to see if its zero
359
360        jz skipped_3_pixel
361        mov word ptr [edi-4],ax
362     
363     skipped_3_pixel:
364        movzx eax,word ptr [esi*2]
365        add ecx,dword ptr [dsdx_frac]
366
367        adc esi,dword ptr [4+s_t_carry+ebx*4]
368        and eax,eax //test to see if its zero
369
370        jz skipped_4_pixel
371        mov word ptr [edi-2],ax   
372
373      skipped_4_pixel:
374        dec cl
375        jnz looper1
376
377        pop ebp
378
379        //store right_s and right_t in left_s and left_t
380        //right_s is what left_s starts at on the next 16 pixel span
381        //right_t is what left_t starts at on the next 16 pixel span
382
383        mov eax,dword ptr [right_s_t] //right_s
384        mov ebx,dword ptr [right_s_t+4] //right_t
385
386        mov dword ptr [left_s_t],eax //left_s
387        mov dword ptr [left_s_t+4],ebx //left_t
388      }
389         
390      _asm dec dword ptr [num_subdivisions]
391    }
392   
393    //store these so that the C code below actually works
394    _asm mov dword ptr [start_pixel],edi
395  }
396   
397  if (num_leftover)
398  {       
399    if (num_leftover > 1)
400    {     
401      if (had_subdivisions==0)
402      {
403        //calculate the right_z for the end of span
404        //ooz_right = left->ooz + (cur_grads.doozdx * num_leftover);
405        //soz_right = left->soz + (cur_grads.dsozdx * num_leftover);
406        //toz_right = left->toz + (cur_grads.dtozdx * num_leftover);
407
408        _asm
409        {
410          movd mm2,dword ptr [num_leftover]
411          lea ebx,dword ptr [cur_grads]
412         
413          movd mm3, dword ptr [ebx]tri_gradients.dsozdx
414          mov edi,dword ptr [left]
415
416          movd mm4, dword ptr [ebx]tri_gradients.dtozdx
417          pi2fd (m2, m2)
418         
419          movd mm5, dword ptr [ebx]tri_gradients.doozdx
420          pfmul (m3, m2)
421         
422          movq mm0, qword ptr [edi]perspective_span.soz
423          pfmul (m4, m2)
424
425          movd mm1, dword ptr [edi]perspective_span.ooz
426          pfmul (m5, m2)         
427         
428          pfacc (m3, m4) //gets dtozdx*num_leftover into high 32 bits of m3
429         
430          pfadd (m1, m5) //ooz += doozdx*num_leftover
431          pfadd (m0, m3) //soz += dsozdx*num_leftover AND toz += dtozdx*num_leftover
432
433          //calculate the z at the right endpoint in mm7
434          movq mm7, mm1
435          pfrcp (m6, m1)
436
437          punpckldq mm7, mm7 //duplicate low 32bits of mm7 into high 32 bits of mm7
438
439          pfrcpit1 (m7, m6) //terrible stalls. oh well
440       
441          pfrcpit2 (m7, m6)
442        }
443      }
444      else
445      {
446        //the correct ending right_z is already being calculated
447        //(see the if (num_subdivisions!=1) case above
448      }
449
450      _asm
451      {
452        //calculate starting fractional and integral values for s and t           
453       
454        //calculate the right endpoint
455        //right_s = qftoi(soz_right * right_z) + cur_grads.s_adjust;
456        //right_t = qftoi(toz_right * right_z) + cur_grads.t_adjust;
457       
458        //soz_right and toz_right are in mm0
459        //right_z is in mm7
460        pfmul (m7, m0) //calculate right_s and right_t
461        mov edi,dword ptr [start_pixel]
462
463        mov esi,dword ptr [r1_software_texture_ptr]
464        mov eax,dword ptr [left_s_t] //left_s
465
466        shr esi,1
467        pf2id (m7, m7) //truncate right_s and right_t
468       
469        sar eax,16
470        mov ebx,dword ptr [left_s_t+4] //left_t
471           
472        sar ebx,16
473        movq qword ptr [right_s_t],mm7
474
475        mov edx,dword ptr [left_s_t+4] //left_t
476        add esi,eax
477       
478        mov cl,byte ptr [r1_software_twidth_log2]
479        shl ebx,cl
480     
481        sal edx,16
482        mov ecx,dword ptr [left_s_t] //left_s
483     
484        sal ecx,16
485        add esi,ebx
486
487        mov eax,dword ptr [right_s_t] //right_s
488        mov ebx,dword ptr [right_s_t+4] //right_t
489       
490        add eax,dword ptr [cur_grads]tri_gradients.s_adjust
491        add ebx,dword ptr [cur_grads]tri_gradients.t_adjust
492 
493        //cap the right s and t
494        cmp eax,0
495        jge cmp_eax_high_2
496
497        mov eax,0
498        jmp cmp_ebx_low_2
499
500      cmp_eax_high_2:
501        cmp eax,dword ptr [s_mask]
502        jle cmp_ebx_low_2
503
504        mov eax,dword ptr [s_mask]
505
506      cmp_ebx_low_2:
507        cmp ebx,0
508        jge cmp_ebx_high_2
509
510        mov ebx,0
511        jmp done_compare_2
512     
513      cmp_ebx_high_2:
514        cmp ebx,dword ptr [t_mask]
515        jle done_compare_2
516
517        mov ebx,dword ptr [t_mask]
518
519      done_compare_2:
520           
521        //calculate the deltas (left to right)
522        //temp_dsdx = qftoi((float)(right_s - left_s) * inverse_leftover_lookup[num_leftover]);
523        //temp_dtdx = qftoi((float)(right_t - left_t) * inverse_leftover_lookup[num_leftover]);
524
525        sub eax,dword ptr [left_s_t] //left_s
526        sub ebx,dword ptr [left_s_t+4] //left_t
527
528        movd mm0,eax //temp_dsdx
529        push ebp
530       
531        movd mm1,ebx //temp_dtdx
532        mov ebp, dword ptr [num_leftover]
533       
534        pi2fd (m0, m0)
535        movd mm2, dword ptr [inverse_leftover_lookup + ebp*4]
536       
537        pi2fd (m1, m1)       
538        pfmul (m0, m2)
539
540        pfmul (m1, m2) //bad stalls here
541        pf2id (m0, m0)
542
543        pf2id (m1, m1)
544
545        movd eax, mm0 //temp_dsdx
546        movd ebx, mm1 //temp_dtdx
547
548        //calculate the fractional and integral delta vars
549        //s_t_carry[0] = (temp_dsdx>>16) + ((temp_dtdx>>16)<<r1_software_twidth_log2) + r1_software_texture_width;
550        //s_t_carry[1] = (temp_dsdx>>16) + ((temp_dtdx>>16)<<r1_software_twidth_log2);
551        //dsdx_frac    = (temp_dsdx<<16);
552        //dtdx_frac    = (temp_dtdx<<16);
553
554        mov word ptr [dsdx_frac+2],ax
555        mov word ptr [dtdx_frac+2],bx
556
557        sar eax,16
558        mov dx,word ptr [left_l]
559
560        sar ebx,16
561        mov cl,byte ptr [r1_software_twidth_log2]
562       
563        shl ebx,cl
564
565        add eax,ebx
566        mov cl, byte ptr [num_leftover]
567
568        mov dword ptr [s_t_carry+4],eax
569        add eax,dword ptr [r1_software_texture_width]
570       
571        mov dword ptr [s_t_carry],eax
572
573        ALIGN 16
574
575      looper3:
576        movzx eax,word ptr [esi*2]
577        add edx,dword ptr [dtdx_frac]
578
579        sbb ebp,ebp
580        add edi,2 //the only convenient place for the stepping of edi was way up here
581
582        add ecx,dword ptr [dsdx_frac]
583       
584        adc esi,dword ptr [4+s_t_carry+ebp*4]
585        and eax,eax
586               
587        jz  skip_a_pixel
588        mov word ptr [edi-2],ax
589
590      skip_a_pixel:
591        dec cl
592        jnz looper3
593
594        pop ebp
595      }
596    }
597    else
598    {
599      //highly unoptimized single pixel drawer
600      register w16 texel = *(r1_software_texture_ptr + (left_s_t[0]>>16) + ((left_s_t[1]>>16)<<r1_software_twidth_log2));
601     
602      if (texel)
603        *start_pixel = texel;
604    }
605  }
606 
607  return;
608
609  _asm
610  {
611  dumpmmxregs:
612    movq qword ptr [mmx0],mm0
613    movq qword ptr [mmx1],mm1
614    movq qword ptr [mmx2],mm2
615    movq qword ptr [mmx3],mm3
616    movq qword ptr [mmx4],mm4
617    movq qword ptr [mmx5],mm5
618    movq qword ptr [mmx6],mm6
619    movq qword ptr [mmx7],mm7
620    ret
621  }
622
623}
624
Note: See TracBrowser for help on using the repository browser.