source: golgotha/src/render/software/amd3d/perspective_map_unlit_alpha_asm_amd3d.cc @ 484

Last change on this file since 484 was 80, checked in by Sam Hocevar, 15 years ago
  • Adding the Golgotha source code. Not sure what's going to be interesting in there, but since it's all public domain, there's certainly stuff to pick up.
File size: 18.7 KB
Line 
1/********************************************************************** <BR>
2  This file is part of Crack dot Com's free source code release of
3  Golgotha. <a href="http://www.crack.com/golgotha_release"> <BR> for
4  information about compiling & licensing issues visit this URL</a>
5  <PRE> If that doesn't help, contact Jonathan Clark at
6  golgotha_source@usa.net (Subject should have "GOLG" in it)
7***********************************************************************/
8
9#include "software/r1_software_globals.hh"
10#include "software/inline_fpu.hh"
11#include "software/amd3d/amd3d.h"
12
13extern sw32 had_subdivisions;
14extern w8 last_alpha_accumulated;
15
16//instead of using left_s, left_t, right_s, and right_t,
17//the divides and multiplies are nicely vectorized by the amd3d,
18//and storing them is a single quad store to an array of 2 floats,
19//rather than two dword stores to two seperate floats
20
21extern sw32 left_s_t[2];
22extern sw32 right_s_t[2];
23
24extern float mmx0[2];
25extern float mmx1[2];
26extern float mmx2[2];
27extern float mmx3[2];
28extern float mmx4[2];
29extern float mmx5[2];
30extern float mmx6[2];
31extern float mmx7[2];
32
33void texture_scanline_perspective_unlit_alpha_amd3d(w16 *start_pixel,
34                                                    sw32 start_x,
35                                                    void *_left,//perspective_span *left,
36                                                    sw32 width)
37{
38  start_pixel = (w16 *)((w8 *)start_pixel + start_x);
39
40  perspective_span *left = (perspective_span *)_left;
41 
42  last_alpha_accumulated = 16;
43 
44  _asm
45  {
46    //left_z = 1.f / left->ooz;
47    //left_s = qftoi(left->soz * left_z) + cur_grads.s_adjust;
48    //left_t = qftoi(left->toz * left_z) + cur_grads.t_adjust;
49   
50    //sw32 had_subdivisions = width & (~15);
51    //num_subdivisions = width >> 4;
52    //num_leftover     = width & 15;
53   
54    mov edi,dword ptr [left]
55    mov eax,dword ptr [width]
56
57    movd mm0, dword ptr [edi]perspective_span.ooz
58    mov ebx,eax
59   
60    pfrcp (m1, m0)
61    and eax,15
62
63    shr ebx,4
64    punpckldq mm0, mm0 //duplicate low 32bits of m0 into high 32 bits of m0
65       
66    pfrcpit1 (m0, m1)
67    mov ecx,dword ptr [width]
68   
69    movq mm2, qword ptr [edi]perspective_span.soz
70    mov dword ptr [num_leftover],eax
71   
72    pfrcpit2 (m0, m1)
73    and ecx,(~15)
74   
75    //mov eax,dword ptr [edi]perspective_span.l
76    mov dword ptr [num_subdivisions],ebx
77
78    pfmul (m2, m0)
79    mov dword ptr [had_subdivisions],ecx
80   
81    //mov dword ptr [left_l],eax
82    //clear these out
83    mov dword ptr [dsdx_frac],0
84
85    //high 32 bits of mm2 - toz / ooz (aka t)
86    //low  32 bits of mm2 - soz / ooz (aka s)
87
88    pf2id (m3, m2)
89    mov dword ptr [dtdx_frac],0
90
91    //high 32 bits of mm3 - toz / ooz (aka t) - truncated ints
92    //low  32 bits of mm3 - soz / ooz (aka s) - truncated ints
93
94    paddd mm3, qword ptr [cur_grads]tri_gradients.s_adjust
95
96    //high 32 bits of mm3 - t + t_adjust
97    //low  32 bits of mm3 - s + s_adjust
98
99    movq qword ptr [left_s_t], mm3
100  }
101
102  if (num_subdivisions)
103  {
104    _asm
105    {
106      //ooz_right = left->ooz + (cur_grads.doozdxspan);
107      //soz_right = left->soz + (cur_grads.dsozdxspan);
108      //toz_right = left->toz + (cur_grads.dtozdxspan);
109
110      //edi still has dword ptr [left]
111      lea ebx,dword ptr [cur_grads]
112      nop
113
114      movd mm1, dword ptr [edi]perspective_span.ooz
115      mov esi,dword ptr [r1_software_texture_ptr]
116     
117      movd mm3, dword ptr [ebx]tri_gradients.doozdxspan
118      mov eax,dword ptr [left_s_t] //left_s
119     
120      shr esi,1
121      movq mm0, qword ptr [edi]perspective_span.soz
122     
123      pfadd (m1, m3)
124      movq mm2, qword ptr [ebx]tri_gradients.dsozdxspan
125     
126      sar eax,16   //get integral left_s into eax
127      mov edi,dword ptr [start_pixel]
128     
129      pfrcp (m6, m1)
130      movq mm7,mm1
131     
132      pfadd (m0, m2)
133      mov ebx,dword ptr [left_s_t+4] //left_t     
134     
135      //calculate the 1st right_z in mm7
136      sar ebx,16 //get integral left_t into ebx
137      punpckldq mm7, mm7 //duplicate high 32bits of mm7 into low 32 bits of mm7
138     
139      pfrcpit1 (m7, m6)
140      mov edx,dword ptr [left_s_t+4] //left_t
141     
142      mov cl,byte ptr [r1_software_twidth_log2]
143      add esi,eax
144     
145      pfrcpit2 (m7, m6)
146
147      //calculate starting fractional and integral values for s and t
148      //esi = starting_s_coordinate >> 16 + starting_t_coordinate >> 16 << r1_software_twidth_log2
149      //ecx = starting_s_coordinate << 16
150      //edx = starting_t_coordinate << 16
151
152      //some stuff has been moved up, interleaved w/the mmx code above
153     
154      shl ebx,cl //multiply integral left_t by texture width
155     
156      sal edx,16 //get fractional left_t into edx
157      mov ecx,dword ptr [left_s_t] //left_s
158   
159      sal ecx,16
160      add esi,ebx
161    }
162
163    while (num_subdivisions)
164    {
165      _asm
166      {
167        //right_s = qftoi(soz_right * right_z);
168        //right_t = qftoi(toz_right * right_z);
169       
170        //soz_right and toz_right are in mm0
171        //right_z is in mm7
172        pfmul (m7, m0)
173       
174        pf2id (m7, m7)
175
176        movq qword ptr [right_s_t],mm7
177
178      //calculate ooz_right, soz_right, toz_right, and right_z for the end of the next span. if there are
179      //more subdivisions, calculate the end of the next span. if there are no more and there is > 1 leftover
180      //in the leftover span, calculate the end of that.
181
182      //if (num_subdivisions!=1)
183      //{
184          cmp dword ptr [num_subdivisions],1
185          je  last_subdivision
186       
187          //ooz_right += (cur_grads.doozdxspan);
188          //soz_right += (cur_grads.dsozdxspan);
189          //toz_right += (cur_grads.dtozdxspan);
190         
191          pfadd (m0, m2)
192          pfadd (m1, m3)
193
194          jmp proceed_with_mapping
195      //}
196      //else
197      //if (num_leftover > 1)
198      //{
199
200      last_subdivision:
201          cmp dword ptr [num_leftover],1
202          jle proceed_with_mapping
203       
204          //calculate the right_z for the end of the leftover span
205          //ooz_right += (cur_grads.doozdx * num_leftover);
206          //soz_right += (cur_grads.dsozdx * num_leftover);
207          //toz_right += (cur_grads.dtozdx * num_leftover);
208         
209          movd mm2,dword ptr [num_leftover]
210          movd mm3, dword ptr [cur_grads]tri_gradients.dsozdx
211         
212          pi2fd (m2, m2)
213          movd mm4, dword ptr [cur_grads]tri_gradients.dtozdx
214
215          pfmul (m3, m2)
216          movd mm5, dword ptr [cur_grads]tri_gradients.doozdx
217         
218          pfmul (m4, m2)
219          pfmul (m5, m2)
220
221          pfacc (m3, m4) //gets dtozdx*num_leftover into high 32 bits of m3
222
223          pfadd (m0, m3)
224          pfadd (m1, m5)
225      //}
226           
227      proceed_with_mapping:
228        //cap the right_s and right_t's so that they're valid
229
230        mov eax,dword ptr [right_s_t] //right_s
231        mov ebx,dword ptr [right_s_t+4] //right_t
232       
233        add eax,dword ptr [cur_grads]tri_gradients.s_adjust
234        add ebx,dword ptr [cur_grads]tri_gradients.t_adjust
235 
236        //cap the right s and t
237        cmp eax,0
238        jge cmp_eax_high
239
240        mov eax,0
241        jmp cmp_ebx_low
242
243      cmp_eax_high:
244        cmp eax,dword ptr [s_mask]
245        jle cmp_ebx_low
246
247        mov eax,dword ptr [s_mask]
248
249      cmp_ebx_low:
250        cmp ebx,0
251        jge cmp_ebx_high
252
253        mov ebx,0
254        jmp done_compare
255     
256      cmp_ebx_high:
257        cmp ebx,dword ptr [t_mask]
258        jle done_compare
259
260        mov ebx,dword ptr [t_mask]
261
262      done_compare:
263
264        //store the right_s and right_t
265        //so they can be copied into left_s and left_t at the end of the 16-pixel span
266        //(the cant be copied now because we have to calculate (right_s-left_s)>>4 and (right_t-left_t)>>4
267       
268        //calculate the next right_z in mm7
269        //unfortunately, if the span is a multiple of 16, and this is the last set of 16, it will
270        //calculate an unnecessary z. but its best to have the code here mixed in w/integer ops so
271        //that the amd3d code has something for its executation latencies to sit through
272        movq mm7, mm1
273        pfrcp (m6, m1)
274
275        mov dword ptr [right_s_t],eax //right_s
276        mov dword ptr [right_s_t+4],ebx //right_t
277
278        punpckldq mm7, mm7 //duplicate low 32bits of mm7 into high 32 bits of mm7
279        sub eax,dword ptr [left_s_t] //left_s
280
281        sar eax,4
282        push ebp
283
284        pfrcpit1 (m7, m6)
285        sub ebx,dword ptr [left_s_t+4] //left_t
286
287        sar ebx,4
288        mov word ptr [dsdx_frac+2],ax //this sets the upper 16 bits of dword ptr [dsdx_frac] to ((right_s-left_s)>>4)<<16
289       
290        pfrcpit2 (m7, m6)
291        nop
292       
293        sar eax,16
294        mov word ptr [dtdx_frac+2],bx //this sets the upper 16 bits of dword ptr [dtdx_frac] to ((right_t-left_t)>>4)<<16
295       
296        sar ebx,16
297        mov cl,byte ptr [r1_software_twidth_log2]
298       
299        shl ebx,cl
300
301        add eax,ebx
302        mov bh,byte ptr [last_alpha_accumulated]
303
304        //s_t_carry[1] = integral_dsdx + integral_dtdx<<r1_software_twidth_log2
305        //s_t_carry[0] = integral_dsdx + integral_dtdx<<r1_software_twidth_log2 + r1_software_texture_width
306
307        mov dword ptr [s_t_carry+4],eax
308        add eax,dword ptr [r1_software_texture_width]
309               
310        mov dword ptr [s_t_carry],eax       
311        mov cl,4 //loop is unrolled to 4 pixels - we want to draw 16, so loop 4 times
312       
313        ALIGN 16
314
315        //high 16 bits of ecx is the fractional s component
316        //high 16 bits of edx is the fractional t component
317
318        //eax is used to lookup the texel as well as the low 8-bits of the lit texel
319        //ebx is used to lookup the high 8-bits of the lit texel
320        //ebp is used to detect a t-carry as well as lookup the lit texel
321        //cl  is the loop count variable
322        //bh  is used to dither the alpha
323
324    looper1:
325        movzx eax,word ptr [esi*2]
326        add edx,dword ptr [dtdx_frac]
327   
328        sbb ebp,ebp
329        mov bl,ah
330   
331        and eax,4095
332        add ecx,dword ptr [dsdx_frac]
333
334        adc esi,dword ptr [4+s_t_carry+ebp*4]
335        and bl,240
336   
337        movzx eax,word ptr [alpha_table+eax*2]
338        add bh,bl
339
340        jnc skip_pixel_1
341
342        mov word ptr [edi],ax
343        add bh,16
344
345      skip_pixel_1:
346        movzx eax,word ptr [esi*2]
347        add edx,dword ptr [dtdx_frac]
348   
349        sbb ebp,ebp
350        mov bl,ah
351   
352        and eax,4095
353        add ecx,dword ptr [dsdx_frac]
354
355        adc esi,dword ptr [4+s_t_carry+ebp*4]
356        and bl,240
357   
358        movzx eax,word ptr [alpha_table+eax*2]
359        add bh,bl
360
361        jnc skip_pixel_2
362       
363        mov word ptr [edi+2],ax
364        add bh,16
365
366      skip_pixel_2:
367        movzx eax,word ptr [esi*2]
368        add edx,dword ptr [dtdx_frac]
369   
370        sbb ebp,ebp
371        mov bl,ah
372   
373        and eax,4095
374        add ecx,dword ptr [dsdx_frac]
375
376        adc esi,dword ptr [4+s_t_carry+ebp*4]
377        and bl,240
378   
379        movzx eax,word ptr [alpha_table+eax*2]
380        add bh,bl
381
382        jnc skip_pixel_3
383       
384        mov word ptr [edi+4],ax
385        add bh,16
386
387      skip_pixel_3: 
388        movzx eax,word ptr [esi*2]
389        add edx,dword ptr [dtdx_frac]
390   
391        sbb ebp,ebp
392        mov bl,ah
393   
394        and eax,4095
395        add ecx,dword ptr [dsdx_frac]
396
397        adc esi,dword ptr [4+s_t_carry+ebp*4]
398        and bl,240
399   
400        movzx eax,word ptr [alpha_table+eax*2]
401        add bh,bl
402
403        jnc skip_pixel_4
404
405        mov word ptr [edi+6],ax
406        add bh,16
407
408      skip_pixel_4:
409        add edi,8
410        dec cl //thank god this doesnt modify the carry flag (the above add ch,dl needs to effect the adc bh,dh at the top of the loop)
411       
412        jnz looper1
413        pop ebp
414
415        mov byte ptr [last_alpha_accumulated],bh //save it
416
417        //store right_s and right_t in left_s and left_t
418        //right_s is what left_s starts at on the next 16 pixel span
419        //right_t is what left_t starts at on the next 16 pixel span
420
421        mov eax,dword ptr [right_s_t] //right_s
422        mov ebx,dword ptr [right_s_t+4] //right_t
423
424        mov dword ptr [left_s_t],eax //left_s
425        mov dword ptr [left_s_t+4],ebx //left_t
426      }
427         
428      _asm dec dword ptr [num_subdivisions]
429    }
430   
431    //store these so that the C code below actually works
432    _asm mov dword ptr [start_pixel],edi
433  }
434   
435  if (num_leftover)
436  {       
437    if (num_leftover > 1)
438    {     
439      if (had_subdivisions==0)
440      {
441        //calculate the right_z for the end of span
442        //ooz_right = left->ooz + (cur_grads.doozdx * num_leftover);
443        //soz_right = left->soz + (cur_grads.dsozdx * num_leftover);
444        //toz_right = left->toz + (cur_grads.dtozdx * num_leftover);
445
446        _asm
447        {
448          movd mm2,dword ptr [num_leftover]
449          lea ebx,dword ptr [cur_grads]
450         
451          movd mm3, dword ptr [ebx]tri_gradients.dsozdx
452          mov edi,dword ptr [left]
453
454          movd mm4, dword ptr [ebx]tri_gradients.dtozdx
455          pi2fd (m2, m2)
456         
457          movd mm5, dword ptr [ebx]tri_gradients.doozdx
458          pfmul (m3, m2)
459         
460          movq mm0, qword ptr [edi]perspective_span.soz
461          pfmul (m4, m2)
462
463          movd mm1, dword ptr [edi]perspective_span.ooz
464          pfmul (m5, m2)         
465         
466          pfacc (m3, m4) //gets dtozdx*num_leftover into high 32 bits of m3
467         
468          pfadd (m1, m5) //ooz += doozdx*num_leftover
469          pfadd (m0, m3) //soz += dsozdx*num_leftover AND toz += dtozdx*num_leftover
470
471          //calculate the z at the right endpoint in mm7
472          movq mm7, mm1
473          pfrcp (m6, m1)
474
475          punpckldq mm7, mm7 //duplicate low 32bits of mm7 into high 32 bits of mm7
476
477          pfrcpit1 (m7, m6) //terrible stalls. oh well
478       
479          pfrcpit2 (m7, m6)
480        }
481      }
482      else
483      {
484        //the correct ending right_z is already being calculated
485        //(see the if (num_subdivisions!=1) case above
486      }
487
488      _asm
489      {
490        //calculate starting fractional and integral values for s and t           
491       
492        //calculate the right endpoint
493        //right_s = qftoi(soz_right * right_z) + cur_grads.s_adjust;
494        //right_t = qftoi(toz_right * right_z) + cur_grads.t_adjust;
495       
496        //soz_right and toz_right are in mm0
497        //right_z is in mm7
498        pfmul (m7, m0) //calculate right_s and right_t
499        mov edi,dword ptr [start_pixel]
500
501        mov esi,dword ptr [r1_software_texture_ptr]
502        mov eax,dword ptr [left_s_t] //left_s
503
504        shr esi,1
505        pf2id (m7, m7) //truncate right_s and right_t
506       
507        sar eax,16
508        mov ebx,dword ptr [left_s_t+4] //left_t
509           
510        sar ebx,16
511        movq qword ptr [right_s_t],mm7
512
513        mov edx,dword ptr [left_s_t+4] //left_t
514        add esi,eax
515       
516        mov cl,byte ptr [r1_software_twidth_log2]
517        shl ebx,cl
518     
519        sal edx,16
520        mov ecx,dword ptr [left_s_t] //left_s
521     
522        sal ecx,16
523        add esi,ebx
524
525        mov eax,dword ptr [right_s_t] //right_s
526        mov ebx,dword ptr [right_s_t+4] //right_t
527       
528        add eax,dword ptr [cur_grads]tri_gradients.s_adjust
529        add ebx,dword ptr [cur_grads]tri_gradients.t_adjust
530 
531        //cap the right s and t
532        cmp eax,0
533        jge cmp_eax_high_2
534
535        mov eax,0
536        jmp cmp_ebx_low_2
537
538      cmp_eax_high_2:
539        cmp eax,dword ptr [s_mask]
540        jle cmp_ebx_low_2
541
542        mov eax,dword ptr [s_mask]
543
544      cmp_ebx_low_2:
545        cmp ebx,0
546        jge cmp_ebx_high_2
547
548        mov ebx,0
549        jmp done_compare_2
550     
551      cmp_ebx_high_2:
552        cmp ebx,dword ptr [t_mask]
553        jle done_compare_2
554
555        mov ebx,dword ptr [t_mask]
556
557      done_compare_2:
558           
559        //calculate the deltas (left to right)
560        //temp_dsdx = qftoi((float)(right_s - left_s) * inverse_leftover_lookup[num_leftover]);
561        //temp_dtdx = qftoi((float)(right_t - left_t) * inverse_leftover_lookup[num_leftover]);
562
563        sub eax,dword ptr [left_s_t] //left_s
564        sub ebx,dword ptr [left_s_t+4] //left_t
565
566        movd mm0,eax //temp_dsdx
567        push ebp
568       
569        movd mm1,ebx //temp_dtdx
570        mov ebp, dword ptr [num_leftover]
571       
572        pi2fd (m0, m0)
573        movd mm2, dword ptr [inverse_leftover_lookup + ebp*4]
574       
575        pi2fd (m1, m1)       
576        pfmul (m0, m2)
577
578        pfmul (m1, m2) //bad stalls here
579        pf2id (m0, m0)
580
581        pf2id (m1, m1)
582
583        movd eax, mm0 //temp_dsdx
584        movd ebx, mm1 //temp_dtdx
585
586        //calculate the fractional and integral delta vars
587        //s_t_carry[0] = (temp_dsdx>>16) + ((temp_dtdx>>16)<<r1_software_twidth_log2) + r1_software_texture_width;
588        //s_t_carry[1] = (temp_dsdx>>16) + ((temp_dtdx>>16)<<r1_software_twidth_log2);
589        //dsdx_frac    = (temp_dsdx<<16);
590        //dtdx_frac    = (temp_dtdx<<16);
591
592        mov word ptr [dsdx_frac+2],ax
593        mov word ptr [dtdx_frac+2],bx
594
595        sar eax,16
596        mov dx,word ptr [left_l]
597
598        sar ebx,16
599        mov cl,byte ptr [r1_software_twidth_log2]
600       
601        shl ebx,cl
602
603        add eax,ebx
604        mov bl,byte ptr [last_alpha_accumulated]
605
606        mov dword ptr [s_t_carry+4],eax
607        add eax,dword ptr [r1_software_texture_width]
608       
609        mov dword ptr [s_t_carry],eax
610        mov cl, byte ptr [num_leftover]
611
612        ALIGN 16
613
614      looper3:
615        movzx eax,word ptr [esi*2]
616        add edx,dword ptr [dtdx_frac]
617
618        sbb ebp,ebp
619        add ecx,dword ptr [dsdx_frac]
620
621        adc esi,dword ptr [4+s_t_carry+ebp*4]
622        add bl,ah
623
624        jnc skip_a_pixel
625
626        and eax,4095
627        mov ax,word ptr [alpha_table+eax*2]
628        mov word ptr [edi],ax
629
630      skip_a_pixel:
631        and bl,240
632        add edi,2
633
634        dec cl
635        jnz looper3
636
637        pop ebp
638      }
639    }
640    else
641    {
642      //highly unoptimized single pixel drawer
643      register w16 texel = *( r1_software_texture_ptr + (left_s_t[0]>>16) + ((left_s_t[1]>>16) << r1_software_twidth_log2) );
644
645      if (texel & (15<<12) == (15<<12))
646      {
647        //*start_pixel = alpha_table[texel & 4095];
648      }
649    }
650  }
651 
652  return;
653
654  _asm
655  {
656  dumpmmxregs:
657    movq qword ptr [mmx0],mm0
658    movq qword ptr [mmx1],mm1
659    movq qword ptr [mmx2],mm2
660    movq qword ptr [mmx3],mm3
661    movq qword ptr [mmx4],mm4
662    movq qword ptr [mmx5],mm5
663    movq qword ptr [mmx6],mm6
664    movq qword ptr [mmx7],mm7
665    ret
666  }
667
668}
669
Note: See TracBrowser for help on using the repository browser.