source: golgotha/src/render/software/perspective_map_lit_asm.cc

Last change on this file was 80, checked in by Sam Hocevar, 11 years ago
  • Adding the Golgotha source code. Not sure what's going to be interesting in there, but since it's all public domain, there's certainly stuff to pick up.
File size: 19.1 KB
Line 
1/********************************************************************** <BR>
2  This file is part of Crack dot Com's free source code release of
3  Golgotha. <a href="http://www.crack.com/golgotha_release"> <BR> for
4  information about compiling & licensing issues visit this URL</a>
5  <PRE> If that doesn't help, contact Jonathan Clark at
6  golgotha_source@usa.net (Subject should have "GOLG" in it)
7***********************************************************************/
8
9#include "software/r1_software_globals.hh"
10#include "software/inline_fpu.hh"
11
12w32 *texture_perspective_lit_starter()
13{
14  bogus_label:
15
16  w32 returnval;
17  _asm
18  {
19    lea eax,bogus_label
20    mov dword ptr [returnval],eax
21  }
22  return (w32 *)returnval;
23}
24
25sw32 had_subdivisions;
26static w8 last_bh2;
27
28void texture_scanline_perspective_lit(w16 *start_pixel,
29                                      sw32 start_x,
30                                      void *_left,//perspective_span *left,
31                                      sw32 width)
32{
33  start_pixel = (w16 *)((w8 *)start_pixel + start_x);
34
35  perspective_span *left = (perspective_span *)_left;
36 
37  last_bh2 = 0;
38
39  _asm
40  {
41    //left_z = 1.f / left->ooz;
42    //left_s = qftoi(left->soz * left_z) + cur_grads.s_adjust;
43    //left_t = qftoi(left->toz * left_z) + cur_grads.t_adjust;
44   
45    //sw32 had_subdivisions = width & (~15);
46    //num_subdivisions = width >> 4;
47    //num_leftover     = width & 15;
48   
49    mov esi,dword ptr [left]
50    mov eax,dword ptr [width]
51
52    fld1
53    fdiv qword ptr [esi]perspective_span.ooz
54
55    mov ebx,eax
56    and eax,15
57
58    shr ebx,4
59    mov ecx,width
60
61    and ecx,(~15)
62    mov dword ptr [num_leftover],eax
63   
64    //dont forget the lighting
65    //left_l = left->l;
66
67    mov eax,dword ptr [esi]perspective_span.l
68    mov dword ptr [num_subdivisions],ebx
69   
70    mov dword ptr [had_subdivisions],ecx
71    mov dword ptr [left_l],eax
72   
73    fld st(0)
74       
75    fmul dword ptr [esi]perspective_span.soz
76    fxch st(1)
77       
78    fmul dword ptr [esi]perspective_span.toz
79    fxch st(1)
80
81    fistp dword ptr [left_s]
82    fistp dword ptr [left_t]
83
84    mov eax,dword ptr [cur_grads].s_adjust
85    mov ebx,dword ptr [cur_grads].t_adjust
86
87    add eax,dword ptr [left_s]
88    add ebx,dword ptr [left_t]
89
90    mov dword ptr [left_s],eax
91    mov dword ptr [left_t],ebx
92   
93    //clear these out
94    mov dword ptr [dsdx_frac],0
95    mov dword ptr [dtdx_frac],0
96  }
97
98  if (num_subdivisions)
99  {
100    _asm
101    {
102      //ooz_right = left->ooz + (cur_grads.doozdxspan);
103      //soz_right = left->soz + (cur_grads.dsozdxspan);
104      //toz_right = left->toz + (cur_grads.dtozdxspan);
105
106      mov esi,dword ptr [left]
107      mov edi,dword ptr [start_pixel]
108
109      fld qword ptr [esi]perspective_span.ooz
110      fld dword ptr [esi]perspective_span.soz
111      fld dword ptr [esi]perspective_span.toz
112
113      //t s o
114      fadd dword ptr [cur_grads]tri_gradients.dtozdxspan
115      fxch st(2)
116
117      //o s t
118
119      fadd qword ptr [cur_grads]tri_gradients.doozdxspan
120      fxch st(1)
121
122      //s o t
123
124      fadd dword ptr [cur_grads]tri_gradients.dsozdxspan
125      fxch st(2)
126
127      //t o s
128
129      fstp dword ptr [toz_right]
130      fxch st(1)
131
132      //s o
133
134      fstp dword ptr [soz_right]
135
136      fstp dword ptr [ooz_right]
137     
138      //calculate the 1st right_z
139      fld1
140      fdiv dword ptr [ooz_right]
141
142      //calculate starting fractional and integral values for s and t
143      //esi = starting_s_coordinate >> 16 + starting_t_coordinate >> 16 << r1_software_twidth_log2
144      //ecx = starting_s_coordinate << 16
145      //edx = starting_t_coordinate << 16
146      //dx  = starting_light_value
147
148      mov esi,dword ptr [r1_software_texture_ptr]
149      mov eax,dword ptr [left_s]
150
151      shr esi,1
152      mov ebx,dword ptr [left_t]
153   
154      sar eax,16
155      mov edx,dword ptr [left_t]
156
157      sar ebx,16
158      add esi,eax
159
160      mov cl,byte ptr [r1_software_twidth_log2]
161      shl ebx,cl
162     
163      sal edx,16
164      mov ecx,dword ptr [left_s]
165   
166      sal ecx,16
167      add esi,ebx
168
169      mov dx,word ptr [left_l]
170      mov ch,dl //store the initial lighting error from the 1st lighting value
171    }
172
173    while (num_subdivisions)
174    {
175      _asm
176      {
177        //right_s = qftoi(soz_right * right_z);
178        //right_t = qftoi(toz_right * right_z);
179       
180        //right_z is in st0
181        fld st(0)
182       
183        fmul dword ptr [soz_right]
184        fxch st(1)
185       
186        fmul dword ptr [toz_right]
187        fxch st(1)
188
189        fistp dword ptr [right_s]
190        fistp dword ptr [right_t]
191
192      //calculate ooz_right, soz_right, toz_right, and right_z for the end of the next span. if there are
193      //more subdivisions, calculate the end of the next span. if there are no more and there is > 1 leftover
194      //in the leftover span, calculate the end of that.
195
196      //if (num_subdivisions!=1)
197      //{
198          cmp dword ptr [num_subdivisions],1
199          je  last_subdivision
200       
201          //ooz_right += (cur_grads.doozdxspan);
202          //soz_right += (cur_grads.dsozdxspan);
203          //toz_right += (cur_grads.dtozdxspan);
204         
205          fld dword ptr [ooz_right]
206          fadd qword ptr [cur_grads]tri_gradients.doozdxspan
207
208          fld dword ptr [soz_right]
209          fadd dword ptr [cur_grads]tri_gradients.dsozdxspan
210                                   
211          fld dword ptr [toz_right]
212          fadd dword ptr [cur_grads]tri_gradients.dtozdxspan
213
214          fxch st(2)
215          fstp dword ptr [ooz_right]
216
217          fstp dword ptr [soz_right]
218
219          fstp dword ptr [toz_right]
220
221          fld1
222          fdiv dword ptr [ooz_right]
223
224          jmp not_last_subdivision
225      //}
226      //else
227      //if (num_leftover > 1)
228      //{
229
230      last_subdivision:
231          cmp dword ptr [num_leftover],1
232          jle not_last_subdivision
233       
234          //calculate the right_z for the end of the leftover span
235          //ooz_right += (cur_grads.doozdx * num_leftover);
236          //soz_right += (cur_grads.dsozdx * num_leftover);
237          //toz_right += (cur_grads.dtozdx * num_leftover);
238
239          fild dword ptr [num_leftover]
240         
241          //todo: pipeline these fpu ops
242          fld  qword ptr [cur_grads]tri_gradients.doozdx
243          fmul st(0),st(1)
244          fadd dword ptr [ooz_right]
245          fstp dword ptr [ooz_right]
246
247          fld  dword ptr [cur_grads]tri_gradients.dsozdx
248          fmul st(0),st(1)
249          fadd dword ptr [soz_right]
250          fstp dword ptr [soz_right]
251
252          fld  dword ptr [cur_grads]tri_gradients.dtozdx
253          fmul st(0),st(1)
254          fadd dword ptr [toz_right]
255          fstp dword ptr [toz_right]
256
257          fstp st(0) //nifty thing i found, a 1 cycle fpu pop
258       
259          fld1
260          fdiv dword ptr [ooz_right]
261      //}
262           
263      not_last_subdivision:
264        //cap the right_s and right_t's so that they're valid
265
266        mov eax,dword ptr [right_s]
267        mov ebx,dword ptr [right_t]
268       
269        add eax,dword ptr [cur_grads]tri_gradients.s_adjust
270        add ebx,dword ptr [cur_grads]tri_gradients.t_adjust
271 
272        //cap the right s and t
273        cmp eax,0
274        jge cmp_eax_high
275
276        mov eax,0
277        jmp cmp_ebx_low
278
279      cmp_eax_high:
280        cmp eax,dword ptr [s_mask]
281        jle cmp_ebx_low
282
283        mov eax,dword ptr [s_mask]
284
285      cmp_ebx_low:
286        cmp ebx,0
287        jge cmp_ebx_high
288
289        mov ebx,0
290        jmp done_compare
291     
292      cmp_ebx_high:
293        cmp ebx,dword ptr [t_mask]
294        jle done_compare
295
296        mov ebx,dword ptr [t_mask]
297
298      done_compare:
299
300        //store the right_s and right_t
301        //so they can be copied into left_s and left_t at the end of the 16-pixel span
302        //(the cant be copied now because we have to calculate (right_s-left_s)>>4 and (right_t-left_t)>>4
303       
304        mov dword ptr [right_s],eax
305        mov dword ptr [right_t],ebx
306
307        sub eax,dword ptr [left_s]
308        push ebp
309
310        sar eax,4
311        sub ebx,dword ptr [left_t]
312       
313        sar ebx,4
314        mov word ptr [dsdx_frac+2],ax //this sets the upper 16 bits of dword ptr [dsdx_frac] to ((right_s-left_s)>>4)<<16
315       
316        sar eax,16
317        mov word ptr [dtdx_frac+2],bx //this sets the upper 16 bits of dword ptr [dtdx_frac] to ((right_t-left_t)>>4)<<16
318       
319        sar ebx,16
320        mov cl,byte ptr [r1_software_twidth_log2]
321       
322        shl ebx,cl
323
324        add eax,ebx
325        mov ebx,0
326
327        //s_t_carry[1] = integral_dsdx + integral_dtdx<<r1_software_twidth_log2
328        //s_t_carry[0] = integral_dsdx + integral_dtdx<<r1_software_twidth_log2 + r1_software_texture_width
329
330        mov dword ptr [s_t_carry+4],eax
331        add eax,dword ptr [r1_software_texture_width]
332               
333        mov dword ptr [s_t_carry],eax       
334        mov eax,0 //must make sure the high bits of these are zeroed out
335
336        mov cl,4 //loop is unrolled to 4 pixels - we want to draw 16, so loop 4 times
337        mov bh,byte ptr [last_bh2] //get the dither back into bh
338
339        add ch,0 //clear the carry bit. clc is 2 cycles. dumb.
340        ALIGN 16
341
342        //high 16 bits of ecx is the fractional s component
343        //high 16 bits of edx is the fractional t component
344
345        //eax is used to lookup the texel as well as the low 8-bits of the lit texel
346        //ebx is used to lookup the high 8-bits of the lit texel
347        //ebp is used to detect a t-carry as well as lookup the lit texel
348        //cl  is the loop count variable
349        //dx  is the lighting value (8 bits integer, 8 bits fraction)
350        //ch  is the lighting error
351        //bh  is used to dither the lighting (mov bh,0 then add ch,dl then adc bh,dh)
352
353    looper1:
354        adc bh,dh
355        add edi,8 //the only convenient place for the stepping of edi was way up here
356
357        mov ax,word ptr [esi*2]
358        add edx,dword ptr [dtdx_frac]
359 
360        sbb ebp,ebp
361        mov bl,ah
362
363        add ecx,dword ptr [dsdx_frac]
364        mov ah,bh
365
366        adc esi,dword ptr [4+s_t_carry+ebp*4]
367        mov ebp,dword ptr [0xDEADBEEF+ctable_size_bytes+ebx*4]
368       
369        add ebp,dword ptr [0xDEADBEEF+eax*4]
370        add edx,dword ptr [dldx_fixed]
371
372        mov bh,0
373        add ch,dl
374
375        mov word ptr [edi-8],bp
376        adc bh,dh
377
378        mov ax,word ptr [esi*2]  //first pixel is finished aroundhere
379        add edx,dword ptr [dtdx_frac]
380
381        sbb ebp,ebp
382        mov bl,ah
383               
384        add ecx,dword ptr [dsdx_frac]
385        mov ah,bh
386
387        adc esi,dword ptr [4+s_t_carry+ebp*4]
388        mov ebp,dword ptr [0xDEADBEEF+ctable_size_bytes+ebx*4]
389       
390        add ebp,dword ptr [0xDEADBEEF+eax*4]
391        add edx,dword ptr [dldx_fixed]
392             
393        mov bh,0
394        add ch,dl
395
396        mov word ptr [edi-6],bp
397        adc bh,dh
398
399        mov ax,word ptr [esi*2]
400        add edx,dword ptr [dtdx_frac]
401
402        sbb ebp,ebp
403        mov bl,ah
404
405        add ecx,dword ptr [dsdx_frac]
406        mov ah,bh
407
408        adc esi,dword ptr [4+s_t_carry+ebp*4]
409        mov ebp,dword ptr [0xDEADBEEF+ctable_size_bytes+ebx*4]
410       
411        add ebp,dword ptr [0xDEADBEEF+eax*4]
412        add edx,dword ptr [dldx_fixed]
413
414        mov bh,0
415        add ch,dl
416     
417        mov word ptr [edi-4],bp
418        adc bh,dh
419
420        mov ax,word ptr [esi*2]  //first pixel is finished aroundhere
421        add edx,dword ptr [dtdx_frac]
422
423        sbb ebp,ebp
424        mov bl,ah
425
426        add ecx,dword ptr [dsdx_frac]
427        mov ah,bh
428
429        adc esi,dword ptr [4+s_t_carry+ebp*4]
430        mov ebp,dword ptr [0xDEADBEEF+ctable_size_bytes+ebx*4]
431       
432        add ebp,dword ptr [0xDEADBEEF+eax*4]
433        add edx,dword ptr [dldx_fixed]
434       
435        mov bh,0
436        add ch,dl
437
438        mov word ptr [edi-2],bp
439        dec cl //thank god this doesnt modify the carry flag (the above add ch,dl needs to effect the adc bh,dh at the top of the loop)
440     
441        jnz looper1
442        adc bh,0 //if we're done looping, save the last carry information here
443       
444        pop ebp
445        mov byte ptr [last_bh2],bh //save it
446
447        //store right_s and right_t in left_s and left_t
448        //right_s is what left_s starts at on the next 16 pixel span
449        //right_t is what left_t starts at on the next 16 pixel span
450
451        mov eax,dword ptr [right_s]
452        mov ebx,dword ptr [right_t]
453
454        mov dword ptr [left_s],eax
455        mov dword ptr [left_t],ebx
456      }
457         
458      _asm dec dword ptr [num_subdivisions]
459    }
460   
461    //store these so that the C code below actually works
462    _asm mov word ptr [left_l],dx
463    _asm mov dword ptr [start_pixel],edi
464  }
465   
466  if (num_leftover)
467  {       
468    if (num_leftover > 1)
469    {     
470      if (had_subdivisions==0)
471      {
472        //calculate the right_z for the end of span
473        ooz_right = left->ooz + (cur_grads.doozdx * num_leftover);
474        soz_right = left->soz + (cur_grads.dsozdx * num_leftover);
475        toz_right = left->toz + (cur_grads.dtozdx * num_leftover);
476       
477        //calculate the z at the right endpoint
478        _asm fld1
479        _asm fdiv dword ptr [ooz_right]
480      }
481      else
482      {
483        //the correct ending right_z is already being calculated
484        //(see the if (num_subdivisions!=1) case above
485      }
486
487      _asm
488      {
489        //calculate starting fractional and integral values for s and t           
490       
491        mov esi,dword ptr [r1_software_texture_ptr]
492        mov eax,dword ptr [left_s]
493
494        shr esi,1
495        mov ebx,dword ptr [left_t]
496   
497        sar eax,16
498        mov edx,dword ptr [left_t]
499
500        sar ebx,16
501        add esi,eax
502
503        mov cl,byte ptr [r1_software_twidth_log2]
504        shl ebx,cl
505     
506        sal edx,16
507        mov ecx,dword ptr [left_s]
508     
509        sal ecx,16
510        add esi,ebx
511       
512        mov edi,dword ptr [start_pixel]
513
514        //calculate the right endpoint
515        //right_s = qftoi(soz_right * right_z) + cur_grads.s_adjust;
516        //right_t = qftoi(toz_right * right_z) + cur_grads.t_adjust;
517       
518        //right_z is in st0
519        fld st(0)
520       
521        fmul dword ptr [soz_right]
522        fxch st(1)
523       
524        fmul dword ptr [toz_right]
525        fxch st(1)
526
527        fistp dword ptr [right_s]
528        fistp dword ptr [right_t]
529
530        mov eax,dword ptr [right_s]
531        mov ebx,dword ptr [right_t]
532       
533        add eax,dword ptr [cur_grads]tri_gradients.s_adjust
534        add ebx,dword ptr [cur_grads]tri_gradients.t_adjust
535 
536        //cap the right s and t
537        cmp eax,0
538        jge cmp_eax_high_2
539
540        mov eax,0
541        jmp cmp_ebx_low_2
542
543      cmp_eax_high_2:
544        cmp eax,dword ptr [s_mask]
545        jle cmp_ebx_low_2
546
547        mov eax,dword ptr [s_mask]
548
549      cmp_ebx_low_2:
550        cmp ebx,0
551        jge cmp_ebx_high_2
552
553        mov ebx,0
554        jmp done_compare_2
555     
556      cmp_ebx_high_2:
557        cmp ebx,dword ptr [t_mask]
558        jle done_compare_2
559
560        mov ebx,dword ptr [t_mask]
561
562      done_compare_2:
563           
564        //calculate the deltas (left to right)
565        //temp_dsdx = qftoi((float)(right_s - left_s) * inverse_leftover_lookup[num_leftover]);
566        //temp_dtdx = qftoi((float)(right_t - left_t) * inverse_leftover_lookup[num_leftover]);
567       
568        push ebp
569        mov ebp,num_leftover
570
571        sub eax,dword ptr [left_s]
572        sub ebx,dword ptr [left_t]
573
574        mov dword ptr [temp_dsdx],eax
575        mov dword ptr [temp_dtdx],ebx
576
577        fild dword ptr [temp_dsdx]
578        fild dword ptr [temp_dtdx]
579
580        fmul dword ptr [inverse_leftover_lookup + ebp*4]
581        fxch st(1)
582
583        fmul dword ptr [inverse_leftover_lookup + ebp*4]
584        fxch st(1)
585
586        fistp dword ptr [temp_dtdx]
587        fistp dword ptr [temp_dsdx]
588       
589        //calculate the fractional and integral delta vars
590        //s_t_carry[0] = (temp_dsdx>>16) + ((temp_dtdx>>16)<<r1_software_twidth_log2) + r1_software_texture_width;
591        //s_t_carry[1] = (temp_dsdx>>16) + ((temp_dtdx>>16)<<r1_software_twidth_log2);
592        //dsdx_frac    = (temp_dsdx<<16);
593        //dtdx_frac    = (temp_dtdx<<16);
594
595        mov eax,dword ptr [temp_dsdx]
596        mov ebx,dword ptr [temp_dtdx]
597       
598        mov word ptr [dsdx_frac+2],ax
599        mov word ptr [dtdx_frac+2],bx
600
601        sar eax,16
602        mov dx,word ptr [left_l]
603
604        sar ebx,16
605        mov cl,byte ptr [r1_software_twidth_log2]
606       
607        shl ebx,cl
608
609        add eax,ebx
610        mov ebx,0
611
612        mov dword ptr [s_t_carry+4],eax
613        add eax,dword ptr [r1_software_texture_width]
614       
615        mov dword ptr [s_t_carry],eax
616        mov cl, byte ptr [num_leftover]
617       
618        mov eax,0
619        mov ch,dl //setup the initial lighting error
620
621        mov bh,byte ptr [last_bh2] //setup the initial dither
622        add ch,0 //clear the carry bit
623
624        ALIGN 16
625
626      looper3:
627        adc bh,dh
628        add edi,2
629
630        mov ax,word ptr [esi*2]
631        add edx,dword ptr [dtdx_frac]
632
633        sbb ebp,ebp
634        mov bl,ah
635
636        add ecx,dword ptr [dsdx_frac]
637        mov ah,bh
638
639        adc esi,dword ptr [4+s_t_carry+ebp*4]
640        mov ebp,dword ptr [0xDEADBEEF+ctable_size_bytes+ebx*4]
641
642        add ebp,dword ptr [0xDEADBEEF+eax*4]
643        add edx,dword ptr [dldx_fixed]
644       
645        mov bh,0
646        add ch,dl
647
648        mov word ptr [edi-2],bp
649        dec cl //thank god this doesnt modify the carry flag (the above add ch,dl needs to effect the adc bh,dh at the top of the loop)
650
651        jnz looper3
652
653        pop ebp
654      }
655    }
656    else
657    {
658      register w16 texel;
659      register w32 l_lookup;
660
661      //highly unoptimized single pixel drawer
662      texel = *(r1_software_texture_ptr + (left_s>>16) + ((left_t>>16)<<r1_software_twidth_log2));
663         
664      l_lookup = left_l & (NUM_LIGHT_SHADES<<8);
665 
666      //                                        low bits                                  high bits
667      *start_pixel = (w16)(((w32 *)(0xDEADBEEF))[l_lookup + (texel & 0xFF)] + ((w32 *)(0xDEADBEEF)+ctable_size)[l_lookup + (texel>>8)]);
668    }
669  }
670}
671
672w32 *texture_perspective_lit_sentinel()
673{
674  bogus_label:
675 
676  w32 returnval;
677  _asm
678  {
679    lea eax,bogus_label
680    mov dword ptr [returnval],eax
681  }
682  return (w32 *)returnval;
683}
684
685void insert_color_modify_address_low(w32 *address);
686void insert_color_modify_address_high(w32 *address);
687extern w32 color_modify_list[];
688extern sw32 num_color_modifies;
689
690void setup_color_modify_perspective_lit()
691{
692  w32 *stop = texture_perspective_lit_sentinel();
693
694  w32 *search = texture_perspective_lit_starter();
695  //start searching for 0xDEADBEEF
696  while (search < stop)
697  {
698    //casting craziness
699    search = (w32 *)((w8 *)search + 1);
700    if (*search==0xDEADBEEF)
701    {
702      insert_color_modify_address_low(search);
703    }
704    else
705    if (*search==(0xDEADBEEF + ctable_size_bytes))
706    {
707      insert_color_modify_address_high(search);
708    }
709  }
710}
Note: See TracBrowser for help on using the repository browser.