source: golgotha/src/render/software/perspective_map_unlit_alpha_asm.cc @ 80

Last change on this file since 80 was 80, checked in by Sam Hocevar, 11 years ago
  • Adding the Golgotha source code. Not sure what's going to be interesting in there, but since it's all public domain, there's certainly stuff to pick up.
File size: 16.6 KB
Line 
1/********************************************************************** <BR>
2  This file is part of Crack dot Com's free source code release of
3  Golgotha. <a href="http://www.crack.com/golgotha_release"> <BR> for
4  information about compiling & licensing issues visit this URL</a>
5  <PRE> If that doesn't help, contact Jonathan Clark at
6  golgotha_source@usa.net (Subject should have "GOLG" in it)
7***********************************************************************/
8
9#include "software/r1_software_globals.hh"
10#include "software/inline_fpu.hh"
11
12extern sw32 had_subdivisions;
13static w8 last_alpha_accumulated;
14
15void texture_scanline_perspective_unlit_alpha(w16 *start_pixel,
16                                              sw32 start_x,
17                                              void *_left,//perspective_span *left,
18                                              sw32 width)
19{
20  start_pixel = (w16 *)((w8 *)start_pixel + start_x);
21
22  perspective_span *left = (perspective_span *)_left;
23
24  last_alpha_accumulated = 16;
25
26  _asm
27  {
28    //left_z = 1.f / left->ooz;
29    //left_s = qftoi(left->soz * left_z) + cur_grads.s_adjust;
30    //left_t = qftoi(left->toz * left_z) + cur_grads.t_adjust;
31   
32    //sw32 had_subdivisions = width & (~15);
33    //num_subdivisions = width >> 4;
34    //num_leftover     = width & 15;
35   
36    mov esi,dword ptr [left]
37    mov eax,dword ptr [width]
38
39    fld1
40    fdiv qword ptr [esi]perspective_span.ooz
41
42    mov ebx,eax
43    and eax,15
44
45    shr ebx,4
46    mov ecx,width
47
48    and ecx,(~15)
49    mov dword ptr [num_leftover],eax
50   
51    mov dword ptr [num_subdivisions],ebx
52    mov dword ptr [had_subdivisions],ecx
53   
54    fld st(0)
55       
56    fmul dword ptr [esi]perspective_span.soz
57    fxch st(1)
58       
59    fmul dword ptr [esi]perspective_span.toz
60    fxch st(1)
61
62    fistp dword ptr [left_s]
63    fistp dword ptr [left_t]
64
65    mov eax,dword ptr [cur_grads].s_adjust
66    mov ebx,dword ptr [cur_grads].t_adjust
67
68    add eax,dword ptr [left_s]
69    add ebx,dword ptr [left_t]
70
71    mov dword ptr [left_s],eax
72    mov dword ptr [left_t],ebx
73   
74    //clear these out
75    mov dword ptr [dsdx_frac],0
76    mov dword ptr [dtdx_frac],0
77  }
78
79  if (num_subdivisions)
80  {
81    _asm
82    {
83      //ooz_right = left->ooz + (cur_grads.doozdxspan);
84      //soz_right = left->soz + (cur_grads.dsozdxspan);
85      //toz_right = left->toz + (cur_grads.dtozdxspan);
86
87      mov esi,dword ptr [left]
88      mov edi,dword ptr [start_pixel]
89
90      fld qword ptr [esi]perspective_span.ooz
91      fld dword ptr [esi]perspective_span.soz
92      fld dword ptr [esi]perspective_span.toz
93
94      //t s o
95      fadd dword ptr [cur_grads]tri_gradients.dtozdxspan
96      fxch st(2)
97
98      //o s t
99
100      fadd qword ptr [cur_grads]tri_gradients.doozdxspan
101      fxch st(1)
102
103      //s o t
104
105      fadd dword ptr [cur_grads]tri_gradients.dsozdxspan
106      fxch st(2)
107
108      //t o s
109
110      fstp dword ptr [toz_right]
111      fxch st(1)
112
113      //s o
114
115      fstp dword ptr [soz_right]
116
117      fstp dword ptr [ooz_right]
118     
119      //calculate the 1st right_z
120      fld1
121      fdiv dword ptr [ooz_right]
122
123      //calculate starting fractional and integral values for s and t
124      //esi = starting_s_coordinate >> 16 + starting_t_coordinate >> 16 << r1_software_twidth_log2
125      //ecx = starting_s_coordinate << 16
126      //edx = starting_t_coordinate << 16
127      //dx  = starting_light_value
128
129      mov esi,dword ptr [r1_software_texture_ptr]
130      mov eax,dword ptr [left_s]
131
132      shr esi,1
133      mov ebx,dword ptr [left_t]
134   
135      sar eax,16
136      mov edx,dword ptr [left_t]
137
138      sar ebx,16
139      add esi,eax
140
141      mov cl,byte ptr [r1_software_twidth_log2]
142      shl ebx,cl
143     
144      sal edx,16
145      mov ecx,dword ptr [left_s]
146   
147      sal ecx,16
148      add esi,ebx
149    }
150
151    while (num_subdivisions)
152    {
153      _asm
154      {
155        //right_s = qftoi(soz_right * right_z);
156        //right_t = qftoi(toz_right * right_z);
157       
158        //right_z is in st0
159        fld st(0)
160       
161        fmul dword ptr [soz_right]
162        fxch st(1)
163       
164        fmul dword ptr [toz_right]
165        fxch st(1)
166
167        fistp dword ptr [right_s]
168        fistp dword ptr [right_t]
169
170      //calculate ooz_right, soz_right, toz_right, and right_z for the end of the next span. if there are
171      //more subdivisions, calculate the end of the next span. if there are no more and there is > 1 leftover
172      //in the leftover span, calculate the end of that.
173
174      //if (num_subdivisions!=1)
175      //{
176          cmp dword ptr [num_subdivisions],1
177          je  last_subdivision
178       
179          //ooz_right += (cur_grads.doozdxspan);
180          //soz_right += (cur_grads.dsozdxspan);
181          //toz_right += (cur_grads.dtozdxspan);
182         
183          fld dword ptr [ooz_right]
184          fadd qword ptr [cur_grads]tri_gradients.doozdxspan
185
186          fld dword ptr [soz_right]
187          fadd dword ptr [cur_grads]tri_gradients.dsozdxspan
188                                   
189          fld dword ptr [toz_right]
190          fadd dword ptr [cur_grads]tri_gradients.dtozdxspan
191
192          fxch st(2)
193          fstp dword ptr [ooz_right]
194
195          fstp dword ptr [soz_right]
196
197          fstp dword ptr [toz_right]
198
199          fld1
200          fdiv dword ptr [ooz_right]
201
202          jmp not_last_subdivision
203      //}
204      //else
205      //if (num_leftover > 1)
206      //{
207
208      last_subdivision:
209          cmp dword ptr [num_leftover],1
210          jle not_last_subdivision
211       
212          //calculate the right_z for the end of the leftover span
213          //ooz_right += (cur_grads.doozdx * num_leftover);
214          //soz_right += (cur_grads.dsozdx * num_leftover);
215          //toz_right += (cur_grads.dtozdx * num_leftover);
216
217          fild dword ptr [num_leftover]
218         
219          //todo: pipeline these fpu ops
220          fld  qword ptr [cur_grads]tri_gradients.doozdx
221          fmul st(0),st(1)
222          fadd dword ptr [ooz_right]
223          fstp dword ptr [ooz_right]
224
225          fld  dword ptr [cur_grads]tri_gradients.dsozdx
226          fmul st(0),st(1)
227          fadd dword ptr [soz_right]
228          fstp dword ptr [soz_right]
229
230          fld  dword ptr [cur_grads]tri_gradients.dtozdx
231          fmul st(0),st(1)
232          fadd dword ptr [toz_right]
233          fstp dword ptr [toz_right]
234
235          fstp st(0) //nifty thing i found, a 1 cycle fpu pop
236       
237          fld1
238          fdiv dword ptr [ooz_right]
239      //}
240           
241      not_last_subdivision:
242        //cap the right_s and right_t's so that they're valid
243
244        mov eax,dword ptr [right_s]
245        mov ebx,dword ptr [right_t]
246       
247        add eax,dword ptr [cur_grads]tri_gradients.s_adjust
248        add ebx,dword ptr [cur_grads]tri_gradients.t_adjust
249 
250        //cap the right s and t
251        cmp eax,0
252        jge cmp_eax_high
253
254        mov eax,0
255        jmp cmp_ebx_low
256
257      cmp_eax_high:
258        cmp eax,dword ptr [s_mask]
259        jle cmp_ebx_low
260
261        mov eax,dword ptr [s_mask]
262
263      cmp_ebx_low:
264        cmp ebx,0
265        jge cmp_ebx_high
266
267        mov ebx,0
268        jmp done_compare
269     
270      cmp_ebx_high:
271        cmp ebx,dword ptr [t_mask]
272        jle done_compare
273
274        mov ebx,dword ptr [t_mask]
275
276      done_compare:
277
278        //store the right_s and right_t
279        //so they can be copied into left_s and left_t at the end of the 16-pixel span
280        //(the cant be copied now because we have to calculate (right_s-left_s)>>4 and (right_t-left_t)>>4
281       
282        mov dword ptr [right_s],eax
283        mov dword ptr [right_t],ebx
284
285        sub eax,dword ptr [left_s]
286        push ebp
287
288        sar eax,4
289        sub ebx,dword ptr [left_t]
290       
291        sar ebx,4
292        mov word ptr [dsdx_frac+2],ax //this sets the upper 16 bits of dword ptr [dsdx_frac] to ((right_s-left_s)>>4)<<16
293       
294        sar eax,16
295        mov word ptr [dtdx_frac+2],bx //this sets the upper 16 bits of dword ptr [dtdx_frac] to ((right_t-left_t)>>4)<<16
296       
297        sar ebx,16
298        mov cl,byte ptr [r1_software_twidth_log2]
299       
300        shl ebx,cl
301
302        add eax,ebx
303        mov ebx,0
304
305        //s_t_carry[1] = integral_dsdx + integral_dtdx<<r1_software_twidth_log2
306        //s_t_carry[0] = integral_dsdx + integral_dtdx<<r1_software_twidth_log2 + r1_software_texture_width
307
308        mov dword ptr [s_t_carry+4],eax
309        add eax,dword ptr [r1_software_texture_width]
310               
311        mov dword ptr [s_t_carry],eax       
312        mov eax,0 //must make sure the high bits of these are zeroed out
313
314        mov cl,4 //loop is unrolled to 4 pixels - we want to draw 16, so loop 4 times
315        mov bh,byte ptr [last_alpha_accumulated]
316
317        ALIGN 16
318
319        //high 16 bits of ecx is the fractional s component
320        //high 16 bits of edx is the fractional t component
321
322        //eax is used to lookup the texel as well as the low 8-bits of the lit texel
323        //ebx is used to lookup the high 8-bits of the lit texel
324        //ebp is used to detect a t-carry as well as lookup the lit texel
325        //cl  is the loop count variable
326
327    looper1:
328        mov ax,word ptr [esi*2]
329        add edx,dword ptr [dtdx_frac]
330   
331        sbb ebp,ebp
332        mov bl,ah
333   
334        and eax,4095
335        add ecx,dword ptr [dsdx_frac]
336
337        adc esi,dword ptr [4+s_t_carry+ebp*4]
338        and bl,240
339   
340        mov ax,word ptr [alpha_table+eax*2]
341        add bh,bl
342
343        jnc skip_pixel_1
344
345        mov word ptr [edi],ax
346        add bh,16
347
348      skip_pixel_1:
349        mov ax,word ptr [esi*2]
350        add edx,dword ptr [dtdx_frac]
351   
352        sbb ebp,ebp
353        mov bl,ah
354   
355        and eax,4095
356        add ecx,dword ptr [dsdx_frac]
357
358        adc esi,dword ptr [4+s_t_carry+ebp*4]
359        and bl,240
360   
361        mov ax,word ptr [alpha_table+eax*2]
362        add bh,bl
363
364        jnc skip_pixel_2
365       
366        mov word ptr [edi+2],ax
367        add bh,16
368
369      skip_pixel_2:
370        mov ax,word ptr [esi*2]
371        add edx,dword ptr [dtdx_frac]
372   
373        sbb ebp,ebp
374        mov bl,ah
375   
376        and eax,4095
377        add ecx,dword ptr [dsdx_frac]
378
379        adc esi,dword ptr [4+s_t_carry+ebp*4]
380        and bl,240
381   
382        mov ax,word ptr [alpha_table+eax*2]
383        add bh,bl
384
385        jnc skip_pixel_3
386       
387        mov word ptr [edi+4],ax
388        add bh,16
389
390      skip_pixel_3: 
391        mov ax,word ptr [esi*2]
392        add edx,dword ptr [dtdx_frac]
393   
394        sbb ebp,ebp
395        mov bl,ah
396   
397        and eax,4095
398        add ecx,dword ptr [dsdx_frac]
399
400        adc esi,dword ptr [4+s_t_carry+ebp*4]
401        and bl,240
402   
403        mov ax,word ptr [alpha_table+eax*2]
404        add bh,bl
405
406        jnc skip_pixel_4
407
408        mov word ptr [edi+6],ax
409        add bh,16
410
411      skip_pixel_4:
412        add edi,8
413        dec cl //thank god this doesnt modify the carry flag (the above add ch,dl needs to effect the adc bh,dh at the top of the loop)
414     
415        jnz looper1
416        pop ebp
417
418        mov byte ptr [last_alpha_accumulated],bh
419
420        //store right_s and right_s in left_s and left_t
421        //right_s is what left_s starts at on the next 16 pixel span
422        //right_t is what left_t starts at on the next 16 pixel span
423
424        mov eax,dword ptr [right_s]
425        mov ebx,dword ptr [right_t]
426
427        mov dword ptr [left_s],eax
428        mov dword ptr [left_t],ebx
429      }
430         
431      _asm dec dword ptr [num_subdivisions]
432    }
433   
434    //store these so that the C code below actually works
435    _asm mov dword ptr [start_pixel],edi
436  }
437   
438  if (num_leftover)
439  {       
440    if (num_leftover > 1)
441    {     
442      if (had_subdivisions==0)
443      {
444        //calculate the right_z for the end of span
445        ooz_right = left->ooz + (cur_grads.doozdx * num_leftover);
446        soz_right = left->soz + (cur_grads.dsozdx * num_leftover);
447        toz_right = left->toz + (cur_grads.dtozdx * num_leftover);
448       
449        //calculate the z at the right endpoint
450        _asm fld1
451        _asm fdiv dword ptr [ooz_right]
452      }
453      else
454      {
455        //the correct ending right_z is already being calculated
456        //(see the if (num_subdivisions!=1) case above
457      }
458
459      _asm
460      {
461        //calculate starting fractional and integral values for s and t           
462       
463        mov esi,dword ptr [r1_software_texture_ptr]
464        mov eax,dword ptr [left_s]
465
466        shr esi,1
467        mov ebx,dword ptr [left_t]
468   
469        sar eax,16
470        mov edx,dword ptr [left_t]
471
472        sar ebx,16
473        add esi,eax
474
475        mov cl,byte ptr [r1_software_twidth_log2]
476        shl ebx,cl
477     
478        sal edx,16
479        mov ecx,dword ptr [left_s]
480     
481        sal ecx,16
482        add esi,ebx
483       
484        mov edi,dword ptr [start_pixel]
485
486        //calculate the right endpoint
487        //right_s = qftoi(soz_right * right_z) + cur_grads.s_adjust;
488        //right_t = qftoi(toz_right * right_z) + cur_grads.t_adjust;
489       
490        //right_z is in st0
491        fld st(0)
492       
493        fmul dword ptr [soz_right]
494        fxch st(1)
495       
496        fmul dword ptr [toz_right]
497        fxch st(1)
498
499        fistp dword ptr [right_s]
500        fistp dword ptr [right_t]
501
502        mov eax,dword ptr [right_s]
503        mov ebx,dword ptr [right_t]
504       
505        add eax,dword ptr [cur_grads]tri_gradients.s_adjust
506        add ebx,dword ptr [cur_grads]tri_gradients.t_adjust
507 
508        //cap the right s and t
509        cmp eax,0
510        jge cmp_eax_high_2
511
512        mov eax,0
513        jmp cmp_ebx_low_2
514
515      cmp_eax_high_2:
516        cmp eax,dword ptr [s_mask]
517        jle cmp_ebx_low_2
518
519        mov eax,dword ptr [s_mask]
520
521      cmp_ebx_low_2:
522        cmp ebx,0
523        jge cmp_ebx_high_2
524
525        mov ebx,0
526        jmp done_compare_2
527     
528      cmp_ebx_high_2:
529        cmp ebx,dword ptr [t_mask]
530        jle done_compare_2
531
532        mov ebx,dword ptr [t_mask]
533
534      done_compare_2:
535           
536        //calculate the deltas (left to right)
537        //temp_dsdx = qftoi((float)(right_s - left_s) * inverse_leftover_lookup[num_leftover]);
538        //temp_dtdx = qftoi((float)(right_t - left_t) * inverse_leftover_lookup[num_leftover]);
539       
540        push ebp
541        mov ebp,num_leftover
542
543        sub eax,dword ptr [left_s]
544        sub ebx,dword ptr [left_t]
545
546        mov dword ptr [temp_dsdx],eax
547        mov dword ptr [temp_dtdx],ebx
548
549        fild dword ptr [temp_dsdx]
550        fild dword ptr [temp_dtdx]
551
552        fmul dword ptr [inverse_leftover_lookup + ebp*4]
553        fxch st(1)
554
555        fmul dword ptr [inverse_leftover_lookup + ebp*4]
556        fxch st(1)
557
558        fistp dword ptr [temp_dtdx]
559        fistp dword ptr [temp_dsdx]
560       
561        //calculate the fractional and integral delta vars
562        //s_t_carry[0] = (temp_dsdx>>16) + ((temp_dtdx>>16)<<r1_software_twidth_log2) + r1_software_texture_width;
563        //s_t_carry[1] = (temp_dsdx>>16) + ((temp_dtdx>>16)<<r1_software_twidth_log2);
564        //dsdx_frac    = (temp_dsdx<<16);
565        //dtdx_frac    = (temp_dtdx<<16);
566
567        mov eax,dword ptr [temp_dsdx]
568        mov ebx,dword ptr [temp_dtdx]
569       
570        mov word ptr [dsdx_frac+2],ax
571        mov word ptr [dtdx_frac+2],bx
572
573        sar eax,16
574        nop //mov dx,word ptr [left_l]
575
576        sar ebx,16
577        mov cl,byte ptr [r1_software_twidth_log2]
578       
579        shl ebx,cl
580
581        add eax,ebx
582        nop //mov ebx,0
583
584        mov dword ptr [s_t_carry+4],eax
585        add eax,dword ptr [r1_software_texture_width]
586       
587        mov dword ptr [s_t_carry],eax
588        mov cl, byte ptr [num_leftover]
589       
590        mov eax,0
591        mov bl,byte ptr [last_alpha_accumulated]
592
593        ALIGN 16
594
595      looper3:
596        mov ax,word ptr [esi*2]
597        add edx,dword ptr [dtdx_frac]
598
599        sbb ebp,ebp
600        add ecx,dword ptr [dsdx_frac]
601
602        adc esi,dword ptr [4+s_t_carry+ebp*4]
603        add bl,ah
604
605        jnc skip_a_pixel
606
607        and eax,4095
608        mov ax,word ptr [alpha_table+eax*2]
609        mov word ptr [edi],ax
610
611      skip_a_pixel:
612        and bl,240
613        add edi,2
614
615        dec cl
616        jnz looper3
617
618        pop ebp
619      }
620    }
621    else
622    {
623      //highly unoptimized single pixel drawer
624      register w16 texel = *( r1_software_texture_ptr + (left_s>>16) + ((left_t>>16) << r1_software_twidth_log2) );
625
626      if (texel & (15<<12) == (15<<12))
627      {
628        //*start_pixel = alpha_table[texel & 4095];
629      }
630    }
631  }
632}
Note: See TracBrowser for help on using the repository browser.