source: golgotha/src/render/software/perspective_map_unlit_holy_asm.cc @ 80

Last change on this file since 80 was 80, checked in by Sam Hocevar, 11 years ago
  • Adding the Golgotha source code. Not sure what's going to be interesting in there, but since it's all public domain, there's certainly stuff to pick up.
File size: 15.8 KB
Line 
1/********************************************************************** <BR>
2  This file is part of Crack dot Com's free source code release of
3  Golgotha. <a href="http://www.crack.com/golgotha_release"> <BR> for
4  information about compiling & licensing issues visit this URL</a>
5  <PRE> If that doesn't help, contact Jonathan Clark at
6  golgotha_source@usa.net (Subject should have "GOLG" in it)
7***********************************************************************/
8
9#include "software/r1_software_globals.hh"
10#include "software/inline_fpu.hh"
11
12extern sw32 had_subdivisions;
13
14void texture_scanline_perspective_unlit_holy(w16 *start_pixel,
15                                             sw32 start_x,
16                                             void *_left,//perspective_span *left,
17                                             sw32 width)
18{
19  start_pixel = (w16 *)((w8 *)start_pixel + start_x);
20
21  perspective_span *left = (perspective_span *)_left;
22
23  _asm
24  {
25    //left_z = 1.f / left->ooz;
26    //left_s = qftoi(left->soz * left_z) + cur_grads.s_adjust;
27    //left_t = qftoi(left->toz * left_z) + cur_grads.t_adjust;
28   
29    //sw32 had_subdivisions = width & (~15);
30    //num_subdivisions = width >> 4;
31    //num_leftover     = width & 15;
32   
33    mov esi,dword ptr [left]
34    mov eax,dword ptr [width]
35
36    fld1
37    fdiv qword ptr [esi]perspective_span.ooz
38
39    mov ebx,eax
40    and eax,15
41
42    shr ebx,4
43    mov ecx,width
44
45    and ecx,(~15)
46    mov dword ptr [num_leftover],eax
47   
48    mov dword ptr [num_subdivisions],ebx
49    mov dword ptr [had_subdivisions],ecx
50   
51    fld st(0)
52       
53    fmul dword ptr [esi]perspective_span.soz
54    fxch st(1)
55       
56    fmul dword ptr [esi]perspective_span.toz
57    fxch st(1)
58
59    fistp dword ptr [left_s]
60    fistp dword ptr [left_t]
61
62    mov eax,dword ptr [cur_grads].s_adjust
63    mov ebx,dword ptr [cur_grads].t_adjust
64
65    add eax,dword ptr [left_s]
66    add ebx,dword ptr [left_t]
67
68    mov dword ptr [left_s],eax
69    mov dword ptr [left_t],ebx
70   
71    //clear these out
72    mov dword ptr [dsdx_frac],0
73    mov dword ptr [dtdx_frac],0
74  }
75
76  if (num_subdivisions)
77  {
78    _asm
79    {
80      //ooz_right = left->ooz + (cur_grads.doozdxspan);
81      //soz_right = left->soz + (cur_grads.dsozdxspan);
82      //toz_right = left->toz + (cur_grads.dtozdxspan);
83
84      mov esi,dword ptr [left]
85      mov edi,dword ptr [start_pixel]
86
87      fld qword ptr [esi]perspective_span.ooz
88      fld dword ptr [esi]perspective_span.soz
89      fld dword ptr [esi]perspective_span.toz
90
91      //t s o
92      fadd dword ptr [cur_grads]tri_gradients.dtozdxspan
93      fxch st(2)
94
95      //o s t
96
97      fadd qword ptr [cur_grads]tri_gradients.doozdxspan
98      fxch st(1)
99
100      //s o t
101
102      fadd dword ptr [cur_grads]tri_gradients.dsozdxspan
103      fxch st(2)
104
105      //t o s
106
107      fstp dword ptr [toz_right]
108      fxch st(1)
109
110      //s o
111
112      fstp dword ptr [soz_right]
113
114      fstp dword ptr [ooz_right]
115     
116      //calculate the 1st right_z
117      fld1
118      fdiv dword ptr [ooz_right]
119
120      //calculate starting fractional and integral values for s and t
121      //esi = starting_s_coordinate >> 16 + starting_t_coordinate >> 16 << r1_software_twidth_log2
122      //ecx = starting_s_coordinate << 16
123      //edx = starting_t_coordinate << 16
124      //dx  = starting_light_value
125
126      mov esi,dword ptr [r1_software_texture_ptr]
127      mov eax,dword ptr [left_s]
128
129      shr esi,1
130      mov ebx,dword ptr [left_t]
131   
132      sar eax,16
133      mov edx,dword ptr [left_t]
134
135      sar ebx,16
136      add esi,eax
137
138      mov cl,byte ptr [r1_software_twidth_log2]
139      shl ebx,cl
140     
141      sal edx,16
142      mov ecx,dword ptr [left_s]
143   
144      sal ecx,16
145      add esi,ebx
146    }
147
148    while (num_subdivisions)
149    {
150      _asm
151      {
152        //right_s = qftoi(soz_right * right_z);
153        //right_t = qftoi(toz_right * right_z);
154       
155        //right_z is in st0
156        fld st(0)
157       
158        fmul dword ptr [soz_right]
159        fxch st(1)
160       
161        fmul dword ptr [toz_right]
162        fxch st(1)
163
164        fistp dword ptr [right_s]
165        fistp dword ptr [right_t]
166
167      //calculate ooz_right, soz_right, toz_right, and right_z for the end of the next span. if there are
168      //more subdivisions, calculate the end of the next span. if there are no more and there is > 1 leftover
169      //in the leftover span, calculate the end of that.
170
171      //if (num_subdivisions!=1)
172      //{
173          cmp dword ptr [num_subdivisions],1
174          je  last_subdivision
175       
176          //ooz_right += (cur_grads.doozdxspan);
177          //soz_right += (cur_grads.dsozdxspan);
178          //toz_right += (cur_grads.dtozdxspan);
179         
180          fld dword ptr [ooz_right]
181          fadd qword ptr [cur_grads]tri_gradients.doozdxspan
182
183          fld dword ptr [soz_right]
184          fadd dword ptr [cur_grads]tri_gradients.dsozdxspan
185                                   
186          fld dword ptr [toz_right]
187          fadd dword ptr [cur_grads]tri_gradients.dtozdxspan
188
189          fxch st(2)
190          fstp dword ptr [ooz_right]
191
192          fstp dword ptr [soz_right]
193
194          fstp dword ptr [toz_right]
195
196          fld1
197          fdiv dword ptr [ooz_right]
198
199          jmp not_last_subdivision
200      //}
201      //else
202      //if (num_leftover > 1)
203      //{
204
205      last_subdivision:
206          cmp dword ptr [num_leftover],1
207          jle not_last_subdivision
208       
209          //calculate the right_z for the end of the leftover span
210          //ooz_right += (cur_grads.doozdx * num_leftover);
211          //soz_right += (cur_grads.dsozdx * num_leftover);
212          //toz_right += (cur_grads.dtozdx * num_leftover);
213
214          fild dword ptr [num_leftover]
215         
216          //todo: pipeline these fpu ops
217          fld  qword ptr [cur_grads]tri_gradients.doozdx
218          fmul st(0),st(1)
219          fadd dword ptr [ooz_right]
220          fstp dword ptr [ooz_right]
221
222          fld  dword ptr [cur_grads]tri_gradients.dsozdx
223          fmul st(0),st(1)
224          fadd dword ptr [soz_right]
225          fstp dword ptr [soz_right]
226
227          fld  dword ptr [cur_grads]tri_gradients.dtozdx
228          fmul st(0),st(1)
229          fadd dword ptr [toz_right]
230          fstp dword ptr [toz_right]
231
232          fstp st(0) //nifty thing i found, a 1 cycle fpu pop
233       
234          fld1
235          fdiv dword ptr [ooz_right]
236      //}
237           
238      not_last_subdivision:
239        //cap the right_s and right_t's so that they're valid
240
241        mov eax,dword ptr [right_s]
242        mov ebx,dword ptr [right_t]
243       
244        add eax,dword ptr [cur_grads]tri_gradients.s_adjust
245        add ebx,dword ptr [cur_grads]tri_gradients.t_adjust
246 
247        //cap the right s and t
248        cmp eax,0
249        jge cmp_eax_high
250
251        mov eax,0
252        jmp cmp_ebx_low
253
254      cmp_eax_high:
255        cmp eax,dword ptr [s_mask]
256        jle cmp_ebx_low
257
258        mov eax,dword ptr [s_mask]
259
260      cmp_ebx_low:
261        cmp ebx,0
262        jge cmp_ebx_high
263
264        mov ebx,0
265        jmp done_compare
266     
267      cmp_ebx_high:
268        cmp ebx,dword ptr [t_mask]
269        jle done_compare
270
271        mov ebx,dword ptr [t_mask]
272
273      done_compare:
274
275        //store the right_s and right_t
276        //so they can be copied into left_s and left_t at the end of the 16-pixel span
277        //(the cant be copied now because we have to calculate (right_s-left_s)>>4 and (right_t-left_t)>>4
278       
279        mov dword ptr [right_s],eax
280        mov dword ptr [right_t],ebx
281
282        sub eax,dword ptr [left_s]
283        push ebp
284
285        sar eax,4
286        sub ebx,dword ptr [left_t]
287       
288        sar ebx,4
289        mov word ptr [dsdx_frac+2],ax //this sets the upper 16 bits of dword ptr [dsdx_frac] to ((right_s-left_s)>>4)<<16
290       
291        sar eax,16
292        mov word ptr [dtdx_frac+2],bx //this sets the upper 16 bits of dword ptr [dtdx_frac] to ((right_t-left_t)>>4)<<16
293       
294        sar ebx,16
295        mov cl,byte ptr [r1_software_twidth_log2]
296       
297        shl ebx,cl
298
299        add eax,ebx
300        mov ebx,0
301
302        //s_t_carry[1] = integral_dsdx + integral_dtdx<<r1_software_twidth_log2
303        //s_t_carry[0] = integral_dsdx + integral_dtdx<<r1_software_twidth_log2 + r1_software_texture_width
304
305        mov dword ptr [s_t_carry+4],eax
306        add eax,dword ptr [r1_software_texture_width]
307               
308        mov dword ptr [s_t_carry],eax       
309        mov eax,0 //must make sure the high bits of these are zeroed out
310
311        mov cl,4 //loop is unrolled to 4 pixels - we want to draw 16, so loop 4 times
312        ALIGN 16
313
314        //high 16 bits of ecx is the fractional s component
315        //high 16 bits of edx is the fractional t component
316
317        //eax is used to lookup the texel as well as the low 8-bits of the lit texel
318        //ebx is used to lookup the high 8-bits of the lit texel
319        //ebp is used to detect a t-carry as well as lookup the lit texel
320        //cl  is the loop count variable
321
322    looper1:
323        add edi,8
324        add edx,dword ptr [dtdx_frac]
325       
326        sbb ebx,ebx
327        add ecx,dword ptr [dsdx_frac]
328
329        mov ax,word ptr [esi*2]
330        nop
331
332        adc esi,dword ptr [4+s_t_carry+ebx*4]
333        add edx,dword ptr [dtdx_frac]
334   
335        sbb ebx,ebx
336        and eax,eax //test to see if its zero
337
338        jz skipped_1_pixel
339        mov word ptr [edi-8],ax    //store 1 pixel
340           
341      skipped_1_pixel:
342        mov ax,word ptr [esi*2]
343        add ecx,dword ptr [dsdx_frac]
344
345        adc esi,dword ptr [4+s_t_carry+ebx*4]
346        add edx,dword ptr [dtdx_frac]
347   
348        sbb ebx,ebx
349        and eax,eax //test to see if its zero
350
351        jz skipped_2_pixel
352        mov word ptr [edi-6],ax
353
354      skipped_2_pixel:
355        mov ax,word ptr [esi*2]
356        add ecx,dword ptr [dsdx_frac]
357
358        adc esi,dword ptr [4+s_t_carry+ebx*4]
359        add edx,dword ptr [dtdx_frac]
360   
361        sbb ebx,ebx
362        and eax,eax //test to see if its zero
363
364        jz skipped_3_pixel
365        mov word ptr [edi-4],ax
366     
367     skipped_3_pixel:
368        mov ax,word ptr [esi*2]
369        add ecx,dword ptr [dsdx_frac]
370
371        adc esi,dword ptr [4+s_t_carry+ebx*4]
372        and eax,eax //test to see if its zero
373
374        jz skipped_4_pixel
375        mov word ptr [edi-2],ax   
376
377      skipped_4_pixel:
378        dec cl
379        jnz looper1
380
381        pop ebp
382
383        //store right_s and right_s in left_s and left_t
384        //right_s is what left_s starts at on the next 16 pixel span
385        //right_t is what left_t starts at on the next 16 pixel span
386
387        mov eax,dword ptr [right_s]
388        mov ebx,dword ptr [right_t]
389
390        mov dword ptr [left_s],eax
391        mov dword ptr [left_t],ebx
392      }
393         
394      _asm dec dword ptr [num_subdivisions]
395    }
396   
397    //store these so that the C code below actually works
398    _asm mov dword ptr [start_pixel],edi
399  }
400   
401  if (num_leftover)
402  {       
403    if (num_leftover > 1)
404    {     
405      if (had_subdivisions==0)
406      {
407        //calculate the right_z for the end of span
408        ooz_right = left->ooz + (cur_grads.doozdx * num_leftover);
409        soz_right = left->soz + (cur_grads.dsozdx * num_leftover);
410        toz_right = left->toz + (cur_grads.dtozdx * num_leftover);
411       
412        //calculate the z at the right endpoint
413        _asm fld1
414        _asm fdiv dword ptr [ooz_right]
415      }
416      else
417      {
418        //the correct ending right_z is already being calculated
419        //(see the if (num_subdivisions!=1) case above
420      }
421
422      _asm
423      {
424        //calculate starting fractional and integral values for s and t           
425       
426        mov esi,dword ptr [r1_software_texture_ptr]
427        mov eax,dword ptr [left_s]
428
429        shr esi,1
430        mov ebx,dword ptr [left_t]
431   
432        sar eax,16
433        mov edx,dword ptr [left_t]
434
435        sar ebx,16
436        add esi,eax
437
438        mov cl,byte ptr [r1_software_twidth_log2]
439        shl ebx,cl
440     
441        sal edx,16
442        mov ecx,dword ptr [left_s]
443     
444        sal ecx,16
445        add esi,ebx
446       
447        mov edi,dword ptr [start_pixel]
448
449        //calculate the right endpoint
450        //right_s = qftoi(soz_right * right_z) + cur_grads.s_adjust;
451        //right_t = qftoi(toz_right * right_z) + cur_grads.t_adjust;
452       
453        //right_z is in st0
454        fld st(0)
455       
456        fmul dword ptr [soz_right]
457        fxch st(1)
458       
459        fmul dword ptr [toz_right]
460        fxch st(1)
461
462        fistp dword ptr [right_s]
463        fistp dword ptr [right_t]
464
465        mov eax,dword ptr [right_s]
466        mov ebx,dword ptr [right_t]
467       
468        add eax,dword ptr [cur_grads]tri_gradients.s_adjust
469        add ebx,dword ptr [cur_grads]tri_gradients.t_adjust
470 
471        //cap the right s and t
472        cmp eax,0
473        jge cmp_eax_high_2
474
475        mov eax,0
476        jmp cmp_ebx_low_2
477
478      cmp_eax_high_2:
479        cmp eax,dword ptr [s_mask]
480        jle cmp_ebx_low_2
481
482        mov eax,dword ptr [s_mask]
483
484      cmp_ebx_low_2:
485        cmp ebx,0
486        jge cmp_ebx_high_2
487
488        mov ebx,0
489        jmp done_compare_2
490     
491      cmp_ebx_high_2:
492        cmp ebx,dword ptr [t_mask]
493        jle done_compare_2
494
495        mov ebx,dword ptr [t_mask]
496
497      done_compare_2:
498           
499        //calculate the deltas (left to right)
500        //temp_dsdx = qftoi((float)(right_s - left_s) * inverse_leftover_lookup[num_leftover]);
501        //temp_dtdx = qftoi((float)(right_t - left_t) * inverse_leftover_lookup[num_leftover]);
502       
503        push ebp
504        mov ebp,num_leftover
505
506        sub eax,dword ptr [left_s]
507        sub ebx,dword ptr [left_t]
508
509        mov dword ptr [temp_dsdx],eax
510        mov dword ptr [temp_dtdx],ebx
511
512        fild dword ptr [temp_dsdx]
513        fild dword ptr [temp_dtdx]
514
515        fmul dword ptr [inverse_leftover_lookup + ebp*4]
516        fxch st(1)
517
518        fmul dword ptr [inverse_leftover_lookup + ebp*4]
519        fxch st(1)
520
521        fistp dword ptr [temp_dtdx]
522        fistp dword ptr [temp_dsdx]
523       
524        //calculate the fractional and integral delta vars
525        //s_t_carry[0] = (temp_dsdx>>16) + ((temp_dtdx>>16)<<r1_software_twidth_log2) + r1_software_texture_width;
526        //s_t_carry[1] = (temp_dsdx>>16) + ((temp_dtdx>>16)<<r1_software_twidth_log2);
527        //dsdx_frac    = (temp_dsdx<<16);
528        //dtdx_frac    = (temp_dtdx<<16);
529
530        mov eax,dword ptr [temp_dsdx]
531        mov ebx,dword ptr [temp_dtdx]
532       
533        mov word ptr [dsdx_frac+2],ax
534        mov word ptr [dtdx_frac+2],bx
535
536        sar eax,16
537        nop //mov dx,word ptr [left_l]
538
539        sar ebx,16
540        mov cl,byte ptr [r1_software_twidth_log2]
541       
542        shl ebx,cl
543
544        add eax,ebx
545        nop //mov ebx,0
546
547        mov dword ptr [s_t_carry+4],eax
548        add eax,dword ptr [r1_software_texture_width]
549       
550        mov dword ptr [s_t_carry],eax
551        mov cl, byte ptr [num_leftover]
552       
553        mov eax,0 //make sure these high bits are clear
554       
555        ALIGN 16
556
557      looper3:
558        mov ax,word ptr [esi*2]
559        add edx,dword ptr [dtdx_frac]
560
561        sbb ebp,ebp
562        add edi,2 //the only convenient place for the stepping of edi was way up here
563
564        add ecx,dword ptr [dsdx_frac]
565        nop
566       
567        adc esi,dword ptr [4+s_t_carry+ebp*4]
568        and eax,eax
569
570        jz  skip_a_pixel
571        mov word ptr [edi-2],ax
572
573      skip_a_pixel:
574        dec cl
575        jnz looper3
576
577        pop ebp
578      }
579    }
580    else
581    {
582      //highly unoptimized single pixel drawer
583      register w16 texel = *(r1_software_texture_ptr + (left_s>>16) + ((left_t>>16)<<r1_software_twidth_log2));
584     
585      if (texel)
586        *start_pixel = texel;
587    }
588  }
589}
Note: See TracBrowser for help on using the repository browser.