source: golgotha/src/render/software/perspective_map_unlit_asm.cc

Last change on this file was 80, checked in by Sam Hocevar, 11 years ago
  • Adding the Golgotha source code. Not sure what's going to be interesting in there, but since it's all public domain, there's certainly stuff to pick up.
File size: 15.6 KB
RevLine 
[80]1/********************************************************************** <BR>
2  This file is part of Crack dot Com's free source code release of
3  Golgotha. <a href="http://www.crack.com/golgotha_release"> <BR> for
4  information about compiling & licensing issues visit this URL</a>
5  <PRE> If that doesn't help, contact Jonathan Clark at
6  golgotha_source@usa.net (Subject should have "GOLG" in it)
7***********************************************************************/
8
9#include "software/r1_software_globals.hh"
10#include "software/inline_fpu.hh"
11
12extern sw32 had_subdivisions;
13
14void texture_scanline_perspective_unlit(w16 *start_pixel,
15                                        sw32 start_x,
16                                        void *_left,//perspective_span *left,
17                                        sw32 width)
18{
19  start_pixel = (w16 *)((w8 *)start_pixel + start_x);
20
21  perspective_span *left = (perspective_span *)_left;
22
23  _asm
24  {
25    //left_z = 1.f / left->ooz;
26    //left_s = qftoi(left->soz * left_z) + cur_grads.s_adjust;
27    //left_t = qftoi(left->toz * left_z) + cur_grads.t_adjust;
28   
29    //sw32 had_subdivisions = width & (~15);
30    //num_subdivisions = width >> 4;
31    //num_leftover     = width & 15;
32   
33    mov esi,dword ptr [left]
34    mov eax,dword ptr [width]
35
36    fld1
37    fdiv qword ptr [esi]perspective_span.ooz
38
39    mov ebx,eax
40    and eax,15
41
42    shr ebx,4
43    mov ecx,width
44
45    and ecx,(~15)
46    mov dword ptr [num_leftover],eax
47   
48    mov dword ptr [num_subdivisions],ebx
49    mov dword ptr [had_subdivisions],ecx
50   
51    fld st(0)
52       
53    fmul dword ptr [esi]perspective_span.soz
54    fxch st(1)
55       
56    fmul dword ptr [esi]perspective_span.toz
57    fxch st(1)
58
59    fistp dword ptr [left_s]
60    fistp dword ptr [left_t]
61
62    mov eax,dword ptr [cur_grads].s_adjust
63    mov ebx,dword ptr [cur_grads].t_adjust
64
65    add eax,dword ptr [left_s]
66    add ebx,dword ptr [left_t]
67
68    mov dword ptr [left_s],eax
69    mov dword ptr [left_t],ebx
70   
71    //clear these out
72    mov dword ptr [dsdx_frac],0
73    mov dword ptr [dtdx_frac],0
74  }
75
76  if (num_subdivisions)
77  {
78    _asm
79    {
80      //ooz_right = left->ooz + (cur_grads.doozdxspan);
81      //soz_right = left->soz + (cur_grads.dsozdxspan);
82      //toz_right = left->toz + (cur_grads.dtozdxspan);
83
84      mov esi,dword ptr [left]
85      mov edi,dword ptr [start_pixel]
86
87      fld qword ptr [esi]perspective_span.ooz
88      fld dword ptr [esi]perspective_span.soz
89      fld dword ptr [esi]perspective_span.toz
90
91      //t s o
92      fadd dword ptr [cur_grads]tri_gradients.dtozdxspan
93      fxch st(2)
94
95      //o s t
96
97      fadd qword ptr [cur_grads]tri_gradients.doozdxspan
98      fxch st(1)
99
100      //s o t
101
102      fadd dword ptr [cur_grads]tri_gradients.dsozdxspan
103      fxch st(2)
104
105      //t o s
106
107      fstp dword ptr [toz_right]
108      fxch st(1)
109
110      //s o
111
112      fstp dword ptr [soz_right]
113
114      fstp dword ptr [ooz_right]
115     
116      //calculate the 1st right_z
117      fld1
118      fdiv dword ptr [ooz_right]
119
120      //calculate starting fractional and integral values for s and t
121      //esi = starting_s_coordinate >> 16 + starting_t_coordinate >> 16 << r1_software_twidth_log2
122      //ecx = starting_s_coordinate << 16
123      //edx = starting_t_coordinate << 16
124      //dx  = starting_light_value
125
126      mov esi,dword ptr [r1_software_texture_ptr]
127      mov eax,dword ptr [left_s]
128
129      shr esi,1
130      mov ebx,dword ptr [left_t]
131   
132      sar eax,16
133      mov edx,dword ptr [left_t]
134
135      sar ebx,16
136      add esi,eax
137
138      mov cl,byte ptr [r1_software_twidth_log2]
139      shl ebx,cl
140     
141      sal edx,16
142      mov ecx,dword ptr [left_s]
143   
144      sal ecx,16
145      add esi,ebx
146    }
147
148    while (num_subdivisions)
149    {
150      _asm
151      {
152        //right_s = qftoi(soz_right * right_z);
153        //right_t = qftoi(toz_right * right_z);
154       
155        //right_z is in st0
156        fld st(0)
157       
158        fmul dword ptr [soz_right]
159        fxch st(1)
160       
161        fmul dword ptr [toz_right]
162        fxch st(1)
163
164        fistp dword ptr [right_s]
165        fistp dword ptr [right_t]
166
167      //calculate ooz_right, soz_right, toz_right, and right_z for the end of the next span. if there are
168      //more subdivisions, calculate the end of the next span. if there are no more and there is > 1 leftover
169      //in the leftover span, calculate the end of that.
170
171      //if (num_subdivisions!=1)
172      //{
173          cmp dword ptr [num_subdivisions],1
174          je  last_subdivision
175       
176          //ooz_right += (cur_grads.doozdxspan);
177          //soz_right += (cur_grads.dsozdxspan);
178          //toz_right += (cur_grads.dtozdxspan);
179         
180          fld dword ptr [ooz_right]
181          fadd qword ptr [cur_grads]tri_gradients.doozdxspan
182
183          fld dword ptr [soz_right]
184          fadd dword ptr [cur_grads]tri_gradients.dsozdxspan
185                                   
186          fld dword ptr [toz_right]
187          fadd dword ptr [cur_grads]tri_gradients.dtozdxspan
188
189          fxch st(2)
190          fstp dword ptr [ooz_right]
191
192          fstp dword ptr [soz_right]
193
194          fstp dword ptr [toz_right]
195
196          fld1
197          fdiv dword ptr [ooz_right]
198
199          jmp not_last_subdivision
200      //}
201      //else
202      //if (num_leftover > 1)
203      //{
204
205      last_subdivision:
206          cmp dword ptr [num_leftover],1
207          jle not_last_subdivision
208       
209          //calculate the right_z for the end of the leftover span
210          //ooz_right += (cur_grads.doozdx * num_leftover);
211          //soz_right += (cur_grads.dsozdx * num_leftover);
212          //toz_right += (cur_grads.dtozdx * num_leftover);
213
214          fild dword ptr [num_leftover]
215         
216          //todo: pipeline these fpu ops
217          fld  qword ptr [cur_grads]tri_gradients.doozdx
218          fmul st(0),st(1)
219          fadd dword ptr [ooz_right]
220          fstp dword ptr [ooz_right]
221
222          fld  dword ptr [cur_grads]tri_gradients.dsozdx
223          fmul st(0),st(1)
224          fadd dword ptr [soz_right]
225          fstp dword ptr [soz_right]
226
227          fld  dword ptr [cur_grads]tri_gradients.dtozdx
228          fmul st(0),st(1)
229          fadd dword ptr [toz_right]
230          fstp dword ptr [toz_right]
231
232          fstp st(0) //nifty thing i found, a 1 cycle fpu pop
233       
234          fld1
235          fdiv dword ptr [ooz_right]
236      //}
237           
238      not_last_subdivision:
239        //cap the right_s and right_t's so that they're valid
240
241        mov eax,dword ptr [right_s]
242        mov ebx,dword ptr [right_t]
243       
244        add eax,dword ptr [cur_grads]tri_gradients.s_adjust
245        add ebx,dword ptr [cur_grads]tri_gradients.t_adjust
246 
247        //cap the right s and t
248        cmp eax,0
249        jge cmp_eax_high
250
251        mov eax,0
252        jmp cmp_ebx_low
253
254      cmp_eax_high:
255        cmp eax,dword ptr [s_mask]
256        jle cmp_ebx_low
257
258        mov eax,dword ptr [s_mask]
259
260      cmp_ebx_low:
261        cmp ebx,0
262        jge cmp_ebx_high
263
264        mov ebx,0
265        jmp done_compare
266     
267      cmp_ebx_high:
268        cmp ebx,dword ptr [t_mask]
269        jle done_compare
270
271        mov ebx,dword ptr [t_mask]
272
273      done_compare:
274
275        //store the right_s and right_t
276        //so they can be copied into left_s and left_t at the end of the 16-pixel span
277        //(the cant be copied now because we have to calculate (right_s-left_s)>>4 and (right_t-left_t)>>4
278       
279        mov dword ptr [right_s],eax
280        mov dword ptr [right_t],ebx
281
282        sub eax,dword ptr [left_s]
283        push ebp
284
285        sar eax,4
286        sub ebx,dword ptr [left_t]
287       
288        sar ebx,4
289        mov word ptr [dsdx_frac+2],ax //this sets the upper 16 bits of dword ptr [dsdx_frac] to ((right_s-left_s)>>4)<<16
290       
291        sar eax,16
292        mov word ptr [dtdx_frac+2],bx //this sets the upper 16 bits of dword ptr [dtdx_frac] to ((right_t-left_t)>>4)<<16
293       
294        sar ebx,16
295        mov cl,byte ptr [r1_software_twidth_log2]
296       
297        shl ebx,cl
298
299        add eax,ebx
300        mov ebx,0
301
302        //s_t_carry[1] = integral_dsdx + integral_dtdx<<r1_software_twidth_log2
303        //s_t_carry[0] = integral_dsdx + integral_dtdx<<r1_software_twidth_log2 + r1_software_texture_width
304
305        mov dword ptr [s_t_carry+4],eax
306        add eax,dword ptr [r1_software_texture_width]
307               
308        mov dword ptr [s_t_carry],eax       
309        mov eax,0 //must make sure the high bits of these are zeroed out
310
311        mov cl,4 //loop is unrolled to 4 pixels - we want to draw 16, so loop 4 times
312        ALIGN 16
313
314        //high 16 bits of ecx is the fractional s component
315        //high 16 bits of edx is the fractional t component
316
317        //eax is used to lookup the texel as well as the low 8-bits of the lit texel
318        //ebx is used to lookup the high 8-bits of the lit texel
319        //ebp is used to detect a t-carry as well as lookup the lit texel
320        //cl  is the loop count variable
321
322    looper1:
323        add edx,dword ptr [dtdx_frac]
324        nop
325 
326        sbb ebp,ebp
327        add edi,8 //the only convenient place for the stepping of edi was way up here
328
329        mov ax,word ptr [esi*2]
330        add ecx,dword ptr [dsdx_frac]
331
332        adc esi,dword ptr [4+s_t_carry+ebp*4]
333        add edx,dword ptr [dtdx_frac]
334       
335        sbb ebp,ebp
336        mov word ptr [edi-8],ax //1
337
338        mov ax,word ptr [esi*2]
339        add ecx,dword ptr [dsdx_frac]
340
341        adc esi,dword ptr [4+s_t_carry+ebp*4]
342        add edx,dword ptr [dtdx_frac]
343       
344        sbb ebp,ebp
345        mov word ptr [edi-6],ax //2
346
347        mov ax,word ptr [esi*2]
348        add ecx,dword ptr [dsdx_frac]
349
350        adc esi,dword ptr [4+s_t_carry+ebp*4]
351        add edx,dword ptr [dtdx_frac]
352       
353        sbb ebp,ebp
354        mov word ptr [edi-4],ax //3
355
356        mov ax,word ptr [esi*2]
357        add ecx,dword ptr [dsdx_frac]
358
359        adc esi,dword ptr [4+s_t_carry+ebp*4]
360        mov word ptr [edi-2],ax //4
361       
362        dec cl //thank god this doesnt modify the carry flag (the above add ch,dl needs to effect the adc bh,dh at the top of the loop)
363     
364        jnz looper1
365        pop ebp
366
367        //store right_s and right_s in left_s and left_t
368        //right_s is what left_s starts at on the next 16 pixel span
369        //right_t is what left_t starts at on the next 16 pixel span
370
371        mov eax,dword ptr [right_s]
372        mov ebx,dword ptr [right_t]
373
374        mov dword ptr [left_s],eax
375        mov dword ptr [left_t],ebx
376      }
377         
378      _asm dec dword ptr [num_subdivisions]
379    }
380   
381    //store these so that the C code below actually works
382    _asm mov dword ptr [start_pixel],edi
383  }
384   
385  if (num_leftover)
386  {       
387    if (num_leftover > 1)
388    {     
389      if (had_subdivisions==0)
390      {
391        //calculate the right_z for the end of span
392        ooz_right = left->ooz + (cur_grads.doozdx * num_leftover);
393        soz_right = left->soz + (cur_grads.dsozdx * num_leftover);
394        toz_right = left->toz + (cur_grads.dtozdx * num_leftover);
395       
396        //calculate the z at the right endpoint
397        _asm fld1
398        _asm fdiv dword ptr [ooz_right]
399      }
400      else
401      {
402        //the correct ending right_z is already being calculated
403        //(see the if (num_subdivisions!=1) case above
404      }
405
406      _asm
407      {
408        //calculate starting fractional and integral values for s and t           
409       
410        mov esi,dword ptr [r1_software_texture_ptr]
411        mov eax,dword ptr [left_s]
412
413        shr esi,1
414        mov ebx,dword ptr [left_t]
415   
416        sar eax,16
417        mov edx,dword ptr [left_t]
418
419        sar ebx,16
420        add esi,eax
421
422        mov cl,byte ptr [r1_software_twidth_log2]
423        shl ebx,cl
424     
425        sal edx,16
426        mov ecx,dword ptr [left_s]
427     
428        sal ecx,16
429        add esi,ebx
430       
431        mov edi,dword ptr [start_pixel]
432
433        //calculate the right endpoint
434        //right_s = qftoi(soz_right * right_z) + cur_grads.s_adjust;
435        //right_t = qftoi(toz_right * right_z) + cur_grads.t_adjust;
436       
437        //right_z is in st0
438        fld st(0)
439       
440        fmul dword ptr [soz_right]
441        fxch st(1)
442       
443        fmul dword ptr [toz_right]
444        fxch st(1)
445
446        fistp dword ptr [right_s]
447        fistp dword ptr [right_t]
448
449        mov eax,dword ptr [right_s]
450        mov ebx,dword ptr [right_t]
451       
452        add eax,dword ptr [cur_grads]tri_gradients.s_adjust
453        add ebx,dword ptr [cur_grads]tri_gradients.t_adjust
454 
455        //cap the right s and t
456        cmp eax,0
457        jge cmp_eax_high_2
458
459        mov eax,0
460        jmp cmp_ebx_low_2
461
462      cmp_eax_high_2:
463        cmp eax,dword ptr [s_mask]
464        jle cmp_ebx_low_2
465
466        mov eax,dword ptr [s_mask]
467
468      cmp_ebx_low_2:
469        cmp ebx,0
470        jge cmp_ebx_high_2
471
472        mov ebx,0
473        jmp done_compare_2
474     
475      cmp_ebx_high_2:
476        cmp ebx,dword ptr [t_mask]
477        jle done_compare_2
478
479        mov ebx,dword ptr [t_mask]
480
481      done_compare_2:
482           
483        //calculate the deltas (left to right)
484        //temp_dsdx = qftoi((float)(right_s - left_s) * inverse_leftover_lookup[num_leftover]);
485        //temp_dtdx = qftoi((float)(right_t - left_t) * inverse_leftover_lookup[num_leftover]);
486       
487        push ebp
488        mov ebp,num_leftover
489
490        sub eax,dword ptr [left_s]
491        sub ebx,dword ptr [left_t]
492
493        mov dword ptr [temp_dsdx],eax
494        mov dword ptr [temp_dtdx],ebx
495
496        fild dword ptr [temp_dsdx]
497        fild dword ptr [temp_dtdx]
498
499        fmul dword ptr [inverse_leftover_lookup + ebp*4]
500        fxch st(1)
501
502        fmul dword ptr [inverse_leftover_lookup + ebp*4]
503        fxch st(1)
504
505        fistp dword ptr [temp_dtdx]
506        fistp dword ptr [temp_dsdx]
507       
508        //calculate the fractional and integral delta vars
509        //s_t_carry[0] = (temp_dsdx>>16) + ((temp_dtdx>>16)<<r1_software_twidth_log2) + r1_software_texture_width;
510        //s_t_carry[1] = (temp_dsdx>>16) + ((temp_dtdx>>16)<<r1_software_twidth_log2);
511        //dsdx_frac    = (temp_dsdx<<16);
512        //dtdx_frac    = (temp_dtdx<<16);
513
514        mov eax,dword ptr [temp_dsdx]
515        mov ebx,dword ptr [temp_dtdx]
516       
517        mov word ptr [dsdx_frac+2],ax
518        mov word ptr [dtdx_frac+2],bx
519
520        sar eax,16
521        nop //mov dx,word ptr [left_l]
522
523        sar ebx,16
524        mov cl,byte ptr [r1_software_twidth_log2]
525       
526        shl ebx,cl
527
528        add eax,ebx
529        nop //mov ebx,0
530
531        mov dword ptr [s_t_carry+4],eax
532        add eax,dword ptr [r1_software_texture_width]
533       
534        mov dword ptr [s_t_carry],eax
535        mov cl, byte ptr [num_leftover]
536       
537        //mov eax,0
538        //mov ch,dl //setup the initial lighting error
539
540        //mov bh,byte ptr [last_bh2] //setup the initial dither
541        //add ch,0 //clear the carry bit
542
543        ALIGN 16
544
545      looper3:
546        mov ax,word ptr [esi*2]
547        add edx,dword ptr [dtdx_frac]
548       
549        sbb ebp,ebp
550        mov word ptr [edi],ax //1
551
552        add edi,2 //the only convenient place for the stepping of edi was way up here
553        add ecx,dword ptr [dsdx_frac]
554
555        adc esi,dword ptr [4+s_t_carry+ebp*4]
556        dec cl
557
558        jnz looper3
559        pop ebp
560      }
561    }
562    else
563    {
564      //highly unoptimized single pixel drawer
565      *start_pixel = *(r1_software_texture_ptr + (left_s>>16) + ((left_t>>16)<<r1_software_twidth_log2));
566    }
567  }
568}
Note: See TracBrowser for help on using the repository browser.