1  /********************************************************************** <BR>


2  This file is part of Crack dot Com's free source code release of


3  Golgotha. <a href="http://www.crack.com/golgotha_release"> <BR> for


4  information about compiling & licensing issues visit this URL</a>


5  <PRE> If that doesn't help, contact Jonathan Clark at


6  golgotha_source@usa.net (Subject should have "GOLG" in it)


7  ***********************************************************************/


8 


9  #include "software/r1_software_globals.hh"


10  #include "software/inline_fpu.hh"


11 


12  extern sw32 had_subdivisions;


13 


14  void texture_scanline_perspective_unlit(w16 *start_pixel,


15  sw32 start_x,


16  void *_left,//perspective_span *left,


17  sw32 width)


18  {


19  start_pixel = (w16 *)((w8 *)start_pixel + start_x);


20 


21  perspective_span *left = (perspective_span *)_left;


22 


23  _asm


24  {


25  //left_z = 1.f / left>ooz;


26  //left_s = qftoi(left>soz * left_z) + cur_grads.s_adjust;


27  //left_t = qftoi(left>toz * left_z) + cur_grads.t_adjust;


28 


29  //sw32 had_subdivisions = width & (~15);


30  //num_subdivisions = width >> 4;


31  //num_leftover = width & 15;


32 


33  mov esi,dword ptr [left]


34  mov eax,dword ptr [width]


35 


36  fld1


37  fdiv qword ptr [esi]perspective_span.ooz


38 


39  mov ebx,eax


40  and eax,15


41 


42  shr ebx,4


43  mov ecx,width


44 


45  and ecx,(~15)


46  mov dword ptr [num_leftover],eax


47 


48  mov dword ptr [num_subdivisions],ebx


49  mov dword ptr [had_subdivisions],ecx


50 


51  fld st(0)


52 


53  fmul dword ptr [esi]perspective_span.soz


54  fxch st(1)


55 


56  fmul dword ptr [esi]perspective_span.toz


57  fxch st(1)


58 


59  fistp dword ptr [left_s]


60  fistp dword ptr [left_t]


61 


62  mov eax,dword ptr [cur_grads].s_adjust


63  mov ebx,dword ptr [cur_grads].t_adjust


64 


65  add eax,dword ptr [left_s]


66  add ebx,dword ptr [left_t]


67 


68  mov dword ptr [left_s],eax


69  mov dword ptr [left_t],ebx


70 


71  //clear these out


72  mov dword ptr [dsdx_frac],0


73  mov dword ptr [dtdx_frac],0


74  }


75 


76  if (num_subdivisions)


77  {


78  _asm


79  {


80  //ooz_right = left>ooz + (cur_grads.doozdxspan);


81  //soz_right = left>soz + (cur_grads.dsozdxspan);


82  //toz_right = left>toz + (cur_grads.dtozdxspan);


83 


84  mov esi,dword ptr [left]


85  mov edi,dword ptr [start_pixel]


86 


87  fld qword ptr [esi]perspective_span.ooz


88  fld dword ptr [esi]perspective_span.soz


89  fld dword ptr [esi]perspective_span.toz


90 


91  //t s o


92  fadd dword ptr [cur_grads]tri_gradients.dtozdxspan


93  fxch st(2)


94 


95  //o s t


96 


97  fadd qword ptr [cur_grads]tri_gradients.doozdxspan


98  fxch st(1)


99 


100  //s o t


101 


102  fadd dword ptr [cur_grads]tri_gradients.dsozdxspan


103  fxch st(2)


104 


105  //t o s


106 


107  fstp dword ptr [toz_right]


108  fxch st(1)


109 


110  //s o


111 


112  fstp dword ptr [soz_right]


113 


114  fstp dword ptr [ooz_right]


115 


116  //calculate the 1st right_z


117  fld1


118  fdiv dword ptr [ooz_right]


119 


120  //calculate starting fractional and integral values for s and t


121  //esi = starting_s_coordinate >> 16 + starting_t_coordinate >> 16 << r1_software_twidth_log2


122  //ecx = starting_s_coordinate << 16


123  //edx = starting_t_coordinate << 16


124  //dx = starting_light_value


125 


126  mov esi,dword ptr [r1_software_texture_ptr]


127  mov eax,dword ptr [left_s]


128 


129  shr esi,1


130  mov ebx,dword ptr [left_t]


131 


132  sar eax,16


133  mov edx,dword ptr [left_t]


134 


135  sar ebx,16


136  add esi,eax


137 


138  mov cl,byte ptr [r1_software_twidth_log2]


139  shl ebx,cl


140 


141  sal edx,16


142  mov ecx,dword ptr [left_s]


143 


144  sal ecx,16


145  add esi,ebx


146  }


147 


148  while (num_subdivisions)


149  {


150  _asm


151  {


152  //right_s = qftoi(soz_right * right_z);


153  //right_t = qftoi(toz_right * right_z);


154 


155  //right_z is in st0


156  fld st(0)


157 


158  fmul dword ptr [soz_right]


159  fxch st(1)


160 


161  fmul dword ptr [toz_right]


162  fxch st(1)


163 


164  fistp dword ptr [right_s]


165  fistp dword ptr [right_t]


166 


167  //calculate ooz_right, soz_right, toz_right, and right_z for the end of the next span. if there are


168  //more subdivisions, calculate the end of the next span. if there are no more and there is > 1 leftover


169  //in the leftover span, calculate the end of that.


170 


171  //if (num_subdivisions!=1)


172  //{


173  cmp dword ptr [num_subdivisions],1


174  je last_subdivision


175 


176  //ooz_right += (cur_grads.doozdxspan);


177  //soz_right += (cur_grads.dsozdxspan);


178  //toz_right += (cur_grads.dtozdxspan);


179 


180  fld dword ptr [ooz_right]


181  fadd qword ptr [cur_grads]tri_gradients.doozdxspan


182 


183  fld dword ptr [soz_right]


184  fadd dword ptr [cur_grads]tri_gradients.dsozdxspan


185 


186  fld dword ptr [toz_right]


187  fadd dword ptr [cur_grads]tri_gradients.dtozdxspan


188 


189  fxch st(2)


190  fstp dword ptr [ooz_right]


191 


192  fstp dword ptr [soz_right]


193 


194  fstp dword ptr [toz_right]


195 


196  fld1


197  fdiv dword ptr [ooz_right]


198 


199  jmp not_last_subdivision


200  //}


201  //else


202  //if (num_leftover > 1)


203  //{


204 


205  last_subdivision:


206  cmp dword ptr [num_leftover],1


207  jle not_last_subdivision


208 


209  //calculate the right_z for the end of the leftover span


210  //ooz_right += (cur_grads.doozdx * num_leftover);


211  //soz_right += (cur_grads.dsozdx * num_leftover);


212  //toz_right += (cur_grads.dtozdx * num_leftover);


213 


214  fild dword ptr [num_leftover]


215 


216  //todo: pipeline these fpu ops


217  fld qword ptr [cur_grads]tri_gradients.doozdx


218  fmul st(0),st(1)


219  fadd dword ptr [ooz_right]


220  fstp dword ptr [ooz_right]


221 


222  fld dword ptr [cur_grads]tri_gradients.dsozdx


223  fmul st(0),st(1)


224  fadd dword ptr [soz_right]


225  fstp dword ptr [soz_right]


226 


227  fld dword ptr [cur_grads]tri_gradients.dtozdx


228  fmul st(0),st(1)


229  fadd dword ptr [toz_right]


230  fstp dword ptr [toz_right]


231 


232  fstp st(0) //nifty thing i found, a 1 cycle fpu pop


233 


234  fld1


235  fdiv dword ptr [ooz_right]


236  //}


237 


238  not_last_subdivision:


239  //cap the right_s and right_t's so that they're valid


240 


241  mov eax,dword ptr [right_s]


242  mov ebx,dword ptr [right_t]


243 


244  add eax,dword ptr [cur_grads]tri_gradients.s_adjust


245  add ebx,dword ptr [cur_grads]tri_gradients.t_adjust


246 


247  //cap the right s and t


248  cmp eax,0


249  jge cmp_eax_high


250 


251  mov eax,0


252  jmp cmp_ebx_low


253 


254  cmp_eax_high:


255  cmp eax,dword ptr [s_mask]


256  jle cmp_ebx_low


257 


258  mov eax,dword ptr [s_mask]


259 


260  cmp_ebx_low:


261  cmp ebx,0


262  jge cmp_ebx_high


263 


264  mov ebx,0


265  jmp done_compare


266 


267  cmp_ebx_high:


268  cmp ebx,dword ptr [t_mask]


269  jle done_compare


270 


271  mov ebx,dword ptr [t_mask]


272 


273  done_compare:


274 


275  //store the right_s and right_t


276  //so they can be copied into left_s and left_t at the end of the 16pixel span


277  //(the cant be copied now because we have to calculate (right_sleft_s)>>4 and (right_tleft_t)>>4


278 


279  mov dword ptr [right_s],eax


280  mov dword ptr [right_t],ebx


281 


282  sub eax,dword ptr [left_s]


283  push ebp


284 


285  sar eax,4


286  sub ebx,dword ptr [left_t]


287 


288  sar ebx,4


289  mov word ptr [dsdx_frac+2],ax //this sets the upper 16 bits of dword ptr [dsdx_frac] to ((right_sleft_s)>>4)<<16


290 


291  sar eax,16


292  mov word ptr [dtdx_frac+2],bx //this sets the upper 16 bits of dword ptr [dtdx_frac] to ((right_tleft_t)>>4)<<16


293 


294  sar ebx,16


295  mov cl,byte ptr [r1_software_twidth_log2]


296 


297  shl ebx,cl


298 


299  add eax,ebx


300  mov ebx,0


301 


302  //s_t_carry[1] = integral_dsdx + integral_dtdx<<r1_software_twidth_log2


303  //s_t_carry[0] = integral_dsdx + integral_dtdx<<r1_software_twidth_log2 + r1_software_texture_width


304 


305  mov dword ptr [s_t_carry+4],eax


306  add eax,dword ptr [r1_software_texture_width]


307 


308  mov dword ptr [s_t_carry],eax


309  mov eax,0 //must make sure the high bits of these are zeroed out


310 


311  mov cl,4 //loop is unrolled to 4 pixels  we want to draw 16, so loop 4 times


312  ALIGN 16


313 


314  //high 16 bits of ecx is the fractional s component


315  //high 16 bits of edx is the fractional t component


316 


317  //eax is used to lookup the texel as well as the low 8bits of the lit texel


318  //ebx is used to lookup the high 8bits of the lit texel


319  //ebp is used to detect a tcarry as well as lookup the lit texel


320  //cl is the loop count variable


321 


322  looper1:


323  add edx,dword ptr [dtdx_frac]


324  nop


325 


326  sbb ebp,ebp


327  add edi,8 //the only convenient place for the stepping of edi was way up here


328 


329  mov ax,word ptr [esi*2]


330  add ecx,dword ptr [dsdx_frac]


331 


332  adc esi,dword ptr [4+s_t_carry+ebp*4]


333  add edx,dword ptr [dtdx_frac]


334 


335  sbb ebp,ebp


336  mov word ptr [edi8],ax //1


337 


338  mov ax,word ptr [esi*2]


339  add ecx,dword ptr [dsdx_frac]


340 


341  adc esi,dword ptr [4+s_t_carry+ebp*4]


342  add edx,dword ptr [dtdx_frac]


343 


344  sbb ebp,ebp


345  mov word ptr [edi6],ax //2


346 


347  mov ax,word ptr [esi*2]


348  add ecx,dword ptr [dsdx_frac]


349 


350  adc esi,dword ptr [4+s_t_carry+ebp*4]


351  add edx,dword ptr [dtdx_frac]


352 


353  sbb ebp,ebp


354  mov word ptr [edi4],ax //3


355 


356  mov ax,word ptr [esi*2]


357  add ecx,dword ptr [dsdx_frac]


358 


359  adc esi,dword ptr [4+s_t_carry+ebp*4]


360  mov word ptr [edi2],ax //4


361 


362  dec cl //thank god this doesnt modify the carry flag (the above add ch,dl needs to effect the adc bh,dh at the top of the loop)


363 


364  jnz looper1


365  pop ebp


366 


367  //store right_s and right_s in left_s and left_t


368  //right_s is what left_s starts at on the next 16 pixel span


369  //right_t is what left_t starts at on the next 16 pixel span


370 


371  mov eax,dword ptr [right_s]


372  mov ebx,dword ptr [right_t]


373 


374  mov dword ptr [left_s],eax


375  mov dword ptr [left_t],ebx


376  }


377 


378  _asm dec dword ptr [num_subdivisions]


379  }


380 


381  //store these so that the C code below actually works


382  _asm mov dword ptr [start_pixel],edi


383  }


384 


385  if (num_leftover)


386  {


387  if (num_leftover > 1)


388  {


389  if (had_subdivisions==0)


390  {


391  //calculate the right_z for the end of span


392  ooz_right = left>ooz + (cur_grads.doozdx * num_leftover);


393  soz_right = left>soz + (cur_grads.dsozdx * num_leftover);


394  toz_right = left>toz + (cur_grads.dtozdx * num_leftover);


395 


396  //calculate the z at the right endpoint


397  _asm fld1


398  _asm fdiv dword ptr [ooz_right]


399  }


400  else


401  {


402  //the correct ending right_z is already being calculated


403  //(see the if (num_subdivisions!=1) case above


404  }


405 


406  _asm


407  {


408  //calculate starting fractional and integral values for s and t


409 


410  mov esi,dword ptr [r1_software_texture_ptr]


411  mov eax,dword ptr [left_s]


412 


413  shr esi,1


414  mov ebx,dword ptr [left_t]


415 


416  sar eax,16


417  mov edx,dword ptr [left_t]


418 


419  sar ebx,16


420  add esi,eax


421 


422  mov cl,byte ptr [r1_software_twidth_log2]


423  shl ebx,cl


424 


425  sal edx,16


426  mov ecx,dword ptr [left_s]


427 


428  sal ecx,16


429  add esi,ebx


430 


431  mov edi,dword ptr [start_pixel]


432 


433  //calculate the right endpoint


434  //right_s = qftoi(soz_right * right_z) + cur_grads.s_adjust;


435  //right_t = qftoi(toz_right * right_z) + cur_grads.t_adjust;


436 


437  //right_z is in st0


438  fld st(0)


439 


440  fmul dword ptr [soz_right]


441  fxch st(1)


442 


443  fmul dword ptr [toz_right]


444  fxch st(1)


445 


446  fistp dword ptr [right_s]


447  fistp dword ptr [right_t]


448 


449  mov eax,dword ptr [right_s]


450  mov ebx,dword ptr [right_t]


451 


452  add eax,dword ptr [cur_grads]tri_gradients.s_adjust


453  add ebx,dword ptr [cur_grads]tri_gradients.t_adjust


454 


455  //cap the right s and t


456  cmp eax,0


457  jge cmp_eax_high_2


458 


459  mov eax,0


460  jmp cmp_ebx_low_2


461 


462  cmp_eax_high_2:


463  cmp eax,dword ptr [s_mask]


464  jle cmp_ebx_low_2


465 


466  mov eax,dword ptr [s_mask]


467 


468  cmp_ebx_low_2:


469  cmp ebx,0


470  jge cmp_ebx_high_2


471 


472  mov ebx,0


473  jmp done_compare_2


474 


475  cmp_ebx_high_2:


476  cmp ebx,dword ptr [t_mask]


477  jle done_compare_2


478 


479  mov ebx,dword ptr [t_mask]


480 


481  done_compare_2:


482 


483  //calculate the deltas (left to right)


484  //temp_dsdx = qftoi((float)(right_s  left_s) * inverse_leftover_lookup[num_leftover]);


485  //temp_dtdx = qftoi((float)(right_t  left_t) * inverse_leftover_lookup[num_leftover]);


486 


487  push ebp


488  mov ebp,num_leftover


489 


490  sub eax,dword ptr [left_s]


491  sub ebx,dword ptr [left_t]


492 


493  mov dword ptr [temp_dsdx],eax


494  mov dword ptr [temp_dtdx],ebx


495 


496  fild dword ptr [temp_dsdx]


497  fild dword ptr [temp_dtdx]


498 


499  fmul dword ptr [inverse_leftover_lookup + ebp*4]


500  fxch st(1)


501 


502  fmul dword ptr [inverse_leftover_lookup + ebp*4]


503  fxch st(1)


504 


505  fistp dword ptr [temp_dtdx]


506  fistp dword ptr [temp_dsdx]


507 


508  //calculate the fractional and integral delta vars


509  //s_t_carry[0] = (temp_dsdx>>16) + ((temp_dtdx>>16)<<r1_software_twidth_log2) + r1_software_texture_width;


510  //s_t_carry[1] = (temp_dsdx>>16) + ((temp_dtdx>>16)<<r1_software_twidth_log2);


511  //dsdx_frac = (temp_dsdx<<16);


512  //dtdx_frac = (temp_dtdx<<16);


513 


514  mov eax,dword ptr [temp_dsdx]


515  mov ebx,dword ptr [temp_dtdx]


516 


517  mov word ptr [dsdx_frac+2],ax


518  mov word ptr [dtdx_frac+2],bx


519 


520  sar eax,16


521  nop //mov dx,word ptr [left_l]


522 


523  sar ebx,16


524  mov cl,byte ptr [r1_software_twidth_log2]


525 


526  shl ebx,cl


527 


528  add eax,ebx


529  nop //mov ebx,0


530 


531  mov dword ptr [s_t_carry+4],eax


532  add eax,dword ptr [r1_software_texture_width]


533 


534  mov dword ptr [s_t_carry],eax


535  mov cl, byte ptr [num_leftover]


536 


537  //mov eax,0


538  //mov ch,dl //setup the initial lighting error


539 


540  //mov bh,byte ptr [last_bh2] //setup the initial dither


541  //add ch,0 //clear the carry bit


542 


543  ALIGN 16


544 


545  looper3:


546  mov ax,word ptr [esi*2]


547  add edx,dword ptr [dtdx_frac]


548 


549  sbb ebp,ebp


550  mov word ptr [edi],ax //1


551 


552  add edi,2 //the only convenient place for the stepping of edi was way up here


553  add ecx,dword ptr [dsdx_frac]


554 


555  adc esi,dword ptr [4+s_t_carry+ebp*4]


556  dec cl


557 


558  jnz looper3


559  pop ebp


560  }


561  }


562  else


563  {


564  //highly unoptimized single pixel drawer


565  *start_pixel = *(r1_software_texture_ptr + (left_s>>16) + ((left_t>>16)<<r1_software_twidth_log2));


566  }


567  }


568  }

