1  /********************************************************************** <BR>


2  This file is part of Crack dot Com's free source code release of


3  Golgotha. <a href="http://www.crack.com/golgotha_release"> <BR> for


4  information about compiling & licensing issues visit this URL</a>


5  <PRE> If that doesn't help, contact Jonathan Clark at


6  golgotha_source@usa.net (Subject should have "GOLG" in it)


7  ***********************************************************************/


8 


9  #include "software/r1_software_globals.hh"


10  #include "software/inline_fpu.hh"


11  #include "software/amd3d/amd3d.h"


12 


13  extern sw32 had_subdivisions;


14 


15  //instead of using left_s, left_t, right_s, and right_t,


16  //the divides and multiplies are nicely vectorized by the amd3d,


17  //and storing them is a single quad store to an array of 2 floats,


18  //rather than two dword stores to two seperate floats


19 


20  extern sw32 left_s_t[2];


21  extern sw32 right_s_t[2];


22 


23  extern float mmx0[2];


24  extern float mmx1[2];


25  extern float mmx2[2];


26  extern float mmx3[2];


27  extern float mmx4[2];


28  extern float mmx5[2];


29  extern float mmx6[2];


30  extern float mmx7[2];


31 


32  void texture_scanline_perspective_unlit_amd3d(w16 *start_pixel,


33  sw32 start_x,


34  void *_left,//perspective_span *left,


35  sw32 width)


36  {


37  start_pixel = (w16 *)((w8 *)start_pixel + start_x);


38 


39  perspective_span *left = (perspective_span *)_left;


40 


41  _asm


42  {


43  //left_z = 1.f / left>ooz;


44  //left_s = qftoi(left>soz * left_z) + cur_grads.s_adjust;


45  //left_t = qftoi(left>toz * left_z) + cur_grads.t_adjust;


46 


47  //sw32 had_subdivisions = width & (~15);


48  //num_subdivisions = width >> 4;


49  //num_leftover = width & 15;


50 


51  mov edi,dword ptr [left]


52  mov eax,dword ptr [width]


53 


54  movd mm0, dword ptr [edi]perspective_span.ooz


55  mov ebx,eax


56 


57  pfrcp (m1, m0)


58  and eax,15


59 


60  shr ebx,4


61  punpckldq mm0, mm0 //duplicate low 32bits of m0 into high 32 bits of m0


62 


63  pfrcpit1 (m0, m1)


64  mov ecx,dword ptr [width]


65 


66  movq mm2, qword ptr [edi]perspective_span.soz


67  mov dword ptr [num_leftover],eax


68 


69  pfrcpit2 (m0, m1)


70  and ecx,(~15)


71 


72  //mov eax,dword ptr [edi]perspective_span.l


73  mov dword ptr [num_subdivisions],ebx


74 


75  pfmul (m2, m0)


76  mov dword ptr [had_subdivisions],ecx


77 


78  //mov dword ptr [left_l],eax


79  //clear these out


80  mov dword ptr [dsdx_frac],0


81 


82  //high 32 bits of mm2  toz / ooz (aka t)


83  //low 32 bits of mm2  soz / ooz (aka s)


84 


85  pf2id (m3, m2)


86  mov dword ptr [dtdx_frac],0


87 


88  //high 32 bits of mm3  toz / ooz (aka t)  truncated ints


89  //low 32 bits of mm3  soz / ooz (aka s)  truncated ints


90 


91  paddd mm3, qword ptr [cur_grads]tri_gradients.s_adjust


92 


93  //high 32 bits of mm3  t + t_adjust


94  //low 32 bits of mm3  s + s_adjust


95 


96  movq qword ptr [left_s_t], mm3


97  }


98 


99  if (num_subdivisions)


100  {


101  _asm


102  {


103  //ooz_right = left>ooz + (cur_grads.doozdxspan);


104  //soz_right = left>soz + (cur_grads.dsozdxspan);


105  //toz_right = left>toz + (cur_grads.dtozdxspan);


106 


107  //edi still has dword ptr [left]


108  lea ebx,dword ptr [cur_grads]


109  nop


110 


111  movd mm1, dword ptr [edi]perspective_span.ooz


112  mov esi,dword ptr [r1_software_texture_ptr]


113 


114  movd mm3, dword ptr [ebx]tri_gradients.doozdxspan


115  mov eax,dword ptr [left_s_t] //left_s


116 


117  shr esi,1


118  movq mm0, qword ptr [edi]perspective_span.soz


119 


120  pfadd (m1, m3)


121  movq mm2, qword ptr [ebx]tri_gradients.dsozdxspan


122 


123  sar eax,16 //get integral left_s into eax


124  mov edi,dword ptr [start_pixel]


125 


126  pfrcp (m6, m1)


127  movq mm7,mm1


128 


129  pfadd (m0, m2)


130  mov ebx,dword ptr [left_s_t+4] //left_t


131 


132  //calculate the 1st right_z in mm7


133  sar ebx,16 //get integral left_t into ebx


134  punpckldq mm7, mm7 //duplicate high 32bits of mm7 into low 32 bits of mm7


135 


136  pfrcpit1 (m7, m6)


137  mov edx,dword ptr [left_s_t+4] //left_t


138 


139  mov cl,byte ptr [r1_software_twidth_log2]


140  add esi,eax


141 


142  pfrcpit2 (m7, m6)


143 


144  //calculate starting fractional and integral values for s and t


145  //esi = starting_s_coordinate >> 16 + starting_t_coordinate >> 16 << r1_software_twidth_log2


146  //ecx = starting_s_coordinate << 16


147  //edx = starting_t_coordinate << 16


148 


149  //some stuff has been moved up, interleaved w/the mmx code above


150 


151  shl ebx,cl //multiply integral left_t by texture width


152 


153  sal edx,16 //get fractional left_t into edx


154  mov ecx,dword ptr [left_s_t] //left_s


155 


156  sal ecx,16


157  add esi,ebx


158  }


159 


160  while (num_subdivisions)


161  {


162  _asm


163  {


164  //right_s = qftoi(soz_right * right_z);


165  //right_t = qftoi(toz_right * right_z);


166 


167  //soz_right and toz_right are in mm0


168  //right_z is in mm7


169  pfmul (m7, m0)


170 


171  pf2id (m7, m7)


172 


173  movq qword ptr [right_s_t],mm7


174 


175  //calculate ooz_right, soz_right, toz_right, and right_z for the end of the next span. if there are


176  //more subdivisions, calculate the end of the next span. if there are no more and there is > 1 leftover


177  //in the leftover span, calculate the end of that.


178 


179  //if (num_subdivisions!=1)


180  //{


181  cmp dword ptr [num_subdivisions],1


182  je last_subdivision


183 


184  //ooz_right += (cur_grads.doozdxspan);


185  //soz_right += (cur_grads.dsozdxspan);


186  //toz_right += (cur_grads.dtozdxspan);


187 


188  pfadd (m0, m2)


189  pfadd (m1, m3)


190 


191  jmp proceed_with_mapping


192  //}


193  //else


194  //if (num_leftover > 1)


195  //{


196 


197  last_subdivision:


198  cmp dword ptr [num_leftover],1


199  jle proceed_with_mapping


200 


201  //calculate the right_z for the end of the leftover span


202  //ooz_right += (cur_grads.doozdx * num_leftover);


203  //soz_right += (cur_grads.dsozdx * num_leftover);


204  //toz_right += (cur_grads.dtozdx * num_leftover);


205 


206  movd mm2,dword ptr [num_leftover]


207  movd mm3, dword ptr [cur_grads]tri_gradients.dsozdx


208 


209  pi2fd (m2, m2)


210  movd mm4, dword ptr [cur_grads]tri_gradients.dtozdx


211 


212  pfmul (m3, m2)


213  movd mm5, dword ptr [cur_grads]tri_gradients.doozdx


214 


215  pfmul (m4, m2)


216  pfmul (m5, m2)


217 


218  pfacc (m3, m4) //gets dtozdx*num_leftover into high 32 bits of m3


219 


220  pfadd (m0, m3)


221  pfadd (m1, m5)


222  //}


223 


224  proceed_with_mapping:


225  //cap the right_s and right_t's so that they're valid


226 


227  mov eax,dword ptr [right_s_t] //right_s


228  mov ebx,dword ptr [right_s_t+4] //right_t


229 


230  add eax,dword ptr [cur_grads]tri_gradients.s_adjust


231  add ebx,dword ptr [cur_grads]tri_gradients.t_adjust


232 


233  //cap the right s and t


234  cmp eax,0


235  jge cmp_eax_high


236 


237  mov eax,0


238  jmp cmp_ebx_low


239 


240  cmp_eax_high:


241  cmp eax,dword ptr [s_mask]


242  jle cmp_ebx_low


243 


244  mov eax,dword ptr [s_mask]


245 


246  cmp_ebx_low:


247  cmp ebx,0


248  jge cmp_ebx_high


249 


250  mov ebx,0


251  jmp done_compare


252 


253  cmp_ebx_high:


254  cmp ebx,dword ptr [t_mask]


255  jle done_compare


256 


257  mov ebx,dword ptr [t_mask]


258 


259  done_compare:


260 


261  //store the right_s and right_t


262  //so they can be copied into left_s and left_t at the end of the 16pixel span


263  //(the cant be copied now because we have to calculate (right_sleft_s)>>4 and (right_tleft_t)>>4


264 


265  //calculate the next right_z in mm7


266  //unfortunately, if the span is a multiple of 16, and this is the last set of 16, it will


267  //calculate an unnecessary z. but its best to have the code here mixed in w/integer ops so


268  //that the amd3d code has something for its executation latencies to sit through


269  movq mm7, mm1


270  pfrcp (m6, m1)


271 


272  mov dword ptr [right_s_t],eax //right_s


273  mov dword ptr [right_s_t+4],ebx //right_t


274 


275  punpckldq mm7, mm7 //duplicate low 32bits of mm7 into high 32 bits of mm7


276  sub eax,dword ptr [left_s_t] //left_s


277 


278  sar eax,4


279  push ebp


280 


281  pfrcpit1 (m7, m6)


282  sub ebx,dword ptr [left_s_t+4] //left_t


283 


284  sar ebx,4


285  mov word ptr [dsdx_frac+2],ax //this sets the upper 16 bits of dword ptr [dsdx_frac] to ((right_sleft_s)>>4)<<16


286 


287  pfrcpit2 (m7, m6)


288  nop


289 


290  sar eax,16


291  mov word ptr [dtdx_frac+2],bx //this sets the upper 16 bits of dword ptr [dtdx_frac] to ((right_tleft_t)>>4)<<16


292 


293  sar ebx,16


294  mov cl,byte ptr [r1_software_twidth_log2]


295 


296  shl ebx,cl


297 


298  add eax,ebx


299 


300  //s_t_carry[1] = integral_dsdx + integral_dtdx<<r1_software_twidth_log2


301  //s_t_carry[0] = integral_dsdx + integral_dtdx<<r1_software_twidth_log2 + r1_software_texture_width


302 


303  mov dword ptr [s_t_carry+4],eax


304  add eax,dword ptr [r1_software_texture_width]


305 


306  mov dword ptr [s_t_carry],eax


307  mov cl,4 //loop is unrolled to 4 pixels  we want to draw 16, so loop 4 times


308 


309  ALIGN 16


310 


311  //high 16 bits of ecx is the fractional s component


312  //high 16 bits of edx is the fractional t component


313 


314  //eax is used to lookup the texel as well as the low 8bits of the lit texel


315  //ebx is used to lookup the high 8bits of the lit texel


316  //ebp is used to detect a tcarry as well as lookup the lit texel


317  //cl is the loop count variable


318 


319  looper1:


320  add edx,dword ptr [dtdx_frac]


321  nop


322 


323  sbb ebp,ebp


324  add edi,8 //the only convenient place for the stepping of edi was way up here


325 


326  movzx eax,word ptr [esi*2]


327  add ecx,dword ptr [dsdx_frac]


328 


329  adc esi,dword ptr [4+s_t_carry+ebp*4]


330  add edx,dword ptr [dtdx_frac]


331 


332  sbb ebp,ebp


333  mov word ptr [edi8],ax //1


334 


335  movzx eax,word ptr [esi*2]


336  add ecx,dword ptr [dsdx_frac]


337 


338  adc esi,dword ptr [4+s_t_carry+ebp*4]


339  add edx,dword ptr [dtdx_frac]


340 


341  sbb ebp,ebp


342  mov word ptr [edi6],ax //2


343 


344  movzx eax,word ptr [esi*2]


345  add ecx,dword ptr [dsdx_frac]


346 


347  adc esi,dword ptr [4+s_t_carry+ebp*4]


348  add edx,dword ptr [dtdx_frac]


349 


350  sbb ebp,ebp


351  mov word ptr [edi4],ax //3


352 


353  movzx eax,word ptr [esi*2]


354  add ecx,dword ptr [dsdx_frac]


355 


356  adc esi,dword ptr [4+s_t_carry+ebp*4]


357  mov word ptr [edi2],ax //4


358 


359  dec cl //thank god this doesnt modify the carry flag (the above add ch,dl needs to effect the adc bh,dh at the top of the loop)


360 


361  jnz looper1


362  pop ebp


363 


364  //store right_s and right_t in left_s and left_t


365  //right_s is what left_s starts at on the next 16 pixel span


366  //right_t is what left_t starts at on the next 16 pixel span


367 


368  mov eax,dword ptr [right_s_t] //right_s


369  mov ebx,dword ptr [right_s_t+4] //right_t


370 


371  mov dword ptr [left_s_t],eax //left_s


372  mov dword ptr [left_s_t+4],ebx //left_t


373  }


374 


375  _asm dec dword ptr [num_subdivisions]


376  }


377 


378  //store these so that the C code below actually works


379  _asm mov dword ptr [start_pixel],edi


380  }


381 


382  if (num_leftover)


383  {


384  if (num_leftover > 1)


385  {


386  if (had_subdivisions==0)


387  {


388  //calculate the right_z for the end of span


389  //ooz_right = left>ooz + (cur_grads.doozdx * num_leftover);


390  //soz_right = left>soz + (cur_grads.dsozdx * num_leftover);


391  //toz_right = left>toz + (cur_grads.dtozdx * num_leftover);


392 


393  _asm


394  {


395  movd mm2,dword ptr [num_leftover]


396  lea ebx,dword ptr [cur_grads]


397 


398  movd mm3, dword ptr [ebx]tri_gradients.dsozdx


399  mov edi,dword ptr [left]


400 


401  movd mm4, dword ptr [ebx]tri_gradients.dtozdx


402  pi2fd (m2, m2)


403 


404  movd mm5, dword ptr [ebx]tri_gradients.doozdx


405  pfmul (m3, m2)


406 


407  movq mm0, qword ptr [edi]perspective_span.soz


408  pfmul (m4, m2)


409 


410  movd mm1, dword ptr [edi]perspective_span.ooz


411  pfmul (m5, m2)


412 


413  pfacc (m3, m4) //gets dtozdx*num_leftover into high 32 bits of m3


414 


415  pfadd (m1, m5) //ooz += doozdx*num_leftover


416  pfadd (m0, m3) //soz += dsozdx*num_leftover AND toz += dtozdx*num_leftover


417 


418  //calculate the z at the right endpoint in mm7


419  movq mm7, mm1


420  pfrcp (m6, m1)


421 


422  punpckldq mm7, mm7 //duplicate low 32bits of mm7 into high 32 bits of mm7


423 


424  pfrcpit1 (m7, m6) //terrible stalls. oh well


425 


426  pfrcpit2 (m7, m6)


427  }


428  }


429  else


430  {


431  //the correct ending right_z is already being calculated


432  //(see the if (num_subdivisions!=1) case above


433  }


434 


435  _asm


436  {


437  //calculate starting fractional and integral values for s and t


438 


439  //calculate the right endpoint


440  //right_s = qftoi(soz_right * right_z) + cur_grads.s_adjust;


441  //right_t = qftoi(toz_right * right_z) + cur_grads.t_adjust;


442 


443  //soz_right and toz_right are in mm0


444  //right_z is in mm7


445  pfmul (m7, m0) //calculate right_s and right_t


446  mov edi,dword ptr [start_pixel]


447 


448  mov esi,dword ptr [r1_software_texture_ptr]


449  mov eax,dword ptr [left_s_t] //left_s


450 


451  shr esi,1


452  pf2id (m7, m7) //truncate right_s and right_t


453 


454  sar eax,16


455  mov ebx,dword ptr [left_s_t+4] //left_t


456 


457  sar ebx,16


458  movq qword ptr [right_s_t],mm7


459 


460  mov edx,dword ptr [left_s_t+4] //left_t


461  add esi,eax


462 


463  mov cl,byte ptr [r1_software_twidth_log2]


464  shl ebx,cl


465 


466  sal edx,16


467  mov ecx,dword ptr [left_s_t] //left_s


468 


469  sal ecx,16


470  add esi,ebx


471 


472  mov eax,dword ptr [right_s_t] //right_s


473  mov ebx,dword ptr [right_s_t+4] //right_t


474 


475  add eax,dword ptr [cur_grads]tri_gradients.s_adjust


476  add ebx,dword ptr [cur_grads]tri_gradients.t_adjust


477 


478  //cap the right s and t


479  cmp eax,0


480  jge cmp_eax_high_2


481 


482  mov eax,0


483  jmp cmp_ebx_low_2


484 


485  cmp_eax_high_2:


486  cmp eax,dword ptr [s_mask]


487  jle cmp_ebx_low_2


488 


489  mov eax,dword ptr [s_mask]


490 


491  cmp_ebx_low_2:


492  cmp ebx,0


493  jge cmp_ebx_high_2


494 


495  mov ebx,0


496  jmp done_compare_2


497 


498  cmp_ebx_high_2:


499  cmp ebx,dword ptr [t_mask]


500  jle done_compare_2


501 


502  mov ebx,dword ptr [t_mask]


503 


504  done_compare_2:


505 


506  //calculate the deltas (left to right)


507  //temp_dsdx = qftoi((float)(right_s  left_s) * inverse_leftover_lookup[num_leftover]);


508  //temp_dtdx = qftoi((float)(right_t  left_t) * inverse_leftover_lookup[num_leftover]);


509 


510  sub eax,dword ptr [left_s_t] //left_s


511  sub ebx,dword ptr [left_s_t+4] //left_t


512 


513  movd mm0,eax //temp_dsdx


514  push ebp


515 


516  movd mm1,ebx //temp_dtdx


517  mov ebp, dword ptr [num_leftover]


518 


519  pi2fd (m0, m0)


520  movd mm2, dword ptr [inverse_leftover_lookup + ebp*4]


521 


522  pi2fd (m1, m1)


523  pfmul (m0, m2)


524 


525  pfmul (m1, m2) //bad stalls here


526  pf2id (m0, m0)


527 


528  pf2id (m1, m1)


529 


530  movd eax, mm0 //temp_dsdx


531  movd ebx, mm1 //temp_dtdx


532 


533  //calculate the fractional and integral delta vars


534  //s_t_carry[0] = (temp_dsdx>>16) + ((temp_dtdx>>16)<<r1_software_twidth_log2) + r1_software_texture_width;


535  //s_t_carry[1] = (temp_dsdx>>16) + ((temp_dtdx>>16)<<r1_software_twidth_log2);


536  //dsdx_frac = (temp_dsdx<<16);


537  //dtdx_frac = (temp_dtdx<<16);


538 


539  mov word ptr [dsdx_frac+2],ax


540  mov word ptr [dtdx_frac+2],bx


541 


542  sar eax,16


543  mov dx,word ptr [left_l]


544 


545  sar ebx,16


546  mov cl,byte ptr [r1_software_twidth_log2]


547 


548  shl ebx,cl


549 


550  add eax,ebx


551  nop //mov ebx,0


552 


553  mov dword ptr [s_t_carry+4],eax


554  add eax,dword ptr [r1_software_texture_width]


555 


556  mov dword ptr [s_t_carry],eax


557  mov cl, byte ptr [num_leftover]


558 


559  ALIGN 16


560 


561  looper3:


562  movzx eax,word ptr [esi*2]


563  add edx,dword ptr [dtdx_frac]


564 


565  sbb ebp,ebp


566  mov word ptr [edi],ax //1


567 


568  add edi,2 //the only convenient place for the stepping of edi was way up here


569  add ecx,dword ptr [dsdx_frac]


570 


571  adc esi,dword ptr [4+s_t_carry+ebp*4]


572  dec cl


573 


574  jnz looper3


575 


576  pop ebp


577  }


578  }


579  else


580  {


581  //highly unoptimized single pixel drawer


582  *start_pixel = *(r1_software_texture_ptr + (left_s_t[0]>>16) + ((left_s_t[1]>>16)<<r1_software_twidth_log2));


583  }


584  }


585 


586  return;


587 


588  _asm


589  {


590  dumpmmxregs:


591  movq qword ptr [mmx0],mm0


592  movq qword ptr [mmx1],mm1


593  movq qword ptr [mmx2],mm2


594  movq qword ptr [mmx3],mm3


595  movq qword ptr [mmx4],mm4


596  movq qword ptr [mmx5],mm5


597  movq qword ptr [mmx6],mm6


598  movq qword ptr [mmx7],mm7


599  ret


600  }


601 


602  }


603 

