1  /********************************************************************** <BR>


2  This file is part of Crack dot Com's free source code release of


3  Golgotha. <a href="http://www.crack.com/golgotha_release"> <BR> for


4  information about compiling & licensing issues visit this URL</a>


5  <PRE> If that doesn't help, contact Jonathan Clark at


6  golgotha_source@usa.net (Subject should have "GOLG" in it)


7  ***********************************************************************/


8 


9  #include "software/r1_software_globals.hh"


10  #include "software/inline_fpu.hh"


11  #include "software/amd3d/amd3d.h"


12 


13  extern sw32 had_subdivisions;


14 


15  //instead of using left_s, left_t, right_s, and right_t,


16  //the divides and multiplies are nicely vectorized by the amd3d,


17  //and storing them is a single quad store to an array of 2 floats,


18  //rather than two dword stores to two seperate floats


19 


20  extern sw32 left_s_t[2];


21  extern sw32 right_s_t[2];


22 


23  extern float mmx0[2];


24  extern float mmx1[2];


25  extern float mmx2[2];


26  extern float mmx3[2];


27  extern float mmx4[2];


28  extern float mmx5[2];


29  extern float mmx6[2];


30  extern float mmx7[2];


31 


32  void texture_scanline_perspective_unlit_holy_amd3d(w16 *start_pixel,


33  sw32 start_x,


34  void *_left,//perspective_span *left,


35  sw32 width)


36  {


37  start_pixel = (w16 *)((w8 *)start_pixel + start_x);


38 


39  perspective_span *left = (perspective_span *)_left;


40 


41  _asm


42  {


43  //left_z = 1.f / left>ooz;


44  //left_s = qftoi(left>soz * left_z) + cur_grads.s_adjust;


45  //left_t = qftoi(left>toz * left_z) + cur_grads.t_adjust;


46 


47  //sw32 had_subdivisions = width & (~15);


48  //num_subdivisions = width >> 4;


49  //num_leftover = width & 15;


50 


51  mov edi,dword ptr [left]


52  mov eax,dword ptr [width]


53 


54  movd mm0, dword ptr [edi]perspective_span.ooz


55  mov ebx,eax


56 


57  pfrcp (m1, m0)


58  and eax,15


59 


60  shr ebx,4


61  punpckldq mm0, mm0 //duplicate low 32bits of m0 into high 32 bits of m0


62 


63  pfrcpit1 (m0, m1)


64  mov ecx,dword ptr [width]


65 


66  movq mm2, qword ptr [edi]perspective_span.soz


67  mov dword ptr [num_leftover],eax


68 


69  pfrcpit2 (m0, m1)


70  and ecx,(~15)


71 


72  //mov eax,dword ptr [edi]perspective_span.l


73  mov dword ptr [num_subdivisions],ebx


74 


75  pfmul (m2, m0)


76  mov dword ptr [had_subdivisions],ecx


77 


78  //mov dword ptr [left_l],eax


79  //clear these out


80  mov dword ptr [dsdx_frac],0


81 


82  //high 32 bits of mm2  toz / ooz (aka t)


83  //low 32 bits of mm2  soz / ooz (aka s)


84 


85  pf2id (m3, m2)


86  mov dword ptr [dtdx_frac],0


87 


88  //high 32 bits of mm3  toz / ooz (aka t)  truncated ints


89  //low 32 bits of mm3  soz / ooz (aka s)  truncated ints


90 


91  paddd mm3, qword ptr [cur_grads]tri_gradients.s_adjust


92 


93  //high 32 bits of mm3  t + t_adjust


94  //low 32 bits of mm3  s + s_adjust


95 


96  movq qword ptr [left_s_t], mm3


97  }


98 


99  if (num_subdivisions)


100  {


101  _asm


102  {


103  //ooz_right = left>ooz + (cur_grads.doozdxspan);


104  //soz_right = left>soz + (cur_grads.dsozdxspan);


105  //toz_right = left>toz + (cur_grads.dtozdxspan);


106 


107  //edi still has dword ptr [left]


108  lea ebx,dword ptr [cur_grads]


109  nop


110 


111  movd mm1, dword ptr [edi]perspective_span.ooz


112  mov esi,dword ptr [r1_software_texture_ptr]


113 


114  movd mm3, dword ptr [ebx]tri_gradients.doozdxspan


115  mov eax,dword ptr [left_s_t] //left_s


116 


117  shr esi,1


118  movq mm0, qword ptr [edi]perspective_span.soz


119 


120  pfadd (m1, m3)


121  movq mm2, qword ptr [ebx]tri_gradients.dsozdxspan


122 


123  sar eax,16 //get integral left_s into eax


124  mov edi,dword ptr [start_pixel]


125 


126  pfrcp (m6, m1)


127  movq mm7,mm1


128 


129  pfadd (m0, m2)


130  mov ebx,dword ptr [left_s_t+4] //left_t


131 


132  //calculate the 1st right_z in mm7


133  sar ebx,16 //get integral left_t into ebx


134  punpckldq mm7, mm7 //duplicate high 32bits of mm7 into low 32 bits of mm7


135 


136  pfrcpit1 (m7, m6)


137  mov edx,dword ptr [left_s_t+4] //left_t


138 


139  mov cl,byte ptr [r1_software_twidth_log2]


140  add esi,eax


141 


142  pfrcpit2 (m7, m6)


143 


144  //calculate starting fractional and integral values for s and t


145  //esi = starting_s_coordinate >> 16 + starting_t_coordinate >> 16 << r1_software_twidth_log2


146  //ecx = starting_s_coordinate << 16


147  //edx = starting_t_coordinate << 16


148 


149  //some stuff has been moved up, interleaved w/the mmx code above


150 


151  shl ebx,cl //multiply integral left_t by texture width


152 


153  sal edx,16 //get fractional left_t into edx


154  mov ecx,dword ptr [left_s_t] //left_s


155 


156  sal ecx,16


157  add esi,ebx


158  }


159 


160  while (num_subdivisions)


161  {


162  _asm


163  {


164  //right_s = qftoi(soz_right * right_z);


165  //right_t = qftoi(toz_right * right_z);


166 


167  //soz_right and toz_right are in mm0


168  //right_z is in mm7


169  pfmul (m7, m0)


170 


171  pf2id (m7, m7)


172 


173  movq qword ptr [right_s_t],mm7


174 


175  //calculate ooz_right, soz_right, toz_right, and right_z for the end of the next span. if there are


176  //more subdivisions, calculate the end of the next span. if there are no more and there is > 1 leftover


177  //in the leftover span, calculate the end of that.


178 


179  //if (num_subdivisions!=1)


180  //{


181  cmp dword ptr [num_subdivisions],1


182  je last_subdivision


183 


184  //ooz_right += (cur_grads.doozdxspan);


185  //soz_right += (cur_grads.dsozdxspan);


186  //toz_right += (cur_grads.dtozdxspan);


187 


188  pfadd (m0, m2)


189  pfadd (m1, m3)


190 


191  jmp proceed_with_mapping


192  //}


193  //else


194  //if (num_leftover > 1)


195  //{


196 


197  last_subdivision:


198  cmp dword ptr [num_leftover],1


199  jle proceed_with_mapping


200 


201  //calculate the right_z for the end of the leftover span


202  //ooz_right += (cur_grads.doozdx * num_leftover);


203  //soz_right += (cur_grads.dsozdx * num_leftover);


204  //toz_right += (cur_grads.dtozdx * num_leftover);


205 


206  movd mm2,dword ptr [num_leftover]


207  movd mm3, dword ptr [cur_grads]tri_gradients.dsozdx


208 


209  pi2fd (m2, m2)


210  movd mm4, dword ptr [cur_grads]tri_gradients.dtozdx


211 


212  pfmul (m3, m2)


213  movd mm5, dword ptr [cur_grads]tri_gradients.doozdx


214 


215  pfmul (m4, m2)


216  pfmul (m5, m2)


217 


218  pfacc (m3, m4) //gets dtozdx*num_leftover into high 32 bits of m3


219 


220  pfadd (m0, m3)


221  pfadd (m1, m5)


222  //}


223 


224  proceed_with_mapping:


225  //cap the right_s and right_t's so that they're valid


226 


227  mov eax,dword ptr [right_s_t] //right_s


228  mov ebx,dword ptr [right_s_t+4] //right_t


229 


230  add eax,dword ptr [cur_grads]tri_gradients.s_adjust


231  add ebx,dword ptr [cur_grads]tri_gradients.t_adjust


232 


233  //cap the right s and t


234  cmp eax,0


235  jge cmp_eax_high


236 


237  mov eax,0


238  jmp cmp_ebx_low


239 


240  cmp_eax_high:


241  cmp eax,dword ptr [s_mask]


242  jle cmp_ebx_low


243 


244  mov eax,dword ptr [s_mask]


245 


246  cmp_ebx_low:


247  cmp ebx,0


248  jge cmp_ebx_high


249 


250  mov ebx,0


251  jmp done_compare


252 


253  cmp_ebx_high:


254  cmp ebx,dword ptr [t_mask]


255  jle done_compare


256 


257  mov ebx,dword ptr [t_mask]


258 


259  done_compare:


260 


261  //store the right_s and right_t


262  //so they can be copied into left_s and left_t at the end of the 16pixel span


263  //(the cant be copied now because we have to calculate (right_sleft_s)>>4 and (right_tleft_t)>>4


264 


265  //calculate the next right_z in mm7


266  //unfortunately, if the span is a multiple of 16, and this is the last set of 16, it will


267  //calculate an unnecessary z. but its best to have the code here mixed in w/integer ops so


268  //that the amd3d code has something for its executation latencies to sit through


269  movq mm7, mm1


270  pfrcp (m6, m1)


271 


272  mov dword ptr [right_s_t],eax //right_s


273  mov dword ptr [right_s_t+4],ebx //right_t


274 


275  punpckldq mm7, mm7 //duplicate low 32bits of mm7 into high 32 bits of mm7


276  sub eax,dword ptr [left_s_t] //left_s


277 


278  sar eax,4


279  push ebp


280 


281  pfrcpit1 (m7, m6)


282  sub ebx,dword ptr [left_s_t+4] //left_t


283 


284  sar ebx,4


285  mov word ptr [dsdx_frac+2],ax //this sets the upper 16 bits of dword ptr [dsdx_frac] to ((right_sleft_s)>>4)<<16


286 


287  pfrcpit2 (m7, m6)


288  nop


289 


290  sar eax,16


291  mov word ptr [dtdx_frac+2],bx //this sets the upper 16 bits of dword ptr [dtdx_frac] to ((right_tleft_t)>>4)<<16


292 


293  sar ebx,16


294  mov cl,byte ptr [r1_software_twidth_log2]


295 


296  shl ebx,cl


297 


298  add eax,ebx


299  mov cl,4 //loop is unrolled to 4 pixels  we want to draw 16, so loop 4 times


300 


301  //s_t_carry[1] = integral_dsdx + integral_dtdx<<r1_software_twidth_log2


302  //s_t_carry[0] = integral_dsdx + integral_dtdx<<r1_software_twidth_log2 + r1_software_texture_width


303 


304  mov dword ptr [s_t_carry+4],eax


305  add eax,dword ptr [r1_software_texture_width]


306 


307  mov dword ptr [s_t_carry],eax


308 


309  ALIGN 16


310 


311  //high 16 bits of ecx is the fractional s component


312  //high 16 bits of edx is the fractional t component


313 


314  //eax is used to lookup the texel as well as the low 8bits of the lit texel


315  //ebx is used to lookup the high 8bits of the lit texel


316  //ebp is used to detect a tcarry as well as lookup the lit texel


317  //cl is the loop count variable


318 


319  looper1:


320  add edi,8


321  add edx,dword ptr [dtdx_frac]


322 


323  sbb ebx,ebx


324  add ecx,dword ptr [dsdx_frac]


325 


326  movzx eax,word ptr [esi*2]


327 


328  adc esi,dword ptr [4+s_t_carry+ebx*4]


329  add edx,dword ptr [dtdx_frac]


330 


331  sbb ebx,ebx


332  and eax,eax //test to see if its zero


333 


334  jz skipped_1_pixel


335  mov word ptr [edi8],ax //store 1 pixel


336 


337  skipped_1_pixel:


338  movzx eax,word ptr [esi*2]


339  add ecx,dword ptr [dsdx_frac]


340 


341  adc esi,dword ptr [4+s_t_carry+ebx*4]


342  add edx,dword ptr [dtdx_frac]


343 


344  sbb ebx,ebx


345  and eax,eax //test to see if its zero


346 


347  jz skipped_2_pixel


348  mov word ptr [edi6],ax


349 


350  skipped_2_pixel:


351  movzx eax,word ptr [esi*2]


352  add ecx,dword ptr [dsdx_frac]


353 


354  adc esi,dword ptr [4+s_t_carry+ebx*4]


355  add edx,dword ptr [dtdx_frac]


356 


357  sbb ebx,ebx


358  and eax,eax //test to see if its zero


359 


360  jz skipped_3_pixel


361  mov word ptr [edi4],ax


362 


363  skipped_3_pixel:


364  movzx eax,word ptr [esi*2]


365  add ecx,dword ptr [dsdx_frac]


366 


367  adc esi,dword ptr [4+s_t_carry+ebx*4]


368  and eax,eax //test to see if its zero


369 


370  jz skipped_4_pixel


371  mov word ptr [edi2],ax


372 


373  skipped_4_pixel:


374  dec cl


375  jnz looper1


376 


377  pop ebp


378 


379  //store right_s and right_t in left_s and left_t


380  //right_s is what left_s starts at on the next 16 pixel span


381  //right_t is what left_t starts at on the next 16 pixel span


382 


383  mov eax,dword ptr [right_s_t] //right_s


384  mov ebx,dword ptr [right_s_t+4] //right_t


385 


386  mov dword ptr [left_s_t],eax //left_s


387  mov dword ptr [left_s_t+4],ebx //left_t


388  }


389 


390  _asm dec dword ptr [num_subdivisions]


391  }


392 


393  //store these so that the C code below actually works


394  _asm mov dword ptr [start_pixel],edi


395  }


396 


397  if (num_leftover)


398  {


399  if (num_leftover > 1)


400  {


401  if (had_subdivisions==0)


402  {


403  //calculate the right_z for the end of span


404  //ooz_right = left>ooz + (cur_grads.doozdx * num_leftover);


405  //soz_right = left>soz + (cur_grads.dsozdx * num_leftover);


406  //toz_right = left>toz + (cur_grads.dtozdx * num_leftover);


407 


408  _asm


409  {


410  movd mm2,dword ptr [num_leftover]


411  lea ebx,dword ptr [cur_grads]


412 


413  movd mm3, dword ptr [ebx]tri_gradients.dsozdx


414  mov edi,dword ptr [left]


415 


416  movd mm4, dword ptr [ebx]tri_gradients.dtozdx


417  pi2fd (m2, m2)


418 


419  movd mm5, dword ptr [ebx]tri_gradients.doozdx


420  pfmul (m3, m2)


421 


422  movq mm0, qword ptr [edi]perspective_span.soz


423  pfmul (m4, m2)


424 


425  movd mm1, dword ptr [edi]perspective_span.ooz


426  pfmul (m5, m2)


427 


428  pfacc (m3, m4) //gets dtozdx*num_leftover into high 32 bits of m3


429 


430  pfadd (m1, m5) //ooz += doozdx*num_leftover


431  pfadd (m0, m3) //soz += dsozdx*num_leftover AND toz += dtozdx*num_leftover


432 


433  //calculate the z at the right endpoint in mm7


434  movq mm7, mm1


435  pfrcp (m6, m1)


436 


437  punpckldq mm7, mm7 //duplicate low 32bits of mm7 into high 32 bits of mm7


438 


439  pfrcpit1 (m7, m6) //terrible stalls. oh well


440 


441  pfrcpit2 (m7, m6)


442  }


443  }


444  else


445  {


446  //the correct ending right_z is already being calculated


447  //(see the if (num_subdivisions!=1) case above


448  }


449 


450  _asm


451  {


452  //calculate starting fractional and integral values for s and t


453 


454  //calculate the right endpoint


455  //right_s = qftoi(soz_right * right_z) + cur_grads.s_adjust;


456  //right_t = qftoi(toz_right * right_z) + cur_grads.t_adjust;


457 


458  //soz_right and toz_right are in mm0


459  //right_z is in mm7


460  pfmul (m7, m0) //calculate right_s and right_t


461  mov edi,dword ptr [start_pixel]


462 


463  mov esi,dword ptr [r1_software_texture_ptr]


464  mov eax,dword ptr [left_s_t] //left_s


465 


466  shr esi,1


467  pf2id (m7, m7) //truncate right_s and right_t


468 


469  sar eax,16


470  mov ebx,dword ptr [left_s_t+4] //left_t


471 


472  sar ebx,16


473  movq qword ptr [right_s_t],mm7


474 


475  mov edx,dword ptr [left_s_t+4] //left_t


476  add esi,eax


477 


478  mov cl,byte ptr [r1_software_twidth_log2]


479  shl ebx,cl


480 


481  sal edx,16


482  mov ecx,dword ptr [left_s_t] //left_s


483 


484  sal ecx,16


485  add esi,ebx


486 


487  mov eax,dword ptr [right_s_t] //right_s


488  mov ebx,dword ptr [right_s_t+4] //right_t


489 


490  add eax,dword ptr [cur_grads]tri_gradients.s_adjust


491  add ebx,dword ptr [cur_grads]tri_gradients.t_adjust


492 


493  //cap the right s and t


494  cmp eax,0


495  jge cmp_eax_high_2


496 


497  mov eax,0


498  jmp cmp_ebx_low_2


499 


500  cmp_eax_high_2:


501  cmp eax,dword ptr [s_mask]


502  jle cmp_ebx_low_2


503 


504  mov eax,dword ptr [s_mask]


505 


506  cmp_ebx_low_2:


507  cmp ebx,0


508  jge cmp_ebx_high_2


509 


510  mov ebx,0


511  jmp done_compare_2


512 


513  cmp_ebx_high_2:


514  cmp ebx,dword ptr [t_mask]


515  jle done_compare_2


516 


517  mov ebx,dword ptr [t_mask]


518 


519  done_compare_2:


520 


521  //calculate the deltas (left to right)


522  //temp_dsdx = qftoi((float)(right_s  left_s) * inverse_leftover_lookup[num_leftover]);


523  //temp_dtdx = qftoi((float)(right_t  left_t) * inverse_leftover_lookup[num_leftover]);


524 


525  sub eax,dword ptr [left_s_t] //left_s


526  sub ebx,dword ptr [left_s_t+4] //left_t


527 


528  movd mm0,eax //temp_dsdx


529  push ebp


530 


531  movd mm1,ebx //temp_dtdx


532  mov ebp, dword ptr [num_leftover]


533 


534  pi2fd (m0, m0)


535  movd mm2, dword ptr [inverse_leftover_lookup + ebp*4]


536 


537  pi2fd (m1, m1)


538  pfmul (m0, m2)


539 


540  pfmul (m1, m2) //bad stalls here


541  pf2id (m0, m0)


542 


543  pf2id (m1, m1)


544 


545  movd eax, mm0 //temp_dsdx


546  movd ebx, mm1 //temp_dtdx


547 


548  //calculate the fractional and integral delta vars


549  //s_t_carry[0] = (temp_dsdx>>16) + ((temp_dtdx>>16)<<r1_software_twidth_log2) + r1_software_texture_width;


550  //s_t_carry[1] = (temp_dsdx>>16) + ((temp_dtdx>>16)<<r1_software_twidth_log2);


551  //dsdx_frac = (temp_dsdx<<16);


552  //dtdx_frac = (temp_dtdx<<16);


553 


554  mov word ptr [dsdx_frac+2],ax


555  mov word ptr [dtdx_frac+2],bx


556 


557  sar eax,16


558  mov dx,word ptr [left_l]


559 


560  sar ebx,16


561  mov cl,byte ptr [r1_software_twidth_log2]


562 


563  shl ebx,cl


564 


565  add eax,ebx


566  mov cl, byte ptr [num_leftover]


567 


568  mov dword ptr [s_t_carry+4],eax


569  add eax,dword ptr [r1_software_texture_width]


570 


571  mov dword ptr [s_t_carry],eax


572 


573  ALIGN 16


574 


575  looper3:


576  movzx eax,word ptr [esi*2]


577  add edx,dword ptr [dtdx_frac]


578 


579  sbb ebp,ebp


580  add edi,2 //the only convenient place for the stepping of edi was way up here


581 


582  add ecx,dword ptr [dsdx_frac]


583 


584  adc esi,dword ptr [4+s_t_carry+ebp*4]


585  and eax,eax


586 


587  jz skip_a_pixel


588  mov word ptr [edi2],ax


589 


590  skip_a_pixel:


591  dec cl


592  jnz looper3


593 


594  pop ebp


595  }


596  }


597  else


598  {


599  //highly unoptimized single pixel drawer


600  register w16 texel = *(r1_software_texture_ptr + (left_s_t[0]>>16) + ((left_s_t[1]>>16)<<r1_software_twidth_log2));


601 


602  if (texel)


603  *start_pixel = texel;


604  }


605  }


606 


607  return;


608 


609  _asm


610  {


611  dumpmmxregs:


612  movq qword ptr [mmx0],mm0


613  movq qword ptr [mmx1],mm1


614  movq qword ptr [mmx2],mm2


615  movq qword ptr [mmx3],mm3


616  movq qword ptr [mmx4],mm4


617  movq qword ptr [mmx5],mm5


618  movq qword ptr [mmx6],mm6


619  movq qword ptr [mmx7],mm7


620  ret


621  }


622 


623  }


624 

