1  /********************************************************************** <BR>


2  This file is part of Crack dot Com's free source code release of


3  Golgotha. <a href="http://www.crack.com/golgotha_release"> <BR> for


4  information about compiling & licensing issues visit this URL</a>


5  <PRE> If that doesn't help, contact Jonathan Clark at


6  golgotha_source@usa.net (Subject should have "GOLG" in it)


7  ***********************************************************************/


8 


9  #include "software/r1_software_globals.hh"


10  #include "software/inline_fpu.hh"


11  #include "software/amd3d/amd3d.h"


12 


13  extern sw32 had_subdivisions;


14  extern w8 last_alpha_accumulated;


15 


16  //instead of using left_s, left_t, right_s, and right_t,


17  //the divides and multiplies are nicely vectorized by the amd3d,


18  //and storing them is a single quad store to an array of 2 floats,


19  //rather than two dword stores to two seperate floats


20 


21  extern sw32 left_s_t[2];


22  extern sw32 right_s_t[2];


23 


24  extern float mmx0[2];


25  extern float mmx1[2];


26  extern float mmx2[2];


27  extern float mmx3[2];


28  extern float mmx4[2];


29  extern float mmx5[2];


30  extern float mmx6[2];


31  extern float mmx7[2];


32 


33  void texture_scanline_perspective_unlit_alpha_amd3d(w16 *start_pixel,


34  sw32 start_x,


35  void *_left,//perspective_span *left,


36  sw32 width)


37  {


38  start_pixel = (w16 *)((w8 *)start_pixel + start_x);


39 


40  perspective_span *left = (perspective_span *)_left;


41 


42  last_alpha_accumulated = 16;


43 


44  _asm


45  {


46  //left_z = 1.f / left>ooz;


47  //left_s = qftoi(left>soz * left_z) + cur_grads.s_adjust;


48  //left_t = qftoi(left>toz * left_z) + cur_grads.t_adjust;


49 


50  //sw32 had_subdivisions = width & (~15);


51  //num_subdivisions = width >> 4;


52  //num_leftover = width & 15;


53 


54  mov edi,dword ptr [left]


55  mov eax,dword ptr [width]


56 


57  movd mm0, dword ptr [edi]perspective_span.ooz


58  mov ebx,eax


59 


60  pfrcp (m1, m0)


61  and eax,15


62 


63  shr ebx,4


64  punpckldq mm0, mm0 //duplicate low 32bits of m0 into high 32 bits of m0


65 


66  pfrcpit1 (m0, m1)


67  mov ecx,dword ptr [width]


68 


69  movq mm2, qword ptr [edi]perspective_span.soz


70  mov dword ptr [num_leftover],eax


71 


72  pfrcpit2 (m0, m1)


73  and ecx,(~15)


74 


75  //mov eax,dword ptr [edi]perspective_span.l


76  mov dword ptr [num_subdivisions],ebx


77 


78  pfmul (m2, m0)


79  mov dword ptr [had_subdivisions],ecx


80 


81  //mov dword ptr [left_l],eax


82  //clear these out


83  mov dword ptr [dsdx_frac],0


84 


85  //high 32 bits of mm2  toz / ooz (aka t)


86  //low 32 bits of mm2  soz / ooz (aka s)


87 


88  pf2id (m3, m2)


89  mov dword ptr [dtdx_frac],0


90 


91  //high 32 bits of mm3  toz / ooz (aka t)  truncated ints


92  //low 32 bits of mm3  soz / ooz (aka s)  truncated ints


93 


94  paddd mm3, qword ptr [cur_grads]tri_gradients.s_adjust


95 


96  //high 32 bits of mm3  t + t_adjust


97  //low 32 bits of mm3  s + s_adjust


98 


99  movq qword ptr [left_s_t], mm3


100  }


101 


102  if (num_subdivisions)


103  {


104  _asm


105  {


106  //ooz_right = left>ooz + (cur_grads.doozdxspan);


107  //soz_right = left>soz + (cur_grads.dsozdxspan);


108  //toz_right = left>toz + (cur_grads.dtozdxspan);


109 


110  //edi still has dword ptr [left]


111  lea ebx,dword ptr [cur_grads]


112  nop


113 


114  movd mm1, dword ptr [edi]perspective_span.ooz


115  mov esi,dword ptr [r1_software_texture_ptr]


116 


117  movd mm3, dword ptr [ebx]tri_gradients.doozdxspan


118  mov eax,dword ptr [left_s_t] //left_s


119 


120  shr esi,1


121  movq mm0, qword ptr [edi]perspective_span.soz


122 


123  pfadd (m1, m3)


124  movq mm2, qword ptr [ebx]tri_gradients.dsozdxspan


125 


126  sar eax,16 //get integral left_s into eax


127  mov edi,dword ptr [start_pixel]


128 


129  pfrcp (m6, m1)


130  movq mm7,mm1


131 


132  pfadd (m0, m2)


133  mov ebx,dword ptr [left_s_t+4] //left_t


134 


135  //calculate the 1st right_z in mm7


136  sar ebx,16 //get integral left_t into ebx


137  punpckldq mm7, mm7 //duplicate high 32bits of mm7 into low 32 bits of mm7


138 


139  pfrcpit1 (m7, m6)


140  mov edx,dword ptr [left_s_t+4] //left_t


141 


142  mov cl,byte ptr [r1_software_twidth_log2]


143  add esi,eax


144 


145  pfrcpit2 (m7, m6)


146 


147  //calculate starting fractional and integral values for s and t


148  //esi = starting_s_coordinate >> 16 + starting_t_coordinate >> 16 << r1_software_twidth_log2


149  //ecx = starting_s_coordinate << 16


150  //edx = starting_t_coordinate << 16


151 


152  //some stuff has been moved up, interleaved w/the mmx code above


153 


154  shl ebx,cl //multiply integral left_t by texture width


155 


156  sal edx,16 //get fractional left_t into edx


157  mov ecx,dword ptr [left_s_t] //left_s


158 


159  sal ecx,16


160  add esi,ebx


161  }


162 


163  while (num_subdivisions)


164  {


165  _asm


166  {


167  //right_s = qftoi(soz_right * right_z);


168  //right_t = qftoi(toz_right * right_z);


169 


170  //soz_right and toz_right are in mm0


171  //right_z is in mm7


172  pfmul (m7, m0)


173 


174  pf2id (m7, m7)


175 


176  movq qword ptr [right_s_t],mm7


177 


178  //calculate ooz_right, soz_right, toz_right, and right_z for the end of the next span. if there are


179  //more subdivisions, calculate the end of the next span. if there are no more and there is > 1 leftover


180  //in the leftover span, calculate the end of that.


181 


182  //if (num_subdivisions!=1)


183  //{


184  cmp dword ptr [num_subdivisions],1


185  je last_subdivision


186 


187  //ooz_right += (cur_grads.doozdxspan);


188  //soz_right += (cur_grads.dsozdxspan);


189  //toz_right += (cur_grads.dtozdxspan);


190 


191  pfadd (m0, m2)


192  pfadd (m1, m3)


193 


194  jmp proceed_with_mapping


195  //}


196  //else


197  //if (num_leftover > 1)


198  //{


199 


200  last_subdivision:


201  cmp dword ptr [num_leftover],1


202  jle proceed_with_mapping


203 


204  //calculate the right_z for the end of the leftover span


205  //ooz_right += (cur_grads.doozdx * num_leftover);


206  //soz_right += (cur_grads.dsozdx * num_leftover);


207  //toz_right += (cur_grads.dtozdx * num_leftover);


208 


209  movd mm2,dword ptr [num_leftover]


210  movd mm3, dword ptr [cur_grads]tri_gradients.dsozdx


211 


212  pi2fd (m2, m2)


213  movd mm4, dword ptr [cur_grads]tri_gradients.dtozdx


214 


215  pfmul (m3, m2)


216  movd mm5, dword ptr [cur_grads]tri_gradients.doozdx


217 


218  pfmul (m4, m2)


219  pfmul (m5, m2)


220 


221  pfacc (m3, m4) //gets dtozdx*num_leftover into high 32 bits of m3


222 


223  pfadd (m0, m3)


224  pfadd (m1, m5)


225  //}


226 


227  proceed_with_mapping:


228  //cap the right_s and right_t's so that they're valid


229 


230  mov eax,dword ptr [right_s_t] //right_s


231  mov ebx,dword ptr [right_s_t+4] //right_t


232 


233  add eax,dword ptr [cur_grads]tri_gradients.s_adjust


234  add ebx,dword ptr [cur_grads]tri_gradients.t_adjust


235 


236  //cap the right s and t


237  cmp eax,0


238  jge cmp_eax_high


239 


240  mov eax,0


241  jmp cmp_ebx_low


242 


243  cmp_eax_high:


244  cmp eax,dword ptr [s_mask]


245  jle cmp_ebx_low


246 


247  mov eax,dword ptr [s_mask]


248 


249  cmp_ebx_low:


250  cmp ebx,0


251  jge cmp_ebx_high


252 


253  mov ebx,0


254  jmp done_compare


255 


256  cmp_ebx_high:


257  cmp ebx,dword ptr [t_mask]


258  jle done_compare


259 


260  mov ebx,dword ptr [t_mask]


261 


262  done_compare:


263 


264  //store the right_s and right_t


265  //so they can be copied into left_s and left_t at the end of the 16pixel span


266  //(the cant be copied now because we have to calculate (right_sleft_s)>>4 and (right_tleft_t)>>4


267 


268  //calculate the next right_z in mm7


269  //unfortunately, if the span is a multiple of 16, and this is the last set of 16, it will


270  //calculate an unnecessary z. but its best to have the code here mixed in w/integer ops so


271  //that the amd3d code has something for its executation latencies to sit through


272  movq mm7, mm1


273  pfrcp (m6, m1)


274 


275  mov dword ptr [right_s_t],eax //right_s


276  mov dword ptr [right_s_t+4],ebx //right_t


277 


278  punpckldq mm7, mm7 //duplicate low 32bits of mm7 into high 32 bits of mm7


279  sub eax,dword ptr [left_s_t] //left_s


280 


281  sar eax,4


282  push ebp


283 


284  pfrcpit1 (m7, m6)


285  sub ebx,dword ptr [left_s_t+4] //left_t


286 


287  sar ebx,4


288  mov word ptr [dsdx_frac+2],ax //this sets the upper 16 bits of dword ptr [dsdx_frac] to ((right_sleft_s)>>4)<<16


289 


290  pfrcpit2 (m7, m6)


291  nop


292 


293  sar eax,16


294  mov word ptr [dtdx_frac+2],bx //this sets the upper 16 bits of dword ptr [dtdx_frac] to ((right_tleft_t)>>4)<<16


295 


296  sar ebx,16


297  mov cl,byte ptr [r1_software_twidth_log2]


298 


299  shl ebx,cl


300 


301  add eax,ebx


302  mov bh,byte ptr [last_alpha_accumulated]


303 


304  //s_t_carry[1] = integral_dsdx + integral_dtdx<<r1_software_twidth_log2


305  //s_t_carry[0] = integral_dsdx + integral_dtdx<<r1_software_twidth_log2 + r1_software_texture_width


306 


307  mov dword ptr [s_t_carry+4],eax


308  add eax,dword ptr [r1_software_texture_width]


309 


310  mov dword ptr [s_t_carry],eax


311  mov cl,4 //loop is unrolled to 4 pixels  we want to draw 16, so loop 4 times


312 


313  ALIGN 16


314 


315  //high 16 bits of ecx is the fractional s component


316  //high 16 bits of edx is the fractional t component


317 


318  //eax is used to lookup the texel as well as the low 8bits of the lit texel


319  //ebx is used to lookup the high 8bits of the lit texel


320  //ebp is used to detect a tcarry as well as lookup the lit texel


321  //cl is the loop count variable


322  //bh is used to dither the alpha


323 


324  looper1:


325  movzx eax,word ptr [esi*2]


326  add edx,dword ptr [dtdx_frac]


327 


328  sbb ebp,ebp


329  mov bl,ah


330 


331  and eax,4095


332  add ecx,dword ptr [dsdx_frac]


333 


334  adc esi,dword ptr [4+s_t_carry+ebp*4]


335  and bl,240


336 


337  movzx eax,word ptr [alpha_table+eax*2]


338  add bh,bl


339 


340  jnc skip_pixel_1


341 


342  mov word ptr [edi],ax


343  add bh,16


344 


345  skip_pixel_1:


346  movzx eax,word ptr [esi*2]


347  add edx,dword ptr [dtdx_frac]


348 


349  sbb ebp,ebp


350  mov bl,ah


351 


352  and eax,4095


353  add ecx,dword ptr [dsdx_frac]


354 


355  adc esi,dword ptr [4+s_t_carry+ebp*4]


356  and bl,240


357 


358  movzx eax,word ptr [alpha_table+eax*2]


359  add bh,bl


360 


361  jnc skip_pixel_2


362 


363  mov word ptr [edi+2],ax


364  add bh,16


365 


366  skip_pixel_2:


367  movzx eax,word ptr [esi*2]


368  add edx,dword ptr [dtdx_frac]


369 


370  sbb ebp,ebp


371  mov bl,ah


372 


373  and eax,4095


374  add ecx,dword ptr [dsdx_frac]


375 


376  adc esi,dword ptr [4+s_t_carry+ebp*4]


377  and bl,240


378 


379  movzx eax,word ptr [alpha_table+eax*2]


380  add bh,bl


381 


382  jnc skip_pixel_3


383 


384  mov word ptr [edi+4],ax


385  add bh,16


386 


387  skip_pixel_3:


388  movzx eax,word ptr [esi*2]


389  add edx,dword ptr [dtdx_frac]


390 


391  sbb ebp,ebp


392  mov bl,ah


393 


394  and eax,4095


395  add ecx,dword ptr [dsdx_frac]


396 


397  adc esi,dword ptr [4+s_t_carry+ebp*4]


398  and bl,240


399 


400  movzx eax,word ptr [alpha_table+eax*2]


401  add bh,bl


402 


403  jnc skip_pixel_4


404 


405  mov word ptr [edi+6],ax


406  add bh,16


407 


408  skip_pixel_4:


409  add edi,8


410  dec cl //thank god this doesnt modify the carry flag (the above add ch,dl needs to effect the adc bh,dh at the top of the loop)


411 


412  jnz looper1


413  pop ebp


414 


415  mov byte ptr [last_alpha_accumulated],bh //save it


416 


417  //store right_s and right_t in left_s and left_t


418  //right_s is what left_s starts at on the next 16 pixel span


419  //right_t is what left_t starts at on the next 16 pixel span


420 


421  mov eax,dword ptr [right_s_t] //right_s


422  mov ebx,dword ptr [right_s_t+4] //right_t


423 


424  mov dword ptr [left_s_t],eax //left_s


425  mov dword ptr [left_s_t+4],ebx //left_t


426  }


427 


428  _asm dec dword ptr [num_subdivisions]


429  }


430 


431  //store these so that the C code below actually works


432  _asm mov dword ptr [start_pixel],edi


433  }


434 


435  if (num_leftover)


436  {


437  if (num_leftover > 1)


438  {


439  if (had_subdivisions==0)


440  {


441  //calculate the right_z for the end of span


442  //ooz_right = left>ooz + (cur_grads.doozdx * num_leftover);


443  //soz_right = left>soz + (cur_grads.dsozdx * num_leftover);


444  //toz_right = left>toz + (cur_grads.dtozdx * num_leftover);


445 


446  _asm


447  {


448  movd mm2,dword ptr [num_leftover]


449  lea ebx,dword ptr [cur_grads]


450 


451  movd mm3, dword ptr [ebx]tri_gradients.dsozdx


452  mov edi,dword ptr [left]


453 


454  movd mm4, dword ptr [ebx]tri_gradients.dtozdx


455  pi2fd (m2, m2)


456 


457  movd mm5, dword ptr [ebx]tri_gradients.doozdx


458  pfmul (m3, m2)


459 


460  movq mm0, qword ptr [edi]perspective_span.soz


461  pfmul (m4, m2)


462 


463  movd mm1, dword ptr [edi]perspective_span.ooz


464  pfmul (m5, m2)


465 


466  pfacc (m3, m4) //gets dtozdx*num_leftover into high 32 bits of m3


467 


468  pfadd (m1, m5) //ooz += doozdx*num_leftover


469  pfadd (m0, m3) //soz += dsozdx*num_leftover AND toz += dtozdx*num_leftover


470 


471  //calculate the z at the right endpoint in mm7


472  movq mm7, mm1


473  pfrcp (m6, m1)


474 


475  punpckldq mm7, mm7 //duplicate low 32bits of mm7 into high 32 bits of mm7


476 


477  pfrcpit1 (m7, m6) //terrible stalls. oh well


478 


479  pfrcpit2 (m7, m6)


480  }


481  }


482  else


483  {


484  //the correct ending right_z is already being calculated


485  //(see the if (num_subdivisions!=1) case above


486  }


487 


488  _asm


489  {


490  //calculate starting fractional and integral values for s and t


491 


492  //calculate the right endpoint


493  //right_s = qftoi(soz_right * right_z) + cur_grads.s_adjust;


494  //right_t = qftoi(toz_right * right_z) + cur_grads.t_adjust;


495 


496  //soz_right and toz_right are in mm0


497  //right_z is in mm7


498  pfmul (m7, m0) //calculate right_s and right_t


499  mov edi,dword ptr [start_pixel]


500 


501  mov esi,dword ptr [r1_software_texture_ptr]


502  mov eax,dword ptr [left_s_t] //left_s


503 


504  shr esi,1


505  pf2id (m7, m7) //truncate right_s and right_t


506 


507  sar eax,16


508  mov ebx,dword ptr [left_s_t+4] //left_t


509 


510  sar ebx,16


511  movq qword ptr [right_s_t],mm7


512 


513  mov edx,dword ptr [left_s_t+4] //left_t


514  add esi,eax


515 


516  mov cl,byte ptr [r1_software_twidth_log2]


517  shl ebx,cl


518 


519  sal edx,16


520  mov ecx,dword ptr [left_s_t] //left_s


521 


522  sal ecx,16


523  add esi,ebx


524 


525  mov eax,dword ptr [right_s_t] //right_s


526  mov ebx,dword ptr [right_s_t+4] //right_t


527 


528  add eax,dword ptr [cur_grads]tri_gradients.s_adjust


529  add ebx,dword ptr [cur_grads]tri_gradients.t_adjust


530 


531  //cap the right s and t


532  cmp eax,0


533  jge cmp_eax_high_2


534 


535  mov eax,0


536  jmp cmp_ebx_low_2


537 


538  cmp_eax_high_2:


539  cmp eax,dword ptr [s_mask]


540  jle cmp_ebx_low_2


541 


542  mov eax,dword ptr [s_mask]


543 


544  cmp_ebx_low_2:


545  cmp ebx,0


546  jge cmp_ebx_high_2


547 


548  mov ebx,0


549  jmp done_compare_2


550 


551  cmp_ebx_high_2:


552  cmp ebx,dword ptr [t_mask]


553  jle done_compare_2


554 


555  mov ebx,dword ptr [t_mask]


556 


557  done_compare_2:


558 


559  //calculate the deltas (left to right)


560  //temp_dsdx = qftoi((float)(right_s  left_s) * inverse_leftover_lookup[num_leftover]);


561  //temp_dtdx = qftoi((float)(right_t  left_t) * inverse_leftover_lookup[num_leftover]);


562 


563  sub eax,dword ptr [left_s_t] //left_s


564  sub ebx,dword ptr [left_s_t+4] //left_t


565 


566  movd mm0,eax //temp_dsdx


567  push ebp


568 


569  movd mm1,ebx //temp_dtdx


570  mov ebp, dword ptr [num_leftover]


571 


572  pi2fd (m0, m0)


573  movd mm2, dword ptr [inverse_leftover_lookup + ebp*4]


574 


575  pi2fd (m1, m1)


576  pfmul (m0, m2)


577 


578  pfmul (m1, m2) //bad stalls here


579  pf2id (m0, m0)


580 


581  pf2id (m1, m1)


582 


583  movd eax, mm0 //temp_dsdx


584  movd ebx, mm1 //temp_dtdx


585 


586  //calculate the fractional and integral delta vars


587  //s_t_carry[0] = (temp_dsdx>>16) + ((temp_dtdx>>16)<<r1_software_twidth_log2) + r1_software_texture_width;


588  //s_t_carry[1] = (temp_dsdx>>16) + ((temp_dtdx>>16)<<r1_software_twidth_log2);


589  //dsdx_frac = (temp_dsdx<<16);


590  //dtdx_frac = (temp_dtdx<<16);


591 


592  mov word ptr [dsdx_frac+2],ax


593  mov word ptr [dtdx_frac+2],bx


594 


595  sar eax,16


596  mov dx,word ptr [left_l]


597 


598  sar ebx,16


599  mov cl,byte ptr [r1_software_twidth_log2]


600 


601  shl ebx,cl


602 


603  add eax,ebx


604  mov bl,byte ptr [last_alpha_accumulated]


605 


606  mov dword ptr [s_t_carry+4],eax


607  add eax,dword ptr [r1_software_texture_width]


608 


609  mov dword ptr [s_t_carry],eax


610  mov cl, byte ptr [num_leftover]


611 


612  ALIGN 16


613 


614  looper3:


615  movzx eax,word ptr [esi*2]


616  add edx,dword ptr [dtdx_frac]


617 


618  sbb ebp,ebp


619  add ecx,dword ptr [dsdx_frac]


620 


621  adc esi,dword ptr [4+s_t_carry+ebp*4]


622  add bl,ah


623 


624  jnc skip_a_pixel


625 


626  and eax,4095


627  mov ax,word ptr [alpha_table+eax*2]


628  mov word ptr [edi],ax


629 


630  skip_a_pixel:


631  and bl,240


632  add edi,2


633 


634  dec cl


635  jnz looper3


636 


637  pop ebp


638  }


639  }


640  else


641  {


642  //highly unoptimized single pixel drawer


643  register w16 texel = *( r1_software_texture_ptr + (left_s_t[0]>>16) + ((left_s_t[1]>>16) << r1_software_twidth_log2) );


644 


645  if (texel & (15<<12) == (15<<12))


646  {


647  //*start_pixel = alpha_table[texel & 4095];


648  }


649  }


650  }


651 


652  return;


653 


654  _asm


655  {


656  dumpmmxregs:


657  movq qword ptr [mmx0],mm0


658  movq qword ptr [mmx1],mm1


659  movq qword ptr [mmx2],mm2


660  movq qword ptr [mmx3],mm3


661  movq qword ptr [mmx4],mm4


662  movq qword ptr [mmx5],mm5


663  movq qword ptr [mmx6],mm6


664  movq qword ptr [mmx7],mm7


665  ret


666  }


667 


668  }


669 

