2
* Mesa 3-D graphics library
5
* Copyright (C) 1999-2004 Brian Paul All Rights Reserved.
7
* Permission is hereby granted, free of charge, to any person obtaining a
8
* copy of this software and associated documentation files (the "Software"),
9
* to deal in the Software without restriction, including without limitation
10
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
11
* and/or sell copies of the Software, and to permit persons to whom the
12
* Software is furnished to do so, subject to the following conditions:
14
* The above copyright notice and this permission notice shall be included
15
* in all copies or substantial portions of the Software.
17
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
18
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20
* BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
21
* AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26
* Translate tgsi vertex programs to x86/x87/SSE/SSE2 machine code
27
* using the rtasm runtime assembler. Based on the old
28
* t_vb_arb_program_sse.c
32
#include "util/u_memory.h"
33
#include "util/u_math.h"
34
#include "pipe/p_shader_tokens.h"
35
#include "util/u_debug.h"
36
#include "tgsi/tgsi_parse.h"
37
#include "tgsi/tgsi_util.h"
38
#include "tgsi/tgsi_exec.h"
39
#include "tgsi/tgsi_dump.h"
42
#include "draw_vs_aos.h"
44
#include "rtasm/rtasm_x86sse.h"
50
static const char *files[] =
63
static INLINE boolean eq( struct x86_reg a,
66
return (a.file == b.file &&
72
struct x86_reg aos_get_x86( struct aos_compilation *cp,
73
unsigned which_reg, /* quick hack */
83
if (cp->x86_reg[which_reg] != value) {
88
assert(which_reg == 0);
89
offset = Offset(struct aos_machine, immediates);
92
assert(which_reg == 1);
93
offset = Offset(struct aos_machine, constants);
96
assert(which_reg == 0);
97
offset = Offset(struct aos_machine, buffer);
105
x86_mov(cp->func, reg,
106
x86_make_disp(cp->machine_EDX, offset));
108
cp->x86_reg[which_reg] = value;
115
static struct x86_reg get_reg_ptr(struct aos_compilation *cp,
119
struct x86_reg ptr = cp->machine_EDX;
122
case TGSI_FILE_INPUT:
123
assert(idx < MAX_INPUTS);
124
return x86_make_disp(ptr, Offset(struct aos_machine, input[idx]));
126
case TGSI_FILE_OUTPUT:
127
return x86_make_disp(ptr, Offset(struct aos_machine, output[idx]));
129
case TGSI_FILE_TEMPORARY:
130
assert(idx < MAX_TEMPS);
131
return x86_make_disp(ptr, Offset(struct aos_machine, temp[idx]));
133
case AOS_FILE_INTERNAL:
134
assert(idx < MAX_INTERNALS);
135
return x86_make_disp(ptr, Offset(struct aos_machine, internal[idx]));
137
case TGSI_FILE_IMMEDIATE:
138
assert(idx < MAX_IMMEDIATES); /* just a sanity check */
139
return x86_make_disp(aos_get_x86(cp, 0, X86_IMMEDIATES), idx * 4 * sizeof(float));
141
case TGSI_FILE_CONSTANT:
142
assert(idx < MAX_CONSTANTS); /* just a sanity check */
143
return x86_make_disp(aos_get_x86(cp, 1, X86_CONSTANTS), idx * 4 * sizeof(float));
146
AOS_ERROR(cp, "unknown reg file");
147
return x86_make_reg(0,0);
153
#define X87_CW_EXCEPTION_INV_OP (1<<0)
154
#define X87_CW_EXCEPTION_DENORM_OP (1<<1)
155
#define X87_CW_EXCEPTION_ZERO_DIVIDE (1<<2)
156
#define X87_CW_EXCEPTION_OVERFLOW (1<<3)
157
#define X87_CW_EXCEPTION_UNDERFLOW (1<<4)
158
#define X87_CW_EXCEPTION_PRECISION (1<<5)
159
#define X87_CW_PRECISION_SINGLE (0<<8)
160
#define X87_CW_PRECISION_RESERVED (1<<8)
161
#define X87_CW_PRECISION_DOUBLE (2<<8)
162
#define X87_CW_PRECISION_DOUBLE_EXT (3<<8)
163
#define X87_CW_PRECISION_MASK (3<<8)
164
#define X87_CW_ROUND_NEAREST (0<<10)
165
#define X87_CW_ROUND_DOWN (1<<10)
166
#define X87_CW_ROUND_UP (2<<10)
167
#define X87_CW_ROUND_ZERO (3<<10)
168
#define X87_CW_ROUND_MASK (3<<10)
169
#define X87_CW_INFINITY (1<<12)
174
static void spill( struct aos_compilation *cp, unsigned idx )
176
if (!cp->xmm[idx].dirty ||
177
(cp->xmm[idx].file != TGSI_FILE_INPUT && /* inputs are fetched into xmm & set dirty */
178
cp->xmm[idx].file != TGSI_FILE_OUTPUT &&
179
cp->xmm[idx].file != TGSI_FILE_TEMPORARY)) {
180
AOS_ERROR(cp, "invalid spill");
184
struct x86_reg oldval = get_reg_ptr(cp,
188
if (0) debug_printf("\nspill %s[%d]",
189
files[cp->xmm[idx].file],
192
assert(cp->xmm[idx].dirty);
193
sse_movaps(cp->func, oldval, x86_make_reg(file_XMM, idx));
194
cp->xmm[idx].dirty = 0;
199
void aos_spill_all( struct aos_compilation *cp )
203
for (i = 0; i < 8; i++) {
204
if (cp->xmm[i].dirty)
206
aos_release_xmm_reg(cp, i);
211
static struct x86_reg get_xmm_writable( struct aos_compilation *cp,
214
if (reg.file != file_XMM ||
215
cp->xmm[reg.idx].file != TGSI_FILE_NULL)
217
struct x86_reg tmp = aos_get_xmm_reg(cp);
218
sse_movaps(cp->func, tmp, reg);
222
cp->xmm[reg.idx].last_used = cp->insn_counter;
226
static struct x86_reg get_xmm( struct aos_compilation *cp,
229
if (reg.file != file_XMM)
231
struct x86_reg tmp = aos_get_xmm_reg(cp);
232
sse_movaps(cp->func, tmp, reg);
236
cp->xmm[reg.idx].last_used = cp->insn_counter;
241
/* Allocate an empty xmm register, either as a temporary or later to
242
* "adopt" as a shader reg.
244
struct x86_reg aos_get_xmm_reg( struct aos_compilation *cp )
248
boolean found = FALSE;
250
for (i = 0; i < 8; i++)
251
if (cp->xmm[i].last_used != cp->insn_counter &&
252
cp->xmm[i].file == TGSI_FILE_NULL) {
258
for (i = 0; i < 8; i++)
259
if (cp->xmm[i].last_used < cp->xmm[oldest].last_used)
263
/* Need to write out the old value?
265
if (cp->xmm[oldest].dirty)
268
assert(cp->xmm[oldest].last_used != cp->insn_counter);
270
cp->xmm[oldest].file = TGSI_FILE_NULL;
271
cp->xmm[oldest].idx = 0;
272
cp->xmm[oldest].dirty = 0;
273
cp->xmm[oldest].last_used = cp->insn_counter;
274
return x86_make_reg(file_XMM, oldest);
277
void aos_release_xmm_reg( struct aos_compilation *cp,
280
cp->xmm[idx].file = TGSI_FILE_NULL;
281
cp->xmm[idx].idx = 0;
282
cp->xmm[idx].dirty = 0;
283
cp->xmm[idx].last_used = 0;
287
static void aos_soft_release_xmm( struct aos_compilation *cp,
290
if (reg.file == file_XMM) {
291
assert(cp->xmm[reg.idx].last_used == cp->insn_counter);
292
cp->xmm[reg.idx].last_used = cp->insn_counter - 1;
298
/* Mark an xmm reg as holding the current copy of a shader reg.
300
void aos_adopt_xmm_reg( struct aos_compilation *cp,
308
if (reg.file != file_XMM) {
314
/* If any xmm reg thinks it holds this shader reg, break the
317
for (i = 0; i < 8; i++) {
318
if (cp->xmm[i].file == file &&
319
cp->xmm[i].idx == idx)
321
/* If an xmm reg is already holding this shader reg, take into account its
324
dirty |= cp->xmm[i].dirty;
325
aos_release_xmm_reg(cp, i);
329
cp->xmm[reg.idx].file = file;
330
cp->xmm[reg.idx].idx = idx;
331
cp->xmm[reg.idx].dirty = dirty;
332
cp->xmm[reg.idx].last_used = cp->insn_counter;
336
/* Return a pointer to the in-memory copy of the reg, making sure it is uptodate.
338
static struct x86_reg aos_get_shader_reg_ptr( struct aos_compilation *cp,
344
/* Ensure the in-memory copy of this reg is up-to-date
346
for (i = 0; i < 8; i++) {
347
if (cp->xmm[i].file == file &&
348
cp->xmm[i].idx == idx &&
354
return get_reg_ptr( cp, file, idx );
358
/* As above, but return a pointer. Note - this pointer may alias
359
* those returned by get_arg_ptr().
361
static struct x86_reg get_dst_ptr( struct aos_compilation *cp,
362
const struct tgsi_full_dst_register *dst )
364
unsigned file = dst->Register.File;
365
unsigned idx = dst->Register.Index;
369
/* Ensure in-memory copy of this reg is up-to-date and invalidate
372
for (i = 0; i < 8; i++) {
373
if (cp->xmm[i].file == file &&
374
cp->xmm[i].idx == idx)
376
if (cp->xmm[i].dirty)
379
aos_release_xmm_reg(cp, i);
383
return get_reg_ptr( cp, file, idx );
390
/* Return an XMM reg if the argument is resident, otherwise return a
391
* base+offset pointer to the saved value.
393
struct x86_reg aos_get_shader_reg( struct aos_compilation *cp,
399
for (i = 0; i < 8; i++) {
400
if (cp->xmm[i].file == file &&
401
cp->xmm[i].idx == idx)
403
cp->xmm[i].last_used = cp->insn_counter;
404
return x86_make_reg(file_XMM, i);
408
/* If not found in the XMM register file, return an indirect
409
* reference to the in-memory copy:
411
return get_reg_ptr( cp, file, idx );
416
static struct x86_reg aos_get_shader_reg_xmm( struct aos_compilation *cp,
420
struct x86_reg reg = get_xmm( cp,
421
aos_get_shader_reg( cp, file, idx ) );
423
aos_adopt_xmm_reg( cp,
434
struct x86_reg aos_get_internal_xmm( struct aos_compilation *cp,
437
return aos_get_shader_reg_xmm( cp, AOS_FILE_INTERNAL, imm );
441
struct x86_reg aos_get_internal( struct aos_compilation *cp,
444
return aos_get_shader_reg( cp, AOS_FILE_INTERNAL, imm );
451
/* Emulate pshufd insn in regular SSE, if necessary:
453
static void emit_pshufd( struct aos_compilation *cp,
459
sse2_pshufd(cp->func, dst, arg0, shuf);
463
sse_movaps(cp->func, dst, arg0);
465
sse_shufps(cp->func, dst, dst, shuf);
469
/* load masks (pack into negs??)
470
* pshufd - shuffle according to writemask
475
static boolean mask_write( struct aos_compilation *cp,
477
struct x86_reg result,
480
struct x86_reg imm_swz = aos_get_internal_xmm(cp, IMM_SWZ);
481
struct x86_reg tmp = aos_get_xmm_reg(cp);
483
emit_pshufd(cp, tmp, imm_swz,
484
SHUF((mask & 1) ? 2 : 3,
487
(mask & 8) ? 2 : 3));
489
sse_andps(cp->func, dst, tmp);
490
sse_andnps(cp->func, tmp, result);
491
sse_orps(cp->func, dst, tmp);
493
aos_release_xmm_reg(cp, tmp.idx);
500
/* Helper for writemask:
502
static boolean emit_shuf_copy2( struct aos_compilation *cp,
508
struct x86_reg tmp = aos_get_xmm_reg(cp);
510
emit_pshufd(cp, dst, arg1, shuf);
511
emit_pshufd(cp, tmp, arg0, shuf);
512
sse_shufps(cp->func, dst, tmp, SHUF(X, Y, Z, W));
513
emit_pshufd(cp, dst, dst, shuf);
515
aos_release_xmm_reg(cp, tmp.idx);
521
#define SSE_SWIZZLE_NOOP ((0<<0) | (1<<2) | (2<<4) | (3<<6))
524
/* Locate a source register and perform any required (simple) swizzle.
526
* Just fail on complex swizzles at this point.
528
static struct x86_reg fetch_src( struct aos_compilation *cp,
529
const struct tgsi_full_src_register *src )
531
struct x86_reg arg0 = aos_get_shader_reg(cp,
533
src->Register.Index);
539
for (i = 0; i < 4; i++) {
540
unsigned swizzle = tgsi_util_get_full_src_register_swizzle( src, i );
541
unsigned neg = tgsi_util_get_full_src_register_sign_mode( src, i );
543
swz |= (swizzle & 0x3) << (i * 2);
546
case TGSI_UTIL_SIGN_TOGGLE:
550
case TGSI_UTIL_SIGN_KEEP:
553
case TGSI_UTIL_SIGN_CLEAR:
558
AOS_ERROR(cp, "unsupported sign-mode");
563
if (swz != SSE_SWIZZLE_NOOP || negs != 0 || abs != 0) {
564
struct x86_reg dst = aos_get_xmm_reg(cp);
566
if (swz != SSE_SWIZZLE_NOOP)
567
emit_pshufd(cp, dst, arg0, swz);
569
sse_movaps(cp->func, dst, arg0);
571
if (negs && negs != 0xf) {
572
struct x86_reg imm_swz = aos_get_internal_xmm(cp, IMM_SWZ);
573
struct x86_reg tmp = aos_get_xmm_reg(cp);
576
* Use neg as arg to pshufd
579
emit_pshufd(cp, tmp, imm_swz,
580
SHUF((negs & 1) ? 1 : 0,
583
(negs & 8) ? 1 : 0));
584
sse_mulps(cp->func, dst, tmp);
586
aos_release_xmm_reg(cp, tmp.idx);
587
aos_soft_release_xmm(cp, imm_swz);
590
struct x86_reg imm_negs = aos_get_internal_xmm(cp, IMM_NEGS);
591
sse_mulps(cp->func, dst, imm_negs);
592
aos_soft_release_xmm(cp, imm_negs);
596
if (abs && abs != 0xf) {
597
AOS_ERROR(cp, "unsupported partial abs");
600
struct x86_reg neg = aos_get_internal(cp, IMM_NEGS);
601
struct x86_reg tmp = aos_get_xmm_reg(cp);
603
sse_movaps(cp->func, tmp, dst);
604
sse_mulps(cp->func, tmp, neg);
605
sse_maxps(cp->func, dst, tmp);
607
aos_release_xmm_reg(cp, tmp.idx);
608
aos_soft_release_xmm(cp, neg);
611
aos_soft_release_xmm(cp, arg0);
618
static void x87_fld_src( struct aos_compilation *cp,
619
const struct tgsi_full_src_register *src,
622
struct x86_reg arg0 = aos_get_shader_reg_ptr(cp,
624
src->Register.Index);
626
unsigned swizzle = tgsi_util_get_full_src_register_swizzle( src, channel );
627
unsigned neg = tgsi_util_get_full_src_register_sign_mode( src, channel );
629
x87_fld( cp->func, x86_make_disp(arg0, (swizzle & 3) * sizeof(float)) );
632
case TGSI_UTIL_SIGN_TOGGLE:
635
x87_fchs( cp->func );
638
case TGSI_UTIL_SIGN_KEEP:
641
case TGSI_UTIL_SIGN_CLEAR:
642
x87_fabs( cp->func );
645
case TGSI_UTIL_SIGN_SET:
646
x87_fabs( cp->func );
647
x87_fchs( cp->func );
651
AOS_ERROR(cp, "unsupported sign-mode");
661
/* Used to implement write masking. This and most of the other instructions
662
* here would be easier to implement if there had been a translation
663
* to a 2 argument format (dst/arg0, arg1) at the shader level before
664
* attempting to translate to x86/sse code.
666
static void store_dest( struct aos_compilation *cp,
667
const struct tgsi_full_dst_register *reg,
668
struct x86_reg result )
672
switch (reg->Register.WriteMask) {
676
case TGSI_WRITEMASK_XYZW:
677
aos_adopt_xmm_reg(cp,
678
get_xmm_writable(cp, result),
687
dst = aos_get_shader_reg_xmm(cp,
689
reg->Register.Index);
691
switch (reg->Register.WriteMask) {
692
case TGSI_WRITEMASK_X:
693
sse_movss(cp->func, dst, get_xmm(cp, result));
696
case TGSI_WRITEMASK_ZW:
697
sse_shufps(cp->func, dst, get_xmm(cp, result), SHUF(X, Y, Z, W));
700
case TGSI_WRITEMASK_XY:
701
result = get_xmm_writable(cp, result);
702
sse_shufps(cp->func, result, dst, SHUF(X, Y, Z, W));
706
case TGSI_WRITEMASK_YZW:
707
result = get_xmm_writable(cp, result);
708
sse_movss(cp->func, result, dst);
713
mask_write(cp, dst, result, reg->Register.WriteMask);
717
aos_adopt_xmm_reg(cp,
725
static void inject_scalar( struct aos_compilation *cp,
727
struct x86_reg result,
730
sse_shufps(cp->func, dst, dst, swizzle);
731
sse_movss(cp->func, dst, result);
732
sse_shufps(cp->func, dst, dst, swizzle);
736
static void store_scalar_dest( struct aos_compilation *cp,
737
const struct tgsi_full_dst_register *reg,
738
struct x86_reg result )
740
unsigned writemask = reg->Register.WriteMask;
743
if (writemask != TGSI_WRITEMASK_X &&
744
writemask != TGSI_WRITEMASK_Y &&
745
writemask != TGSI_WRITEMASK_Z &&
746
writemask != TGSI_WRITEMASK_W &&
749
result = get_xmm_writable(cp, result); /* already true, right? */
750
sse_shufps(cp->func, result, result, SHUF(X,X,X,X));
751
store_dest(cp, reg, result);
755
result = get_xmm(cp, result);
756
dst = aos_get_shader_reg_xmm(cp,
758
reg->Register.Index);
762
switch (reg->Register.WriteMask) {
763
case TGSI_WRITEMASK_X:
764
sse_movss(cp->func, dst, result);
767
case TGSI_WRITEMASK_Y:
768
inject_scalar(cp, dst, result, SHUF(Y, X, Z, W));
771
case TGSI_WRITEMASK_Z:
772
inject_scalar(cp, dst, result, SHUF(Z, Y, X, W));
775
case TGSI_WRITEMASK_W:
776
inject_scalar(cp, dst, result, SHUF(W, Y, Z, X));
783
aos_adopt_xmm_reg(cp,
792
static void x87_fst_or_nop( struct x86_function *func,
797
assert(ptr.file == file_REG32);
798
if (writemask & (1<<channel))
799
x87_fst( func, x86_make_disp(ptr, channel * sizeof(float)) );
802
static void x87_fstp_or_pop( struct x86_function *func,
807
assert(ptr.file == file_REG32);
808
if (writemask & (1<<channel))
809
x87_fstp( func, x86_make_disp(ptr, channel * sizeof(float)) );
811
x87_fstp( func, x86_make_reg( file_x87, 0 ));
818
static void x87_fstp_dest4( struct aos_compilation *cp,
819
const struct tgsi_full_dst_register *dst )
821
struct x86_reg ptr = get_dst_ptr(cp, dst);
822
unsigned writemask = dst->Register.WriteMask;
824
x87_fst_or_nop(cp->func, writemask, 0, ptr);
825
x87_fst_or_nop(cp->func, writemask, 1, ptr);
826
x87_fst_or_nop(cp->func, writemask, 2, ptr);
827
x87_fstp_or_pop(cp->func, writemask, 3, ptr);
830
/* Save current x87 state and put it into single precision mode.
832
static void save_fpu_state( struct aos_compilation *cp )
834
x87_fnstcw( cp->func, x86_make_disp(cp->machine_EDX,
835
Offset(struct aos_machine, fpu_restore)));
838
static void restore_fpu_state( struct aos_compilation *cp )
840
x87_fnclex(cp->func);
841
x87_fldcw( cp->func, x86_make_disp(cp->machine_EDX,
842
Offset(struct aos_machine, fpu_restore)));
845
static void set_fpu_round_neg_inf( struct aos_compilation *cp )
847
if (cp->fpucntl != FPU_RND_NEG) {
848
cp->fpucntl = FPU_RND_NEG;
849
x87_fnclex(cp->func);
850
x87_fldcw( cp->func, x86_make_disp(cp->machine_EDX,
851
Offset(struct aos_machine, fpu_rnd_neg_inf)));
855
static void set_fpu_round_nearest( struct aos_compilation *cp )
857
if (cp->fpucntl != FPU_RND_NEAREST) {
858
cp->fpucntl = FPU_RND_NEAREST;
859
x87_fnclex(cp->func);
860
x87_fldcw( cp->func, x86_make_disp(cp->machine_EDX,
861
Offset(struct aos_machine, fpu_rnd_nearest)));
866
static void x87_emit_ex2( struct aos_compilation *cp )
868
struct x86_reg st0 = x86_make_reg(file_x87, 0);
869
struct x86_reg st1 = x86_make_reg(file_x87, 1);
870
int stack = cp->func->x87_stack;
872
/* set_fpu_round_neg_inf( cp ); */
874
x87_fld(cp->func, st0); /* a a */
875
x87_fprndint( cp->func ); /* int(a) a*/
876
x87_fsubr(cp->func, st1, st0); /* int(a) frc(a) */
877
x87_fxch(cp->func, st1); /* frc(a) int(a) */
878
x87_f2xm1(cp->func); /* (2^frc(a))-1 int(a) */
879
x87_fld1(cp->func); /* 1 (2^frc(a))-1 int(a) */
880
x87_faddp(cp->func, st1); /* 2^frac(a) int(a) */
881
x87_fscale(cp->func); /* (2^frac(a)*2^int(int(a))) int(a) */
883
x87_fstp(cp->func, st1); /* 2^a */
885
assert( stack == cp->func->x87_stack);
891
static void PIPE_CDECL print_reg( const char *msg,
894
debug_printf("%s: %f %f %f %f\n", msg, reg[0], reg[1], reg[2], reg[3]);
899
static void emit_print( struct aos_compilation *cp,
900
const char *message, /* must point to a static string! */
904
struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
905
struct x86_reg arg = aos_get_shader_reg_ptr( cp, file, idx );
908
/* There shouldn't be anything on the x87 stack. Can add this
909
* capacity later if need be.
911
assert(cp->func->x87_stack == 0);
913
/* For absolute correctness, need to spill/invalidate all XMM regs
914
* too. We're obviously not concerned about performance on this
915
* debug path, so here goes:
917
for (i = 0; i < 8; i++) {
918
if (cp->xmm[i].dirty)
921
aos_release_xmm_reg(cp, i);
924
/* Push caller-save (ie scratch) regs.
926
x86_cdecl_caller_push_regs( cp->func );
929
/* Push the arguments:
931
x86_lea( cp->func, ecx, arg );
932
x86_push( cp->func, ecx );
933
x86_push_imm32( cp->func, (int)message );
935
/* Call the helper. Could call debug_printf directly, but
936
* print_reg is a nice place to put a breakpoint if need be.
938
x86_mov_reg_imm( cp->func, ecx, (int)print_reg );
939
x86_call( cp->func, ecx );
940
x86_pop( cp->func, ecx );
941
x86_pop( cp->func, ecx );
943
/* Pop caller-save regs
945
x86_cdecl_caller_pop_regs( cp->func );
953
* The traditional instructions. All operate on internal registers
954
* and ignore write masks and swizzling issues.
957
static boolean emit_ABS( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
959
struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
960
struct x86_reg neg = aos_get_internal(cp, IMM_NEGS);
961
struct x86_reg tmp = aos_get_xmm_reg(cp);
963
sse_movaps(cp->func, tmp, arg0);
964
sse_mulps(cp->func, tmp, neg);
965
sse_maxps(cp->func, tmp, arg0);
967
store_dest(cp, &op->Dst[0], tmp);
971
static boolean emit_ADD( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
973
struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
974
struct x86_reg arg1 = fetch_src(cp, &op->Src[1]);
975
struct x86_reg dst = get_xmm_writable(cp, arg0);
977
sse_addps(cp->func, dst, arg1);
979
store_dest(cp, &op->Dst[0], dst);
983
static boolean emit_COS( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
985
x87_fld_src(cp, &op->Src[0], 0);
987
x87_fstp_dest4(cp, &op->Dst[0]);
991
/* The dotproduct instructions don't really do that well in sse:
992
* XXX: produces wrong results -- disabled.
994
static boolean emit_DP3( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
996
struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
997
struct x86_reg arg1 = fetch_src(cp, &op->Src[1]);
998
struct x86_reg tmp = aos_get_xmm_reg(cp);
999
struct x86_reg dst = get_xmm_writable(cp, arg0);
1001
sse_mulps(cp->func, dst, arg1);
1002
/* Now the hard bit: sum the first 3 values:
1004
sse_movhlps(cp->func, tmp, dst);
1005
sse_addss(cp->func, dst, tmp); /* a*x+c*z, b*y, ?, ? */
1006
emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z));
1007
sse_addss(cp->func, dst, tmp);
1009
aos_release_xmm_reg(cp, tmp.idx);
1010
store_scalar_dest(cp, &op->Dst[0], dst);
1014
static boolean emit_DP4( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1016
struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
1017
struct x86_reg arg1 = fetch_src(cp, &op->Src[1]);
1018
struct x86_reg tmp = aos_get_xmm_reg(cp);
1019
struct x86_reg dst = get_xmm_writable(cp, arg0);
1021
sse_mulps(cp->func, dst, arg1);
1023
/* Now the hard bit: sum the values:
1025
sse_movhlps(cp->func, tmp, dst);
1026
sse_addps(cp->func, dst, tmp); /* a*x+c*z, b*y+d*w, a*x+c*z, b*y+d*w */
1027
emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z));
1028
sse_addss(cp->func, dst, tmp);
1030
aos_release_xmm_reg(cp, tmp.idx);
1031
store_scalar_dest(cp, &op->Dst[0], dst);
1035
static boolean emit_DPH( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1037
struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
1038
struct x86_reg arg1 = fetch_src(cp, &op->Src[1]);
1039
struct x86_reg tmp = aos_get_xmm_reg(cp);
1040
struct x86_reg dst = get_xmm_writable(cp, arg0);
1042
sse_mulps(cp->func, dst, arg1);
1044
/* Now the hard bit: sum the values (from DP3):
1046
sse_movhlps(cp->func, tmp, dst);
1047
sse_addss(cp->func, dst, tmp); /* a*x+c*z, b*y, ?, ? */
1048
emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z));
1049
sse_addss(cp->func, dst, tmp);
1050
emit_pshufd(cp, tmp, arg1, SHUF(W,W,W,W));
1051
sse_addss(cp->func, dst, tmp);
1053
aos_release_xmm_reg(cp, tmp.idx);
1054
store_scalar_dest(cp, &op->Dst[0], dst);
1058
static boolean emit_DST( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1060
struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
1061
struct x86_reg arg1 = fetch_src(cp, &op->Src[1]);
1062
struct x86_reg dst = aos_get_xmm_reg(cp);
1063
struct x86_reg tmp = aos_get_xmm_reg(cp);
1064
struct x86_reg ones = aos_get_internal(cp, IMM_ONES);
1066
/* dst[0] = 1.0 * 1.0F; */
1067
/* dst[1] = arg0[1] * arg1[1]; */
1068
/* dst[2] = arg0[2] * 1.0; */
1069
/* dst[3] = 1.0 * arg1[3]; */
1071
emit_shuf_copy2(cp, dst, arg0, ones, SHUF(X,W,Z,Y));
1072
emit_shuf_copy2(cp, tmp, arg1, ones, SHUF(X,Z,Y,W));
1073
sse_mulps(cp->func, dst, tmp);
1075
aos_release_xmm_reg(cp, tmp.idx);
1076
store_dest(cp, &op->Dst[0], dst);
1080
static boolean emit_LG2( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1082
x87_fld1(cp->func); /* 1 */
1083
x87_fld_src(cp, &op->Src[0], 0); /* a0 1 */
1084
x87_fyl2x(cp->func); /* log2(a0) */
1085
x87_fstp_dest4(cp, &op->Dst[0]);
1090
static boolean emit_EX2( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1092
x87_fld_src(cp, &op->Src[0], 0);
1094
x87_fstp_dest4(cp, &op->Dst[0]);
1100
static boolean emit_FLR( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1102
struct x86_reg dst = get_dst_ptr(cp, &op->Dst[0]);
1103
unsigned writemask = op->Dst[0].Register.WriteMask;
1106
set_fpu_round_neg_inf( cp );
1108
/* Load all sources first to avoid aliasing
1110
for (i = 3; i >= 0; i--) {
1111
if (writemask & (1<<i)) {
1112
x87_fld_src(cp, &op->Src[0], i);
1116
for (i = 0; i < 4; i++) {
1117
if (writemask & (1<<i)) {
1118
x87_fprndint( cp->func );
1119
x87_fstp(cp->func, x86_make_disp(dst, i*4));
1127
static boolean emit_RND( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1129
struct x86_reg dst = get_dst_ptr(cp, &op->Dst[0]);
1130
unsigned writemask = op->Dst[0].Register.WriteMask;
1133
set_fpu_round_nearest( cp );
1135
/* Load all sources first to avoid aliasing
1137
for (i = 3; i >= 0; i--) {
1138
if (writemask & (1<<i)) {
1139
x87_fld_src(cp, &op->Src[0], i);
1143
for (i = 0; i < 4; i++) {
1144
if (writemask & (1<<i)) {
1145
x87_fprndint( cp->func );
1146
x87_fstp(cp->func, x86_make_disp(dst, i*4));
1154
static boolean emit_FRC( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1156
struct x86_reg dst = get_dst_ptr(cp, &op->Dst[0]);
1157
struct x86_reg st0 = x86_make_reg(file_x87, 0);
1158
struct x86_reg st1 = x86_make_reg(file_x87, 1);
1159
unsigned writemask = op->Dst[0].Register.WriteMask;
1162
set_fpu_round_neg_inf( cp );
1164
/* suck all the source values onto the stack before writing out any
1165
* dst, which may alias...
1167
for (i = 3; i >= 0; i--) {
1168
if (writemask & (1<<i)) {
1169
x87_fld_src(cp, &op->Src[0], i);
1173
for (i = 0; i < 4; i++) {
1174
if (writemask & (1<<i)) {
1175
x87_fld(cp->func, st0); /* a a */
1176
x87_fprndint( cp->func ); /* flr(a) a */
1177
x87_fsubp(cp->func, st1); /* frc(a) */
1178
x87_fstp(cp->func, x86_make_disp(dst, i*4));
1190
static boolean emit_LIT( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1192
struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
1193
unsigned writemask = op->Dst[0].Register.WriteMask;
1194
unsigned lit_count = cp->lit_count++;
1195
struct x86_reg result, arg0;
1199
/* For absolute correctness, need to spill/invalidate all XMM regs
1202
for (i = 0; i < 8; i++) {
1203
if (cp->xmm[i].dirty)
1205
aos_release_xmm_reg(cp, i);
1209
if (writemask != TGSI_WRITEMASK_XYZW)
1210
result = x86_make_disp(cp->machine_EDX, Offset(struct aos_machine, tmp[0]));
1212
result = get_dst_ptr(cp, &op->Dst[0]);
1215
arg0 = fetch_src( cp, &op->Src[0] );
1216
if (arg0.file == file_XMM) {
1217
struct x86_reg tmp = x86_make_disp(cp->machine_EDX,
1218
Offset(struct aos_machine, tmp[1]));
1219
sse_movaps( cp->func, tmp, arg0 );
1225
/* Push caller-save (ie scratch) regs.
1227
x86_cdecl_caller_push_regs( cp->func );
1229
/* Push the arguments:
1231
x86_push_imm32( cp->func, lit_count );
1233
x86_lea( cp->func, ecx, arg0 );
1234
x86_push( cp->func, ecx );
1236
x86_lea( cp->func, ecx, result );
1237
x86_push( cp->func, ecx );
1239
x86_push( cp->func, cp->machine_EDX );
1241
if (lit_count < MAX_LIT_INFO) {
1242
x86_mov( cp->func, ecx, x86_make_disp( cp->machine_EDX,
1243
Offset(struct aos_machine, lit_info) +
1244
lit_count * sizeof(struct lit_info) +
1245
Offset(struct lit_info, func)));
1248
x86_mov_reg_imm( cp->func, ecx, (int)aos_do_lit );
1251
x86_call( cp->func, ecx );
1253
x86_pop( cp->func, ecx ); /* fixme... */
1254
x86_pop( cp->func, ecx );
1255
x86_pop( cp->func, ecx );
1256
x86_pop( cp->func, ecx );
1258
x86_cdecl_caller_pop_regs( cp->func );
1260
if (writemask != TGSI_WRITEMASK_XYZW) {
1263
get_xmm_writable( cp, result ) );
1270
static boolean emit_inline_LIT( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1272
struct x86_reg dst = get_dst_ptr(cp, &op->Dst[0]);
1273
unsigned writemask = op->Dst[0].Register.WriteMask;
1275
if (writemask & TGSI_WRITEMASK_YZ) {
1276
struct x86_reg st1 = x86_make_reg(file_x87, 1);
1277
struct x86_reg st2 = x86_make_reg(file_x87, 2);
1279
/* a1' = a1 <= 0 ? 1 : a1;
1281
x87_fldz(cp->func); /* 1 0 */
1283
x87_fld1(cp->func); /* 1 0 */
1285
/* Correct but slow due to fp exceptions generated in fyl2x - fix me.
1287
x87_fldz(cp->func); /* 1 0 */
1289
x87_fld_src(cp, &op->Src[0], 1); /* a1 1 0 */
1290
x87_fcomi(cp->func, st2); /* a1 1 0 */
1291
x87_fcmovb(cp->func, st1); /* a1' 1 0 */
1292
x87_fstp(cp->func, st1); /* a1' 0 */
1293
x87_fstp(cp->func, st1); /* a1' */
1295
x87_fld_src(cp, &op->Src[0], 3); /* a3 a1' */
1296
x87_fxch(cp->func, st1); /* a1' a3 */
1299
/* Compute pow(a1, a3)
1301
x87_fyl2x(cp->func); /* a3*log2(a1) */
1302
x87_emit_ex2( cp ); /* 2^(a3*log2(a1)) */
1305
/* a0' = max2(a0, 0):
1307
x87_fldz(cp->func); /* 0 r2 */
1308
x87_fld_src(cp, &op->Src[0], 0); /* a0 0 r2 */
1309
x87_fcomi(cp->func, st1);
1310
x87_fcmovb(cp->func, st1); /* a0' 0 r2 */
1312
x87_fst_or_nop(cp->func, writemask, 1, dst); /* result[1] = a0' */
1314
x87_fcomi(cp->func, st1); /* a0' 0 r2 */
1315
x87_fcmovnbe(cp->func, st2); /* r2' 0' r2 */
1317
x87_fstp_or_pop(cp->func, writemask, 2, dst); /* 0 r2 */
1318
x87_fpop(cp->func); /* r2 */
1322
if (writemask & TGSI_WRITEMASK_XW) {
1324
x87_fst_or_nop(cp->func, writemask, 0, dst);
1325
x87_fstp_or_pop(cp->func, writemask, 3, dst);
1334
static boolean emit_MAX( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1336
struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
1337
struct x86_reg arg1 = fetch_src(cp, &op->Src[1]);
1338
struct x86_reg dst = get_xmm_writable(cp, arg0);
1340
sse_maxps(cp->func, dst, arg1);
1342
store_dest(cp, &op->Dst[0], dst);
1347
static boolean emit_MIN( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1349
struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
1350
struct x86_reg arg1 = fetch_src(cp, &op->Src[1]);
1351
struct x86_reg dst = get_xmm_writable(cp, arg0);
1353
sse_minps(cp->func, dst, arg1);
1355
store_dest(cp, &op->Dst[0], dst);
1359
static boolean emit_MOV( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1361
struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
1362
struct x86_reg dst = get_xmm_writable(cp, arg0);
1364
/* potentially nothing to do */
1366
store_dest(cp, &op->Dst[0], dst);
1370
static boolean emit_MUL( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1372
struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
1373
struct x86_reg arg1 = fetch_src(cp, &op->Src[1]);
1374
struct x86_reg dst = get_xmm_writable(cp, arg0);
1376
sse_mulps(cp->func, dst, arg1);
1378
store_dest(cp, &op->Dst[0], dst);
1383
static boolean emit_MAD( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1385
struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
1386
struct x86_reg arg1 = fetch_src(cp, &op->Src[1]);
1387
struct x86_reg arg2 = fetch_src(cp, &op->Src[2]);
1389
/* If we can't clobber old contents of arg0, get a temporary & copy
1390
* it there, then clobber it...
1392
arg0 = get_xmm_writable(cp, arg0);
1394
sse_mulps(cp->func, arg0, arg1);
1395
sse_addps(cp->func, arg0, arg2);
1396
store_dest(cp, &op->Dst[0], arg0);
1402
/* A wrapper for powf().
1403
* Makes sure it is cdecl and operates on floats.
1405
static float PIPE_CDECL _powerf( float x, float y )
1408
return util_fast_pow(x, y);
1410
return powf( x, y );
1415
static float PIPE_CDECL _exp2(float x)
1417
return util_fast_exp2(x);
1422
/* Really not sufficient -- need to check for conditions that could
1423
* generate inf/nan values, which will slow things down hugely.
1425
static boolean emit_POW( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1428
x87_fld_src(cp, &op->Src[1], 0); /* a1.x */
1429
x87_fld_src(cp, &op->Src[0], 0); /* a0.x a1.x */
1430
x87_fyl2x(cp->func); /* a1*log2(a0) */
1432
x87_emit_ex2( cp ); /* 2^(a1*log2(a0)) */
1434
x87_fstp_dest4(cp, &op->Dst[0]);
1438
/* For absolute correctness, need to spill/invalidate all XMM regs
1441
for (i = 0; i < 8; i++) {
1442
if (cp->xmm[i].dirty)
1444
aos_release_xmm_reg(cp, i);
1447
/* Push caller-save (ie scratch) regs.
1449
x86_cdecl_caller_push_regs( cp->func );
1451
x86_lea( cp->func, cp->stack_ESP, x86_make_disp(cp->stack_ESP, -8) );
1453
x87_fld_src( cp, &op->Src[1], 0 );
1454
x87_fstp( cp->func, x86_make_disp( cp->stack_ESP, 4 ) );
1455
x87_fld_src( cp, &op->Src[0], 0 );
1456
x87_fstp( cp->func, x86_make_disp( cp->stack_ESP, 0 ) );
1458
/* tmp_EAX has been pushed & will be restored below */
1459
x86_mov_reg_imm( cp->func, cp->tmp_EAX, (unsigned long) _powerf );
1460
x86_call( cp->func, cp->tmp_EAX );
1462
x86_lea( cp->func, cp->stack_ESP, x86_make_disp(cp->stack_ESP, 8) );
1464
x86_cdecl_caller_pop_regs( cp->func );
1466
/* Note retval on x87 stack:
1468
cp->func->x87_stack++;
1470
x87_fstp_dest4( cp, &op->Dst[0] );
1477
static boolean emit_EXPBASE2( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1481
/* For absolute correctness, need to spill/invalidate all XMM regs
1484
for (i = 0; i < 8; i++) {
1485
if (cp->xmm[i].dirty)
1487
aos_release_xmm_reg(cp, i);
1490
/* Push caller-save (ie scratch) regs.
1492
x86_cdecl_caller_push_regs( cp->func );
1494
x86_lea( cp->func, cp->stack_ESP, x86_make_disp(cp->stack_ESP, -4) );
1496
x87_fld_src( cp, &op->Src[0], 0 );
1497
x87_fstp( cp->func, x86_make_disp( cp->stack_ESP, 0 ) );
1499
/* tmp_EAX has been pushed & will be restored below */
1500
x86_mov_reg_imm( cp->func, cp->tmp_EAX, (unsigned long) _exp2 );
1501
x86_call( cp->func, cp->tmp_EAX );
1503
x86_lea( cp->func, cp->stack_ESP, x86_make_disp(cp->stack_ESP, 4) );
1505
x86_cdecl_caller_pop_regs( cp->func );
1507
/* Note retval on x87 stack:
1509
cp->func->x87_stack++;
1511
x87_fstp_dest4( cp, &op->Dst[0] );
1518
static boolean emit_RCP( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1520
struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
1521
struct x86_reg dst = aos_get_xmm_reg(cp);
1523
if (cp->have_sse2) {
1524
sse2_rcpss(cp->func, dst, arg0);
1525
/* extend precision here...
1529
struct x86_reg ones = aos_get_internal(cp, IMM_ONES);
1530
sse_movss(cp->func, dst, ones);
1531
sse_divss(cp->func, dst, arg0);
1534
store_scalar_dest(cp, &op->Dst[0], dst);
1539
/* Although rsqrtps() and rcpps() are low precision on some/all SSE
1540
* implementations, it is possible to improve its precision at
1541
* fairly low cost, using a newton/raphson step, as below:
1543
* x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a)
1544
* x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)]
1546
* x1 = rsqrtps(a) * [1.5 - .5 * a * rsqrtps(a) * rsqrtps(a)]
1549
* See: http://softwarecommunity.intel.com/articles/eng/1818.htm
1551
static boolean emit_RSQ( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1554
struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
1555
struct x86_reg r = aos_get_xmm_reg(cp);
1556
sse_rsqrtss(cp->func, r, arg0);
1557
store_scalar_dest(cp, &op->Dst[0], r);
1561
struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
1562
struct x86_reg r = aos_get_xmm_reg(cp);
1564
struct x86_reg neg_half = get_reg_ptr( cp, AOS_FILE_INTERNAL, IMM_RSQ );
1565
struct x86_reg one_point_five = x86_make_disp( neg_half, 4 );
1566
struct x86_reg src = get_xmm_writable( cp, arg0 );
1567
struct x86_reg neg = aos_get_internal(cp, IMM_NEGS);
1568
struct x86_reg tmp = aos_get_xmm_reg(cp);
1570
sse_movaps(cp->func, tmp, src);
1571
sse_mulps(cp->func, tmp, neg);
1572
sse_maxps(cp->func, tmp, src);
1574
sse_rsqrtss( cp->func, r, tmp ); /* rsqrtss(a) */
1575
sse_mulss( cp->func, tmp, neg_half ); /* -.5 * a */
1576
sse_mulss( cp->func, tmp, r ); /* -.5 * a * r */
1577
sse_mulss( cp->func, tmp, r ); /* -.5 * a * r * r */
1578
sse_addss( cp->func, tmp, one_point_five ); /* 1.5 - .5 * a * r * r */
1579
sse_mulss( cp->func, r, tmp ); /* r * (1.5 - .5 * a * r * r) */
1581
store_scalar_dest(cp, &op->Dst[0], r);
1583
aos_release_xmm_reg(cp, tmp.idx);
1590
static boolean emit_SGE( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1592
struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
1593
struct x86_reg arg1 = fetch_src(cp, &op->Src[1]);
1594
struct x86_reg ones = aos_get_internal(cp, IMM_ONES);
1595
struct x86_reg dst = get_xmm_writable(cp, arg0);
1597
sse_cmpps(cp->func, dst, arg1, cc_NotLessThan);
1598
sse_andps(cp->func, dst, ones);
1600
store_dest(cp, &op->Dst[0], dst);
1604
static boolean emit_SIN( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1606
x87_fld_src(cp, &op->Src[0], 0);
1608
x87_fstp_dest4(cp, &op->Dst[0]);
1614
static boolean emit_SLT( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1616
struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
1617
struct x86_reg arg1 = fetch_src(cp, &op->Src[1]);
1618
struct x86_reg ones = aos_get_internal(cp, IMM_ONES);
1619
struct x86_reg dst = get_xmm_writable(cp, arg0);
1621
sse_cmpps(cp->func, dst, arg1, cc_LessThan);
1622
sse_andps(cp->func, dst, ones);
1624
store_dest(cp, &op->Dst[0], dst);
1628
static boolean emit_SUB( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1630
struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
1631
struct x86_reg arg1 = fetch_src(cp, &op->Src[1]);
1632
struct x86_reg dst = get_xmm_writable(cp, arg0);
1634
sse_subps(cp->func, dst, arg1);
1636
store_dest(cp, &op->Dst[0], dst);
1640
static boolean emit_TRUNC( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1642
struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
1643
struct x86_reg tmp0 = aos_get_xmm_reg(cp);
1645
sse2_cvttps2dq(cp->func, tmp0, arg0);
1646
sse2_cvtdq2ps(cp->func, tmp0, tmp0);
1648
store_dest(cp, &op->Dst[0], tmp0);
1652
static boolean emit_XPD( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
1654
struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
1655
struct x86_reg arg1 = fetch_src(cp, &op->Src[1]);
1656
struct x86_reg tmp0 = aos_get_xmm_reg(cp);
1657
struct x86_reg tmp1 = aos_get_xmm_reg(cp);
1659
emit_pshufd(cp, tmp1, arg1, SHUF(Y, Z, X, W));
1660
sse_mulps(cp->func, tmp1, arg0);
1661
emit_pshufd(cp, tmp0, arg0, SHUF(Y, Z, X, W));
1662
sse_mulps(cp->func, tmp0, arg1);
1663
sse_subps(cp->func, tmp1, tmp0);
1664
sse_shufps(cp->func, tmp1, tmp1, SHUF(Y, Z, X, W));
1666
/* dst[2] = arg0[0] * arg1[1] - arg0[1] * arg1[0]; */
1667
/* dst[0] = arg0[1] * arg1[2] - arg0[2] * arg1[1]; */
1668
/* dst[1] = arg0[2] * arg1[0] - arg0[0] * arg1[2]; */
1669
/* dst[3] is undef */
1672
aos_release_xmm_reg(cp, tmp0.idx);
1673
store_dest(cp, &op->Dst[0], tmp1);
1680
emit_instruction( struct aos_compilation *cp,
1681
struct tgsi_full_instruction *inst )
1683
x87_assert_stack_empty(cp->func);
1685
switch( inst->Instruction.Opcode ) {
1686
case TGSI_OPCODE_MOV:
1687
return emit_MOV( cp, inst );
1689
case TGSI_OPCODE_LIT:
1690
return emit_LIT(cp, inst);
1692
case TGSI_OPCODE_RCP:
1693
return emit_RCP(cp, inst);
1695
case TGSI_OPCODE_RSQ:
1696
return emit_RSQ(cp, inst);
1698
case TGSI_OPCODE_EXP:
1699
/*return emit_EXP(cp, inst);*/
1702
case TGSI_OPCODE_LOG:
1703
/*return emit_LOG(cp, inst);*/
1706
case TGSI_OPCODE_MUL:
1707
return emit_MUL(cp, inst);
1709
case TGSI_OPCODE_ADD:
1710
return emit_ADD(cp, inst);
1712
case TGSI_OPCODE_DP3:
1713
return emit_DP3(cp, inst);
1715
case TGSI_OPCODE_DP4:
1716
return emit_DP4(cp, inst);
1718
case TGSI_OPCODE_DST:
1719
return emit_DST(cp, inst);
1721
case TGSI_OPCODE_MIN:
1722
return emit_MIN(cp, inst);
1724
case TGSI_OPCODE_MAX:
1725
return emit_MAX(cp, inst);
1727
case TGSI_OPCODE_SLT:
1728
return emit_SLT(cp, inst);
1730
case TGSI_OPCODE_SGE:
1731
return emit_SGE(cp, inst);
1733
case TGSI_OPCODE_MAD:
1734
return emit_MAD(cp, inst);
1736
case TGSI_OPCODE_SUB:
1737
return emit_SUB(cp, inst);
1739
case TGSI_OPCODE_LRP:
1740
/*return emit_LERP(cp, inst);*/
1743
case TGSI_OPCODE_FRC:
1744
return emit_FRC(cp, inst);
1746
case TGSI_OPCODE_CLAMP:
1747
/*return emit_CLAMP(cp, inst);*/
1750
case TGSI_OPCODE_FLR:
1751
return emit_FLR(cp, inst);
1753
case TGSI_OPCODE_ROUND:
1754
return emit_RND(cp, inst);
1756
case TGSI_OPCODE_EX2:
1758
return emit_EXPBASE2(cp, inst);
1760
/* this seems to fail for "larger" exponents.
1761
* See glean tvertProg1's EX2 test.
1763
return emit_EX2(cp, inst);
1768
case TGSI_OPCODE_LG2:
1769
return emit_LG2(cp, inst);
1771
case TGSI_OPCODE_POW:
1772
return emit_POW(cp, inst);
1774
case TGSI_OPCODE_XPD:
1775
return emit_XPD(cp, inst);
1777
case TGSI_OPCODE_ABS:
1778
return emit_ABS(cp, inst);
1780
case TGSI_OPCODE_DPH:
1781
return emit_DPH(cp, inst);
1783
case TGSI_OPCODE_COS:
1784
return emit_COS(cp, inst);
1786
case TGSI_OPCODE_SIN:
1787
return emit_SIN(cp, inst);
1789
case TGSI_OPCODE_TRUNC:
1790
return emit_TRUNC(cp, inst);
1792
case TGSI_OPCODE_END:
1801
static boolean emit_viewport( struct aos_compilation *cp )
1803
struct x86_reg pos = aos_get_shader_reg_xmm(cp,
1805
cp->vaos->draw->vs.position_output );
1807
struct x86_reg scale = x86_make_disp(cp->machine_EDX,
1808
Offset(struct aos_machine, scale));
1810
struct x86_reg translate = x86_make_disp(cp->machine_EDX,
1811
Offset(struct aos_machine, translate));
1813
sse_mulps(cp->func, pos, scale);
1814
sse_addps(cp->func, pos, translate);
1816
aos_adopt_xmm_reg( cp,
1819
cp->vaos->draw->vs.position_output,
1825
/* This is useful to be able to see the results on softpipe. Doesn't
1826
* do proper clipping, just assumes the backend can do it during
1827
* rasterization -- for debug only...
1829
static boolean emit_rhw_viewport( struct aos_compilation *cp )
1831
struct x86_reg tmp = aos_get_xmm_reg(cp);
1832
struct x86_reg pos = aos_get_shader_reg_xmm(cp,
1834
cp->vaos->draw->vs.position_output);
1836
struct x86_reg scale = x86_make_disp(cp->machine_EDX,
1837
Offset(struct aos_machine, scale));
1839
struct x86_reg translate = x86_make_disp(cp->machine_EDX,
1840
Offset(struct aos_machine, translate));
1844
emit_pshufd(cp, tmp, pos, SHUF(W, W, W, W));
1845
sse2_rcpss(cp->func, tmp, tmp);
1846
sse_shufps(cp->func, tmp, tmp, SHUF(X, X, X, X));
1848
sse_mulps(cp->func, pos, scale);
1849
sse_mulps(cp->func, pos, tmp);
1850
sse_addps(cp->func, pos, translate);
1854
mask_write(cp, pos, tmp, TGSI_WRITEMASK_W);
1856
aos_adopt_xmm_reg( cp,
1859
cp->vaos->draw->vs.position_output,
1866
static boolean note_immediate( struct aos_compilation *cp,
1867
struct tgsi_full_immediate *imm )
1869
unsigned pos = cp->num_immediates++;
1872
assert( imm->Immediate.NrTokens <= 4 + 1 );
1873
for (j = 0; j < imm->Immediate.NrTokens - 1; j++) {
1874
cp->vaos->machine->immediate[pos][j] = imm->u[j].Float;
1884
static void find_last_write_outputs( struct aos_compilation *cp )
1886
struct tgsi_parse_context parse;
1887
unsigned this_instruction = 0;
1890
tgsi_parse_init( &parse, cp->vaos->base.vs->state.tokens );
1892
while (!tgsi_parse_end_of_tokens( &parse )) {
1894
tgsi_parse_token( &parse );
1896
if (parse.FullToken.Token.Type != TGSI_TOKEN_TYPE_INSTRUCTION)
1899
for (i = 0; i < TGSI_FULL_MAX_DST_REGISTERS; i++) {
1900
if (parse.FullToken.FullInstruction.Dst[i].Register.File ==
1903
unsigned idx = parse.FullToken.FullInstruction.Dst[i].Register.Index;
1904
cp->output_last_write[idx] = this_instruction;
1911
tgsi_parse_free( &parse );
1915
#define ARG_MACHINE 1
1916
#define ARG_START_ELTS 2
1918
#define ARG_OUTBUF 4
1921
static boolean build_vertex_program( struct draw_vs_variant_aos_sse *variant,
1924
struct tgsi_parse_context parse;
1925
struct aos_compilation cp;
1926
unsigned fixup, label;
1930
tgsi_parse_init( &parse, variant->base.vs->state.tokens );
1932
memset(&cp, 0, sizeof(cp));
1934
cp.insn_counter = 1;
1937
cp.func = &variant->func[ linear ? 0 : 1 ];
1939
cp.tmp_EAX = x86_make_reg(file_REG32, reg_AX);
1940
cp.idx_EBX = x86_make_reg(file_REG32, reg_BX);
1941
cp.outbuf_ECX = x86_make_reg(file_REG32, reg_CX);
1942
cp.machine_EDX = x86_make_reg(file_REG32, reg_DX);
1943
cp.count_ESI = x86_make_reg(file_REG32, reg_SI);
1944
cp.temp_EBP = x86_make_reg(file_REG32, reg_BP);
1945
cp.stack_ESP = x86_make_reg( file_REG32, reg_SP );
1947
x86_init_func(cp.func);
1949
find_last_write_outputs(&cp);
1951
x86_push(cp.func, cp.idx_EBX);
1952
x86_push(cp.func, cp.count_ESI);
1953
x86_push(cp.func, cp.temp_EBP);
1956
/* Load arguments into regs:
1958
x86_mov(cp.func, cp.machine_EDX, x86_fn_arg(cp.func, ARG_MACHINE));
1959
x86_mov(cp.func, cp.idx_EBX, x86_fn_arg(cp.func, ARG_START_ELTS));
1960
x86_mov(cp.func, cp.count_ESI, x86_fn_arg(cp.func, ARG_COUNT));
1961
x86_mov(cp.func, cp.outbuf_ECX, x86_fn_arg(cp.func, ARG_OUTBUF));
1964
/* Compare count to zero and possibly bail.
1966
x86_xor(cp.func, cp.tmp_EAX, cp.tmp_EAX);
1967
x86_cmp(cp.func, cp.count_ESI, cp.tmp_EAX);
1968
fixup = x86_jcc_forward(cp.func, cc_E);
1971
save_fpu_state( &cp );
1972
set_fpu_round_nearest( &cp );
1974
aos_init_inputs( &cp, linear );
1979
/* Note address for loop jump
1981
label = x86_get_label(cp.func);
1983
/* Fetch inputs... TODO: fetch lazily...
1985
if (!aos_fetch_inputs( &cp, linear ))
1990
while( !tgsi_parse_end_of_tokens( &parse ) && !cp.error )
1992
tgsi_parse_token( &parse );
1994
switch (parse.FullToken.Token.Type) {
1995
case TGSI_TOKEN_TYPE_IMMEDIATE:
1997
if (!note_immediate( &cp, &parse.FullToken.FullImmediate ))
2002
case TGSI_TOKEN_TYPE_INSTRUCTION:
2004
tgsi_dump_instruction( &parse.FullToken.FullInstruction, cp.insn_counter );
2006
if (!emit_instruction( &cp, &parse.FullToken.FullInstruction ))
2011
x87_assert_stack_empty(cp.func);
2021
for (i = 0; i < 8; i++) {
2022
if (cp.xmm[i].file != TGSI_FILE_OUTPUT) {
2023
cp.xmm[i].file = TGSI_FILE_NULL;
2024
cp.xmm[i].dirty = 0;
2032
if (cp.vaos->base.key.clip) {
2033
/* not really handling clipping, just do the rhw so we can
2034
* see the results...
2036
emit_rhw_viewport(&cp);
2038
else if (cp.vaos->base.key.viewport) {
2042
/* Emit output... TODO: do this eagerly after the last write to a
2045
if (!aos_emit_outputs( &cp ))
2053
x86_make_disp(cp.outbuf_ECX,
2054
cp.vaos->base.key.output_stride));
2058
aos_incr_inputs( &cp, linear );
2060
/* decr count, loop if not zero
2062
x86_dec(cp.func, cp.count_ESI);
2063
x86_jcc(cp.func, cc_NZ, label);
2065
restore_fpu_state(&cp);
2067
/* Land forward jump here:
2069
x86_fixup_fwd_jump(cp.func, fixup);
2073
if (cp.func->need_emms)
2076
x86_pop(cp.func, cp.temp_EBP);
2077
x86_pop(cp.func, cp.count_ESI);
2078
x86_pop(cp.func, cp.idx_EBX);
2080
x87_assert_stack_empty(cp.func);
2083
tgsi_parse_free( &parse );
2087
tgsi_parse_free( &parse );
2093
static INLINE struct draw_vs_variant_aos_sse *
2094
draw_vs_variant_aos_sse(struct draw_vs_variant *variant)
2096
return (struct draw_vs_variant_aos_sse *) variant;
2100
static void vaos_set_buffer( struct draw_vs_variant *variant,
2104
unsigned max_stride)
2106
struct draw_vs_variant_aos_sse *vaos = draw_vs_variant_aos_sse(variant);
2108
if (buf < vaos->nr_vb) {
2109
vaos->buffer[buf].base_ptr = (char *)ptr;
2110
vaos->buffer[buf].stride = stride;
2113
if (0) debug_printf("%s %d/%d: %p %d\n", __FUNCTION__, buf, vaos->nr_vb, ptr, stride);
2118
static void PIPE_CDECL vaos_run_elts( struct draw_vs_variant *variant,
2119
const unsigned *elts,
2121
void *output_buffer )
2123
struct draw_vs_variant_aos_sse *vaos = draw_vs_variant_aos_sse(variant);
2124
struct aos_machine *machine = vaos->draw->vs.aos_machine;
2127
if (0) debug_printf("%s %d\n", __FUNCTION__, count);
2129
machine->internal[IMM_PSIZE][0] = vaos->draw->rasterizer->point_size;
2130
for (i = 0; i < PIPE_MAX_CONSTANT_BUFFERS; i++) {
2131
machine->constants[i] = vaos->draw->vs.aligned_constants[i];
2133
machine->immediates = vaos->base.vs->immediates;
2134
machine->buffer = vaos->buffer;
2136
vaos->gen_run_elts( machine,
2142
static void PIPE_CDECL vaos_run_linear( struct draw_vs_variant *variant,
2145
void *output_buffer )
2147
struct draw_vs_variant_aos_sse *vaos = draw_vs_variant_aos_sse(variant);
2148
struct aos_machine *machine = vaos->draw->vs.aos_machine;
2151
if (0) debug_printf("%s %d %d const: %x\n", __FUNCTION__, start, count,
2152
vaos->base.key.const_vbuffers);
2154
machine->internal[IMM_PSIZE][0] = vaos->draw->rasterizer->point_size;
2155
for (i = 0; i < PIPE_MAX_CONSTANT_BUFFERS; i++) {
2156
machine->constants[i] = vaos->draw->vs.aligned_constants[i];
2158
machine->immediates = vaos->base.vs->immediates;
2159
machine->buffer = vaos->buffer;
2161
vaos->gen_run_linear( machine,
2166
/* Sanity spot checks to make sure we didn't trash our constants */
2167
assert(machine->internal[IMM_ONES][0] == 1.0f);
2168
assert(machine->internal[IMM_IDENTITY][0] == 0.0f);
2169
assert(machine->internal[IMM_NEGS][0] == -1.0f);
2174
static void vaos_destroy( struct draw_vs_variant *variant )
2176
struct draw_vs_variant_aos_sse *vaos = draw_vs_variant_aos_sse(variant);
2178
FREE( vaos->buffer );
2180
x86_release_func( &vaos->func[0] );
2181
x86_release_func( &vaos->func[1] );
2188
static struct draw_vs_variant *variant_aos_sse( struct draw_vertex_shader *vs,
2189
const struct draw_vs_variant_key *key )
2192
struct draw_vs_variant_aos_sse *vaos = CALLOC_STRUCT(draw_vs_variant_aos_sse);
2197
vaos->base.key = *key;
2199
vaos->base.set_buffer = vaos_set_buffer;
2200
vaos->base.destroy = vaos_destroy;
2201
vaos->base.run_linear = vaos_run_linear;
2202
vaos->base.run_elts = vaos_run_elts;
2204
vaos->draw = vs->draw;
2206
for (i = 0; i < key->nr_inputs; i++)
2207
vaos->nr_vb = MAX2( vaos->nr_vb, key->element[i].in.buffer + 1 );
2209
vaos->buffer = MALLOC( vaos->nr_vb * sizeof(vaos->buffer[0]) );
2214
debug_printf("nr_vb: %d const: %x\n", vaos->nr_vb, vaos->base.key.const_vbuffers);
2217
tgsi_dump(vs->state.tokens, 0);
2220
if (!build_vertex_program( vaos, TRUE ))
2223
if (!build_vertex_program( vaos, FALSE ))
2226
vaos->gen_run_linear = (vaos_run_linear_func)x86_get_func(&vaos->func[0]);
2227
if (!vaos->gen_run_linear)
2230
vaos->gen_run_elts = (vaos_run_elts_func)x86_get_func(&vaos->func[1]);
2231
if (!vaos->gen_run_elts)
2237
if (vaos && vaos->buffer)
2241
x86_release_func( &vaos->func[0] );
2244
x86_release_func( &vaos->func[1] );
2252
struct draw_vs_variant *
2253
draw_vs_create_variant_aos_sse( struct draw_vertex_shader *vs,
2254
const struct draw_vs_variant_key *key )
2256
struct draw_vs_variant *variant = variant_aos_sse( vs, key );
2258
if (variant == NULL) {
2259
variant = draw_vs_create_variant_generic( vs, key );
2267
#endif /* PIPE_ARCH_X86 */