Commit d56ea8d9 by Richard Henderson

ffi64.c (ffi_prep_cif_machdep): Save sse-used flag in bit 11 of flags.

        * src/x86/ffi64.c (ffi_prep_cif_machdep): Save sse-used flag in
        bit 11 of flags.
        (ffi_call): Mask return type field.  Pass ssecount to ffi_call_unix64.
        (ffi_prep_closure): Set carry bit if sse-used flag set.
        * src/x86/unix64.S (ffi_call_unix64): Add ssecount argument.
        Only load sse registers if ssecount non-zero.
        (ffi_closure_unix64): Only save sse registers if carry set on entry.

From-SVN: r99257
parent 08cce8fe
2005-05-29 Ralf Corsepius <ralf.corsepius@rtems.org> 2005-05-04 Andreas Degert <ad@papyrus-gmbh.de>
Richard Henderson <rth@redhat.com>
* src/x86/ffi64.c (ffi_prep_cif_machdep): Save sse-used flag in
bit 11 of flags.
(ffi_call): Mask return type field. Pass ssecount to ffi_call_unix64.
(ffi_prep_closure): Set carry bit if sse-used flag set.
* src/x86/unix64.S (ffi_call_unix64): Add ssecount argument.
Only load sse registers if ssecount non-zero.
(ffi_closure_unix64): Only save sse registers if carry set on entry.
2005-04-29 Ralf Corsepius <ralf.corsepius@rtems.org>
* configure.ac: Add i*86-*-rtems*, sparc*-*-rtems*, * configure.ac: Add i*86-*-rtems*, sparc*-*-rtems*,
powerpc-*rtems*, arm*-*-rtems*, sh-*-rtems*. powerpc-*rtems*, arm*-*-rtems*, sh-*-rtems*.
......
...@@ -42,7 +42,7 @@ struct register_args ...@@ -42,7 +42,7 @@ struct register_args
}; };
extern void ffi_call_unix64 (void *args, unsigned long bytes, unsigned flags, extern void ffi_call_unix64 (void *args, unsigned long bytes, unsigned flags,
void *raddr, void (*fnaddr)()); void *raddr, void (*fnaddr)(), unsigned ssecount);
/* All reference to register classes here is identical to the code in /* All reference to register classes here is identical to the code in
gcc/config/i386/i386.c. Do *not* change one without the other. */ gcc/config/i386/i386.c. Do *not* change one without the other. */
...@@ -303,10 +303,9 @@ ffi_prep_cif_machdep (ffi_cif *cif) ...@@ -303,10 +303,9 @@ ffi_prep_cif_machdep (ffi_cif *cif)
else if (sse0 && sse1) else if (sse0 && sse1)
flags |= 1 << 10; flags |= 1 << 10;
/* Mark the true size of the structure. */ /* Mark the true size of the structure. */
flags |= cif->rtype->size << 11; flags |= cif->rtype->size << 12;
} }
} }
cif->flags = flags;
/* Go over all arguments and determine the way they should be passed. /* Go over all arguments and determine the way they should be passed.
If it's in a register and there is space for it, let that be so. If If it's in a register and there is space for it, let that be so. If
...@@ -331,6 +330,9 @@ ffi_prep_cif_machdep (ffi_cif *cif) ...@@ -331,6 +330,9 @@ ffi_prep_cif_machdep (ffi_cif *cif)
ssecount += nsse; ssecount += nsse;
} }
} }
if (ssecount)
flags |= 1 << 11;
cif->flags = flags;
cif->bytes = bytes; cif->bytes = bytes;
return FFI_OK; return FFI_OK;
...@@ -353,7 +355,7 @@ ffi_call (ffi_cif *cif, void (*fn)(), void *rvalue, void **avalue) ...@@ -353,7 +355,7 @@ ffi_call (ffi_cif *cif, void (*fn)(), void *rvalue, void **avalue)
address then we need to make one. Note the setting of flags to address then we need to make one. Note the setting of flags to
VOID above in ffi_prep_cif_machdep. */ VOID above in ffi_prep_cif_machdep. */
ret_in_memory = (cif->rtype->type == FFI_TYPE_STRUCT ret_in_memory = (cif->rtype->type == FFI_TYPE_STRUCT
&& cif->flags == FFI_TYPE_VOID); && (cif->flags & 0xff) == FFI_TYPE_VOID);
if (rvalue == NULL && ret_in_memory) if (rvalue == NULL && ret_in_memory)
rvalue = alloca (cif->rtype->size); rvalue = alloca (cif->rtype->size);
...@@ -424,7 +426,7 @@ ffi_call (ffi_cif *cif, void (*fn)(), void *rvalue, void **avalue) ...@@ -424,7 +426,7 @@ ffi_call (ffi_cif *cif, void (*fn)(), void *rvalue, void **avalue)
} }
ffi_call_unix64 (stack, cif->bytes + sizeof (struct register_args), ffi_call_unix64 (stack, cif->bytes + sizeof (struct register_args),
cif->flags, rvalue, fn); cif->flags, rvalue, fn, ssecount);
} }
...@@ -439,13 +441,18 @@ ffi_prep_closure (ffi_closure* closure, ...@@ -439,13 +441,18 @@ ffi_prep_closure (ffi_closure* closure,
volatile unsigned short *tramp; volatile unsigned short *tramp;
tramp = (volatile unsigned short *) &closure->tramp[0]; tramp = (volatile unsigned short *) &closure->tramp[0];
tramp[0] = 0xbb49; /* mov <code>, %r11 */ tramp[0] = 0xbb49; /* mov <code>, %r11 */
tramp[5] = 0xba49; /* mov <data>, %r10 */
tramp[10] = 0xff49; /* jmp *%r11 */
tramp[11] = 0x00e3;
*(void * volatile *) &tramp[1] = ffi_closure_unix64; *(void * volatile *) &tramp[1] = ffi_closure_unix64;
tramp[5] = 0xba49; /* mov <data>, %r10 */
*(void * volatile *) &tramp[6] = closure; *(void * volatile *) &tramp[6] = closure;
/* Set the carry bit iff the function uses any sse registers.
This is clc or stc, together with the first byte of the jmp. */
tramp[10] = cif->flags & (1 << 11) ? 0x49f9 : 0x49f8;
tramp[11] = 0xe3ff; /* jmp *%r11 */
closure->cif = cif; closure->cif = cif;
closure->fun = fun; closure->fun = fun;
closure->user_data = user_data; closure->user_data = user_data;
......
...@@ -53,6 +53,7 @@ ffi_call_unix64: ...@@ -53,6 +53,7 @@ ffi_call_unix64:
.LUW1: .LUW1:
movq %rdi, %r10 /* Save a copy of the register area. */ movq %rdi, %r10 /* Save a copy of the register area. */
movq %r8, %r11 /* Save a copy of the target fn. */ movq %r8, %r11 /* Save a copy of the target fn. */
movl %r9d, %eax /* Set number of SSE registers. */
/* Load up all argument registers. */ /* Load up all argument registers. */
movq (%r10), %rdi movq (%r10), %rdi
...@@ -61,14 +62,9 @@ ffi_call_unix64: ...@@ -61,14 +62,9 @@ ffi_call_unix64:
movq 24(%r10), %rcx movq 24(%r10), %rcx
movq 32(%r10), %r8 movq 32(%r10), %r8
movq 40(%r10), %r9 movq 40(%r10), %r9
movdqa 48(%r10), %xmm0 testl %eax, %eax
movdqa 64(%r10), %xmm1 jnz .Lload_sse
movdqa 80(%r10), %xmm2 .Lret_from_load_sse:
movdqa 96(%r10), %xmm3
movdqa 112(%r10), %xmm4
movdqa 128(%r10), %xmm5
movdqa 144(%r10), %xmm6
movdqa 160(%r10), %xmm7
/* Deallocate the reg arg area. */ /* Deallocate the reg arg area. */
leaq 176(%r10), %rsp leaq 176(%r10), %rsp
...@@ -181,12 +177,29 @@ ffi_call_unix64: ...@@ -181,12 +177,29 @@ ffi_call_unix64:
movq %rax, (%rsi) movq %rax, (%rsi)
movq %rdx, 8(%rsi) movq %rdx, 8(%rsi)
/* Bits 11-31 contain the true size of the structure. Copy from /* Bits 12-31 contain the true size of the structure. Copy from
the scratch area to the true destination. */ the scratch area to the true destination. */
shrl $11, %ecx shrl $12, %ecx
rep movsb rep movsb
ret ret
/* Many times we can avoid loading any SSE registers at all.
It's not worth an indirect jump to load the exact set of
SSE registers needed; zero or all is a good compromise. */
.align 2
.LUW3: .LUW3:
.Lload_sse:
movdqa 48(%r10), %xmm0
movdqa 64(%r10), %xmm1
movdqa 80(%r10), %xmm2
movdqa 96(%r10), %xmm3
movdqa 112(%r10), %xmm4
movdqa 128(%r10), %xmm5
movdqa 144(%r10), %xmm6
movdqa 160(%r10), %xmm7
jmp .Lret_from_load_sse
.LUW4:
.size ffi_call_unix64,.-ffi_call_unix64 .size ffi_call_unix64,.-ffi_call_unix64
.align 2 .align 2
...@@ -194,24 +207,19 @@ ffi_call_unix64: ...@@ -194,24 +207,19 @@ ffi_call_unix64:
.type ffi_closure_unix64,@function .type ffi_closure_unix64,@function
ffi_closure_unix64: ffi_closure_unix64:
.LUW4:
subq $200, %rsp
.LUW5: .LUW5:
/* The carry flag is set by the trampoline iff SSE registers
are used. Don't clobber it before the branch instruction. */
leaq -200(%rsp), %rsp
.LUW6:
movq %rdi, (%rsp) movq %rdi, (%rsp)
movq %rsi, 8(%rsp) movq %rsi, 8(%rsp)
movq %rdx, 16(%rsp) movq %rdx, 16(%rsp)
movq %rcx, 24(%rsp) movq %rcx, 24(%rsp)
movq %r8, 32(%rsp) movq %r8, 32(%rsp)
movq %r9, 40(%rsp) movq %r9, 40(%rsp)
movdqa %xmm0, 48(%rsp) jc .Lsave_sse
movdqa %xmm1, 64(%rsp) .Lret_from_save_sse:
movdqa %xmm2, 80(%rsp)
movdqa %xmm3, 96(%rsp)
movdqa %xmm4, 112(%rsp)
movdqa %xmm5, 128(%rsp)
movdqa %xmm6, 144(%rsp)
movdqa %xmm7, 160(%rsp)
movq %r10, %rdi movq %r10, %rdi
leaq 176(%rsp), %rsi leaq 176(%rsp), %rsi
...@@ -221,7 +229,7 @@ ffi_closure_unix64: ...@@ -221,7 +229,7 @@ ffi_closure_unix64:
/* Deallocate stack frame early; return value is now in redzone. */ /* Deallocate stack frame early; return value is now in redzone. */
addq $200, %rsp addq $200, %rsp
.LUW6: .LUW7:
/* The first byte of the return value contains the FFI_TYPE. */ /* The first byte of the return value contains the FFI_TYPE. */
movzbl %al, %r10d movzbl %al, %r10d
...@@ -300,7 +308,22 @@ ffi_closure_unix64: ...@@ -300,7 +308,22 @@ ffi_closure_unix64:
movq -24(%rsp), %rax movq -24(%rsp), %rax
cmovnz %rdx, %rax cmovnz %rdx, %rax
ret ret
.LUW7:
/* See the comment above .Lload_sse; the same logic applies here. */
.align 2
.LUW8:
.Lsave_sse:
movdqa %xmm0, 48(%rsp)
movdqa %xmm1, 64(%rsp)
movdqa %xmm2, 80(%rsp)
movdqa %xmm3, 96(%rsp)
movdqa %xmm4, 112(%rsp)
movdqa %xmm5, 128(%rsp)
movdqa %xmm6, 144(%rsp)
movdqa %xmm7, 160(%rsp)
jmp .Lret_from_save_sse
.LUW9:
.size ffi_closure_unix64,.-ffi_closure_unix64 .size ffi_closure_unix64,.-ffi_closure_unix64
.section .eh_frame,"a",@progbits .section .eh_frame,"a",@progbits
...@@ -327,7 +350,7 @@ ffi_closure_unix64: ...@@ -327,7 +350,7 @@ ffi_closure_unix64:
.LASFDE1: .LASFDE1:
.long .LASFDE1-.Lframe1 /* FDE CIE offset */ .long .LASFDE1-.Lframe1 /* FDE CIE offset */
.long .LUW0-. /* FDE initial location */ .long .LUW0-. /* FDE initial location */
.long .LUW3-.LUW0 /* FDE address range */ .long .LUW4-.LUW0 /* FDE address range */
.uleb128 0x0 /* Augmentation size */ .uleb128 0x0 /* Augmentation size */
.byte 0x4 /* DW_CFA_advance_loc4 */ .byte 0x4 /* DW_CFA_advance_loc4 */
...@@ -345,6 +368,7 @@ ffi_closure_unix64: ...@@ -345,6 +368,7 @@ ffi_closure_unix64:
.uleb128 32 .uleb128 32
.byte 0x80+6 /* DW_CFA_offset, %rbp offset 2*-8 */ .byte 0x80+6 /* DW_CFA_offset, %rbp offset 2*-8 */
.uleb128 2 .uleb128 2
.byte 0xa /* DW_CFA_remember_state */
.byte 0x4 /* DW_CFA_advance_loc4 */ .byte 0x4 /* DW_CFA_advance_loc4 */
.long .LUW2-.LUW1 .long .LUW2-.LUW1
...@@ -352,23 +376,36 @@ ffi_closure_unix64: ...@@ -352,23 +376,36 @@ ffi_closure_unix64:
.uleb128 7 .uleb128 7
.uleb128 8 .uleb128 8
.byte 0xc0+6 /* DW_CFA_restore, %rbp */ .byte 0xc0+6 /* DW_CFA_restore, %rbp */
.byte 0x4 /* DW_CFA_advance_loc4 */
.long .LUW3-.LUW2
.byte 0xb /* DW_CFA_restore_state */
.align 8 .align 8
.LEFDE1: .LEFDE1:
.LSFDE3: .LSFDE3:
.long .LEFDE3-.LASFDE3 /* FDE Length */ .long .LEFDE3-.LASFDE3 /* FDE Length */
.LASFDE3: .LASFDE3:
.long .LASFDE3-.Lframe1 /* FDE CIE offset */ .long .LASFDE3-.Lframe1 /* FDE CIE offset */
.long .LUW4-. /* FDE initial location */ .long .LUW5-. /* FDE initial location */
.long .LUW7-.LUW4 /* FDE address range */ .long .LUW9-.LUW5 /* FDE address range */
.uleb128 0x0 /* Augmentation size */ .uleb128 0x0 /* Augmentation size */
.byte 0x4 /* DW_CFA_advance_loc4 */ .byte 0x4 /* DW_CFA_advance_loc4 */
.long .LUW5-.LUW4 .long .LUW6-.LUW5
.byte 0xe /* DW_CFA_def_cfa_offset */ .byte 0xe /* DW_CFA_def_cfa_offset */
.uleb128 208 .uleb128 208
.byte 0xa /* DW_CFA_remember_state */
.byte 0x4 /* DW_CFA_advance_loc4 */ .byte 0x4 /* DW_CFA_advance_loc4 */
.long .LUW6-.LUW5 .long .LUW7-.LUW6
.byte 0xe /* DW_CFA_def_cfa_offset */ .byte 0xe /* DW_CFA_def_cfa_offset */
.uleb128 8 .uleb128 8
.byte 0x4 /* DW_CFA_advance_loc4 */
.long .LUW8-.LUW7
.byte 0xb /* DW_CFA_restore_state */
.align 8 .align 8
.LEFDE3: .LEFDE3:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment