Commit d56ea8d9 by Richard Henderson

ffi64.c (ffi_prep_cif_machdep): Save sse-used flag in bit 11 of flags.

        * src/x86/ffi64.c (ffi_prep_cif_machdep): Save sse-used flag in
        bit 11 of flags.
        (ffi_call): Mask return type field.  Pass ssecount to ffi_call_unix64.
        (ffi_prep_closure): Set carry bit if sse-used flag set.
        * src/x86/unix64.S (ffi_call_unix64): Add ssecount argument.
        Only load sse registers if ssecount non-zero.
        (ffi_closure_unix64): Only save sse registers if carry set on entry.

From-SVN: r99257
parent 08cce8fe
2005-05-29 Ralf Corsepius <ralf.corsepius@rtems.org> 2005-05-04 Andreas Degert <ad@papyrus-gmbh.de>
Richard Henderson <rth@redhat.com>
* src/x86/ffi64.c (ffi_prep_cif_machdep): Save sse-used flag in
bit 11 of flags.
(ffi_call): Mask return type field. Pass ssecount to ffi_call_unix64.
(ffi_prep_closure): Set carry bit if sse-used flag set.
* src/x86/unix64.S (ffi_call_unix64): Add ssecount argument.
Only load sse registers if ssecount non-zero.
(ffi_closure_unix64): Only save sse registers if carry set on entry.
2005-04-29 Ralf Corsepius <ralf.corsepius@rtems.org>
* configure.ac: Add i*86-*-rtems*, sparc*-*-rtems*, * configure.ac: Add i*86-*-rtems*, sparc*-*-rtems*,
powerpc-*rtems*, arm*-*-rtems*, sh-*-rtems*. powerpc-*rtems*, arm*-*-rtems*, sh-*-rtems*.
......
...@@ -42,7 +42,7 @@ struct register_args ...@@ -42,7 +42,7 @@ struct register_args
}; };
extern void ffi_call_unix64 (void *args, unsigned long bytes, unsigned flags, extern void ffi_call_unix64 (void *args, unsigned long bytes, unsigned flags,
void *raddr, void (*fnaddr)()); void *raddr, void (*fnaddr)(), unsigned ssecount);
/* All reference to register classes here is identical to the code in /* All reference to register classes here is identical to the code in
gcc/config/i386/i386.c. Do *not* change one without the other. */ gcc/config/i386/i386.c. Do *not* change one without the other. */
...@@ -303,10 +303,9 @@ ffi_prep_cif_machdep (ffi_cif *cif) ...@@ -303,10 +303,9 @@ ffi_prep_cif_machdep (ffi_cif *cif)
else if (sse0 && sse1) else if (sse0 && sse1)
flags |= 1 << 10; flags |= 1 << 10;
/* Mark the true size of the structure. */ /* Mark the true size of the structure. */
flags |= cif->rtype->size << 11; flags |= cif->rtype->size << 12;
} }
} }
cif->flags = flags;
/* Go over all arguments and determine the way they should be passed. /* Go over all arguments and determine the way they should be passed.
If it's in a register and there is space for it, let that be so. If If it's in a register and there is space for it, let that be so. If
...@@ -331,6 +330,9 @@ ffi_prep_cif_machdep (ffi_cif *cif) ...@@ -331,6 +330,9 @@ ffi_prep_cif_machdep (ffi_cif *cif)
ssecount += nsse; ssecount += nsse;
} }
} }
if (ssecount)
flags |= 1 << 11;
cif->flags = flags;
cif->bytes = bytes; cif->bytes = bytes;
return FFI_OK; return FFI_OK;
...@@ -353,7 +355,7 @@ ffi_call (ffi_cif *cif, void (*fn)(), void *rvalue, void **avalue) ...@@ -353,7 +355,7 @@ ffi_call (ffi_cif *cif, void (*fn)(), void *rvalue, void **avalue)
address then we need to make one. Note the setting of flags to address then we need to make one. Note the setting of flags to
VOID above in ffi_prep_cif_machdep. */ VOID above in ffi_prep_cif_machdep. */
ret_in_memory = (cif->rtype->type == FFI_TYPE_STRUCT ret_in_memory = (cif->rtype->type == FFI_TYPE_STRUCT
&& cif->flags == FFI_TYPE_VOID); && (cif->flags & 0xff) == FFI_TYPE_VOID);
if (rvalue == NULL && ret_in_memory) if (rvalue == NULL && ret_in_memory)
rvalue = alloca (cif->rtype->size); rvalue = alloca (cif->rtype->size);
...@@ -424,7 +426,7 @@ ffi_call (ffi_cif *cif, void (*fn)(), void *rvalue, void **avalue) ...@@ -424,7 +426,7 @@ ffi_call (ffi_cif *cif, void (*fn)(), void *rvalue, void **avalue)
} }
ffi_call_unix64 (stack, cif->bytes + sizeof (struct register_args), ffi_call_unix64 (stack, cif->bytes + sizeof (struct register_args),
cif->flags, rvalue, fn); cif->flags, rvalue, fn, ssecount);
} }
...@@ -439,13 +441,18 @@ ffi_prep_closure (ffi_closure* closure, ...@@ -439,13 +441,18 @@ ffi_prep_closure (ffi_closure* closure,
volatile unsigned short *tramp; volatile unsigned short *tramp;
tramp = (volatile unsigned short *) &closure->tramp[0]; tramp = (volatile unsigned short *) &closure->tramp[0];
tramp[0] = 0xbb49; /* mov <code>, %r11 */ tramp[0] = 0xbb49; /* mov <code>, %r11 */
tramp[5] = 0xba49; /* mov <data>, %r10 */
tramp[10] = 0xff49; /* jmp *%r11 */
tramp[11] = 0x00e3;
*(void * volatile *) &tramp[1] = ffi_closure_unix64; *(void * volatile *) &tramp[1] = ffi_closure_unix64;
tramp[5] = 0xba49; /* mov <data>, %r10 */
*(void * volatile *) &tramp[6] = closure; *(void * volatile *) &tramp[6] = closure;
/* Set the carry bit iff the function uses any sse registers.
This is clc or stc, together with the first byte of the jmp. */
tramp[10] = cif->flags & (1 << 11) ? 0x49f9 : 0x49f8;
tramp[11] = 0xe3ff; /* jmp *%r11 */
closure->cif = cif; closure->cif = cif;
closure->fun = fun; closure->fun = fun;
closure->user_data = user_data; closure->user_data = user_data;
......
...@@ -31,7 +31,7 @@ ...@@ -31,7 +31,7 @@
.text .text
/* ffi_call_unix64 (void *args, unsigned long bytes, unsigned flags, /* ffi_call_unix64 (void *args, unsigned long bytes, unsigned flags,
void *raddr, void (*fnaddr)()); void *raddr, void (*fnaddr)());
Bit o trickiness here -- ARGS+BYTES is the base of the stack frame Bit o trickiness here -- ARGS+BYTES is the base of the stack frame
for this function. This has been allocated by ffi_call. We also for this function. This has been allocated by ffi_call. We also
...@@ -39,7 +39,7 @@ ...@@ -39,7 +39,7 @@
.align 2 .align 2
.globl ffi_call_unix64 .globl ffi_call_unix64
.type ffi_call_unix64,@function .type ffi_call_unix64,@function
ffi_call_unix64: ffi_call_unix64:
.LUW0: .LUW0:
...@@ -53,6 +53,7 @@ ffi_call_unix64: ...@@ -53,6 +53,7 @@ ffi_call_unix64:
.LUW1: .LUW1:
movq %rdi, %r10 /* Save a copy of the register area. */ movq %rdi, %r10 /* Save a copy of the register area. */
movq %r8, %r11 /* Save a copy of the target fn. */ movq %r8, %r11 /* Save a copy of the target fn. */
movl %r9d, %eax /* Set number of SSE registers. */
/* Load up all argument registers. */ /* Load up all argument registers. */
movq (%r10), %rdi movq (%r10), %rdi
...@@ -61,14 +62,9 @@ ffi_call_unix64: ...@@ -61,14 +62,9 @@ ffi_call_unix64:
movq 24(%r10), %rcx movq 24(%r10), %rcx
movq 32(%r10), %r8 movq 32(%r10), %r8
movq 40(%r10), %r9 movq 40(%r10), %r9
movdqa 48(%r10), %xmm0 testl %eax, %eax
movdqa 64(%r10), %xmm1 jnz .Lload_sse
movdqa 80(%r10), %xmm2 .Lret_from_load_sse:
movdqa 96(%r10), %xmm3
movdqa 112(%r10), %xmm4
movdqa 128(%r10), %xmm5
movdqa 144(%r10), %xmm6
movdqa 160(%r10), %xmm7
/* Deallocate the reg arg area. */ /* Deallocate the reg arg area. */
leaq 176(%r10), %rsp leaq 176(%r10), %rsp
...@@ -181,37 +177,49 @@ ffi_call_unix64: ...@@ -181,37 +177,49 @@ ffi_call_unix64:
movq %rax, (%rsi) movq %rax, (%rsi)
movq %rdx, 8(%rsi) movq %rdx, 8(%rsi)
/* Bits 11-31 contain the true size of the structure. Copy from /* Bits 12-31 contain the true size of the structure. Copy from
the scratch area to the true destination. */ the scratch area to the true destination. */
shrl $11, %ecx shrl $12, %ecx
rep movsb rep movsb
ret ret
/* Many times we can avoid loading any SSE registers at all.
It's not worth an indirect jump to load the exact set of
SSE registers needed; zero or all is a good compromise. */
.align 2
.LUW3: .LUW3:
.Lload_sse:
movdqa 48(%r10), %xmm0
movdqa 64(%r10), %xmm1
movdqa 80(%r10), %xmm2
movdqa 96(%r10), %xmm3
movdqa 112(%r10), %xmm4
movdqa 128(%r10), %xmm5
movdqa 144(%r10), %xmm6
movdqa 160(%r10), %xmm7
jmp .Lret_from_load_sse
.LUW4:
.size ffi_call_unix64,.-ffi_call_unix64 .size ffi_call_unix64,.-ffi_call_unix64
.align 2 .align 2
.globl ffi_closure_unix64 .globl ffi_closure_unix64
.type ffi_closure_unix64,@function .type ffi_closure_unix64,@function
ffi_closure_unix64: ffi_closure_unix64:
.LUW4:
subq $200, %rsp
.LUW5: .LUW5:
/* The carry flag is set by the trampoline iff SSE registers
are used. Don't clobber it before the branch instruction. */
leaq -200(%rsp), %rsp
.LUW6:
movq %rdi, (%rsp) movq %rdi, (%rsp)
movq %rsi, 8(%rsp) movq %rsi, 8(%rsp)
movq %rdx, 16(%rsp) movq %rdx, 16(%rsp)
movq %rcx, 24(%rsp) movq %rcx, 24(%rsp)
movq %r8, 32(%rsp) movq %r8, 32(%rsp)
movq %r9, 40(%rsp) movq %r9, 40(%rsp)
movdqa %xmm0, 48(%rsp) jc .Lsave_sse
movdqa %xmm1, 64(%rsp) .Lret_from_save_sse:
movdqa %xmm2, 80(%rsp)
movdqa %xmm3, 96(%rsp)
movdqa %xmm4, 112(%rsp)
movdqa %xmm5, 128(%rsp)
movdqa %xmm6, 144(%rsp)
movdqa %xmm7, 160(%rsp)
movq %r10, %rdi movq %r10, %rdi
leaq 176(%rsp), %rsi leaq 176(%rsp), %rsi
...@@ -221,7 +229,7 @@ ffi_closure_unix64: ...@@ -221,7 +229,7 @@ ffi_closure_unix64:
/* Deallocate stack frame early; return value is now in redzone. */ /* Deallocate stack frame early; return value is now in redzone. */
addq $200, %rsp addq $200, %rsp
.LUW6: .LUW7:
/* The first byte of the return value contains the FFI_TYPE. */ /* The first byte of the return value contains the FFI_TYPE. */
movzbl %al, %r10d movzbl %al, %r10d
...@@ -300,7 +308,22 @@ ffi_closure_unix64: ...@@ -300,7 +308,22 @@ ffi_closure_unix64:
movq -24(%rsp), %rax movq -24(%rsp), %rax
cmovnz %rdx, %rax cmovnz %rdx, %rax
ret ret
.LUW7:
/* See the comment above .Lload_sse; the same logic applies here. */
.align 2
.LUW8:
.Lsave_sse:
movdqa %xmm0, 48(%rsp)
movdqa %xmm1, 64(%rsp)
movdqa %xmm2, 80(%rsp)
movdqa %xmm3, 96(%rsp)
movdqa %xmm4, 112(%rsp)
movdqa %xmm5, 128(%rsp)
movdqa %xmm6, 144(%rsp)
movdqa %xmm7, 160(%rsp)
jmp .Lret_from_save_sse
.LUW9:
.size ffi_closure_unix64,.-ffi_closure_unix64 .size ffi_closure_unix64,.-ffi_closure_unix64
.section .eh_frame,"a",@progbits .section .eh_frame,"a",@progbits
...@@ -327,24 +350,25 @@ ffi_closure_unix64: ...@@ -327,24 +350,25 @@ ffi_closure_unix64:
.LASFDE1: .LASFDE1:
.long .LASFDE1-.Lframe1 /* FDE CIE offset */ .long .LASFDE1-.Lframe1 /* FDE CIE offset */
.long .LUW0-. /* FDE initial location */ .long .LUW0-. /* FDE initial location */
.long .LUW3-.LUW0 /* FDE address range */ .long .LUW4-.LUW0 /* FDE address range */
.uleb128 0x0 /* Augmentation size */ .uleb128 0x0 /* Augmentation size */
.byte 0x4 /* DW_CFA_advance_loc4 */ .byte 0x4 /* DW_CFA_advance_loc4 */
.long .LUW1-.LUW0 .long .LUW1-.LUW0
/* New stack frame based off rbp. This is a itty bit of unwind /* New stack frame based off rbp. This is a itty bit of unwind
trickery in that the CFA *has* changed. There is no easy way trickery in that the CFA *has* changed. There is no easy way
to describe it correctly on entry to the function. Fortunately, to describe it correctly on entry to the function. Fortunately,
it doesn't matter too much since at all points we can correctly it doesn't matter too much since at all points we can correctly
unwind back to ffi_call. Note that the location to which we unwind back to ffi_call. Note that the location to which we
moved the return address is (the new) CFA-8, so from the moved the return address is (the new) CFA-8, so from the
perspective of the unwind info, it hasn't moved. */ perspective of the unwind info, it hasn't moved. */
.byte 0xc /* DW_CFA_def_cfa, %rbp offset 32 */ .byte 0xc /* DW_CFA_def_cfa, %rbp offset 32 */
.uleb128 6 .uleb128 6
.uleb128 32 .uleb128 32
.byte 0x80+6 /* DW_CFA_offset, %rbp offset 2*-8 */ .byte 0x80+6 /* DW_CFA_offset, %rbp offset 2*-8 */
.uleb128 2 .uleb128 2
.byte 0xa /* DW_CFA_remember_state */
.byte 0x4 /* DW_CFA_advance_loc4 */ .byte 0x4 /* DW_CFA_advance_loc4 */
.long .LUW2-.LUW1 .long .LUW2-.LUW1
...@@ -352,23 +376,36 @@ ffi_closure_unix64: ...@@ -352,23 +376,36 @@ ffi_closure_unix64:
.uleb128 7 .uleb128 7
.uleb128 8 .uleb128 8
.byte 0xc0+6 /* DW_CFA_restore, %rbp */ .byte 0xc0+6 /* DW_CFA_restore, %rbp */
.byte 0x4 /* DW_CFA_advance_loc4 */
.long .LUW3-.LUW2
.byte 0xb /* DW_CFA_restore_state */
.align 8 .align 8
.LEFDE1: .LEFDE1:
.LSFDE3: .LSFDE3:
.long .LEFDE3-.LASFDE3 /* FDE Length */ .long .LEFDE3-.LASFDE3 /* FDE Length */
.LASFDE3: .LASFDE3:
.long .LASFDE3-.Lframe1 /* FDE CIE offset */ .long .LASFDE3-.Lframe1 /* FDE CIE offset */
.long .LUW4-. /* FDE initial location */ .long .LUW5-. /* FDE initial location */
.long .LUW7-.LUW4 /* FDE address range */ .long .LUW9-.LUW5 /* FDE address range */
.uleb128 0x0 /* Augmentation size */ .uleb128 0x0 /* Augmentation size */
.byte 0x4 /* DW_CFA_advance_loc4 */ .byte 0x4 /* DW_CFA_advance_loc4 */
.long .LUW5-.LUW4 .long .LUW6-.LUW5
.byte 0xe /* DW_CFA_def_cfa_offset */ .byte 0xe /* DW_CFA_def_cfa_offset */
.uleb128 208 .uleb128 208
.byte 0xa /* DW_CFA_remember_state */
.byte 0x4 /* DW_CFA_advance_loc4 */ .byte 0x4 /* DW_CFA_advance_loc4 */
.long .LUW6-.LUW5 .long .LUW7-.LUW6
.byte 0xe /* DW_CFA_def_cfa_offset */ .byte 0xe /* DW_CFA_def_cfa_offset */
.uleb128 8 .uleb128 8
.byte 0x4 /* DW_CFA_advance_loc4 */
.long .LUW8-.LUW7
.byte 0xb /* DW_CFA_restore_state */
.align 8 .align 8
.LEFDE3: .LEFDE3:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment