Skip to content

Commit 61d3c5d

Browse files
committed
Auto merge of #141538 - folkertdev:systemv-x86_64-va_arg, r=<try>
implement `va_arg` for x86_64 systemv tracking issue: #44930 Turns out LLVM's `va_arg` is also unreliable for this target. llvm/llvm-project#141361 So, like clang, we implement our own. I used - the spec at https://gitlab.com/x86-psABIs/x86-64-ABI - the clang implementation at https://github.com/llvm/llvm-project/blob/9a440f84773c56d3803f330774acb2b4f471d5b4/clang/lib/CodeGen/Targets/X86.cpp#L3041 We can take a bunch of shortcuts because the return type of `va_list` must implement `VaArgSafe`. I also extended some of the tests, because up to 11 floats can be stored in the `reg_safe_area` for this calling convention. r? `@workingjubilee` `@rustbot` label +F-c_variadic try-job: x86_64-apple-1
2 parents be42293 + fd76915 commit 61d3c5d

File tree

3 files changed

+316
-11
lines changed

3 files changed

+316
-11
lines changed

compiler/rustc_codegen_llvm/src/va_arg.rs

Lines changed: 304 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,10 @@
1-
use rustc_abi::{Align, Endian, HasDataLayout, Size};
1+
use rustc_abi::{Align, BackendRepr, Endian, HasDataLayout, Primitive, Size, TyAndLayout};
2+
use rustc_codegen_ssa::MemFlags;
23
use rustc_codegen_ssa::common::IntPredicate;
34
use rustc_codegen_ssa::mir::operand::OperandRef;
4-
use rustc_codegen_ssa::traits::{BaseTypeCodegenMethods, BuilderMethods, ConstCodegenMethods};
5+
use rustc_codegen_ssa::traits::{
6+
BaseTypeCodegenMethods, BuilderMethods, ConstCodegenMethods, LayoutTypeCodegenMethods,
7+
};
58
use rustc_middle::ty::Ty;
69
use rustc_middle::ty::layout::{HasTyCtxt, LayoutOf};
710

@@ -303,6 +306,298 @@ fn emit_s390x_va_arg<'ll, 'tcx>(
303306
bx.load(val_type, val_addr, layout.align.abi)
304307
}
305308

309+
fn emit_x86_64_sysv64_va_arg<'ll, 'tcx>(
310+
bx: &mut Builder<'_, 'll, 'tcx>,
311+
list: OperandRef<'tcx, &'ll Value>,
312+
target_ty: Ty<'tcx>,
313+
) -> &'ll Value {
314+
let dl = bx.cx.data_layout();
315+
316+
// Implementation of the systemv x86_64 ABI calling convention for va_args, see
317+
// https://gitlab.com/x86-psABIs/x86-64-ABI (section 3.5.7). This implementation is heavily
318+
// based on the one in clang.
319+
320+
// We're able to take some shortcuts because the return type of `va_arg` must implement the
321+
// `VaArgSafe` trait. Currently, only pointers, f64, i32, u32, i64 and u64 implement this trait.
322+
323+
// typedef struct __va_list_tag {
324+
// unsigned int gp_offset;
325+
// unsigned int fp_offset;
326+
// void *overflow_arg_area;
327+
// void *reg_save_area;
328+
// } va_list[1];
329+
let va_list_addr = list.immediate();
330+
331+
// Peel off any newtype wrappers.
332+
let layout = {
333+
let mut layout = bx.cx.layout_of(target_ty);
334+
335+
while let Some((_, inner)) = layout.non_1zst_field(bx.cx) {
336+
layout = inner;
337+
}
338+
339+
layout
340+
};
341+
342+
// AMD64-ABI 3.5.7p5: Step 1. Determine whether type may be passed
343+
// in the registers. If not go to step 7.
344+
345+
// AMD64-ABI 3.5.7p5: Step 2. Compute num_gp to hold the number of
346+
// general purpose registers needed to pass type and num_fp to hold
347+
// the number of floating point registers needed.
348+
349+
let mut num_gp_registers = 0;
350+
let mut num_fp_registers = 0;
351+
352+
let mut registers_for_primitive = |p| match p {
353+
Primitive::Int(integer, _is_signed) => {
354+
num_gp_registers += integer.size().bytes().div_ceil(8) as u32;
355+
}
356+
Primitive::Float(float) => {
357+
num_fp_registers += float.size().bytes().div_ceil(16) as u32;
358+
}
359+
Primitive::Pointer(_) => {
360+
num_gp_registers += 1;
361+
}
362+
};
363+
364+
match layout.layout.backend_repr() {
365+
BackendRepr::Scalar(scalar) => {
366+
registers_for_primitive(scalar.primitive());
367+
}
368+
BackendRepr::ScalarPair(scalar1, scalar2) => {
369+
registers_for_primitive(scalar1.primitive());
370+
registers_for_primitive(scalar2.primitive());
371+
}
372+
BackendRepr::SimdVector { .. } => {
373+
// Because no instance of VaArgSafe uses a non-scalar `BackendRepr`.
374+
unreachable!(
375+
"No x86-64 SysV va_arg implementation for {:?}",
376+
layout.layout.backend_repr()
377+
)
378+
}
379+
BackendRepr::Memory { .. } => {
380+
let mem_addr = x86_64_sysv64_va_arg_from_memory(bx, va_list_addr, layout);
381+
return bx.load(layout.llvm_type(bx), mem_addr, layout.align.abi);
382+
}
383+
};
384+
385+
// AMD64-ABI 3.5.7p5: Step 3. Verify whether arguments fit into
386+
// registers. In the case: l->gp_offset > 48 - num_gp * 8 or
387+
// l->fp_offset > 176 - num_fp * 16 go to step 7.
388+
389+
let unsigned_int_offset = 4;
390+
let ptr_offset = 8;
391+
let gp_offset_ptr = va_list_addr;
392+
let fp_offset_ptr = bx.inbounds_ptradd(va_list_addr, bx.cx.const_usize(unsigned_int_offset));
393+
394+
let gp_offset_v = bx.load(bx.type_i32(), gp_offset_ptr, Align::from_bytes(8).unwrap());
395+
let fp_offset_v = bx.load(bx.type_i32(), fp_offset_ptr, Align::from_bytes(4).unwrap());
396+
397+
let mut use_regs = bx.const_bool(false);
398+
399+
if num_gp_registers > 0 {
400+
let max_offset_val = 48u32 - num_gp_registers * 8;
401+
let fits_in_gp = bx.icmp(IntPredicate::IntULE, gp_offset_v, bx.const_u32(max_offset_val));
402+
use_regs = fits_in_gp;
403+
}
404+
405+
if num_fp_registers > 0 {
406+
let max_offset_val = 176u32 - num_fp_registers * 16;
407+
let fits_in_fp = bx.icmp(IntPredicate::IntULE, fp_offset_v, bx.const_u32(max_offset_val));
408+
use_regs = if num_gp_registers > 0 { bx.and(use_regs, fits_in_fp) } else { fits_in_fp };
409+
}
410+
411+
let in_reg = bx.append_sibling_block("va_arg.in_reg");
412+
let in_mem = bx.append_sibling_block("va_arg.in_mem");
413+
let end = bx.append_sibling_block("va_arg.end");
414+
415+
bx.cond_br(use_regs, in_reg, in_mem);
416+
417+
// Emit code to load the value if it was passed in a register.
418+
bx.switch_to_block(in_reg);
419+
420+
// AMD64-ABI 3.5.7p5: Step 4. Fetch type from l->reg_save_area with
421+
// an offset of l->gp_offset and/or l->fp_offset. This may require
422+
// copying to a temporary location in case the parameter is passed
423+
// in different register classes or requires an alignment greater
424+
// than 8 for general purpose registers and 16 for XMM registers.
425+
//
426+
// FIXME(llvm): This really results in shameful code when we end up needing to
427+
// collect arguments from different places; often what should result in a
428+
// simple assembling of a structure from scattered addresses has many more
429+
// loads than necessary. Can we clean this up?
430+
let reg_save_area_ptr =
431+
bx.inbounds_ptradd(va_list_addr, bx.cx.const_usize(2 * unsigned_int_offset + ptr_offset));
432+
let reg_save_area_v = bx.load(bx.type_ptr(), reg_save_area_ptr, dl.pointer_align.abi);
433+
434+
let reg_addr = match layout.layout.backend_repr() {
435+
BackendRepr::Scalar(scalar) => match scalar.primitive() {
436+
Primitive::Int(_, _) | Primitive::Pointer(_) => {
437+
let reg_addr = bx.inbounds_ptradd(reg_save_area_v, gp_offset_v);
438+
439+
// Copy into a temporary if the type is more aligned than the register save area.
440+
copy_to_temporary_if_more_aligned(bx, reg_addr, layout)
441+
}
442+
Primitive::Float(_) => bx.inbounds_ptradd(reg_save_area_v, fp_offset_v),
443+
},
444+
BackendRepr::ScalarPair(scalar1, scalar2) => {
445+
let ty_lo = bx.cx().scalar_pair_element_backend_type(layout, 0, false);
446+
let ty_hi = bx.cx().scalar_pair_element_backend_type(layout, 1, false);
447+
448+
let align_lo = layout.field(bx.cx, 0).layout.align().abi;
449+
let align_hi = layout.field(bx.cx, 1).layout.align().abi;
450+
451+
match (scalar1.primitive(), scalar2.primitive()) {
452+
(Primitive::Float(_), Primitive::Float(_)) => {
453+
// SSE registers are spaced 16 bytes apart in the register save
454+
// area, we need to collect the two eightbytes together.
455+
// The ABI isn't explicit about this, but it seems reasonable
456+
// to assume that the slots are 16-byte aligned, since the stack is
457+
// naturally 16-byte aligned and the prologue is expected to store
458+
// all the SSE registers to the RSA.
459+
let reg_lo_addr = bx.inbounds_ptradd(reg_save_area_v, fp_offset_v);
460+
let reg_hi_addr = bx.inbounds_ptradd(reg_lo_addr, bx.const_i32(16));
461+
462+
let align = layout.layout.align().abi;
463+
let tmp = bx.alloca(layout.layout.size(), align);
464+
465+
let reg_lo = bx.load(ty_lo, reg_lo_addr, align_lo);
466+
let reg_hi = bx.load(ty_hi, reg_hi_addr, align_hi);
467+
468+
let offset = scalar1.size(bx.cx).align_to(align_hi).bytes();
469+
let field0 = tmp;
470+
let field1 = bx.inbounds_ptradd(tmp, bx.const_u32(offset as u32));
471+
472+
bx.store(reg_lo, field0, align);
473+
bx.store(reg_hi, field1, align);
474+
475+
tmp
476+
}
477+
(Primitive::Float(_), _) | (_, Primitive::Float(_)) => {
478+
let gp_addr = bx.inbounds_ptradd(reg_save_area_v, gp_offset_v);
479+
let fp_addr = bx.inbounds_ptradd(reg_save_area_v, fp_offset_v);
480+
481+
let (reg_lo_addr, reg_hi_addr) = match scalar1.primitive() {
482+
Primitive::Float(_) => (fp_addr, gp_addr),
483+
Primitive::Int(_, _) | Primitive::Pointer(_) => (gp_addr, fp_addr),
484+
};
485+
486+
let tmp = bx.alloca(layout.layout.size(), layout.layout.align().abi);
487+
488+
let reg_lo = bx.load(ty_lo, reg_lo_addr, align_lo);
489+
let reg_hi = bx.load(ty_hi, reg_hi_addr, align_hi);
490+
491+
let offset = scalar1.size(bx.cx).align_to(align_hi).bytes();
492+
let field0 = tmp;
493+
let field1 = bx.inbounds_ptradd(tmp, bx.const_u32(offset as u32));
494+
495+
bx.store(reg_lo, field0, align_lo);
496+
bx.store(reg_hi, field1, align_hi);
497+
498+
tmp
499+
}
500+
(_, _) => {
501+
// Two integer/pointer values are just contiguous in memory.
502+
let reg_addr = bx.inbounds_ptradd(reg_save_area_v, gp_offset_v);
503+
504+
// Copy into a temporary if the type is more aligned than the register save area.
505+
copy_to_temporary_if_more_aligned(bx, reg_addr, layout)
506+
}
507+
}
508+
}
509+
BackendRepr::SimdVector { .. } => {
510+
unreachable!("panics in the previous match on `backend_repr`")
511+
}
512+
BackendRepr::Memory { .. } => {
513+
unreachable!("early returns in the previous match on `backend_repr`")
514+
}
515+
};
516+
517+
// AMD64-ABI 3.5.7p5: Step 5. Set:
518+
// l->gp_offset = l->gp_offset + num_gp * 8
519+
if num_gp_registers > 0 {
520+
let offset = bx.const_u32(num_gp_registers * 8);
521+
let sum = bx.add(gp_offset_v, offset);
522+
bx.store(sum, gp_offset_ptr, Align::from_bytes(8).unwrap());
523+
}
524+
525+
// l->fp_offset = l->fp_offset + num_fp * 16.
526+
if num_fp_registers > 0 {
527+
let offset = bx.const_u32(num_fp_registers * 16);
528+
let sum = bx.add(fp_offset_v, offset);
529+
bx.store(sum, fp_offset_ptr, Align::from_bytes(4).unwrap());
530+
}
531+
532+
bx.br(end);
533+
534+
bx.switch_to_block(in_mem);
535+
let mem_addr = x86_64_sysv64_va_arg_from_memory(bx, va_list_addr, layout);
536+
bx.br(end);
537+
538+
bx.switch_to_block(end);
539+
540+
let val_type = layout.llvm_type(bx);
541+
let val_addr = bx.phi(bx.type_ptr(), &[reg_addr, mem_addr], &[in_reg, in_mem]);
542+
543+
bx.load(val_type, val_addr, layout.align.abi)
544+
}
545+
546+
/// Copy into a temporary if the type is more aligned than the register save area.
547+
fn copy_to_temporary_if_more_aligned<'ll, 'tcx>(
548+
bx: &mut Builder<'_, 'll, 'tcx>,
549+
reg_addr: &'ll Value,
550+
layout: TyAndLayout<'tcx, Ty<'tcx>>,
551+
) -> &'ll Value {
552+
if layout.layout.align.abi.bytes() > 8 {
553+
let tmp = bx.alloca(layout.layout.size(), layout.layout.align().abi);
554+
bx.memcpy(
555+
tmp,
556+
layout.layout.align.abi,
557+
reg_addr,
558+
Align::from_bytes(8).unwrap(),
559+
bx.const_u32(layout.layout.size().bytes() as u32),
560+
MemFlags::empty(),
561+
);
562+
tmp
563+
} else {
564+
reg_addr
565+
}
566+
}
567+
568+
fn x86_64_sysv64_va_arg_from_memory<'ll, 'tcx>(
569+
bx: &mut Builder<'_, 'll, 'tcx>,
570+
va_list_addr: &'ll Value,
571+
layout: TyAndLayout<'tcx, Ty<'tcx>>,
572+
) -> &'ll Value {
573+
let dl = bx.cx.data_layout();
574+
575+
let overflow_arg_area_ptr = bx.inbounds_ptradd(va_list_addr, bx.const_usize(8));
576+
577+
let overflow_arg_area_v = bx.load(bx.type_ptr(), overflow_arg_area_ptr, dl.pointer_align.abi);
578+
// AMD64-ABI 3.5.7p5: Step 7. Align l->overflow_arg_area upwards to a 16
579+
// byte boundary if alignment needed by type exceeds 8 byte boundary.
580+
// It isn't stated explicitly in the standard, but in practice we use
581+
// alignment greater than 16 where necessary.
582+
if layout.layout.align.abi.bytes() > 8 {
583+
unreachable!("all instances of VaArgSafe have an alignment <= 8");
584+
}
585+
586+
// AMD64-ABI 3.5.7p5: Step 8. Fetch type from l->overflow_arg_area.
587+
let mem_addr = overflow_arg_area_v;
588+
589+
// AMD64-ABI 3.5.7p5: Step 9. Set l->overflow_arg_area to:
590+
// l->overflow_arg_area + sizeof(type).
591+
// AMD64-ABI 3.5.7p5: Step 10. Align l->overflow_arg_area upwards to
592+
// an 8 byte boundary.
593+
let size_in_bytes = layout.layout.size().bytes();
594+
let offset = bx.const_i32(size_in_bytes.next_multiple_of(8) as i32);
595+
let overflow_arg_area = bx.inbounds_ptradd(overflow_arg_area_v, offset);
596+
bx.store(overflow_arg_area, overflow_arg_area_ptr, dl.pointer_align.abi);
597+
598+
mem_addr
599+
}
600+
306601
fn emit_xtensa_va_arg<'ll, 'tcx>(
307602
bx: &mut Builder<'_, 'll, 'tcx>,
308603
list: OperandRef<'tcx, &'ll Value>,
@@ -334,8 +629,7 @@ fn emit_xtensa_va_arg<'ll, 'tcx>(
334629
// (*va).va_ndx
335630
let va_reg_offset = 4;
336631
let va_ndx_offset = va_reg_offset + 4;
337-
let offset_ptr =
338-
bx.inbounds_gep(bx.type_i8(), va_list_addr, &[bx.cx.const_usize(va_ndx_offset)]);
632+
let offset_ptr = bx.inbounds_ptradd(va_list_addr, bx.cx.const_usize(va_ndx_offset));
339633

340634
let offset = bx.load(bx.type_i32(), offset_ptr, bx.tcx().data_layout.i32_align.abi);
341635
let offset = round_up_to_alignment(bx, offset, layout.align.abi);
@@ -356,11 +650,10 @@ fn emit_xtensa_va_arg<'ll, 'tcx>(
356650
bx.store(offset_next, offset_ptr, bx.tcx().data_layout.pointer_align.abi);
357651

358652
// (*va).va_reg
359-
let regsave_area_ptr =
360-
bx.inbounds_gep(bx.type_i8(), va_list_addr, &[bx.cx.const_usize(va_reg_offset)]);
653+
let regsave_area_ptr = bx.inbounds_ptradd(va_list_addr, bx.cx.const_usize(va_reg_offset));
361654
let regsave_area =
362655
bx.load(bx.type_ptr(), regsave_area_ptr, bx.tcx().data_layout.pointer_align.abi);
363-
let regsave_value_ptr = bx.inbounds_gep(bx.type_i8(), regsave_area, &[offset]);
656+
let regsave_value_ptr = bx.inbounds_ptradd(regsave_area, offset);
364657
bx.br(end);
365658

366659
bx.switch_to_block(from_stack);
@@ -381,9 +674,9 @@ fn emit_xtensa_va_arg<'ll, 'tcx>(
381674
bx.store(offset_next_corrected, offset_ptr, bx.tcx().data_layout.pointer_align.abi);
382675

383676
// let stack_value_ptr = unsafe { (*va).va_stk.byte_add(offset_corrected) };
384-
let stack_area_ptr = bx.inbounds_gep(bx.type_i8(), va_list_addr, &[bx.cx.const_usize(0)]);
677+
let stack_area_ptr = bx.inbounds_ptradd(va_list_addr, bx.cx.const_usize(0));
385678
let stack_area = bx.load(bx.type_ptr(), stack_area_ptr, bx.tcx().data_layout.pointer_align.abi);
386-
let stack_value_ptr = bx.inbounds_gep(bx.type_i8(), stack_area, &[offset_corrected]);
679+
let stack_value_ptr = bx.inbounds_ptradd(stack_area, offset_corrected);
387680
bx.br(end);
388681

389682
bx.switch_to_block(end);
@@ -449,6 +742,8 @@ pub(super) fn emit_va_arg<'ll, 'tcx>(
449742
AllowHigherAlign::No,
450743
)
451744
}
745+
// This includes `target.is_like_darwin`, which on x86_64 targets is like sysv64.
746+
"x86_64" => emit_x86_64_sysv64_va_arg(bx, addr, target_ty),
452747
"xtensa" => emit_xtensa_va_arg(bx, addr, target_ty),
453748
// For all other architecture/OS combinations fall back to using
454749
// the LLVM va_arg instruction.

tests/run-make/c-link-to-rust-va-list-fn/checkrust.rs

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,9 @@ pub unsafe extern "C" fn check_varargs_4(_: c_double, mut ap: ...) -> usize {
112112
continue_if!(ap.arg::<c_double>() == 8.0);
113113
continue_if!(ap.arg::<c_double>() == 9.0);
114114
continue_if!(ap.arg::<c_double>() == 10.0);
115+
continue_if!(ap.arg::<c_double>() == 11.0);
116+
continue_if!(ap.arg::<c_double>() == 12.0);
117+
continue_if!(ap.arg::<c_double>() == 13.0);
115118
0
116119
}
117120

@@ -137,5 +140,11 @@ pub unsafe extern "C" fn check_varargs_5(_: c_int, mut ap: ...) -> usize {
137140
continue_if!(ap.arg::<c_double>() == 9.0);
138141
continue_if!(ap.arg::<c_int>() == 10);
139142
continue_if!(ap.arg::<c_double>() == 10.0);
143+
continue_if!(ap.arg::<c_int>() == 11);
144+
continue_if!(ap.arg::<c_double>() == 11.0);
145+
continue_if!(ap.arg::<c_int>() == 12);
146+
continue_if!(ap.arg::<c_double>() == 12.0);
147+
continue_if!(ap.arg::<c_int>() == 13);
148+
continue_if!(ap.arg::<c_double>() == 13.0);
140149
0
141150
}

tests/run-make/c-link-to-rust-va-list-fn/test.c

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,10 +41,11 @@ int main(int argc, char* argv[]) {
4141

4242
assert(check_varargs_3(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10) == 0);
4343

44-
assert(check_varargs_4(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0) == 0);
44+
assert(check_varargs_4(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0,
45+
13.0) == 0);
4546

4647
assert(check_varargs_5(0, 1.0, 1, 2.0, 2, 3.0, 3, 4.0, 4, 5, 5.0, 6, 6.0, 7, 7.0, 8, 8.0,
47-
9, 9.0, 10, 10.0) == 0);
48+
9, 9.0, 10, 10.0, 11, 11.0, 12, 12.0, 13, 13.0) == 0);
4849

4950
return 0;
5051
}

0 commit comments

Comments
 (0)