diff --git a/build.rs b/build.rs index 8ac0808cf16e..fbaf95c34362 100644 --- a/build.rs +++ b/build.rs @@ -190,10 +190,8 @@ fn x64_should_panic(testsuite: &str, testname: &str, strategy: &str) -> bool { match (testsuite, testname) { ("simd", "simd_conversions") => return true, // unknown operator or unexpected token: tests/spec_testsuite/proposals/simd/simd_conversions.wast:724:6 - ("simd", "simd_i16x8_extadd_pairwise_i8x16") => return true, ("simd", "simd_i16x8_extmul_i8x16") => return true, ("simd", "simd_i16x8_q15mulr_sat_s") => return true, - ("simd", "simd_i32x4_extadd_pairwise_i16x8") => return true, ("simd", "simd_i32x4_extmul_i16x8") => return true, ("simd", "simd_i32x4_trunc_sat_f64x2") => return true, ("simd", "simd_i64x2_extmul_i32x4") => return true, @@ -229,9 +227,8 @@ fn ignore(testsuite: &str, testname: &str, strategy: &str) -> bool { // These are new instructions that are not really implemented in any backend. ("simd", "simd_conversions") - | ("simd", "simd_i16x8_extadd_pairwise_i8x16") | ("simd", "simd_i16x8_extmul_i8x16") - | ("simd", "simd_i32x4_extadd_pairwise_i16x8") + | ("simd", "simd_i16x8_q15mulr_sat_s") | ("simd", "simd_i32x4_extmul_i16x8") | ("simd", "simd_i32x4_trunc_sat_f64x2") | ("simd", "simd_i64x2_extmul_i32x4") => return true, diff --git a/cranelift/codegen/meta/src/shared/instructions.rs b/cranelift/codegen/meta/src/shared/instructions.rs index 8759fd347dfa..547b0768330c 100644 --- a/cranelift/codegen/meta/src/shared/instructions.rs +++ b/cranelift/codegen/meta/src/shared/instructions.rs @@ -4095,7 +4095,34 @@ pub(crate) fn define( Inst::new( "uwiden_high", r#" - Widen the high lanes of `x` using unsigned extension. + Lane-wise integer extended pairwise addition producing extended results + (twice wider results than the input) + "#, + &formats.unary, + ) + .operands_in(vec![x]) + .operands_out(vec![a]), + ); + + ig.push( + Inst::new( + "extended_pairwise_add_signed", + r#" + Widen the high lanes of `x` using signed extension. + + This will double the lane width and halve the number of lanes. + "#, + &formats.unary, + ) + .operands_in(vec![x]) + .operands_out(vec![a]), + ); + + ig.push( + Inst::new( + "extended_pairwise_add_unsigned", + r#" + Widen the high lanes of `x` extending with zeros. This will double the lane width and halve the number of lanes. "#, diff --git a/cranelift/codegen/src/isa/aarch64/lower_inst.rs b/cranelift/codegen/src/isa/aarch64/lower_inst.rs index 6a5b70351c32..2c32070b0893 100644 --- a/cranelift/codegen/src/isa/aarch64/lower_inst.rs +++ b/cranelift/codegen/src/isa/aarch64/lower_inst.rs @@ -3545,6 +3545,8 @@ pub(crate) fn lower_insn_to_regs>( | Opcode::Fvdemote | Opcode::FvpromoteLow | Opcode::Vconcat + | Opcode::ExtendedPairwiseAddSigned + | Opcode::ExtendedPairwiseAddUnsigned | Opcode::Vsplit => unimplemented!("lowering {}", op), } diff --git a/cranelift/codegen/src/isa/s390x/lower.rs b/cranelift/codegen/src/isa/s390x/lower.rs index 188d7884a345..157ac060085f 100644 --- a/cranelift/codegen/src/isa/s390x/lower.rs +++ b/cranelift/codegen/src/isa/s390x/lower.rs @@ -2867,7 +2867,9 @@ fn lower_insn_to_regs>( | Opcode::WideningPairwiseDotProductS | Opcode::SqmulRoundSat | Opcode::FvpromoteLow - | Opcode::Fvdemote => { + | Opcode::Fvdemote + | Opcode::ExtendedPairwiseAddSigned + | Opcode::ExtendedPairwiseAddUnsigned => { // TODO unimplemented!("Vector ops not implemented."); } diff --git a/cranelift/codegen/src/isa/x64/inst/args.rs b/cranelift/codegen/src/isa/x64/inst/args.rs index c362075061d2..6e0f224a136c 100644 --- a/cranelift/codegen/src/isa/x64/inst/args.rs +++ b/cranelift/codegen/src/isa/x64/inst/args.rs @@ -567,6 +567,7 @@ pub enum SseOpcode { Pinsrb, Pinsrw, Pinsrd, + Pmaddubsw, Pmaddwd, Pmaxsb, Pmaxsw, @@ -734,6 +735,7 @@ impl SseOpcode { | SseOpcode::Pcmpgtd | SseOpcode::Pextrw | SseOpcode::Pinsrw + | SseOpcode::Pmaddubsw | SseOpcode::Pmaddwd | SseOpcode::Pmaxsw | SseOpcode::Pmaxub @@ -925,6 +927,7 @@ impl fmt::Debug for SseOpcode { SseOpcode::Pinsrb => "pinsrb", SseOpcode::Pinsrw => "pinsrw", SseOpcode::Pinsrd => "pinsrd", + SseOpcode::Pmaddubsw => "pmaddubsw", SseOpcode::Pmaddwd => "pmaddwd", SseOpcode::Pmaxsb => "pmaxsb", SseOpcode::Pmaxsw => "pmaxsw", diff --git a/cranelift/codegen/src/isa/x64/inst/emit.rs b/cranelift/codegen/src/isa/x64/inst/emit.rs index 441d89fa918d..b2f3d8b853de 100644 --- a/cranelift/codegen/src/isa/x64/inst/emit.rs +++ b/cranelift/codegen/src/isa/x64/inst/emit.rs @@ -1483,6 +1483,7 @@ pub(crate) fn emit( SseOpcode::Paddsw => (LegacyPrefixes::_66, 0x0FED, 2), SseOpcode::Paddusb => (LegacyPrefixes::_66, 0x0FDC, 2), SseOpcode::Paddusw => (LegacyPrefixes::_66, 0x0FDD, 2), + SseOpcode::Pmaddubsw => (LegacyPrefixes::_66, 0x0F3804, 3), SseOpcode::Pand => (LegacyPrefixes::_66, 0x0FDB, 2), SseOpcode::Pandn => (LegacyPrefixes::_66, 0x0FDF, 2), SseOpcode::Pavgb => (LegacyPrefixes::_66, 0x0FE0, 2), diff --git a/cranelift/codegen/src/isa/x64/lower.rs b/cranelift/codegen/src/isa/x64/lower.rs index 1cf1da4e9bd1..888889fbccc1 100644 --- a/cranelift/codegen/src/isa/x64/lower.rs +++ b/cranelift/codegen/src/isa/x64/lower.rs @@ -4495,6 +4495,128 @@ fn lower_insn_to_regs>( } } } + Opcode::ExtendedPairwiseAddSigned | Opcode::ExtendedPairwiseAddUnsigned => { + // Extended pairwise addition instructions computes extended sums within adjacent + // pairs of lanes of a SIMD vector, producing a SIMD vector with half as many lanes. + // Instruction sequences taken from instruction SPEC PR https://github.com/WebAssembly/simd/pull/380 + /* + let input_ty = ctx.input_ty(insn, 0); + let output_ty = ctx.output_ty(insn, 0); + let src = put_input_in_reg(ctx, inputs[0]); + let dst = get_output_reg(ctx, outputs[0]).only_reg().unwrap(); + unreachable!(); + match op { + Opcode::ExtendedPairwiseAddSigned => match (input_ty, output_ty) { + (types::I8X16, types::I16X8) => { + static MUL_CONST: [u8; 16] = [0x01; 16]; + let mul_const = ctx.use_constant(VCodeConstantData::WellKnown(&MUL_CONST)); + let mul_const_reg = ctx.alloc_tmp(types::I8X16).only_reg().unwrap(); + ctx.emit(Inst::xmm_load_const(mul_const, mul_const_reg, types::I8X16)); + ctx.emit(Inst::xmm_mov( + SseOpcode::Movdqa, + RegMem::reg(mul_const_reg.to_reg()), + dst, + )); + ctx.emit(Inst::xmm_rm_r(SseOpcode::Pmaddubsw, RegMem::reg(src), dst)); + } + (types::I16X8, types::I32X4) => { + static MUL_CONST: [u8; 16] = [ + 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, + 0x01, 0x00, 0x01, 0x00, + ]; + let mul_const = ctx.use_constant(VCodeConstantData::WellKnown(&MUL_CONST)); + let mul_const_reg = ctx.alloc_tmp(types::I16X8).only_reg().unwrap(); + ctx.emit(Inst::xmm_load_const(mul_const, mul_const_reg, types::I16X8)); + ctx.emit(Inst::xmm_mov(SseOpcode::Movdqa, RegMem::reg(src), dst)); + ctx.emit(Inst::xmm_rm_r( + SseOpcode::Pmaddwd, + RegMem::reg(mul_const_reg.to_reg()), + dst, + )); + } + _ => unreachable!( + "Type pattern not supported {:?}-{:?} not supported for {:?}.", + input_ty, output_ty, op + ), + }, + Opcode::ExtendedPairwiseAddUnsigned => match (input_ty, output_ty) { + (types::I8X16, types::I16X8) => { + static MUL_CONST: [u8; 16] = [0x01; 16]; + let mul_const = ctx.use_constant(VCodeConstantData::WellKnown(&MUL_CONST)); + let mul_const_reg = ctx.alloc_tmp(types::I8X16).only_reg().unwrap(); + ctx.emit(Inst::xmm_load_const(mul_const, mul_const_reg, types::I8X16)); + ctx.emit(Inst::xmm_mov(SseOpcode::Movdqa, RegMem::reg(src), dst)); + ctx.emit(Inst::xmm_rm_r( + SseOpcode::Pmaddubsw, + RegMem::reg(mul_const_reg.to_reg()), + dst, + )); + } + (types::I16X8, types::I32X4) => { + static PXOR_CONST: [u8; 16] = [ + 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, + 0x00, 0x80, 0x00, 0x80, + ]; + let pxor_const = + ctx.use_constant(VCodeConstantData::WellKnown(&PXOR_CONST)); + let pxor_const_reg = ctx.alloc_tmp(types::I16X8).only_reg().unwrap(); + ctx.emit(Inst::xmm_load_const( + pxor_const, + pxor_const_reg, + types::I16X8, + )); + ctx.emit(Inst::xmm_mov(SseOpcode::Movdqa, RegMem::reg(src), dst)); + ctx.emit(Inst::xmm_rm_r( + SseOpcode::Pxor, + RegMem::reg(pxor_const_reg.to_reg()), + dst, + )); + + static MADD_CONST: [u8; 16] = [ + 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, + 0x01, 0x00, 0x01, 0x00, + ]; + let madd_const = + ctx.use_constant(VCodeConstantData::WellKnown(&MADD_CONST)); + let madd_const_reg = ctx.alloc_tmp(types::I8X16).only_reg().unwrap(); + ctx.emit(Inst::xmm_load_const( + madd_const, + madd_const_reg, + types::I16X8, + )); + ctx.emit(Inst::xmm_rm_r( + SseOpcode::Pmaddwd, + RegMem::reg(madd_const_reg.to_reg()), + dst, + )); + + static ADDD_CONST2: [u8; 16] = [ + 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x01, 0x00, + ]; + let addd_const2 = + ctx.use_constant(VCodeConstantData::WellKnown(&ADDD_CONST2)); + let addd_const2_reg = ctx.alloc_tmp(types::I8X16).only_reg().unwrap(); + ctx.emit(Inst::xmm_load_const( + addd_const2, + addd_const2_reg, + types::I16X8, + )); + ctx.emit(Inst::xmm_rm_r( + SseOpcode::Paddd, + RegMem::reg(addd_const2_reg.to_reg()), + dst, + )); + } + _ => unreachable!( + "Type pattern not supported {:?}-{:?} not supported for {:?}.", + input_ty, output_ty, op + ), + }, + _ => unreachable!("{:?} not supported.", op), + } + */ + } Opcode::UwidenHigh | Opcode::UwidenLow | Opcode::SwidenHigh | Opcode::SwidenLow => { let input_ty = ctx.input_ty(insn, 0); let output_ty = ctx.output_ty(insn, 0); diff --git a/cranelift/interpreter/src/step.rs b/cranelift/interpreter/src/step.rs index 76ec43a814c3..c488e2fcd068 100644 --- a/cranelift/interpreter/src/step.rs +++ b/cranelift/interpreter/src/step.rs @@ -575,6 +575,8 @@ where Opcode::Fence => unimplemented!("Fence"), Opcode::WideningPairwiseDotProductS => unimplemented!("WideningPairwiseDotProductS"), Opcode::SqmulRoundSat => unimplemented!("SqmulRoundSat"), + Opcode::ExtendedPairwiseAddSigned => unimplemented!("ExtendedPairwiseAddSigned"), + Opcode::ExtendedPairwiseAddUnsigned => unimplemented!("ExtendedPairwiseAddUnsigned"), // TODO: these instructions should be removed once the new backend makes these obsolete // (see https://github.com/bytecodealliance/wasmtime/issues/1936); additionally, the diff --git a/cranelift/wasm/src/code_translator.rs b/cranelift/wasm/src/code_translator.rs index 259d301df277..ffc3ab693658 100644 --- a/cranelift/wasm/src/code_translator.rs +++ b/cranelift/wasm/src/code_translator.rs @@ -1858,6 +1858,22 @@ pub fn translate_operator( let a = pop1_with_bitcast(state, I32X4, builder); state.push1(builder.ins().uwiden_high(a)) } + Operator::I16x8ExtAddPairwiseI8x16S => { + let a = pop1_with_bitcast(state, I8X16, builder); + state.push1(builder.ins().extended_pairwise_add_signed(a)) + } + Operator::I32x4ExtAddPairwiseI16x8S => { + let a = pop1_with_bitcast(state, I16X8, builder); + state.push1(builder.ins().extended_pairwise_add_signed(a)) + } + Operator::I16x8ExtAddPairwiseI8x16U => { + let a = pop1_with_bitcast(state, I8X16, builder); + state.push1(builder.ins().extended_pairwise_add_unsigned(a)) + } + Operator::I32x4ExtAddPairwiseI16x8U => { + let a = pop1_with_bitcast(state, I16X8, builder); + state.push1(builder.ins().extended_pairwise_add_unsigned(a)) + } Operator::F32x4Ceil | Operator::F64x2Ceil => { // This is something of a misuse of `type_of`, because that produces the return type // of `op`. In this case we want the arg type, but we know it's the same as the @@ -1902,10 +1918,6 @@ pub fn translate_operator( | Operator::I64x2ExtMulHighI32x4S | Operator::I64x2ExtMulLowI32x4U | Operator::I64x2ExtMulHighI32x4U - | Operator::I16x8ExtAddPairwiseI8x16S - | Operator::I16x8ExtAddPairwiseI8x16U - | Operator::I32x4ExtAddPairwiseI16x8S - | Operator::I32x4ExtAddPairwiseI16x8U | Operator::F64x2ConvertLowI32x4U | Operator::I32x4TruncSatF64x2SZero | Operator::I32x4TruncSatF64x2UZero => {