Skip to content

Commit fa0c433

Browse files
[X86] Attempt to use VPMADD52L/VPMULUDQ instead of VPMULLQ on slow VPMULLQ targets (or when VPMULLQ is unavailable) (#171760)
This pull request introduces a new tuning flag "TuningSlowPMULLQ" and uses it to optimize 64-bit vector multiplication on Intel targets where "VPMULLQ" is slow. On recent Intel microarchitectures , the "VPMULLQ" instruction has a high latency of 15 cycles . In contrast, the "VPMADD52LUQ" instruction (available via AVX512IFMA) performs a similar operation with a latency of only 4 cycles . Reference data from uops.info (Ice Lake): "VPMULLQ" : Latency 15, TP 1.5 "VPMADD52LUQ" : Latency 4, TP 0.5 Fixes #158854
1 parent 9cd8018 commit fa0c433

File tree

3 files changed

+284
-3
lines changed

3 files changed

+284
-3
lines changed

llvm/lib/Target/X86/X86.td

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -485,6 +485,9 @@ def TuningSlowSHLD : SubtargetFeature<"slow-shld", "IsSHLDSlow", "true",
485485
def TuningSlowPMULLD : SubtargetFeature<"slow-pmulld", "IsPMULLDSlow", "true",
486486
"PMULLD instruction is slow (compared to PMULLW/PMULHW and PMULUDQ)">;
487487

488+
def TuningSlowPMULLQ : SubtargetFeature<"slow-pmullq", "IsPMULLQSlow", "true",
489+
"PMULLQ instruction is slow">;
490+
488491
def TuningSlowPMADDWD : SubtargetFeature<"slow-pmaddwd", "IsPMADDWDSlow",
489492
"true",
490493
"PMADDWD is slower than PMULLD">;
@@ -1065,7 +1068,8 @@ def ProcessorFeatures {
10651068
TuningNoDomainDelayMov,
10661069
TuningNoDomainDelayShuffle,
10671070
TuningNoDomainDelayBlend,
1068-
TuningFastImmVectorShift];
1071+
TuningFastImmVectorShift,
1072+
TuningSlowPMULLQ];
10691073
list<SubtargetFeature> CNLFeatures =
10701074
!listconcat(SKLFeatures, CNLAdditionalFeatures);
10711075

@@ -1094,7 +1098,8 @@ def ProcessorFeatures {
10941098
TuningNoDomainDelayMov,
10951099
TuningNoDomainDelayShuffle,
10961100
TuningNoDomainDelayBlend,
1097-
TuningFastImmVectorShift];
1101+
TuningFastImmVectorShift,
1102+
TuningSlowPMULLQ];
10981103
list<SubtargetFeature> ICLFeatures =
10991104
!listconcat(CNLFeatures, ICLAdditionalFeatures);
11001105

@@ -1291,7 +1296,8 @@ def ProcessorFeatures {
12911296
FeatureWAITPKG];
12921297
list<SubtargetFeature> ADLAdditionalTuning = [TuningPERMFalseDeps,
12931298
TuningPreferMovmskOverVTest,
1294-
TuningFastImmVectorShift];
1299+
TuningFastImmVectorShift,
1300+
TuningSlowPMULLQ];
12951301
list<SubtargetFeature> ADLRemoveTuning = [TuningPOPCNTFalseDeps];
12961302
list<SubtargetFeature> ADLTuning =
12971303
!listremove(!listconcat(SKLTuning, ADLAdditionalTuning), ADLRemoveTuning);

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49986,6 +49986,40 @@ static SDValue combineMulToPMULDQ(SDNode *N, const SDLoc &DL, SelectionDAG &DAG,
4998649986
return SDValue();
4998749987
}
4998849988

49989+
static SDValue combineMulToPMADD52(SDNode *N, const SDLoc &DL,
49990+
SelectionDAG &DAG,
49991+
const X86Subtarget &Subtarget) {
49992+
EVT VT = N->getValueType(0);
49993+
// Only optimize vXi64 when the standard PMULLQ instruction is slow.
49994+
if (VT.getScalarType() != MVT::i64 || !Subtarget.isPMULLQSlow())
49995+
return SDValue();
49996+
// Check hardware support:
49997+
// 512-bit vectors (v8i64) require AVX512-IFMA.
49998+
// 128/256-bit vectors (v2i64/v4i64) require either AVX512-IFMA + VLX, or
49999+
// AVX-IFMA.
50000+
bool Supported512 = (VT == MVT::v8i64) && Subtarget.hasIFMA();
50001+
bool SupportedSmall =
50002+
(VT == MVT::v2i64 || VT == MVT::v4i64) &&
50003+
((Subtarget.hasIFMA() && Subtarget.hasVLX()) || Subtarget.hasAVXIFMA());
50004+
50005+
if (!Supported512 && !SupportedSmall)
50006+
return SDValue();
50007+
SDValue Op0 = N->getOperand(0);
50008+
SDValue Op1 = N->getOperand(1);
50009+
// Use KnownBits analysis to verify if the high bits are zero.
50010+
KnownBits Known0 = DAG.computeKnownBits(Op0);
50011+
KnownBits Known1 = DAG.computeKnownBits(Op1);
50012+
KnownBits KnownMul = KnownBits::mul(Known0, Known1, Op0 == Op1);
50013+
// If inputs and the result fit in 52 bits, VPMADD52L is safe to use.
50014+
// We pass a zero vector as the addend since we only need the multiply result.
50015+
if (Known0.countMaxActiveBits() <= 52 && Known1.countMaxActiveBits() <= 52 &&
50016+
KnownMul.countMaxActiveBits() <= 52) {
50017+
SDValue Zero = getZeroVector(VT.getSimpleVT(), Subtarget, DAG, DL);
50018+
return DAG.getNode(X86ISD::VPMADD52L, DL, VT, Op0, Op1, Zero);
50019+
}
50020+
return SDValue();
50021+
}
50022+
4998950023
static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
4999050024
TargetLowering::DAGCombinerInfo &DCI,
4999150025
const X86Subtarget &Subtarget) {
@@ -49998,6 +50032,9 @@ static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
4999850032
if (SDValue V = combineMulToPMULDQ(N, DL, DAG, Subtarget))
4999950033
return V;
5000050034

50035+
if (SDValue V = combineMulToPMADD52(N, DL, DAG, Subtarget))
50036+
return V;
50037+
5000150038
if (DCI.isBeforeLegalize() && VT.isVector())
5000250039
return reduceVMULWidth(N, DL, DAG, Subtarget);
5000350040

Lines changed: 238 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,238 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=cannonlake | FileCheck %s --check-prefix=CNL
3+
; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=cannonlake -mattr=-avx512vl | FileCheck %s --check-prefix=NOVLX
4+
; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512ifma,+avx512dq,+avx512vl,+slow-pmullq | FileCheck %s --check-prefix=GENERIC
5+
; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512ifma,+avx512dq,-avx512vl,+slow-pmullq | FileCheck %s --check-prefix=GENERIC-NOVLX
6+
7+
; ============================================================================
8+
; Case 1: 52-bit Optimization (vpmadd52luq)
9+
; ============================================================================
10+
11+
define <8 x i64> @test_mul_52bit_fits(<8 x i64> %a, <8 x i64> %b) {
12+
; CNL-LABEL: test_mul_52bit_fits:
13+
; CNL: # %bb.0:
14+
; CNL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm2
15+
; CNL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm1
16+
; CNL-NEXT: vpxor %xmm0, %xmm0, %xmm0
17+
; CNL-NEXT: vpmadd52luq %zmm1, %zmm2, %zmm0
18+
; CNL-NEXT: retq
19+
;
20+
; NOVLX-LABEL: test_mul_52bit_fits:
21+
; NOVLX: # %bb.0:
22+
; NOVLX-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm2
23+
; NOVLX-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm1
24+
; NOVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
25+
; NOVLX-NEXT: vpmadd52luq %zmm1, %zmm2, %zmm0
26+
; NOVLX-NEXT: retq
27+
;
28+
; GENERIC-LABEL: test_mul_52bit_fits:
29+
; GENERIC: # %bb.0:
30+
; GENERIC-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm2
31+
; GENERIC-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm1
32+
; GENERIC-NEXT: vpxor %xmm0, %xmm0, %xmm0
33+
; GENERIC-NEXT: vpmadd52luq %zmm1, %zmm2, %zmm0
34+
; GENERIC-NEXT: retq
35+
;
36+
; GENERIC-NOVLX-LABEL: test_mul_52bit_fits:
37+
; GENERIC-NOVLX: # %bb.0:
38+
; GENERIC-NOVLX-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm2
39+
; GENERIC-NOVLX-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm1
40+
; GENERIC-NOVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
41+
; GENERIC-NOVLX-NEXT: vpmadd52luq %zmm1, %zmm2, %zmm0
42+
; GENERIC-NOVLX-NEXT: retq
43+
%a_masked = and <8 x i64> %a, splat (i64 8589934591)
44+
%b_masked = and <8 x i64> %b, splat (i64 524287)
45+
46+
%res = mul <8 x i64> %a_masked, %b_masked
47+
ret <8 x i64> %res
48+
}
49+
50+
; ============================================================================
51+
; Case 1.5: Non-constant test (using Logical Shift Right to clear high bits)
52+
; ============================================================================
53+
54+
define <8 x i64> @test_mul_shift_high_bits(<8 x i64> %a, <8 x i64> %b) {
55+
; CNL-LABEL: test_mul_shift_high_bits:
56+
; CNL: # %bb.0:
57+
; CNL-NEXT: vpsrlq $31, %zmm0, %zmm2
58+
; CNL-NEXT: vpsrlq $45, %zmm1, %zmm1
59+
; CNL-NEXT: vpxor %xmm0, %xmm0, %xmm0
60+
; CNL-NEXT: vpmadd52luq %zmm1, %zmm2, %zmm0
61+
; CNL-NEXT: retq
62+
;
63+
; NOVLX-LABEL: test_mul_shift_high_bits:
64+
; NOVLX: # %bb.0:
65+
; NOVLX-NEXT: vpsrlq $31, %zmm0, %zmm2
66+
; NOVLX-NEXT: vpsrlq $45, %zmm1, %zmm1
67+
; NOVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
68+
; NOVLX-NEXT: vpmadd52luq %zmm1, %zmm2, %zmm0
69+
; NOVLX-NEXT: retq
70+
;
71+
; GENERIC-LABEL: test_mul_shift_high_bits:
72+
; GENERIC: # %bb.0:
73+
; GENERIC-NEXT: vpsrlq $31, %zmm0, %zmm2
74+
; GENERIC-NEXT: vpsrlq $45, %zmm1, %zmm1
75+
; GENERIC-NEXT: vpxor %xmm0, %xmm0, %xmm0
76+
; GENERIC-NEXT: vpmadd52luq %zmm1, %zmm2, %zmm0
77+
; GENERIC-NEXT: retq
78+
;
79+
; GENERIC-NOVLX-LABEL: test_mul_shift_high_bits:
80+
; GENERIC-NOVLX: # %bb.0:
81+
; GENERIC-NOVLX-NEXT: vpsrlq $31, %zmm0, %zmm2
82+
; GENERIC-NOVLX-NEXT: vpsrlq $45, %zmm1, %zmm1
83+
; GENERIC-NOVLX-NEXT: vpxor %xmm0, %xmm0, %xmm0
84+
; GENERIC-NOVLX-NEXT: vpmadd52luq %zmm1, %zmm2, %zmm0
85+
; GENERIC-NOVLX-NEXT: retq
86+
%a_shifted = lshr <8 x i64> %a, <i64 31, i64 31, i64 31, i64 31, i64 31, i64 31, i64 31, i64 31>
87+
%b_shifted = lshr <8 x i64> %b, <i64 45, i64 45, i64 45, i64 45, i64 45, i64 45, i64 45, i64 45>
88+
89+
%res = mul <8 x i64> %a_shifted, %b_shifted
90+
ret <8 x i64> %res
91+
}
92+
93+
; ============================================================================
94+
; Case 2: 32-bit Optimization (vpmuludq)
95+
; ============================================================================
96+
97+
define <8 x i64> @test_mul_32bit_fits(<8 x i64> %a, <8 x i64> %b) {
98+
; CNL-LABEL: test_mul_32bit_fits:
99+
; CNL: # %bb.0:
100+
; CNL-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
101+
; CNL-NEXT: retq
102+
;
103+
; NOVLX-LABEL: test_mul_32bit_fits:
104+
; NOVLX: # %bb.0:
105+
; NOVLX-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
106+
; NOVLX-NEXT: retq
107+
;
108+
; GENERIC-LABEL: test_mul_32bit_fits:
109+
; GENERIC: # %bb.0:
110+
; GENERIC-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
111+
; GENERIC-NEXT: retq
112+
;
113+
; GENERIC-NOVLX-LABEL: test_mul_32bit_fits:
114+
; GENERIC-NOVLX: # %bb.0:
115+
; GENERIC-NOVLX-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
116+
; GENERIC-NOVLX-NEXT: retq
117+
118+
%a_masked = and <8 x i64> %a, splat (i64 4294967295)
119+
%b_masked = and <8 x i64> %b, splat (i64 4294967295)
120+
121+
%res = mul <8 x i64> %a_masked, %b_masked
122+
ret <8 x i64> %res
123+
}
124+
125+
; ============================================================================
126+
; Case 3: No Optimization (Full 64-bit)
127+
; ============================================================================
128+
129+
define <8 x i64> @test_mul_full_64bit(<8 x i64> %a, <8 x i64> %b) {
130+
; CNL-LABEL: test_mul_full_64bit:
131+
; CNL: # %bb.0:
132+
; CNL-NEXT: vpmullq %zmm1, %zmm0, %zmm0
133+
; CNL-NEXT: retq
134+
;
135+
; NOVLX-LABEL: test_mul_full_64bit:
136+
; NOVLX: # %bb.0:
137+
; NOVLX-NEXT: vpmullq %zmm1, %zmm0, %zmm0
138+
; NOVLX-NEXT: retq
139+
;
140+
; GENERIC-LABEL: test_mul_full_64bit:
141+
; GENERIC: # %bb.0:
142+
; GENERIC-NEXT: vpmullq %zmm1, %zmm0, %zmm0
143+
; GENERIC-NEXT: retq
144+
;
145+
; GENERIC-NOVLX-LABEL: test_mul_full_64bit:
146+
; GENERIC-NOVLX: # %bb.0:
147+
; GENERIC-NOVLX-NEXT: vpmullq %zmm1, %zmm0, %zmm0
148+
; GENERIC-NOVLX-NEXT: retq
149+
%res = mul <8 x i64> %a, %b
150+
ret <8 x i64> %res
151+
}
152+
153+
; ============================================================================
154+
; Case 4: Vector Width Variety (Check 256-bit / YMM)
155+
; ============================================================================
156+
157+
define <4 x i64> @test_mul_52bit_ymm(<4 x i64> %a, <4 x i64> %b) {
158+
; CNL-LABEL: test_mul_52bit_ymm:
159+
; CNL: # %bb.0:
160+
; CNL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm2
161+
; CNL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm1
162+
; CNL-NEXT: vpxor %xmm0, %xmm0, %xmm0
163+
; CNL-NEXT: vpmadd52luq %ymm1, %ymm2, %ymm0
164+
; CNL-NEXT: retq
165+
;
166+
; NOVLX-LABEL: test_mul_52bit_ymm:
167+
; NOVLX: # %bb.0:
168+
; NOVLX-NEXT: vpbroadcastq {{.*#+}} ymm2 = [8589934591,8589934591,8589934591,8589934591]
169+
; NOVLX-NEXT: vpand %ymm2, %ymm0, %ymm0
170+
; NOVLX-NEXT: vpbroadcastq {{.*#+}} ymm2 = [524287,524287,524287,524287]
171+
; NOVLX-NEXT: vpand %ymm2, %ymm1, %ymm1
172+
; NOVLX-NEXT: vpmullq %zmm1, %zmm0, %zmm0
173+
; NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
174+
; NOVLX-NEXT: retq
175+
;
176+
; GENERIC-LABEL: test_mul_52bit_ymm:
177+
; GENERIC: # %bb.0:
178+
; GENERIC-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm2
179+
; GENERIC-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm1
180+
; GENERIC-NEXT: vpxor %xmm0, %xmm0, %xmm0
181+
; GENERIC-NEXT: vpmadd52luq %ymm1, %ymm2, %ymm0
182+
; GENERIC-NEXT: retq
183+
;
184+
; GENERIC-NOVLX-LABEL: test_mul_52bit_ymm:
185+
; GENERIC-NOVLX: # %bb.0:
186+
; GENERIC-NOVLX-NEXT: vpbroadcastq {{.*#+}} ymm2 = [8589934591,8589934591,8589934591,8589934591]
187+
; GENERIC-NOVLX-NEXT: vpand %ymm2, %ymm0, %ymm0
188+
; GENERIC-NOVLX-NEXT: vpbroadcastq {{.*#+}} ymm2 = [524287,524287,524287,524287]
189+
; GENERIC-NOVLX-NEXT: vpand %ymm2, %ymm1, %ymm1
190+
; GENERIC-NOVLX-NEXT: vpmullq %zmm1, %zmm0, %zmm0
191+
; GENERIC-NOVLX-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
192+
; GENERIC-NOVLX-NEXT: retq
193+
194+
%a_masked = and <4 x i64> %a, splat (i64 8589934591)
195+
%b_masked = and <4 x i64> %b, splat (i64 524287)
196+
197+
%res = mul <4 x i64> %a_masked, %b_masked
198+
ret <4 x i64> %res
199+
}
200+
201+
; ============================================================================
202+
; Case 1.5: 32-bit Signed Optimization (vpmuldq)
203+
; ============================================================================
204+
205+
define <8 x i64> @test_mul_32bit_signed(<8 x i32> %a, <8 x i32> %b) {
206+
; CNL-LABEL: test_mul_32bit_signed:
207+
; CNL: # %bb.0:
208+
; CNL-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
209+
; CNL-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
210+
; CNL-NEXT: vpmuldq %zmm1, %zmm0, %zmm0
211+
; CNL-NEXT: retq
212+
;
213+
; NOVLX-LABEL: test_mul_32bit_signed:
214+
; NOVLX: # %bb.0:
215+
; NOVLX-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
216+
; NOVLX-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
217+
; NOVLX-NEXT: vpmuldq %zmm1, %zmm0, %zmm0
218+
; NOVLX-NEXT: retq
219+
;
220+
; GENERIC-LABEL: test_mul_32bit_signed:
221+
; GENERIC: # %bb.0:
222+
; GENERIC-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
223+
; GENERIC-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
224+
; GENERIC-NEXT: vpmuldq %zmm1, %zmm0, %zmm0
225+
; GENERIC-NEXT: retq
226+
;
227+
; GENERIC-NOVLX-LABEL: test_mul_32bit_signed:
228+
; GENERIC-NOVLX: # %bb.0:
229+
; GENERIC-NOVLX-NEXT: vpmovzxdq {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero
230+
; GENERIC-NOVLX-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
231+
; GENERIC-NOVLX-NEXT: vpmuldq %zmm1, %zmm0, %zmm0
232+
; GENERIC-NOVLX-NEXT: retq
233+
%a_ = sext <8 x i32> %a to <8 x i64>
234+
%b_ = sext <8 x i32> %b to <8 x i64>
235+
236+
%res = mul <8 x i64> %a_, %b_
237+
ret <8 x i64> %res
238+
}

0 commit comments

Comments
 (0)